llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/core/library_client.py +80 -3
- llama_stack/core/routing_tables/common.py +11 -0
- llama_stack/core/routing_tables/vector_stores.py +4 -0
- llama_stack/core/stack.py +16 -1
- llama_stack/core/storage/kvstore/kvstore.py +11 -0
- llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
- llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
- llama_stack/core/storage/kvstore/redis/redis.py +5 -0
- llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
- llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
- llama_stack/distributions/dell/doc_template.md +209 -0
- llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
- llama_stack/distributions/nvidia/doc_template.md +170 -0
- llama_stack/distributions/oci/doc_template.md +140 -0
- llama_stack/models/llama/llama3/dog.jpg +0 -0
- llama_stack/models/llama/llama3/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/dog.jpg +0 -0
- llama_stack/models/llama/resources/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/small_dog.jpg +0 -0
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
- llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
- llama_stack/providers/remote/eval/nvidia/README.md +134 -0
- llama_stack/providers/remote/files/s3/README.md +266 -0
- llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
- llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
- llama_stack/providers/remote/safety/nvidia/README.md +78 -0
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
- llama_stack/providers/utils/responses/responses_store.py +34 -0
- llama_stack/providers/utils/tools/mcp.py +258 -16
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
- llama_stack-0.4.4.dist-info/top_level.txt +1 -0
- llama_stack-0.4.2.dist-info/top_level.txt +0 -2
- llama_stack_api/__init__.py +0 -945
- llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/admin/api.py +0 -72
- llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/admin/models.py +0 -113
- llama_stack_api/agents.py +0 -173
- llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/batches/api.py +0 -53
- llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/batches/models.py +0 -78
- llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/common/errors.py +0 -95
- llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/common/responses.py +0 -77
- llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/connectors.py +0 -146
- llama_stack_api/conversations.py +0 -270
- llama_stack_api/datasetio.py +0 -55
- llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/datatypes.py +0 -373
- llama_stack_api/eval.py +0 -137
- llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/files/api.py +0 -51
- llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/files/models.py +0 -107
- llama_stack_api/inference.py +0 -1169
- llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/__init__.py +0 -945
- llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/llama_stack_api/admin/api.py +0 -72
- llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/llama_stack_api/admin/models.py +0 -113
- llama_stack_api/llama_stack_api/agents.py +0 -173
- llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/llama_stack_api/batches/api.py +0 -53
- llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/llama_stack_api/batches/models.py +0 -78
- llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/llama_stack_api/common/errors.py +0 -95
- llama_stack_api/llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/llama_stack_api/common/responses.py +0 -77
- llama_stack_api/llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/llama_stack_api/connectors.py +0 -146
- llama_stack_api/llama_stack_api/conversations.py +0 -270
- llama_stack_api/llama_stack_api/datasetio.py +0 -55
- llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/llama_stack_api/datatypes.py +0 -373
- llama_stack_api/llama_stack_api/eval.py +0 -137
- llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/llama_stack_api/files/api.py +0 -51
- llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/llama_stack_api/files/models.py +0 -107
- llama_stack_api/llama_stack_api/inference.py +0 -1169
- llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/models.py +0 -171
- llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/llama_stack_api/post_training.py +0 -370
- llama_stack_api/llama_stack_api/prompts.py +0 -203
- llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/llama_stack_api/providers/api.py +0 -16
- llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/llama_stack_api/providers/models.py +0 -24
- llama_stack_api/llama_stack_api/py.typed +0 -0
- llama_stack_api/llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/llama_stack_api/resource.py +0 -37
- llama_stack_api/llama_stack_api/router_utils.py +0 -160
- llama_stack_api/llama_stack_api/safety.py +0 -132
- llama_stack_api/llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/llama_stack_api/scoring.py +0 -93
- llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/llama_stack_api/shields.py +0 -93
- llama_stack_api/llama_stack_api/tools.py +0 -226
- llama_stack_api/llama_stack_api/vector_io.py +0 -941
- llama_stack_api/llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/llama_stack_api/version.py +0 -9
- llama_stack_api/models.py +0 -171
- llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/post_training.py +0 -370
- llama_stack_api/prompts.py +0 -203
- llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/providers/api.py +0 -16
- llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/providers/models.py +0 -24
- llama_stack_api/py.typed +0 -0
- llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/resource.py +0 -37
- llama_stack_api/router_utils.py +0 -160
- llama_stack_api/safety.py +0 -132
- llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/scoring.py +0 -93
- llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/shields.py +0 -93
- llama_stack_api/tools.py +0 -226
- llama_stack_api/vector_io.py +0 -941
- llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/version.py +0 -9
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
# NVIDIA Distribution
|
|
5
|
+
|
|
6
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
7
|
+
|
|
8
|
+
{{ providers_table }}
|
|
9
|
+
|
|
10
|
+
{% if run_config_env_vars %}
|
|
11
|
+
### Environment Variables
|
|
12
|
+
|
|
13
|
+
The following environment variables can be configured:
|
|
14
|
+
|
|
15
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
16
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
17
|
+
{% endfor %}
|
|
18
|
+
{% endif %}
|
|
19
|
+
|
|
20
|
+
{% if default_models %}
|
|
21
|
+
### Models
|
|
22
|
+
|
|
23
|
+
The following models are available by default:
|
|
24
|
+
|
|
25
|
+
{% for model in default_models %}
|
|
26
|
+
- `{{ model.model_id }} {{ model.doc_string }}`
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Prerequisites
|
|
32
|
+
### NVIDIA API Keys
|
|
33
|
+
|
|
34
|
+
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
|
35
|
+
|
|
36
|
+
### Deploy NeMo Microservices Platform
|
|
37
|
+
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
|
38
|
+
|
|
39
|
+
## Supported Services
|
|
40
|
+
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
|
41
|
+
|
|
42
|
+
### Inference: NVIDIA NIM
|
|
43
|
+
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
|
44
|
+
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
|
45
|
+
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
|
46
|
+
|
|
47
|
+
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
|
48
|
+
|
|
49
|
+
### Datasetio API: NeMo Data Store
|
|
50
|
+
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
|
51
|
+
|
|
52
|
+
See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
|
53
|
+
|
|
54
|
+
### Eval API: NeMo Evaluator
|
|
55
|
+
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
56
|
+
|
|
57
|
+
See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
|
58
|
+
|
|
59
|
+
### Post-Training API: NeMo Customizer
|
|
60
|
+
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
61
|
+
|
|
62
|
+
See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
|
63
|
+
|
|
64
|
+
### Safety API: NeMo Guardrails
|
|
65
|
+
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
66
|
+
|
|
67
|
+
See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
|
|
68
|
+
|
|
69
|
+
## Deploying models
|
|
70
|
+
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
|
71
|
+
|
|
72
|
+
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
|
73
|
+
```sh
|
|
74
|
+
# URL to NeMo NIM Proxy service
|
|
75
|
+
export NEMO_URL="http://nemo.test"
|
|
76
|
+
|
|
77
|
+
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
|
78
|
+
-H 'accept: application/json' \
|
|
79
|
+
-H 'Content-Type: application/json' \
|
|
80
|
+
-d '{
|
|
81
|
+
"name": "llama-3.2-1b-instruct",
|
|
82
|
+
"namespace": "meta",
|
|
83
|
+
"config": {
|
|
84
|
+
"model": "meta/llama-3.2-1b-instruct",
|
|
85
|
+
"nim_deployment": {
|
|
86
|
+
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
|
87
|
+
"image_tag": "1.8.3",
|
|
88
|
+
"pvc_size": "25Gi",
|
|
89
|
+
"gpu": 1,
|
|
90
|
+
"additional_envs": {
|
|
91
|
+
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}'
|
|
96
|
+
```
|
|
97
|
+
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
|
98
|
+
|
|
99
|
+
You can also remove a deployed NIM to free up GPU resources, if needed.
|
|
100
|
+
```sh
|
|
101
|
+
export NEMO_URL="http://nemo.test"
|
|
102
|
+
|
|
103
|
+
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Running Llama Stack with NVIDIA
|
|
107
|
+
|
|
108
|
+
You can do this via venv (build code), or Docker which has a pre-built image.
|
|
109
|
+
|
|
110
|
+
### Via Docker
|
|
111
|
+
|
|
112
|
+
This method allows you to get started quickly without having to build the distribution code.
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
LLAMA_STACK_PORT=8321
|
|
116
|
+
docker run \
|
|
117
|
+
-it \
|
|
118
|
+
--pull always \
|
|
119
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
120
|
+
-v ~/.llama:/root/.llama \
|
|
121
|
+
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
122
|
+
llamastack/distribution-{{ name }} \
|
|
123
|
+
--port $LLAMA_STACK_PORT
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Via Docker with Custom Run Configuration
|
|
127
|
+
|
|
128
|
+
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Set the path to your custom config.yaml file
|
|
132
|
+
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
133
|
+
LLAMA_STACK_PORT=8321
|
|
134
|
+
|
|
135
|
+
docker run \
|
|
136
|
+
-it \
|
|
137
|
+
--pull always \
|
|
138
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
139
|
+
-v ~/.llama:/root/.llama \
|
|
140
|
+
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
141
|
+
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
142
|
+
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
143
|
+
llamastack/distribution-{{ name }} \
|
|
144
|
+
--port $LLAMA_STACK_PORT
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
148
|
+
|
|
149
|
+
{% if run_configs %}
|
|
150
|
+
Available run configurations for this distribution:
|
|
151
|
+
{% for config in run_configs %}
|
|
152
|
+
- `{{ config }}`
|
|
153
|
+
{% endfor %}
|
|
154
|
+
{% endif %}
|
|
155
|
+
|
|
156
|
+
### Via venv
|
|
157
|
+
|
|
158
|
+
If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
162
|
+
llama stack list-deps nvidia | xargs -L1 uv pip install
|
|
163
|
+
NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
164
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
165
|
+
llama stack run ./config.yaml \
|
|
166
|
+
--port 8321
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Example Notebooks
|
|
170
|
+
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
# OCI Distribution
|
|
5
|
+
|
|
6
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
7
|
+
|
|
8
|
+
{{ providers_table }}
|
|
9
|
+
|
|
10
|
+
{% if run_config_env_vars %}
|
|
11
|
+
### Environment Variables
|
|
12
|
+
|
|
13
|
+
The following environment variables can be configured:
|
|
14
|
+
|
|
15
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
16
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
17
|
+
{% endfor %}
|
|
18
|
+
{% endif %}
|
|
19
|
+
|
|
20
|
+
{% if default_models %}
|
|
21
|
+
### Models
|
|
22
|
+
|
|
23
|
+
The following models are available by default:
|
|
24
|
+
|
|
25
|
+
{% for model in default_models %}
|
|
26
|
+
- `{{ model.model_id }} {{ model.doc_string }}`
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
## Prerequisites
|
|
31
|
+
### Oracle Cloud Infrastructure Setup
|
|
32
|
+
|
|
33
|
+
Before using the OCI Generative AI distribution, ensure you have:
|
|
34
|
+
|
|
35
|
+
1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
|
|
36
|
+
2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
|
|
37
|
+
3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
|
|
38
|
+
4. **Authentication**: Configure authentication using either:
|
|
39
|
+
- **Instance Principal** (recommended for cloud-hosted deployments)
|
|
40
|
+
- **API Key** (for on-premises or development environments)
|
|
41
|
+
|
|
42
|
+
### Authentication Methods
|
|
43
|
+
|
|
44
|
+
#### Instance Principal Authentication (Recommended)
|
|
45
|
+
Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
|
|
46
|
+
|
|
47
|
+
Requirements:
|
|
48
|
+
- Instance must be running in an Oracle Cloud Infrastructure compartment
|
|
49
|
+
- Instance must have appropriate IAM policies to access Generative AI services
|
|
50
|
+
|
|
51
|
+
#### API Key Authentication
|
|
52
|
+
For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
|
|
53
|
+
|
|
54
|
+
### Required IAM Policies
|
|
55
|
+
|
|
56
|
+
Ensure your OCI user or instance has the following policy statements:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
|
|
60
|
+
Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Supported Services
|
|
64
|
+
|
|
65
|
+
### Inference: OCI Generative AI
|
|
66
|
+
Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
|
|
67
|
+
|
|
68
|
+
- **Chat Completions**: Conversational AI with context awareness
|
|
69
|
+
- **Text Generation**: Complete prompts and generate text content
|
|
70
|
+
|
|
71
|
+
#### Available Models
|
|
72
|
+
Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
|
|
73
|
+
|
|
74
|
+
### Safety: Llama Guard
|
|
75
|
+
For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
|
|
76
|
+
- Content filtering and moderation
|
|
77
|
+
- Policy compliance checking
|
|
78
|
+
- Harmful content detection
|
|
79
|
+
|
|
80
|
+
### Vector Storage: Multiple Options
|
|
81
|
+
The distribution supports several vector storage providers:
|
|
82
|
+
- **FAISS**: Local in-memory vector search
|
|
83
|
+
- **ChromaDB**: Distributed vector database
|
|
84
|
+
- **PGVector**: PostgreSQL with vector extensions
|
|
85
|
+
|
|
86
|
+
### Additional Services
|
|
87
|
+
- **Dataset I/O**: Local filesystem and Hugging Face integration
|
|
88
|
+
- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
|
|
89
|
+
- **Evaluation**: Meta reference evaluation framework
|
|
90
|
+
|
|
91
|
+
## Running Llama Stack with OCI
|
|
92
|
+
|
|
93
|
+
You can run the OCI distribution via Docker or local virtual environment.
|
|
94
|
+
|
|
95
|
+
### Via venv
|
|
96
|
+
|
|
97
|
+
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Configuration Examples
|
|
104
|
+
|
|
105
|
+
#### Using Instance Principal (Recommended for Production)
|
|
106
|
+
```bash
|
|
107
|
+
export OCI_AUTH_TYPE=instance_principal
|
|
108
|
+
export OCI_REGION=us-chicago-1
|
|
109
|
+
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
#### Using API Key Authentication (Development)
|
|
113
|
+
```bash
|
|
114
|
+
export OCI_AUTH_TYPE=config_file
|
|
115
|
+
export OCI_CONFIG_FILE_PATH=~/.oci/config
|
|
116
|
+
export OCI_CLI_PROFILE=DEFAULT
|
|
117
|
+
export OCI_REGION=us-chicago-1
|
|
118
|
+
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Regional Endpoints
|
|
122
|
+
|
|
123
|
+
OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
|
|
124
|
+
|
|
125
|
+
https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
|
|
126
|
+
|
|
127
|
+
## Troubleshooting
|
|
128
|
+
|
|
129
|
+
### Common Issues
|
|
130
|
+
|
|
131
|
+
1. **Authentication Errors**: Verify your OCI credentials and IAM policies
|
|
132
|
+
2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
|
|
133
|
+
3. **Permission Denied**: Check compartment permissions and Generative AI service access
|
|
134
|
+
4. **Region Unavailable**: Verify the specified region supports Generative AI services
|
|
135
|
+
|
|
136
|
+
### Getting Help
|
|
137
|
+
|
|
138
|
+
For additional support:
|
|
139
|
+
- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
|
|
140
|
+
- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
import asyncio
|
|
7
8
|
import re
|
|
8
9
|
import time
|
|
9
10
|
import uuid
|
|
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.responses.responses_store import (
|
|
|
16
17
|
ResponsesStore,
|
|
17
18
|
_OpenAIResponseObjectWithInputAndMessages,
|
|
18
19
|
)
|
|
20
|
+
from llama_stack.providers.utils.tools.mcp import MCPSessionManager
|
|
19
21
|
from llama_stack_api import (
|
|
20
22
|
ConversationItem,
|
|
21
23
|
Conversations,
|
|
@@ -322,6 +324,125 @@ class OpenAIResponsesImpl:
|
|
|
322
324
|
messages=messages,
|
|
323
325
|
)
|
|
324
326
|
|
|
327
|
+
def _prepare_input_items_for_storage(
|
|
328
|
+
self,
|
|
329
|
+
input: str | list[OpenAIResponseInput],
|
|
330
|
+
) -> list[OpenAIResponseInput]:
|
|
331
|
+
"""Prepare input items for storage, adding IDs where needed.
|
|
332
|
+
|
|
333
|
+
This method is called once at the start of streaming to prepare input items
|
|
334
|
+
that will be reused across multiple persistence calls during streaming.
|
|
335
|
+
"""
|
|
336
|
+
new_input_id = f"msg_{uuid.uuid4()}"
|
|
337
|
+
input_items_data: list[OpenAIResponseInput] = []
|
|
338
|
+
|
|
339
|
+
if isinstance(input, str):
|
|
340
|
+
input_content = OpenAIResponseInputMessageContentText(text=input)
|
|
341
|
+
input_content_item = OpenAIResponseMessage(
|
|
342
|
+
role="user",
|
|
343
|
+
content=[input_content],
|
|
344
|
+
id=new_input_id,
|
|
345
|
+
)
|
|
346
|
+
input_items_data = [input_content_item]
|
|
347
|
+
else:
|
|
348
|
+
for input_item in input:
|
|
349
|
+
if isinstance(input_item, OpenAIResponseMessage):
|
|
350
|
+
input_item_dict = input_item.model_dump()
|
|
351
|
+
if "id" not in input_item_dict:
|
|
352
|
+
input_item_dict["id"] = new_input_id
|
|
353
|
+
input_items_data.append(OpenAIResponseMessage(**input_item_dict))
|
|
354
|
+
else:
|
|
355
|
+
input_items_data.append(input_item)
|
|
356
|
+
|
|
357
|
+
return input_items_data
|
|
358
|
+
|
|
359
|
+
async def _persist_streaming_state(
|
|
360
|
+
self,
|
|
361
|
+
stream_chunk: OpenAIResponseObjectStream,
|
|
362
|
+
orchestrator,
|
|
363
|
+
input_items: list[OpenAIResponseInput],
|
|
364
|
+
output_items: list,
|
|
365
|
+
) -> None:
|
|
366
|
+
"""Persist response state at significant streaming events.
|
|
367
|
+
|
|
368
|
+
This enables clients to poll GET /v1/responses/{response_id} during streaming
|
|
369
|
+
to see in-progress turn state instead of empty results.
|
|
370
|
+
|
|
371
|
+
Persistence occurs at:
|
|
372
|
+
- response.in_progress: Initial INSERT with empty output
|
|
373
|
+
- response.output_item.done: UPDATE with accumulated output items
|
|
374
|
+
- response.completed/response.incomplete: Final UPDATE with complete state
|
|
375
|
+
- response.failed: UPDATE with error state
|
|
376
|
+
|
|
377
|
+
:param stream_chunk: The current streaming event.
|
|
378
|
+
:param orchestrator: The streaming orchestrator (for snapshotting response).
|
|
379
|
+
:param input_items: Pre-prepared input items for storage.
|
|
380
|
+
:param output_items: Accumulated output items so far.
|
|
381
|
+
"""
|
|
382
|
+
try:
|
|
383
|
+
match stream_chunk.type:
|
|
384
|
+
case "response.in_progress":
|
|
385
|
+
# Initial persistence when response starts
|
|
386
|
+
in_progress_response = stream_chunk.response
|
|
387
|
+
await self.responses_store.upsert_response_object(
|
|
388
|
+
response_object=in_progress_response,
|
|
389
|
+
input=input_items,
|
|
390
|
+
messages=[],
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
case "response.output_item.done":
|
|
394
|
+
# Incremental update when an output item completes (tool call, message)
|
|
395
|
+
current_snapshot = orchestrator._snapshot_response(
|
|
396
|
+
status="in_progress",
|
|
397
|
+
outputs=output_items,
|
|
398
|
+
)
|
|
399
|
+
# Get current messages (filter out system messages)
|
|
400
|
+
messages_to_store = list(
|
|
401
|
+
filter(
|
|
402
|
+
lambda x: not isinstance(x, OpenAISystemMessageParam),
|
|
403
|
+
orchestrator.final_messages or orchestrator.ctx.messages,
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
await self.responses_store.upsert_response_object(
|
|
407
|
+
response_object=current_snapshot,
|
|
408
|
+
input=input_items,
|
|
409
|
+
messages=messages_to_store,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
case "response.completed" | "response.incomplete":
|
|
413
|
+
# Final persistence when response finishes
|
|
414
|
+
final_response = stream_chunk.response
|
|
415
|
+
messages_to_store = list(
|
|
416
|
+
filter(
|
|
417
|
+
lambda x: not isinstance(x, OpenAISystemMessageParam),
|
|
418
|
+
orchestrator.final_messages,
|
|
419
|
+
)
|
|
420
|
+
)
|
|
421
|
+
await self.responses_store.upsert_response_object(
|
|
422
|
+
response_object=final_response,
|
|
423
|
+
input=input_items,
|
|
424
|
+
messages=messages_to_store,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
case "response.failed":
|
|
428
|
+
# Persist failed state so GET shows error
|
|
429
|
+
failed_response = stream_chunk.response
|
|
430
|
+
# Preserve any accumulated non-system messages for failed responses
|
|
431
|
+
messages_to_store = list(
|
|
432
|
+
filter(
|
|
433
|
+
lambda x: not isinstance(x, OpenAISystemMessageParam),
|
|
434
|
+
orchestrator.final_messages or orchestrator.ctx.messages,
|
|
435
|
+
)
|
|
436
|
+
)
|
|
437
|
+
await self.responses_store.upsert_response_object(
|
|
438
|
+
response_object=failed_response,
|
|
439
|
+
input=input_items,
|
|
440
|
+
messages=messages_to_store,
|
|
441
|
+
)
|
|
442
|
+
except Exception as e:
|
|
443
|
+
# Best-effort persistence: log error but don't fail the stream
|
|
444
|
+
logger.warning(f"Failed to persist streaming state for {stream_chunk.type}: {e}")
|
|
445
|
+
|
|
325
446
|
async def create_openai_response(
|
|
326
447
|
self,
|
|
327
448
|
input: str | list[OpenAIResponseInput],
|
|
@@ -489,6 +610,19 @@ class OpenAIResponsesImpl:
|
|
|
489
610
|
response_id = f"resp_{uuid.uuid4()}"
|
|
490
611
|
created_at = int(time.time())
|
|
491
612
|
|
|
613
|
+
# Create a per-request MCP session manager for session reuse (fix for #4452)
|
|
614
|
+
# This avoids redundant tools/list calls when making multiple MCP tool invocations
|
|
615
|
+
mcp_session_manager = MCPSessionManager()
|
|
616
|
+
|
|
617
|
+
# Create a per-request ToolExecutor with the session manager
|
|
618
|
+
request_tool_executor = ToolExecutor(
|
|
619
|
+
tool_groups_api=self.tool_groups_api,
|
|
620
|
+
tool_runtime_api=self.tool_runtime_api,
|
|
621
|
+
vector_io_api=self.vector_io_api,
|
|
622
|
+
vector_stores_config=self.tool_executor.vector_stores_config,
|
|
623
|
+
mcp_session_manager=mcp_session_manager,
|
|
624
|
+
)
|
|
625
|
+
|
|
492
626
|
orchestrator = StreamingResponseOrchestrator(
|
|
493
627
|
inference_api=self.inference_api,
|
|
494
628
|
ctx=ctx,
|
|
@@ -498,7 +632,7 @@ class OpenAIResponsesImpl:
|
|
|
498
632
|
text=text,
|
|
499
633
|
max_infer_iters=max_infer_iters,
|
|
500
634
|
parallel_tool_calls=parallel_tool_calls,
|
|
501
|
-
tool_executor=
|
|
635
|
+
tool_executor=request_tool_executor,
|
|
502
636
|
safety_api=self.safety_api,
|
|
503
637
|
guardrail_ids=guardrail_ids,
|
|
504
638
|
instructions=instructions,
|
|
@@ -513,41 +647,58 @@ class OpenAIResponsesImpl:
|
|
|
513
647
|
|
|
514
648
|
# Type as ConversationItem to avoid list invariance issues
|
|
515
649
|
output_items: list[ConversationItem] = []
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
messages_to_store = list(
|
|
536
|
-
filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
|
|
537
|
-
)
|
|
650
|
+
|
|
651
|
+
# Prepare input items for storage once (used by all persistence calls)
|
|
652
|
+
input_items_for_storage = self._prepare_input_items_for_storage(all_input)
|
|
653
|
+
|
|
654
|
+
try:
|
|
655
|
+
async for stream_chunk in orchestrator.create_response():
|
|
656
|
+
match stream_chunk.type:
|
|
657
|
+
case "response.completed" | "response.incomplete":
|
|
658
|
+
final_response = stream_chunk.response
|
|
659
|
+
case "response.failed":
|
|
660
|
+
failed_response = stream_chunk.response
|
|
661
|
+
case "response.output_item.done":
|
|
662
|
+
item = stream_chunk.item
|
|
663
|
+
output_items.append(item)
|
|
664
|
+
case _:
|
|
665
|
+
pass # Other event types
|
|
666
|
+
|
|
667
|
+
# Incremental persistence: persist on significant state changes
|
|
668
|
+
# This enables clients to poll GET /v1/responses/{response_id} during streaming
|
|
538
669
|
if store:
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
670
|
+
await self._persist_streaming_state(
|
|
671
|
+
stream_chunk=stream_chunk,
|
|
672
|
+
orchestrator=orchestrator,
|
|
673
|
+
input_items=input_items_for_storage,
|
|
674
|
+
output_items=output_items,
|
|
544
675
|
)
|
|
545
676
|
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
677
|
+
# Store and sync before yielding terminal events
|
|
678
|
+
# This ensures the storage/syncing happens even if the consumer breaks after receiving the event
|
|
679
|
+
if (
|
|
680
|
+
stream_chunk.type in {"response.completed", "response.incomplete"}
|
|
681
|
+
and final_response
|
|
682
|
+
and failed_response is None
|
|
683
|
+
):
|
|
684
|
+
if conversation:
|
|
685
|
+
messages_to_store = list(
|
|
686
|
+
filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
|
|
687
|
+
)
|
|
688
|
+
await self._sync_response_to_conversation(conversation, input, output_items)
|
|
689
|
+
await self.responses_store.store_conversation_messages(conversation, messages_to_store)
|
|
690
|
+
|
|
691
|
+
yield stream_chunk
|
|
692
|
+
finally:
|
|
693
|
+
# Clean up MCP sessions at the end of the request (fix for #4452)
|
|
694
|
+
# Use shield() to prevent cancellation from interrupting cleanup and leaking resources
|
|
695
|
+
# Wrap in try/except as cleanup errors should not mask the original response
|
|
696
|
+
try:
|
|
697
|
+
await asyncio.shield(mcp_session_manager.close_all())
|
|
698
|
+
except BaseException as e:
|
|
699
|
+
# Debug level - cleanup errors are expected in streaming scenarios where
|
|
700
|
+
# anyio cancel scopes may be in a different task context
|
|
701
|
+
logger.debug(f"Error during MCP session cleanup: {e}")
|
|
551
702
|
|
|
552
703
|
async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
|
|
553
704
|
return await self.responses_store.delete_response_object(response_id)
|
|
@@ -1200,6 +1200,9 @@ class StreamingResponseOrchestrator:
|
|
|
1200
1200
|
"mcp_list_tools_id": list_id,
|
|
1201
1201
|
}
|
|
1202
1202
|
|
|
1203
|
+
# Get session manager from tool_executor if available (fix for #4452)
|
|
1204
|
+
session_manager = getattr(self.tool_executor, "mcp_session_manager", None)
|
|
1205
|
+
|
|
1203
1206
|
# TODO: follow semantic conventions for Open Telemetry tool spans
|
|
1204
1207
|
# https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
|
|
1205
1208
|
with tracer.start_as_current_span("list_mcp_tools", attributes=attributes):
|
|
@@ -1207,6 +1210,7 @@ class StreamingResponseOrchestrator:
|
|
|
1207
1210
|
endpoint=mcp_tool.server_url,
|
|
1208
1211
|
headers=mcp_tool.headers,
|
|
1209
1212
|
authorization=mcp_tool.authorization,
|
|
1213
|
+
session_manager=session_manager,
|
|
1210
1214
|
)
|
|
1211
1215
|
|
|
1212
1216
|
# Create the MCP list tools message
|
|
@@ -54,11 +54,14 @@ class ToolExecutor:
|
|
|
54
54
|
tool_runtime_api: ToolRuntime,
|
|
55
55
|
vector_io_api: VectorIO,
|
|
56
56
|
vector_stores_config=None,
|
|
57
|
+
mcp_session_manager=None,
|
|
57
58
|
):
|
|
58
59
|
self.tool_groups_api = tool_groups_api
|
|
59
60
|
self.tool_runtime_api = tool_runtime_api
|
|
60
61
|
self.vector_io_api = vector_io_api
|
|
61
62
|
self.vector_stores_config = vector_stores_config
|
|
63
|
+
# Optional MCPSessionManager for session reuse within a request (fix for #4452)
|
|
64
|
+
self.mcp_session_manager = mcp_session_manager
|
|
62
65
|
|
|
63
66
|
async def execute_tool_call(
|
|
64
67
|
self,
|
|
@@ -233,6 +236,7 @@ class ToolExecutor:
|
|
|
233
236
|
"document_ids": [r.file_id for r in search_results],
|
|
234
237
|
"chunks": [r.content[0].text if r.content else "" for r in search_results],
|
|
235
238
|
"scores": [r.score for r in search_results],
|
|
239
|
+
"attributes": [r.attributes or {} for r in search_results],
|
|
236
240
|
"citation_files": citation_files,
|
|
237
241
|
},
|
|
238
242
|
)
|
|
@@ -327,12 +331,14 @@ class ToolExecutor:
|
|
|
327
331
|
# TODO: follow semantic conventions for Open Telemetry tool spans
|
|
328
332
|
# https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
|
|
329
333
|
with tracer.start_as_current_span("invoke_mcp_tool", attributes=attributes):
|
|
334
|
+
# Pass session_manager for session reuse within request (fix for #4452)
|
|
330
335
|
result = await invoke_mcp_tool(
|
|
331
336
|
endpoint=mcp_tool.server_url,
|
|
332
337
|
tool_name=function_name,
|
|
333
338
|
kwargs=tool_kwargs,
|
|
334
339
|
headers=mcp_tool.headers,
|
|
335
340
|
authorization=mcp_tool.authorization,
|
|
341
|
+
session_manager=self.mcp_session_manager,
|
|
336
342
|
)
|
|
337
343
|
elif function_name == "knowledge_search":
|
|
338
344
|
response_file_search_tool = (
|
|
@@ -464,16 +470,18 @@ class ToolExecutor:
|
|
|
464
470
|
)
|
|
465
471
|
if result and (metadata := getattr(result, "metadata", None)) and "document_ids" in metadata:
|
|
466
472
|
message.results = []
|
|
473
|
+
attributes_list = metadata.get("attributes", [])
|
|
467
474
|
for i, doc_id in enumerate(metadata["document_ids"]):
|
|
468
475
|
text = metadata["chunks"][i] if "chunks" in metadata else None
|
|
469
476
|
score = metadata["scores"][i] if "scores" in metadata else None
|
|
477
|
+
attrs = attributes_list[i] if i < len(attributes_list) else {}
|
|
470
478
|
message.results.append(
|
|
471
479
|
OpenAIResponseOutputMessageFileSearchToolCallResults(
|
|
472
480
|
file_id=doc_id,
|
|
473
481
|
filename=doc_id,
|
|
474
482
|
text=text if text is not None else "",
|
|
475
483
|
score=score if score is not None else 0.0,
|
|
476
|
-
attributes=
|
|
484
|
+
attributes=attrs,
|
|
477
485
|
)
|
|
478
486
|
)
|
|
479
487
|
if has_error:
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
#import <Foundation/Foundation.h>
|
|
2
|
+
|
|
3
|
+
//! Project version number for LocalInference.
|
|
4
|
+
FOUNDATION_EXPORT double LocalInferenceVersionNumber;
|
|
5
|
+
|
|
6
|
+
//! Project version string for LocalInference.
|
|
7
|
+
FOUNDATION_EXPORT const unsigned char LocalInferenceVersionString[];
|
|
8
|
+
|
|
9
|
+
// In this header, you should import all the public headers of your framework using statements like #import <LocalInference/PublicHeader.h>
|