llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/core/library_client.py +80 -3
- llama_stack/core/routing_tables/common.py +11 -0
- llama_stack/core/routing_tables/vector_stores.py +4 -0
- llama_stack/core/stack.py +16 -1
- llama_stack/core/storage/kvstore/kvstore.py +11 -0
- llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
- llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
- llama_stack/core/storage/kvstore/redis/redis.py +5 -0
- llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
- llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
- llama_stack/distributions/dell/doc_template.md +209 -0
- llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
- llama_stack/distributions/nvidia/doc_template.md +170 -0
- llama_stack/distributions/oci/doc_template.md +140 -0
- llama_stack/models/llama/llama3/dog.jpg +0 -0
- llama_stack/models/llama/llama3/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/dog.jpg +0 -0
- llama_stack/models/llama/resources/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/small_dog.jpg +0 -0
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
- llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
- llama_stack/providers/remote/eval/nvidia/README.md +134 -0
- llama_stack/providers/remote/files/s3/README.md +266 -0
- llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
- llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
- llama_stack/providers/remote/safety/nvidia/README.md +78 -0
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
- llama_stack/providers/utils/responses/responses_store.py +34 -0
- llama_stack/providers/utils/tools/mcp.py +258 -16
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
- llama_stack-0.4.4.dist-info/top_level.txt +1 -0
- llama_stack-0.4.2.dist-info/top_level.txt +0 -2
- llama_stack_api/__init__.py +0 -945
- llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/admin/api.py +0 -72
- llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/admin/models.py +0 -113
- llama_stack_api/agents.py +0 -173
- llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/batches/api.py +0 -53
- llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/batches/models.py +0 -78
- llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/common/errors.py +0 -95
- llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/common/responses.py +0 -77
- llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/connectors.py +0 -146
- llama_stack_api/conversations.py +0 -270
- llama_stack_api/datasetio.py +0 -55
- llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/datatypes.py +0 -373
- llama_stack_api/eval.py +0 -137
- llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/files/api.py +0 -51
- llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/files/models.py +0 -107
- llama_stack_api/inference.py +0 -1169
- llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/__init__.py +0 -945
- llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/llama_stack_api/admin/api.py +0 -72
- llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/llama_stack_api/admin/models.py +0 -113
- llama_stack_api/llama_stack_api/agents.py +0 -173
- llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/llama_stack_api/batches/api.py +0 -53
- llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/llama_stack_api/batches/models.py +0 -78
- llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/llama_stack_api/common/errors.py +0 -95
- llama_stack_api/llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/llama_stack_api/common/responses.py +0 -77
- llama_stack_api/llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/llama_stack_api/connectors.py +0 -146
- llama_stack_api/llama_stack_api/conversations.py +0 -270
- llama_stack_api/llama_stack_api/datasetio.py +0 -55
- llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/llama_stack_api/datatypes.py +0 -373
- llama_stack_api/llama_stack_api/eval.py +0 -137
- llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/llama_stack_api/files/api.py +0 -51
- llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/llama_stack_api/files/models.py +0 -107
- llama_stack_api/llama_stack_api/inference.py +0 -1169
- llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/models.py +0 -171
- llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/llama_stack_api/post_training.py +0 -370
- llama_stack_api/llama_stack_api/prompts.py +0 -203
- llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/llama_stack_api/providers/api.py +0 -16
- llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/llama_stack_api/providers/models.py +0 -24
- llama_stack_api/llama_stack_api/py.typed +0 -0
- llama_stack_api/llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/llama_stack_api/resource.py +0 -37
- llama_stack_api/llama_stack_api/router_utils.py +0 -160
- llama_stack_api/llama_stack_api/safety.py +0 -132
- llama_stack_api/llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/llama_stack_api/scoring.py +0 -93
- llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/llama_stack_api/shields.py +0 -93
- llama_stack_api/llama_stack_api/tools.py +0 -226
- llama_stack_api/llama_stack_api/vector_io.py +0 -941
- llama_stack_api/llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/llama_stack_api/version.py +0 -9
- llama_stack_api/models.py +0 -171
- llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/post_training.py +0 -370
- llama_stack_api/prompts.py +0 -203
- llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/providers/api.py +0 -16
- llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/providers/models.py +0 -24
- llama_stack_api/py.typed +0 -0
- llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/resource.py +0 -37
- llama_stack_api/router_utils.py +0 -160
- llama_stack_api/safety.py +0 -132
- llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/scoring.py +0 -93
- llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/shields.py +0 -93
- llama_stack_api/tools.py +0 -226
- llama_stack_api/vector_io.py +0 -941
- llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/version.py +0 -9
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# NVIDIA Post-Training Provider for LlamaStack
|
|
2
|
+
|
|
3
|
+
This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Supervised fine-tuning of Llama models
|
|
8
|
+
- LoRA fine-tuning support
|
|
9
|
+
- Job management and status tracking
|
|
10
|
+
|
|
11
|
+
## Getting Started
|
|
12
|
+
|
|
13
|
+
### Prerequisites
|
|
14
|
+
|
|
15
|
+
- LlamaStack with NVIDIA configuration
|
|
16
|
+
- Access to Hosted NVIDIA NeMo Customizer service
|
|
17
|
+
- Dataset registered in the Hosted NVIDIA NeMo Customizer service
|
|
18
|
+
- Base model downloaded and available in the Hosted NVIDIA NeMo Customizer service
|
|
19
|
+
|
|
20
|
+
### Setup
|
|
21
|
+
|
|
22
|
+
Build the NVIDIA environment:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv pip install llama-stack-client
|
|
26
|
+
uv run llama stack list-deps nvidia | xargs -L1 uv pip install
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Basic Usage using the LlamaStack Python Client
|
|
30
|
+
|
|
31
|
+
### Create Customization Job
|
|
32
|
+
|
|
33
|
+
#### Initialize the client
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import os
|
|
37
|
+
|
|
38
|
+
os.environ["NVIDIA_API_KEY"] = "your-api-key"
|
|
39
|
+
os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
|
|
40
|
+
os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
|
|
41
|
+
os.environ["NVIDIA_PROJECT_ID"] = "test-project"
|
|
42
|
+
os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = "test-example-model@v1"
|
|
43
|
+
|
|
44
|
+
from llama_stack.core.library_client import LlamaStackAsLibraryClient
|
|
45
|
+
|
|
46
|
+
client = LlamaStackAsLibraryClient("nvidia")
|
|
47
|
+
client.initialize()
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
#### Configure fine-tuning parameters
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from llama_stack_client.types.post_training_supervised_fine_tune_params import (
|
|
54
|
+
TrainingConfig,
|
|
55
|
+
TrainingConfigDataConfig,
|
|
56
|
+
TrainingConfigOptimizerConfig,
|
|
57
|
+
)
|
|
58
|
+
from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### Set up LoRA configuration
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
#### Configure training data
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
data_config = TrainingConfigDataConfig(
|
|
71
|
+
dataset_id="your-dataset-id", # Use client.datasets.list() to see available datasets
|
|
72
|
+
batch_size=16,
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
#### Configure optimizer
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
optimizer_config = TrainingConfigOptimizerConfig(
|
|
80
|
+
lr=0.0001,
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Set up training configuration
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
training_config = TrainingConfig(
|
|
88
|
+
n_epochs=2,
|
|
89
|
+
data_config=data_config,
|
|
90
|
+
optimizer_config=optimizer_config,
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
#### Start fine-tuning job
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
training_job = client.post_training.supervised_fine_tune(
|
|
98
|
+
job_uuid="unique-job-id",
|
|
99
|
+
model="meta-llama/Llama-3.1-8B-Instruct",
|
|
100
|
+
checkpoint_dir="",
|
|
101
|
+
algorithm_config=algorithm_config,
|
|
102
|
+
training_config=training_config,
|
|
103
|
+
logger_config={},
|
|
104
|
+
hyperparam_search_config={},
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### List all jobs
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
jobs = client.post_training.job.list()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Check job status
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
job_status = client.post_training.job.status(job_uuid="your-job-id")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Cancel a job
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
client.post_training.job.cancel(job_uuid="your-job-id")
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Inference with the fine-tuned model
|
|
127
|
+
|
|
128
|
+
#### 1. Register the model
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from llama_stack_api.models import Model, ModelType
|
|
132
|
+
|
|
133
|
+
client.models.register(
|
|
134
|
+
model_id="test-example-model@v1",
|
|
135
|
+
provider_id="nvidia",
|
|
136
|
+
provider_model_id="test-example-model@v1",
|
|
137
|
+
model_type=ModelType.llm,
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
#### 2. Inference with the fine-tuned model
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
response = client.completions.create(
|
|
145
|
+
prompt="Complete the sentence using one word: Roses are red, violets are ",
|
|
146
|
+
stream=False,
|
|
147
|
+
model="test-example-model@v1",
|
|
148
|
+
max_tokens=50,
|
|
149
|
+
)
|
|
150
|
+
print(response.choices[0].text)
|
|
151
|
+
```
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# NVIDIA Safety Provider for LlamaStack
|
|
2
|
+
|
|
3
|
+
This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Run safety checks for messages
|
|
8
|
+
|
|
9
|
+
## Getting Started
|
|
10
|
+
|
|
11
|
+
### Prerequisites
|
|
12
|
+
|
|
13
|
+
- LlamaStack with NVIDIA configuration
|
|
14
|
+
- Access to NVIDIA NeMo Guardrails service
|
|
15
|
+
- NIM for model to use for safety check is deployed
|
|
16
|
+
|
|
17
|
+
### Setup
|
|
18
|
+
|
|
19
|
+
Build the NVIDIA environment:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv pip install llama-stack-client
|
|
23
|
+
uv run llama stack list-deps nvidia | xargs -L1 uv pip install
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Basic Usage using the LlamaStack Python Client
|
|
27
|
+
|
|
28
|
+
#### Initialize the client
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import os
|
|
32
|
+
|
|
33
|
+
os.environ["NVIDIA_API_KEY"] = "your-api-key"
|
|
34
|
+
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
|
|
35
|
+
|
|
36
|
+
from llama_stack.core.library_client import LlamaStackAsLibraryClient
|
|
37
|
+
|
|
38
|
+
client = LlamaStackAsLibraryClient("nvidia")
|
|
39
|
+
client.initialize()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
#### Create a safety shield
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from llama_stack_api.safety import Shield
|
|
46
|
+
from llama_stack_api.inference import Message
|
|
47
|
+
|
|
48
|
+
# Create a safety shield
|
|
49
|
+
shield = Shield(
|
|
50
|
+
shield_id="your-shield-id",
|
|
51
|
+
provider_resource_id="safety-model-id", # The model to use for safety checks
|
|
52
|
+
description="Safety checks for content moderation",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Register the shield
|
|
56
|
+
await client.safety.register_shield(shield)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### Run safety checks
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
# Messages to check
|
|
63
|
+
messages = [Message(role="user", content="Your message to check")]
|
|
64
|
+
|
|
65
|
+
# Run safety check
|
|
66
|
+
response = await client.safety.run_shield(
|
|
67
|
+
shield_id="your-shield-id",
|
|
68
|
+
messages=messages,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Check for violations
|
|
72
|
+
if response.violation:
|
|
73
|
+
print(f"Safety violation detected: {response.violation.user_message}")
|
|
74
|
+
print(f"Violation level: {response.violation.violation_level}")
|
|
75
|
+
print(f"Metadata: {response.violation.metadata}")
|
|
76
|
+
else:
|
|
77
|
+
print("No safety violations detected")
|
|
78
|
+
```
|
|
@@ -10,6 +10,7 @@ from typing import Any
|
|
|
10
10
|
import psycopg2
|
|
11
11
|
from numpy.typing import NDArray
|
|
12
12
|
from psycopg2 import sql
|
|
13
|
+
from psycopg2.extensions import cursor
|
|
13
14
|
from psycopg2.extras import Json, execute_values
|
|
14
15
|
from pydantic import BaseModel, TypeAdapter
|
|
15
16
|
|
|
@@ -54,6 +55,17 @@ def check_extension_version(cur):
|
|
|
54
55
|
return result[0] if result else None
|
|
55
56
|
|
|
56
57
|
|
|
58
|
+
def create_vector_extension(cur: cursor) -> None:
|
|
59
|
+
try:
|
|
60
|
+
log.info("Vector extension not found, creating...")
|
|
61
|
+
cur.execute("CREATE EXTENSION vector;")
|
|
62
|
+
log.info("Vector extension created successfully")
|
|
63
|
+
log.info(f"Vector extension version: {check_extension_version(cur)}")
|
|
64
|
+
|
|
65
|
+
except psycopg2.Error as e:
|
|
66
|
+
raise RuntimeError(f"Failed to create vector extension for PGVector: {e}") from e
|
|
67
|
+
|
|
68
|
+
|
|
57
69
|
def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
|
|
58
70
|
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
59
71
|
query = sql.SQL(
|
|
@@ -364,7 +376,7 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
|
|
|
364
376
|
if version:
|
|
365
377
|
log.info(f"Vector extension version: {version}")
|
|
366
378
|
else:
|
|
367
|
-
|
|
379
|
+
create_vector_extension(cur)
|
|
368
380
|
|
|
369
381
|
cur.execute(
|
|
370
382
|
"""
|
|
@@ -25,7 +25,8 @@ from llama_stack_api import (
|
|
|
25
25
|
OpenAIEmbeddingUsage,
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
EMBEDDING_MODELS = {}
|
|
28
|
+
EMBEDDING_MODELS: dict[str, "SentenceTransformer"] = {}
|
|
29
|
+
EMBEDDING_MODELS_LOCK = asyncio.Lock()
|
|
29
30
|
|
|
30
31
|
DARWIN = "Darwin"
|
|
31
32
|
|
|
@@ -76,26 +77,29 @@ class SentenceTransformerEmbeddingMixin:
|
|
|
76
77
|
)
|
|
77
78
|
|
|
78
79
|
async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
|
|
79
|
-
global EMBEDDING_MODELS
|
|
80
|
-
|
|
81
80
|
loaded_model = EMBEDDING_MODELS.get(model)
|
|
82
81
|
if loaded_model is not None:
|
|
83
82
|
return loaded_model
|
|
84
83
|
|
|
85
|
-
|
|
84
|
+
async with EMBEDDING_MODELS_LOCK:
|
|
85
|
+
loaded_model = EMBEDDING_MODELS.get(model)
|
|
86
|
+
if loaded_model is not None:
|
|
87
|
+
return loaded_model
|
|
88
|
+
|
|
89
|
+
log.info(f"Loading sentence transformer for {model}...")
|
|
86
90
|
|
|
87
|
-
|
|
88
|
-
|
|
91
|
+
def _load_model():
|
|
92
|
+
from sentence_transformers import SentenceTransformer
|
|
89
93
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
platform_name = platform.system()
|
|
95
|
+
if platform_name == DARWIN:
|
|
96
|
+
# PyTorch's OpenMP kernels can segfault on macOS when spawned from background
|
|
97
|
+
# threads with the default parallel settings, so force a single-threaded CPU run.
|
|
98
|
+
log.debug(f"Constraining torch threads on {platform_name} to a single worker")
|
|
99
|
+
torch.set_num_threads(1)
|
|
96
100
|
|
|
97
|
-
|
|
101
|
+
return SentenceTransformer(model, trust_remote_code=True)
|
|
98
102
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
loaded_model = await asyncio.to_thread(_load_model)
|
|
104
|
+
EMBEDDING_MODELS[model] = loaded_model
|
|
105
|
+
return loaded_model
|
|
@@ -122,6 +122,39 @@ class OpenAIVectorStoreMixin(ABC):
|
|
|
122
122
|
# update in-memory cache
|
|
123
123
|
self.openai_vector_stores[store_id] = store_info
|
|
124
124
|
|
|
125
|
+
async def _ensure_openai_metadata_exists(self, vector_store: VectorStore, name: str | None = None) -> None:
|
|
126
|
+
"""
|
|
127
|
+
Ensure OpenAI-compatible metadata exists for a vector store.
|
|
128
|
+
"""
|
|
129
|
+
if vector_store.identifier not in self.openai_vector_stores:
|
|
130
|
+
store_info = {
|
|
131
|
+
"id": vector_store.identifier,
|
|
132
|
+
"object": "vector_store",
|
|
133
|
+
"created_at": int(time.time()),
|
|
134
|
+
"name": name or vector_store.vector_store_name or vector_store.identifier,
|
|
135
|
+
"usage_bytes": 0,
|
|
136
|
+
"file_counts": VectorStoreFileCounts(
|
|
137
|
+
cancelled=0,
|
|
138
|
+
completed=0,
|
|
139
|
+
failed=0,
|
|
140
|
+
in_progress=0,
|
|
141
|
+
total=0,
|
|
142
|
+
).model_dump(),
|
|
143
|
+
"status": "completed",
|
|
144
|
+
"expires_after": None,
|
|
145
|
+
"expires_at": None,
|
|
146
|
+
"last_active_at": int(time.time()),
|
|
147
|
+
"file_ids": [],
|
|
148
|
+
"chunking_strategy": None,
|
|
149
|
+
"metadata": {
|
|
150
|
+
"provider_id": vector_store.provider_id,
|
|
151
|
+
"provider_vector_store_id": vector_store.provider_resource_id,
|
|
152
|
+
"embedding_model": vector_store.embedding_model,
|
|
153
|
+
"embedding_dimension": str(vector_store.embedding_dimension),
|
|
154
|
+
},
|
|
155
|
+
}
|
|
156
|
+
await self._save_openai_vector_store(vector_store.identifier, store_info)
|
|
157
|
+
|
|
125
158
|
async def _load_openai_vector_stores(self) -> dict[str, dict[str, Any]]:
|
|
126
159
|
"""Load all vector store metadata from persistent storage."""
|
|
127
160
|
assert self.kvstore
|
|
@@ -89,6 +89,40 @@ class ResponsesStore:
|
|
|
89
89
|
) -> None:
|
|
90
90
|
await self._write_response_object(response_object, input, messages)
|
|
91
91
|
|
|
92
|
+
async def upsert_response_object(
|
|
93
|
+
self,
|
|
94
|
+
response_object: OpenAIResponseObject,
|
|
95
|
+
input: list[OpenAIResponseInput],
|
|
96
|
+
messages: list[OpenAIMessageParam],
|
|
97
|
+
) -> None:
|
|
98
|
+
"""Upsert response object using INSERT on first call, UPDATE on subsequent calls.
|
|
99
|
+
|
|
100
|
+
This method enables incremental persistence during streaming, allowing clients
|
|
101
|
+
to poll GET /v1/responses/{response_id} and see in-progress turn state.
|
|
102
|
+
|
|
103
|
+
:param response_object: The response object to store/update.
|
|
104
|
+
:param input: The input items for the response.
|
|
105
|
+
:param messages: The chat completion messages (for conversation continuity).
|
|
106
|
+
"""
|
|
107
|
+
if self.sql_store is None:
|
|
108
|
+
raise ValueError("Responses store is not initialized")
|
|
109
|
+
|
|
110
|
+
data = response_object.model_dump()
|
|
111
|
+
data["input"] = [input_item.model_dump() for input_item in input]
|
|
112
|
+
data["messages"] = [msg.model_dump() for msg in messages]
|
|
113
|
+
|
|
114
|
+
await self.sql_store.upsert(
|
|
115
|
+
table="openai_responses",
|
|
116
|
+
data={
|
|
117
|
+
"id": data["id"],
|
|
118
|
+
"created_at": data["created_at"],
|
|
119
|
+
"model": data["model"],
|
|
120
|
+
"response_object": data,
|
|
121
|
+
},
|
|
122
|
+
conflict_columns=["id"],
|
|
123
|
+
update_columns=["response_object"],
|
|
124
|
+
)
|
|
125
|
+
|
|
92
126
|
async def _write_response_object(
|
|
93
127
|
self,
|
|
94
128
|
response_object: OpenAIResponseObject,
|