llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. llama_stack/core/library_client.py +80 -3
  2. llama_stack/core/routing_tables/common.py +11 -0
  3. llama_stack/core/routing_tables/vector_stores.py +4 -0
  4. llama_stack/core/stack.py +16 -1
  5. llama_stack/core/storage/kvstore/kvstore.py +11 -0
  6. llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
  7. llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
  8. llama_stack/core/storage/kvstore/redis/redis.py +5 -0
  9. llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
  10. llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
  11. llama_stack/distributions/dell/doc_template.md +209 -0
  12. llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
  13. llama_stack/distributions/nvidia/doc_template.md +170 -0
  14. llama_stack/distributions/oci/doc_template.md +140 -0
  15. llama_stack/models/llama/llama3/dog.jpg +0 -0
  16. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  17. llama_stack/models/llama/resources/dog.jpg +0 -0
  18. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  19. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  20. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
  21. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
  22. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
  23. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  24. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  25. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  26. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  27. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  28. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  29. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  30. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  31. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  32. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  33. llama_stack/providers/remote/files/s3/README.md +266 -0
  34. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  35. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  36. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  37. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
  38. llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
  39. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
  40. llama_stack/providers/utils/responses/responses_store.py +34 -0
  41. llama_stack/providers/utils/tools/mcp.py +258 -16
  42. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
  43. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
  44. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
  45. llama_stack-0.4.4.dist-info/top_level.txt +1 -0
  46. llama_stack-0.4.2.dist-info/top_level.txt +0 -2
  47. llama_stack_api/__init__.py +0 -945
  48. llama_stack_api/admin/__init__.py +0 -45
  49. llama_stack_api/admin/api.py +0 -72
  50. llama_stack_api/admin/fastapi_routes.py +0 -117
  51. llama_stack_api/admin/models.py +0 -113
  52. llama_stack_api/agents.py +0 -173
  53. llama_stack_api/batches/__init__.py +0 -40
  54. llama_stack_api/batches/api.py +0 -53
  55. llama_stack_api/batches/fastapi_routes.py +0 -113
  56. llama_stack_api/batches/models.py +0 -78
  57. llama_stack_api/benchmarks/__init__.py +0 -43
  58. llama_stack_api/benchmarks/api.py +0 -39
  59. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  60. llama_stack_api/benchmarks/models.py +0 -109
  61. llama_stack_api/common/__init__.py +0 -5
  62. llama_stack_api/common/content_types.py +0 -101
  63. llama_stack_api/common/errors.py +0 -95
  64. llama_stack_api/common/job_types.py +0 -38
  65. llama_stack_api/common/responses.py +0 -77
  66. llama_stack_api/common/training_types.py +0 -47
  67. llama_stack_api/common/type_system.py +0 -146
  68. llama_stack_api/connectors.py +0 -146
  69. llama_stack_api/conversations.py +0 -270
  70. llama_stack_api/datasetio.py +0 -55
  71. llama_stack_api/datasets/__init__.py +0 -61
  72. llama_stack_api/datasets/api.py +0 -35
  73. llama_stack_api/datasets/fastapi_routes.py +0 -104
  74. llama_stack_api/datasets/models.py +0 -152
  75. llama_stack_api/datatypes.py +0 -373
  76. llama_stack_api/eval.py +0 -137
  77. llama_stack_api/file_processors/__init__.py +0 -27
  78. llama_stack_api/file_processors/api.py +0 -64
  79. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  80. llama_stack_api/file_processors/models.py +0 -42
  81. llama_stack_api/files/__init__.py +0 -35
  82. llama_stack_api/files/api.py +0 -51
  83. llama_stack_api/files/fastapi_routes.py +0 -124
  84. llama_stack_api/files/models.py +0 -107
  85. llama_stack_api/inference.py +0 -1169
  86. llama_stack_api/inspect_api/__init__.py +0 -37
  87. llama_stack_api/inspect_api/api.py +0 -25
  88. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  89. llama_stack_api/inspect_api/models.py +0 -28
  90. llama_stack_api/internal/__init__.py +0 -9
  91. llama_stack_api/internal/kvstore.py +0 -26
  92. llama_stack_api/internal/sqlstore.py +0 -79
  93. llama_stack_api/llama_stack_api/__init__.py +0 -945
  94. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  95. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  96. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  97. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  98. llama_stack_api/llama_stack_api/agents.py +0 -173
  99. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  100. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  101. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  102. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  103. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  104. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  105. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  106. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  107. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  108. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  109. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  110. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  111. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  112. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  113. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  114. llama_stack_api/llama_stack_api/connectors.py +0 -146
  115. llama_stack_api/llama_stack_api/conversations.py +0 -270
  116. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  117. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  118. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  119. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  120. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  121. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  122. llama_stack_api/llama_stack_api/eval.py +0 -137
  123. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  124. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  125. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  126. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  127. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  128. llama_stack_api/llama_stack_api/files/api.py +0 -51
  129. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  130. llama_stack_api/llama_stack_api/files/models.py +0 -107
  131. llama_stack_api/llama_stack_api/inference.py +0 -1169
  132. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  133. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  134. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  135. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  136. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  137. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
  138. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
  139. llama_stack_api/llama_stack_api/models.py +0 -171
  140. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  141. llama_stack_api/llama_stack_api/post_training.py +0 -370
  142. llama_stack_api/llama_stack_api/prompts.py +0 -203
  143. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  144. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  145. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  146. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  147. llama_stack_api/llama_stack_api/py.typed +0 -0
  148. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  149. llama_stack_api/llama_stack_api/resource.py +0 -37
  150. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  151. llama_stack_api/llama_stack_api/safety.py +0 -132
  152. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  153. llama_stack_api/llama_stack_api/scoring.py +0 -93
  154. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  155. llama_stack_api/llama_stack_api/shields.py +0 -93
  156. llama_stack_api/llama_stack_api/tools.py +0 -226
  157. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  158. llama_stack_api/llama_stack_api/vector_stores.py +0 -51
  159. llama_stack_api/llama_stack_api/version.py +0 -9
  160. llama_stack_api/models.py +0 -171
  161. llama_stack_api/openai_responses.py +0 -1468
  162. llama_stack_api/post_training.py +0 -370
  163. llama_stack_api/prompts.py +0 -203
  164. llama_stack_api/providers/__init__.py +0 -33
  165. llama_stack_api/providers/api.py +0 -16
  166. llama_stack_api/providers/fastapi_routes.py +0 -57
  167. llama_stack_api/providers/models.py +0 -24
  168. llama_stack_api/py.typed +0 -0
  169. llama_stack_api/rag_tool.py +0 -168
  170. llama_stack_api/resource.py +0 -37
  171. llama_stack_api/router_utils.py +0 -160
  172. llama_stack_api/safety.py +0 -132
  173. llama_stack_api/schema_utils.py +0 -208
  174. llama_stack_api/scoring.py +0 -93
  175. llama_stack_api/scoring_functions.py +0 -211
  176. llama_stack_api/shields.py +0 -93
  177. llama_stack_api/tools.py +0 -226
  178. llama_stack_api/vector_io.py +0 -941
  179. llama_stack_api/vector_stores.py +0 -51
  180. llama_stack_api/version.py +0 -9
  181. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
  182. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,151 @@
1
+ # NVIDIA Post-Training Provider for LlamaStack
2
+
3
+ This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service.
4
+
5
+ ## Features
6
+
7
+ - Supervised fine-tuning of Llama models
8
+ - LoRA fine-tuning support
9
+ - Job management and status tracking
10
+
11
+ ## Getting Started
12
+
13
+ ### Prerequisites
14
+
15
+ - LlamaStack with NVIDIA configuration
16
+ - Access to Hosted NVIDIA NeMo Customizer service
17
+ - Dataset registered in the Hosted NVIDIA NeMo Customizer service
18
+ - Base model downloaded and available in the Hosted NVIDIA NeMo Customizer service
19
+
20
+ ### Setup
21
+
22
+ Build the NVIDIA environment:
23
+
24
+ ```bash
25
+ uv pip install llama-stack-client
26
+ uv run llama stack list-deps nvidia | xargs -L1 uv pip install
27
+ ```
28
+
29
+ ### Basic Usage using the LlamaStack Python Client
30
+
31
+ ### Create Customization Job
32
+
33
+ #### Initialize the client
34
+
35
+ ```python
36
+ import os
37
+
38
+ os.environ["NVIDIA_API_KEY"] = "your-api-key"
39
+ os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
40
+ os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
41
+ os.environ["NVIDIA_PROJECT_ID"] = "test-project"
42
+ os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = "test-example-model@v1"
43
+
44
+ from llama_stack.core.library_client import LlamaStackAsLibraryClient
45
+
46
+ client = LlamaStackAsLibraryClient("nvidia")
47
+ client.initialize()
48
+ ```
49
+
50
+ #### Configure fine-tuning parameters
51
+
52
+ ```python
53
+ from llama_stack_client.types.post_training_supervised_fine_tune_params import (
54
+ TrainingConfig,
55
+ TrainingConfigDataConfig,
56
+ TrainingConfigOptimizerConfig,
57
+ )
58
+ from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
59
+ ```
60
+
61
+ #### Set up LoRA configuration
62
+
63
+ ```python
64
+ algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16)
65
+ ```
66
+
67
+ #### Configure training data
68
+
69
+ ```python
70
+ data_config = TrainingConfigDataConfig(
71
+ dataset_id="your-dataset-id", # Use client.datasets.list() to see available datasets
72
+ batch_size=16,
73
+ )
74
+ ```
75
+
76
+ #### Configure optimizer
77
+
78
+ ```python
79
+ optimizer_config = TrainingConfigOptimizerConfig(
80
+ lr=0.0001,
81
+ )
82
+ ```
83
+
84
+ #### Set up training configuration
85
+
86
+ ```python
87
+ training_config = TrainingConfig(
88
+ n_epochs=2,
89
+ data_config=data_config,
90
+ optimizer_config=optimizer_config,
91
+ )
92
+ ```
93
+
94
+ #### Start fine-tuning job
95
+
96
+ ```python
97
+ training_job = client.post_training.supervised_fine_tune(
98
+ job_uuid="unique-job-id",
99
+ model="meta-llama/Llama-3.1-8B-Instruct",
100
+ checkpoint_dir="",
101
+ algorithm_config=algorithm_config,
102
+ training_config=training_config,
103
+ logger_config={},
104
+ hyperparam_search_config={},
105
+ )
106
+ ```
107
+
108
+ ### List all jobs
109
+
110
+ ```python
111
+ jobs = client.post_training.job.list()
112
+ ```
113
+
114
+ ### Check job status
115
+
116
+ ```python
117
+ job_status = client.post_training.job.status(job_uuid="your-job-id")
118
+ ```
119
+
120
+ ### Cancel a job
121
+
122
+ ```python
123
+ client.post_training.job.cancel(job_uuid="your-job-id")
124
+ ```
125
+
126
+ ### Inference with the fine-tuned model
127
+
128
+ #### 1. Register the model
129
+
130
+ ```python
131
+ from llama_stack_api.models import Model, ModelType
132
+
133
+ client.models.register(
134
+ model_id="test-example-model@v1",
135
+ provider_id="nvidia",
136
+ provider_model_id="test-example-model@v1",
137
+ model_type=ModelType.llm,
138
+ )
139
+ ```
140
+
141
+ #### 2. Inference with the fine-tuned model
142
+
143
+ ```python
144
+ response = client.completions.create(
145
+ prompt="Complete the sentence using one word: Roses are red, violets are ",
146
+ stream=False,
147
+ model="test-example-model@v1",
148
+ max_tokens=50,
149
+ )
150
+ print(response.choices[0].text)
151
+ ```
@@ -0,0 +1,78 @@
1
+ # NVIDIA Safety Provider for LlamaStack
2
+
3
+ This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
4
+
5
+ ## Features
6
+
7
+ - Run safety checks for messages
8
+
9
+ ## Getting Started
10
+
11
+ ### Prerequisites
12
+
13
+ - LlamaStack with NVIDIA configuration
14
+ - Access to NVIDIA NeMo Guardrails service
15
+ - NIM for model to use for safety check is deployed
16
+
17
+ ### Setup
18
+
19
+ Build the NVIDIA environment:
20
+
21
+ ```bash
22
+ uv pip install llama-stack-client
23
+ uv run llama stack list-deps nvidia | xargs -L1 uv pip install
24
+ ```
25
+
26
+ ### Basic Usage using the LlamaStack Python Client
27
+
28
+ #### Initialize the client
29
+
30
+ ```python
31
+ import os
32
+
33
+ os.environ["NVIDIA_API_KEY"] = "your-api-key"
34
+ os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
35
+
36
+ from llama_stack.core.library_client import LlamaStackAsLibraryClient
37
+
38
+ client = LlamaStackAsLibraryClient("nvidia")
39
+ client.initialize()
40
+ ```
41
+
42
+ #### Create a safety shield
43
+
44
+ ```python
45
+ from llama_stack_api.safety import Shield
46
+ from llama_stack_api.inference import Message
47
+
48
+ # Create a safety shield
49
+ shield = Shield(
50
+ shield_id="your-shield-id",
51
+ provider_resource_id="safety-model-id", # The model to use for safety checks
52
+ description="Safety checks for content moderation",
53
+ )
54
+
55
+ # Register the shield
56
+ await client.safety.register_shield(shield)
57
+ ```
58
+
59
+ #### Run safety checks
60
+
61
+ ```python
62
+ # Messages to check
63
+ messages = [Message(role="user", content="Your message to check")]
64
+
65
+ # Run safety check
66
+ response = await client.safety.run_shield(
67
+ shield_id="your-shield-id",
68
+ messages=messages,
69
+ )
70
+
71
+ # Check for violations
72
+ if response.violation:
73
+ print(f"Safety violation detected: {response.violation.user_message}")
74
+ print(f"Violation level: {response.violation.violation_level}")
75
+ print(f"Metadata: {response.violation.metadata}")
76
+ else:
77
+ print("No safety violations detected")
78
+ ```
@@ -10,6 +10,7 @@ from typing import Any
10
10
  import psycopg2
11
11
  from numpy.typing import NDArray
12
12
  from psycopg2 import sql
13
+ from psycopg2.extensions import cursor
13
14
  from psycopg2.extras import Json, execute_values
14
15
  from pydantic import BaseModel, TypeAdapter
15
16
 
@@ -54,6 +55,17 @@ def check_extension_version(cur):
54
55
  return result[0] if result else None
55
56
 
56
57
 
58
+ def create_vector_extension(cur: cursor) -> None:
59
+ try:
60
+ log.info("Vector extension not found, creating...")
61
+ cur.execute("CREATE EXTENSION vector;")
62
+ log.info("Vector extension created successfully")
63
+ log.info(f"Vector extension version: {check_extension_version(cur)}")
64
+
65
+ except psycopg2.Error as e:
66
+ raise RuntimeError(f"Failed to create vector extension for PGVector: {e}") from e
67
+
68
+
57
69
  def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
58
70
  with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
59
71
  query = sql.SQL(
@@ -364,7 +376,7 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
364
376
  if version:
365
377
  log.info(f"Vector extension version: {version}")
366
378
  else:
367
- raise RuntimeError("Vector extension is not installed.")
379
+ create_vector_extension(cur)
368
380
 
369
381
  cur.execute(
370
382
  """
@@ -25,7 +25,8 @@ from llama_stack_api import (
25
25
  OpenAIEmbeddingUsage,
26
26
  )
27
27
 
28
- EMBEDDING_MODELS = {}
28
+ EMBEDDING_MODELS: dict[str, "SentenceTransformer"] = {}
29
+ EMBEDDING_MODELS_LOCK = asyncio.Lock()
29
30
 
30
31
  DARWIN = "Darwin"
31
32
 
@@ -76,26 +77,29 @@ class SentenceTransformerEmbeddingMixin:
76
77
  )
77
78
 
78
79
  async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
79
- global EMBEDDING_MODELS
80
-
81
80
  loaded_model = EMBEDDING_MODELS.get(model)
82
81
  if loaded_model is not None:
83
82
  return loaded_model
84
83
 
85
- log.info(f"Loading sentence transformer for {model}...")
84
+ async with EMBEDDING_MODELS_LOCK:
85
+ loaded_model = EMBEDDING_MODELS.get(model)
86
+ if loaded_model is not None:
87
+ return loaded_model
88
+
89
+ log.info(f"Loading sentence transformer for {model}...")
86
90
 
87
- def _load_model():
88
- from sentence_transformers import SentenceTransformer
91
+ def _load_model():
92
+ from sentence_transformers import SentenceTransformer
89
93
 
90
- platform_name = platform.system()
91
- if platform_name == DARWIN:
92
- # PyTorch's OpenMP kernels can segfault on macOS when spawned from background
93
- # threads with the default parallel settings, so force a single-threaded CPU run.
94
- log.debug(f"Constraining torch threads on {platform_name} to a single worker")
95
- torch.set_num_threads(1)
94
+ platform_name = platform.system()
95
+ if platform_name == DARWIN:
96
+ # PyTorch's OpenMP kernels can segfault on macOS when spawned from background
97
+ # threads with the default parallel settings, so force a single-threaded CPU run.
98
+ log.debug(f"Constraining torch threads on {platform_name} to a single worker")
99
+ torch.set_num_threads(1)
96
100
 
97
- return SentenceTransformer(model, trust_remote_code=True)
101
+ return SentenceTransformer(model, trust_remote_code=True)
98
102
 
99
- loaded_model = await asyncio.to_thread(_load_model)
100
- EMBEDDING_MODELS[model] = loaded_model
101
- return loaded_model
103
+ loaded_model = await asyncio.to_thread(_load_model)
104
+ EMBEDDING_MODELS[model] = loaded_model
105
+ return loaded_model
@@ -122,6 +122,39 @@ class OpenAIVectorStoreMixin(ABC):
122
122
  # update in-memory cache
123
123
  self.openai_vector_stores[store_id] = store_info
124
124
 
125
+ async def _ensure_openai_metadata_exists(self, vector_store: VectorStore, name: str | None = None) -> None:
126
+ """
127
+ Ensure OpenAI-compatible metadata exists for a vector store.
128
+ """
129
+ if vector_store.identifier not in self.openai_vector_stores:
130
+ store_info = {
131
+ "id": vector_store.identifier,
132
+ "object": "vector_store",
133
+ "created_at": int(time.time()),
134
+ "name": name or vector_store.vector_store_name or vector_store.identifier,
135
+ "usage_bytes": 0,
136
+ "file_counts": VectorStoreFileCounts(
137
+ cancelled=0,
138
+ completed=0,
139
+ failed=0,
140
+ in_progress=0,
141
+ total=0,
142
+ ).model_dump(),
143
+ "status": "completed",
144
+ "expires_after": None,
145
+ "expires_at": None,
146
+ "last_active_at": int(time.time()),
147
+ "file_ids": [],
148
+ "chunking_strategy": None,
149
+ "metadata": {
150
+ "provider_id": vector_store.provider_id,
151
+ "provider_vector_store_id": vector_store.provider_resource_id,
152
+ "embedding_model": vector_store.embedding_model,
153
+ "embedding_dimension": str(vector_store.embedding_dimension),
154
+ },
155
+ }
156
+ await self._save_openai_vector_store(vector_store.identifier, store_info)
157
+
125
158
  async def _load_openai_vector_stores(self) -> dict[str, dict[str, Any]]:
126
159
  """Load all vector store metadata from persistent storage."""
127
160
  assert self.kvstore
@@ -89,6 +89,40 @@ class ResponsesStore:
89
89
  ) -> None:
90
90
  await self._write_response_object(response_object, input, messages)
91
91
 
92
+ async def upsert_response_object(
93
+ self,
94
+ response_object: OpenAIResponseObject,
95
+ input: list[OpenAIResponseInput],
96
+ messages: list[OpenAIMessageParam],
97
+ ) -> None:
98
+ """Upsert response object using INSERT on first call, UPDATE on subsequent calls.
99
+
100
+ This method enables incremental persistence during streaming, allowing clients
101
+ to poll GET /v1/responses/{response_id} and see in-progress turn state.
102
+
103
+ :param response_object: The response object to store/update.
104
+ :param input: The input items for the response.
105
+ :param messages: The chat completion messages (for conversation continuity).
106
+ """
107
+ if self.sql_store is None:
108
+ raise ValueError("Responses store is not initialized")
109
+
110
+ data = response_object.model_dump()
111
+ data["input"] = [input_item.model_dump() for input_item in input]
112
+ data["messages"] = [msg.model_dump() for msg in messages]
113
+
114
+ await self.sql_store.upsert(
115
+ table="openai_responses",
116
+ data={
117
+ "id": data["id"],
118
+ "created_at": data["created_at"],
119
+ "model": data["model"],
120
+ "response_object": data,
121
+ },
122
+ conflict_columns=["id"],
123
+ update_columns=["response_object"],
124
+ )
125
+
92
126
  async def _write_response_object(
93
127
  self,
94
128
  response_object: OpenAIResponseObject,