llama-stack-api 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. llama_stack_api/__init__.py +1100 -0
  2. llama_stack_api/admin/__init__.py +45 -0
  3. llama_stack_api/admin/api.py +72 -0
  4. llama_stack_api/admin/fastapi_routes.py +117 -0
  5. llama_stack_api/admin/models.py +113 -0
  6. llama_stack_api/agents/__init__.py +38 -0
  7. llama_stack_api/agents/api.py +52 -0
  8. llama_stack_api/agents/fastapi_routes.py +268 -0
  9. llama_stack_api/agents/models.py +181 -0
  10. llama_stack_api/batches/__init__.py +40 -0
  11. llama_stack_api/batches/api.py +53 -0
  12. llama_stack_api/batches/fastapi_routes.py +113 -0
  13. llama_stack_api/batches/models.py +78 -0
  14. llama_stack_api/benchmarks/__init__.py +43 -0
  15. llama_stack_api/benchmarks/api.py +39 -0
  16. llama_stack_api/benchmarks/fastapi_routes.py +109 -0
  17. llama_stack_api/benchmarks/models.py +109 -0
  18. llama_stack_api/common/__init__.py +5 -0
  19. llama_stack_api/common/content_types.py +101 -0
  20. llama_stack_api/common/errors.py +110 -0
  21. llama_stack_api/common/job_types.py +38 -0
  22. llama_stack_api/common/responses.py +77 -0
  23. llama_stack_api/common/training_types.py +47 -0
  24. llama_stack_api/common/type_system.py +146 -0
  25. llama_stack_api/connectors/__init__.py +38 -0
  26. llama_stack_api/connectors/api.py +50 -0
  27. llama_stack_api/connectors/fastapi_routes.py +103 -0
  28. llama_stack_api/connectors/models.py +103 -0
  29. llama_stack_api/conversations/__init__.py +61 -0
  30. llama_stack_api/conversations/api.py +44 -0
  31. llama_stack_api/conversations/fastapi_routes.py +177 -0
  32. llama_stack_api/conversations/models.py +245 -0
  33. llama_stack_api/datasetio/__init__.py +34 -0
  34. llama_stack_api/datasetio/api.py +42 -0
  35. llama_stack_api/datasetio/fastapi_routes.py +94 -0
  36. llama_stack_api/datasetio/models.py +48 -0
  37. llama_stack_api/datasets/__init__.py +61 -0
  38. llama_stack_api/datasets/api.py +35 -0
  39. llama_stack_api/datasets/fastapi_routes.py +104 -0
  40. llama_stack_api/datasets/models.py +152 -0
  41. llama_stack_api/datatypes.py +373 -0
  42. llama_stack_api/eval/__init__.py +55 -0
  43. llama_stack_api/eval/api.py +51 -0
  44. llama_stack_api/eval/compat.py +300 -0
  45. llama_stack_api/eval/fastapi_routes.py +126 -0
  46. llama_stack_api/eval/models.py +141 -0
  47. llama_stack_api/file_processors/__init__.py +27 -0
  48. llama_stack_api/file_processors/api.py +64 -0
  49. llama_stack_api/file_processors/fastapi_routes.py +78 -0
  50. llama_stack_api/file_processors/models.py +42 -0
  51. llama_stack_api/files/__init__.py +35 -0
  52. llama_stack_api/files/api.py +51 -0
  53. llama_stack_api/files/fastapi_routes.py +124 -0
  54. llama_stack_api/files/models.py +107 -0
  55. llama_stack_api/inference/__init__.py +207 -0
  56. llama_stack_api/inference/api.py +93 -0
  57. llama_stack_api/inference/fastapi_routes.py +243 -0
  58. llama_stack_api/inference/models.py +1035 -0
  59. llama_stack_api/inspect_api/__init__.py +37 -0
  60. llama_stack_api/inspect_api/api.py +25 -0
  61. llama_stack_api/inspect_api/fastapi_routes.py +76 -0
  62. llama_stack_api/inspect_api/models.py +28 -0
  63. llama_stack_api/internal/__init__.py +9 -0
  64. llama_stack_api/internal/kvstore.py +28 -0
  65. llama_stack_api/internal/sqlstore.py +81 -0
  66. llama_stack_api/models/__init__.py +47 -0
  67. llama_stack_api/models/api.py +38 -0
  68. llama_stack_api/models/fastapi_routes.py +104 -0
  69. llama_stack_api/models/models.py +157 -0
  70. llama_stack_api/openai_responses.py +1494 -0
  71. llama_stack_api/post_training/__init__.py +73 -0
  72. llama_stack_api/post_training/api.py +36 -0
  73. llama_stack_api/post_training/fastapi_routes.py +116 -0
  74. llama_stack_api/post_training/models.py +339 -0
  75. llama_stack_api/prompts/__init__.py +47 -0
  76. llama_stack_api/prompts/api.py +44 -0
  77. llama_stack_api/prompts/fastapi_routes.py +163 -0
  78. llama_stack_api/prompts/models.py +177 -0
  79. llama_stack_api/providers/__init__.py +33 -0
  80. llama_stack_api/providers/api.py +16 -0
  81. llama_stack_api/providers/fastapi_routes.py +57 -0
  82. llama_stack_api/providers/models.py +24 -0
  83. llama_stack_api/rag_tool.py +168 -0
  84. llama_stack_api/resource.py +36 -0
  85. llama_stack_api/router_utils.py +160 -0
  86. llama_stack_api/safety/__init__.py +37 -0
  87. llama_stack_api/safety/api.py +29 -0
  88. llama_stack_api/safety/datatypes.py +83 -0
  89. llama_stack_api/safety/fastapi_routes.py +55 -0
  90. llama_stack_api/safety/models.py +38 -0
  91. llama_stack_api/schema_utils.py +251 -0
  92. llama_stack_api/scoring/__init__.py +66 -0
  93. llama_stack_api/scoring/api.py +35 -0
  94. llama_stack_api/scoring/fastapi_routes.py +67 -0
  95. llama_stack_api/scoring/models.py +81 -0
  96. llama_stack_api/scoring_functions/__init__.py +50 -0
  97. llama_stack_api/scoring_functions/api.py +39 -0
  98. llama_stack_api/scoring_functions/fastapi_routes.py +108 -0
  99. llama_stack_api/scoring_functions/models.py +214 -0
  100. llama_stack_api/shields/__init__.py +41 -0
  101. llama_stack_api/shields/api.py +39 -0
  102. llama_stack_api/shields/fastapi_routes.py +104 -0
  103. llama_stack_api/shields/models.py +74 -0
  104. llama_stack_api/tools.py +226 -0
  105. llama_stack_api/validators.py +46 -0
  106. llama_stack_api/vector_io/__init__.py +88 -0
  107. llama_stack_api/vector_io/api.py +234 -0
  108. llama_stack_api/vector_io/fastapi_routes.py +447 -0
  109. llama_stack_api/vector_io/models.py +663 -0
  110. llama_stack_api/vector_stores.py +53 -0
  111. llama_stack_api/version.py +9 -0
  112. {llama_stack_api-0.4.3.dist-info → llama_stack_api-0.5.0rc1.dist-info}/METADATA +1 -1
  113. llama_stack_api-0.5.0rc1.dist-info/RECORD +115 -0
  114. llama_stack_api-0.5.0rc1.dist-info/top_level.txt +1 -0
  115. llama_stack_api-0.4.3.dist-info/RECORD +0 -4
  116. llama_stack_api-0.4.3.dist-info/top_level.txt +0 -1
  117. {llama_stack_api-0.4.3.dist-info → llama_stack_api-0.5.0rc1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,93 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the terms described in the LICENSE file in
5
+ # the root directory of this source tree.
6
+
7
+ from collections.abc import AsyncIterator
8
+ from typing import Protocol, runtime_checkable
9
+
10
+ from llama_stack_api.models import Model
11
+
12
+ from .models import (
13
+ GetChatCompletionRequest,
14
+ ListChatCompletionsRequest,
15
+ ListOpenAIChatCompletionResponse,
16
+ OpenAIChatCompletion,
17
+ OpenAIChatCompletionChunk,
18
+ OpenAIChatCompletionRequestWithExtraBody,
19
+ OpenAICompletion,
20
+ OpenAICompletionRequestWithExtraBody,
21
+ OpenAICompletionWithInputMessages,
22
+ OpenAIEmbeddingsRequestWithExtraBody,
23
+ OpenAIEmbeddingsResponse,
24
+ RerankRequest,
25
+ RerankResponse,
26
+ )
27
+
28
+
29
+ class ModelStore(Protocol):
30
+ async def get_model(self, identifier: str) -> Model: ...
31
+
32
+
33
+ @runtime_checkable
34
+ class InferenceProvider(Protocol):
35
+ """
36
+ This protocol defines the interface that should be implemented by all inference providers.
37
+ """
38
+
39
+ API_NAMESPACE: str = "Inference"
40
+
41
+ model_store: ModelStore | None = None
42
+
43
+ async def rerank(
44
+ self,
45
+ request: RerankRequest,
46
+ ) -> RerankResponse:
47
+ """Rerank a list of documents based on their relevance to a query."""
48
+ raise NotImplementedError("Reranking is not implemented")
49
+ return # this is so mypy's safe-super rule will consider the method concrete
50
+
51
+ async def openai_completion(
52
+ self,
53
+ params: OpenAICompletionRequestWithExtraBody,
54
+ ) -> OpenAICompletion | AsyncIterator[OpenAICompletion]:
55
+ """Generate an OpenAI-compatible completion for the given prompt using the specified model."""
56
+ ...
57
+
58
+ async def openai_chat_completion(
59
+ self,
60
+ params: OpenAIChatCompletionRequestWithExtraBody,
61
+ ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
62
+ """Generate an OpenAI-compatible chat completion for the given messages using the specified model."""
63
+ ...
64
+
65
+ async def openai_embeddings(
66
+ self,
67
+ params: OpenAIEmbeddingsRequestWithExtraBody,
68
+ ) -> OpenAIEmbeddingsResponse:
69
+ """Generate OpenAI-compatible embeddings for the given input using the specified model."""
70
+ ...
71
+
72
+
73
+ class Inference(InferenceProvider):
74
+ """Inference
75
+
76
+ Llama Stack Inference API for generating completions, chat completions, and embeddings.
77
+
78
+ This API provides the raw interface to the underlying models. Three kinds of models are supported:
79
+ - LLM models: these models generate "raw" and "chat" (conversational) completions.
80
+ - Embedding models: these models generate embeddings to be used for semantic search.
81
+ - Rerank models: these models reorder the documents based on their relevance to a query.
82
+ """
83
+
84
+ async def list_chat_completions(
85
+ self,
86
+ request: ListChatCompletionsRequest,
87
+ ) -> ListOpenAIChatCompletionResponse:
88
+ """List stored chat completions."""
89
+ raise NotImplementedError("List chat completions is not implemented")
90
+
91
+ async def get_chat_completion(self, request: GetChatCompletionRequest) -> OpenAICompletionWithInputMessages:
92
+ """Retrieve a stored chat completion by its ID."""
93
+ raise NotImplementedError("Get chat completion is not implemented")
@@ -0,0 +1,243 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the terms described in the LICENSE file in
5
+ # the root directory of this source tree.
6
+
7
+ """FastAPI router for the Inference API.
8
+
9
+ This module defines the FastAPI router for the Inference API using standard
10
+ FastAPI route decorators. The router is defined in the API package to keep
11
+ all API-related code together.
12
+ """
13
+
14
+ import asyncio
15
+ import contextvars
16
+ import json
17
+ import logging # allow-direct-logging
18
+ from collections.abc import AsyncIterator
19
+ from typing import Annotated, Any
20
+
21
+ from fastapi import APIRouter, Body, Depends, HTTPException
22
+ from fastapi.responses import StreamingResponse
23
+ from pydantic import BaseModel
24
+
25
+ from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
26
+ from llama_stack_api.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
27
+
28
+ from .api import Inference
29
+ from .models import (
30
+ GetChatCompletionRequest,
31
+ ListChatCompletionsRequest,
32
+ ListOpenAIChatCompletionResponse,
33
+ OpenAIChatCompletion,
34
+ OpenAIChatCompletionRequestWithExtraBody,
35
+ OpenAICompletion,
36
+ OpenAICompletionRequestWithExtraBody,
37
+ OpenAICompletionWithInputMessages,
38
+ OpenAIEmbeddingsRequestWithExtraBody,
39
+ OpenAIEmbeddingsResponse,
40
+ RerankRequest,
41
+ RerankResponse,
42
+ )
43
+
44
+ logger = logging.LoggerAdapter(logging.getLogger(__name__), {"category": "inference"})
45
+
46
+
47
+ def _create_sse_event(data: Any) -> str:
48
+ """Create a Server-Sent Event string from data."""
49
+ if isinstance(data, BaseModel):
50
+ data = data.model_dump_json()
51
+ else:
52
+ data = json.dumps(data)
53
+ return f"data: {data}\n\n"
54
+
55
+
56
+ async def _sse_generator(event_gen: AsyncIterator[Any], context: str = "inference") -> AsyncIterator[str]:
57
+ """Convert an async generator to SSE format."""
58
+ try:
59
+ async for item in event_gen:
60
+ yield _create_sse_event(item)
61
+ except asyncio.CancelledError:
62
+ if hasattr(event_gen, "aclose"):
63
+ await event_gen.aclose()
64
+ raise
65
+ except Exception as e:
66
+ logger.exception(f"Error in SSE generator ({context})")
67
+ exc = _http_exception_from_sse_error(e)
68
+ yield _create_sse_event({"error": {"status_code": exc.status_code, "message": exc.detail}})
69
+
70
+
71
+ def _http_exception_from_value_error(exc: ValueError) -> HTTPException:
72
+ """Convert a ValueError to an HTTPException."""
73
+ detail = str(exc) or "Invalid value"
74
+ return HTTPException(status_code=400, detail=detail)
75
+
76
+
77
+ def _http_exception_from_sse_error(exc: Exception) -> HTTPException:
78
+ """Convert an exception to an HTTPException."""
79
+ if isinstance(exc, HTTPException):
80
+ return exc
81
+ if isinstance(exc, ValueError):
82
+ return _http_exception_from_value_error(exc)
83
+ status_code = getattr(exc, "status_code", None)
84
+ if isinstance(status_code, int):
85
+ return HTTPException(status_code=status_code, detail=str(exc))
86
+ return HTTPException(status_code=500, detail="Internal server error: An unexpected error occurred.")
87
+
88
+
89
+ def _preserve_context_for_sse(event_gen):
90
+ """Preserve request context for SSE streaming.
91
+
92
+ StreamingResponse runs in a different task, losing request contextvars.
93
+ This wrapper captures and restores the context.
94
+ """
95
+ context = contextvars.copy_context()
96
+
97
+ async def wrapper():
98
+ try:
99
+ while True:
100
+ try:
101
+ task = context.run(asyncio.create_task, event_gen.__anext__())
102
+ item = await task
103
+ except StopAsyncIteration:
104
+ break
105
+ yield item
106
+ except (asyncio.CancelledError, GeneratorExit):
107
+ if hasattr(event_gen, "aclose"):
108
+ await event_gen.aclose()
109
+ raise
110
+
111
+ return wrapper()
112
+
113
+
114
+ # Automatically generate dependency functions from Pydantic models
115
+ # This ensures the models are the single source of truth for descriptions
116
+ get_list_chat_completions_request = create_query_dependency(ListChatCompletionsRequest)
117
+ get_chat_completion_request = create_path_dependency(GetChatCompletionRequest)
118
+
119
+
120
+ def create_router(impl: Inference) -> APIRouter:
121
+ """Create a FastAPI router for the Inference API.
122
+
123
+ Args:
124
+ impl: The Inference implementation instance
125
+
126
+ Returns:
127
+ APIRouter configured for the Inference API
128
+ """
129
+ # Use no prefix - specify full paths for each route to support both v1 and v1alpha endpoints
130
+ router = APIRouter(
131
+ tags=["Inference"],
132
+ responses=standard_responses,
133
+ )
134
+
135
+ @router.post(
136
+ f"/{LLAMA_STACK_API_V1}/chat/completions",
137
+ response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
138
+ summary="Create chat completions.",
139
+ description="Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
140
+ responses={
141
+ 200: {
142
+ "description": "An OpenAIChatCompletion. When streaming, returns Server-Sent Events (SSE) with OpenAIChatCompletionChunk objects.",
143
+ "content": {
144
+ "application/json": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletion"}},
145
+ "text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletionChunk"}},
146
+ },
147
+ },
148
+ },
149
+ )
150
+ async def openai_chat_completion(
151
+ params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)],
152
+ ) -> OpenAIChatCompletion | StreamingResponse:
153
+ result = await impl.openai_chat_completion(params)
154
+ if isinstance(result, AsyncIterator):
155
+ return StreamingResponse(
156
+ _preserve_context_for_sse(_sse_generator(result, context="chat_completion")),
157
+ media_type="text/event-stream",
158
+ )
159
+ return result
160
+
161
+ @router.get(
162
+ f"/{LLAMA_STACK_API_V1}/chat/completions",
163
+ response_model=ListOpenAIChatCompletionResponse,
164
+ summary="List chat completions.",
165
+ description="List chat completions.",
166
+ responses={
167
+ 200: {"description": "A ListOpenAIChatCompletionResponse."},
168
+ },
169
+ )
170
+ async def list_chat_completions(
171
+ request: Annotated[ListChatCompletionsRequest, Depends(get_list_chat_completions_request)],
172
+ ) -> ListOpenAIChatCompletionResponse:
173
+ return await impl.list_chat_completions(request)
174
+
175
+ @router.get(
176
+ f"/{LLAMA_STACK_API_V1}/chat/completions/{{completion_id}}",
177
+ response_model=OpenAICompletionWithInputMessages,
178
+ summary="Get chat completion.",
179
+ description="Describe a chat completion by its ID.",
180
+ responses={
181
+ 200: {"description": "A OpenAICompletionWithInputMessages."},
182
+ },
183
+ )
184
+ async def get_chat_completion(
185
+ request: Annotated[GetChatCompletionRequest, Depends(get_chat_completion_request)],
186
+ ) -> OpenAICompletionWithInputMessages:
187
+ return await impl.get_chat_completion(request)
188
+
189
+ @router.post(
190
+ f"/{LLAMA_STACK_API_V1}/completions",
191
+ response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
192
+ summary="Create completion.",
193
+ description="Generate an OpenAI-compatible completion for the given prompt using the specified model.",
194
+ responses={
195
+ 200: {
196
+ "description": "An OpenAICompletion. When streaming, returns Server-Sent Events (SSE) with OpenAICompletion chunks.",
197
+ "content": {
198
+ "application/json": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
199
+ "text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
200
+ },
201
+ },
202
+ },
203
+ )
204
+ async def openai_completion(
205
+ params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
206
+ ) -> OpenAICompletion | StreamingResponse:
207
+ result = await impl.openai_completion(params)
208
+ if isinstance(result, AsyncIterator):
209
+ return StreamingResponse(
210
+ _preserve_context_for_sse(_sse_generator(result, context="completion")),
211
+ media_type="text/event-stream",
212
+ )
213
+ return result
214
+
215
+ @router.post(
216
+ f"/{LLAMA_STACK_API_V1}/embeddings",
217
+ response_model=OpenAIEmbeddingsResponse,
218
+ summary="Create embeddings.",
219
+ description="Generate OpenAI-compatible embeddings for the given input using the specified model.",
220
+ responses={
221
+ 200: {"description": "An OpenAIEmbeddingsResponse containing the embeddings."},
222
+ },
223
+ )
224
+ async def openai_embeddings(
225
+ params: Annotated[OpenAIEmbeddingsRequestWithExtraBody, Body(...)],
226
+ ) -> OpenAIEmbeddingsResponse:
227
+ return await impl.openai_embeddings(params)
228
+
229
+ @router.post(
230
+ f"/{LLAMA_STACK_API_V1ALPHA}/inference/rerank",
231
+ response_model=RerankResponse,
232
+ summary="Rerank documents based on relevance to a query.",
233
+ description="Rerank a list of documents based on their relevance to a query.",
234
+ responses={
235
+ 200: {"description": "RerankResponse with indices sorted by relevance score (descending)."},
236
+ },
237
+ )
238
+ async def rerank(
239
+ request: Annotated[RerankRequest, Body(...)],
240
+ ) -> RerankResponse:
241
+ return await impl.rerank(request)
242
+
243
+ return router