llama-stack-api 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. llama_stack_api/__init__.py +175 -20
  2. llama_stack_api/agents/__init__.py +38 -0
  3. llama_stack_api/agents/api.py +52 -0
  4. llama_stack_api/agents/fastapi_routes.py +268 -0
  5. llama_stack_api/agents/models.py +181 -0
  6. llama_stack_api/common/errors.py +15 -0
  7. llama_stack_api/connectors/__init__.py +38 -0
  8. llama_stack_api/connectors/api.py +50 -0
  9. llama_stack_api/connectors/fastapi_routes.py +103 -0
  10. llama_stack_api/connectors/models.py +103 -0
  11. llama_stack_api/conversations/__init__.py +61 -0
  12. llama_stack_api/conversations/api.py +44 -0
  13. llama_stack_api/conversations/fastapi_routes.py +177 -0
  14. llama_stack_api/conversations/models.py +245 -0
  15. llama_stack_api/datasetio/__init__.py +34 -0
  16. llama_stack_api/datasetio/api.py +42 -0
  17. llama_stack_api/datasetio/fastapi_routes.py +94 -0
  18. llama_stack_api/datasetio/models.py +48 -0
  19. llama_stack_api/eval/__init__.py +55 -0
  20. llama_stack_api/eval/api.py +51 -0
  21. llama_stack_api/eval/compat.py +300 -0
  22. llama_stack_api/eval/fastapi_routes.py +126 -0
  23. llama_stack_api/eval/models.py +141 -0
  24. llama_stack_api/inference/__init__.py +207 -0
  25. llama_stack_api/inference/api.py +93 -0
  26. llama_stack_api/inference/fastapi_routes.py +243 -0
  27. llama_stack_api/inference/models.py +1035 -0
  28. llama_stack_api/models/__init__.py +47 -0
  29. llama_stack_api/models/api.py +38 -0
  30. llama_stack_api/models/fastapi_routes.py +104 -0
  31. llama_stack_api/{models.py → models/models.py} +65 -79
  32. llama_stack_api/openai_responses.py +32 -6
  33. llama_stack_api/post_training/__init__.py +73 -0
  34. llama_stack_api/post_training/api.py +36 -0
  35. llama_stack_api/post_training/fastapi_routes.py +116 -0
  36. llama_stack_api/{post_training.py → post_training/models.py} +55 -86
  37. llama_stack_api/prompts/__init__.py +47 -0
  38. llama_stack_api/prompts/api.py +44 -0
  39. llama_stack_api/prompts/fastapi_routes.py +163 -0
  40. llama_stack_api/prompts/models.py +177 -0
  41. llama_stack_api/resource.py +0 -1
  42. llama_stack_api/safety/__init__.py +37 -0
  43. llama_stack_api/safety/api.py +29 -0
  44. llama_stack_api/safety/datatypes.py +83 -0
  45. llama_stack_api/safety/fastapi_routes.py +55 -0
  46. llama_stack_api/safety/models.py +38 -0
  47. llama_stack_api/schema_utils.py +47 -4
  48. llama_stack_api/scoring/__init__.py +66 -0
  49. llama_stack_api/scoring/api.py +35 -0
  50. llama_stack_api/scoring/fastapi_routes.py +67 -0
  51. llama_stack_api/scoring/models.py +81 -0
  52. llama_stack_api/scoring_functions/__init__.py +50 -0
  53. llama_stack_api/scoring_functions/api.py +39 -0
  54. llama_stack_api/scoring_functions/fastapi_routes.py +108 -0
  55. llama_stack_api/{scoring_functions.py → scoring_functions/models.py} +67 -64
  56. llama_stack_api/shields/__init__.py +41 -0
  57. llama_stack_api/shields/api.py +39 -0
  58. llama_stack_api/shields/fastapi_routes.py +104 -0
  59. llama_stack_api/shields/models.py +74 -0
  60. llama_stack_api/validators.py +46 -0
  61. llama_stack_api/vector_io/__init__.py +88 -0
  62. llama_stack_api/vector_io/api.py +234 -0
  63. llama_stack_api/vector_io/fastapi_routes.py +447 -0
  64. llama_stack_api/{vector_io.py → vector_io/models.py} +99 -377
  65. {llama_stack_api-0.4.4.dist-info → llama_stack_api-0.5.0rc1.dist-info}/METADATA +1 -1
  66. llama_stack_api-0.5.0rc1.dist-info/RECORD +115 -0
  67. llama_stack_api/agents.py +0 -173
  68. llama_stack_api/connectors.py +0 -146
  69. llama_stack_api/conversations.py +0 -270
  70. llama_stack_api/datasetio.py +0 -55
  71. llama_stack_api/eval.py +0 -137
  72. llama_stack_api/inference.py +0 -1169
  73. llama_stack_api/prompts.py +0 -203
  74. llama_stack_api/safety.py +0 -132
  75. llama_stack_api/scoring.py +0 -93
  76. llama_stack_api/shields.py +0 -93
  77. llama_stack_api-0.4.4.dist-info/RECORD +0 -70
  78. {llama_stack_api-0.4.4.dist-info → llama_stack_api-0.5.0rc1.dist-info}/WHEEL +0 -0
  79. {llama_stack_api-0.4.4.dist-info → llama_stack_api-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,243 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the terms described in the LICENSE file in
5
+ # the root directory of this source tree.
6
+
7
+ """FastAPI router for the Inference API.
8
+
9
+ This module defines the FastAPI router for the Inference API using standard
10
+ FastAPI route decorators. The router is defined in the API package to keep
11
+ all API-related code together.
12
+ """
13
+
14
+ import asyncio
15
+ import contextvars
16
+ import json
17
+ import logging # allow-direct-logging
18
+ from collections.abc import AsyncIterator
19
+ from typing import Annotated, Any
20
+
21
+ from fastapi import APIRouter, Body, Depends, HTTPException
22
+ from fastapi.responses import StreamingResponse
23
+ from pydantic import BaseModel
24
+
25
+ from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
26
+ from llama_stack_api.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
27
+
28
+ from .api import Inference
29
+ from .models import (
30
+ GetChatCompletionRequest,
31
+ ListChatCompletionsRequest,
32
+ ListOpenAIChatCompletionResponse,
33
+ OpenAIChatCompletion,
34
+ OpenAIChatCompletionRequestWithExtraBody,
35
+ OpenAICompletion,
36
+ OpenAICompletionRequestWithExtraBody,
37
+ OpenAICompletionWithInputMessages,
38
+ OpenAIEmbeddingsRequestWithExtraBody,
39
+ OpenAIEmbeddingsResponse,
40
+ RerankRequest,
41
+ RerankResponse,
42
+ )
43
+
44
+ logger = logging.LoggerAdapter(logging.getLogger(__name__), {"category": "inference"})
45
+
46
+
47
+ def _create_sse_event(data: Any) -> str:
48
+ """Create a Server-Sent Event string from data."""
49
+ if isinstance(data, BaseModel):
50
+ data = data.model_dump_json()
51
+ else:
52
+ data = json.dumps(data)
53
+ return f"data: {data}\n\n"
54
+
55
+
56
+ async def _sse_generator(event_gen: AsyncIterator[Any], context: str = "inference") -> AsyncIterator[str]:
57
+ """Convert an async generator to SSE format."""
58
+ try:
59
+ async for item in event_gen:
60
+ yield _create_sse_event(item)
61
+ except asyncio.CancelledError:
62
+ if hasattr(event_gen, "aclose"):
63
+ await event_gen.aclose()
64
+ raise
65
+ except Exception as e:
66
+ logger.exception(f"Error in SSE generator ({context})")
67
+ exc = _http_exception_from_sse_error(e)
68
+ yield _create_sse_event({"error": {"status_code": exc.status_code, "message": exc.detail}})
69
+
70
+
71
+ def _http_exception_from_value_error(exc: ValueError) -> HTTPException:
72
+ """Convert a ValueError to an HTTPException."""
73
+ detail = str(exc) or "Invalid value"
74
+ return HTTPException(status_code=400, detail=detail)
75
+
76
+
77
+ def _http_exception_from_sse_error(exc: Exception) -> HTTPException:
78
+ """Convert an exception to an HTTPException."""
79
+ if isinstance(exc, HTTPException):
80
+ return exc
81
+ if isinstance(exc, ValueError):
82
+ return _http_exception_from_value_error(exc)
83
+ status_code = getattr(exc, "status_code", None)
84
+ if isinstance(status_code, int):
85
+ return HTTPException(status_code=status_code, detail=str(exc))
86
+ return HTTPException(status_code=500, detail="Internal server error: An unexpected error occurred.")
87
+
88
+
89
+ def _preserve_context_for_sse(event_gen):
90
+ """Preserve request context for SSE streaming.
91
+
92
+ StreamingResponse runs in a different task, losing request contextvars.
93
+ This wrapper captures and restores the context.
94
+ """
95
+ context = contextvars.copy_context()
96
+
97
+ async def wrapper():
98
+ try:
99
+ while True:
100
+ try:
101
+ task = context.run(asyncio.create_task, event_gen.__anext__())
102
+ item = await task
103
+ except StopAsyncIteration:
104
+ break
105
+ yield item
106
+ except (asyncio.CancelledError, GeneratorExit):
107
+ if hasattr(event_gen, "aclose"):
108
+ await event_gen.aclose()
109
+ raise
110
+
111
+ return wrapper()
112
+
113
+
114
+ # Automatically generate dependency functions from Pydantic models
115
+ # This ensures the models are the single source of truth for descriptions
116
+ get_list_chat_completions_request = create_query_dependency(ListChatCompletionsRequest)
117
+ get_chat_completion_request = create_path_dependency(GetChatCompletionRequest)
118
+
119
+
120
+ def create_router(impl: Inference) -> APIRouter:
121
+ """Create a FastAPI router for the Inference API.
122
+
123
+ Args:
124
+ impl: The Inference implementation instance
125
+
126
+ Returns:
127
+ APIRouter configured for the Inference API
128
+ """
129
+ # Use no prefix - specify full paths for each route to support both v1 and v1alpha endpoints
130
+ router = APIRouter(
131
+ tags=["Inference"],
132
+ responses=standard_responses,
133
+ )
134
+
135
+ @router.post(
136
+ f"/{LLAMA_STACK_API_V1}/chat/completions",
137
+ response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
138
+ summary="Create chat completions.",
139
+ description="Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
140
+ responses={
141
+ 200: {
142
+ "description": "An OpenAIChatCompletion. When streaming, returns Server-Sent Events (SSE) with OpenAIChatCompletionChunk objects.",
143
+ "content": {
144
+ "application/json": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletion"}},
145
+ "text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletionChunk"}},
146
+ },
147
+ },
148
+ },
149
+ )
150
+ async def openai_chat_completion(
151
+ params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)],
152
+ ) -> OpenAIChatCompletion | StreamingResponse:
153
+ result = await impl.openai_chat_completion(params)
154
+ if isinstance(result, AsyncIterator):
155
+ return StreamingResponse(
156
+ _preserve_context_for_sse(_sse_generator(result, context="chat_completion")),
157
+ media_type="text/event-stream",
158
+ )
159
+ return result
160
+
161
+ @router.get(
162
+ f"/{LLAMA_STACK_API_V1}/chat/completions",
163
+ response_model=ListOpenAIChatCompletionResponse,
164
+ summary="List chat completions.",
165
+ description="List chat completions.",
166
+ responses={
167
+ 200: {"description": "A ListOpenAIChatCompletionResponse."},
168
+ },
169
+ )
170
+ async def list_chat_completions(
171
+ request: Annotated[ListChatCompletionsRequest, Depends(get_list_chat_completions_request)],
172
+ ) -> ListOpenAIChatCompletionResponse:
173
+ return await impl.list_chat_completions(request)
174
+
175
+ @router.get(
176
+ f"/{LLAMA_STACK_API_V1}/chat/completions/{{completion_id}}",
177
+ response_model=OpenAICompletionWithInputMessages,
178
+ summary="Get chat completion.",
179
+ description="Describe a chat completion by its ID.",
180
+ responses={
181
+ 200: {"description": "A OpenAICompletionWithInputMessages."},
182
+ },
183
+ )
184
+ async def get_chat_completion(
185
+ request: Annotated[GetChatCompletionRequest, Depends(get_chat_completion_request)],
186
+ ) -> OpenAICompletionWithInputMessages:
187
+ return await impl.get_chat_completion(request)
188
+
189
+ @router.post(
190
+ f"/{LLAMA_STACK_API_V1}/completions",
191
+ response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
192
+ summary="Create completion.",
193
+ description="Generate an OpenAI-compatible completion for the given prompt using the specified model.",
194
+ responses={
195
+ 200: {
196
+ "description": "An OpenAICompletion. When streaming, returns Server-Sent Events (SSE) with OpenAICompletion chunks.",
197
+ "content": {
198
+ "application/json": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
199
+ "text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
200
+ },
201
+ },
202
+ },
203
+ )
204
+ async def openai_completion(
205
+ params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
206
+ ) -> OpenAICompletion | StreamingResponse:
207
+ result = await impl.openai_completion(params)
208
+ if isinstance(result, AsyncIterator):
209
+ return StreamingResponse(
210
+ _preserve_context_for_sse(_sse_generator(result, context="completion")),
211
+ media_type="text/event-stream",
212
+ )
213
+ return result
214
+
215
+ @router.post(
216
+ f"/{LLAMA_STACK_API_V1}/embeddings",
217
+ response_model=OpenAIEmbeddingsResponse,
218
+ summary="Create embeddings.",
219
+ description="Generate OpenAI-compatible embeddings for the given input using the specified model.",
220
+ responses={
221
+ 200: {"description": "An OpenAIEmbeddingsResponse containing the embeddings."},
222
+ },
223
+ )
224
+ async def openai_embeddings(
225
+ params: Annotated[OpenAIEmbeddingsRequestWithExtraBody, Body(...)],
226
+ ) -> OpenAIEmbeddingsResponse:
227
+ return await impl.openai_embeddings(params)
228
+
229
+ @router.post(
230
+ f"/{LLAMA_STACK_API_V1ALPHA}/inference/rerank",
231
+ response_model=RerankResponse,
232
+ summary="Rerank documents based on relevance to a query.",
233
+ description="Rerank a list of documents based on their relevance to a query.",
234
+ responses={
235
+ 200: {"description": "RerankResponse with indices sorted by relevance score (descending)."},
236
+ },
237
+ )
238
+ async def rerank(
239
+ request: Annotated[RerankRequest, Body(...)],
240
+ ) -> RerankResponse:
241
+ return await impl.rerank(request)
242
+
243
+ return router