llama-stack-api 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack_api/__init__.py +1100 -0
- llama_stack_api/admin/__init__.py +45 -0
- llama_stack_api/admin/api.py +72 -0
- llama_stack_api/admin/fastapi_routes.py +117 -0
- llama_stack_api/admin/models.py +113 -0
- llama_stack_api/agents/__init__.py +38 -0
- llama_stack_api/agents/api.py +52 -0
- llama_stack_api/agents/fastapi_routes.py +268 -0
- llama_stack_api/agents/models.py +181 -0
- llama_stack_api/batches/__init__.py +40 -0
- llama_stack_api/batches/api.py +53 -0
- llama_stack_api/batches/fastapi_routes.py +113 -0
- llama_stack_api/batches/models.py +78 -0
- llama_stack_api/benchmarks/__init__.py +43 -0
- llama_stack_api/benchmarks/api.py +39 -0
- llama_stack_api/benchmarks/fastapi_routes.py +109 -0
- llama_stack_api/benchmarks/models.py +109 -0
- llama_stack_api/common/__init__.py +5 -0
- llama_stack_api/common/content_types.py +101 -0
- llama_stack_api/common/errors.py +110 -0
- llama_stack_api/common/job_types.py +38 -0
- llama_stack_api/common/responses.py +77 -0
- llama_stack_api/common/training_types.py +47 -0
- llama_stack_api/common/type_system.py +146 -0
- llama_stack_api/connectors/__init__.py +38 -0
- llama_stack_api/connectors/api.py +50 -0
- llama_stack_api/connectors/fastapi_routes.py +103 -0
- llama_stack_api/connectors/models.py +103 -0
- llama_stack_api/conversations/__init__.py +61 -0
- llama_stack_api/conversations/api.py +44 -0
- llama_stack_api/conversations/fastapi_routes.py +177 -0
- llama_stack_api/conversations/models.py +245 -0
- llama_stack_api/datasetio/__init__.py +34 -0
- llama_stack_api/datasetio/api.py +42 -0
- llama_stack_api/datasetio/fastapi_routes.py +94 -0
- llama_stack_api/datasetio/models.py +48 -0
- llama_stack_api/datasets/__init__.py +61 -0
- llama_stack_api/datasets/api.py +35 -0
- llama_stack_api/datasets/fastapi_routes.py +104 -0
- llama_stack_api/datasets/models.py +152 -0
- llama_stack_api/datatypes.py +373 -0
- llama_stack_api/eval/__init__.py +55 -0
- llama_stack_api/eval/api.py +51 -0
- llama_stack_api/eval/compat.py +300 -0
- llama_stack_api/eval/fastapi_routes.py +126 -0
- llama_stack_api/eval/models.py +141 -0
- llama_stack_api/file_processors/__init__.py +27 -0
- llama_stack_api/file_processors/api.py +64 -0
- llama_stack_api/file_processors/fastapi_routes.py +78 -0
- llama_stack_api/file_processors/models.py +42 -0
- llama_stack_api/files/__init__.py +35 -0
- llama_stack_api/files/api.py +51 -0
- llama_stack_api/files/fastapi_routes.py +124 -0
- llama_stack_api/files/models.py +107 -0
- llama_stack_api/inference/__init__.py +207 -0
- llama_stack_api/inference/api.py +93 -0
- llama_stack_api/inference/fastapi_routes.py +243 -0
- llama_stack_api/inference/models.py +1035 -0
- llama_stack_api/inspect_api/__init__.py +37 -0
- llama_stack_api/inspect_api/api.py +25 -0
- llama_stack_api/inspect_api/fastapi_routes.py +76 -0
- llama_stack_api/inspect_api/models.py +28 -0
- llama_stack_api/internal/__init__.py +9 -0
- llama_stack_api/internal/kvstore.py +28 -0
- llama_stack_api/internal/sqlstore.py +81 -0
- llama_stack_api/models/__init__.py +47 -0
- llama_stack_api/models/api.py +38 -0
- llama_stack_api/models/fastapi_routes.py +104 -0
- llama_stack_api/models/models.py +157 -0
- llama_stack_api/openai_responses.py +1494 -0
- llama_stack_api/post_training/__init__.py +73 -0
- llama_stack_api/post_training/api.py +36 -0
- llama_stack_api/post_training/fastapi_routes.py +116 -0
- llama_stack_api/post_training/models.py +339 -0
- llama_stack_api/prompts/__init__.py +47 -0
- llama_stack_api/prompts/api.py +44 -0
- llama_stack_api/prompts/fastapi_routes.py +163 -0
- llama_stack_api/prompts/models.py +177 -0
- llama_stack_api/providers/__init__.py +33 -0
- llama_stack_api/providers/api.py +16 -0
- llama_stack_api/providers/fastapi_routes.py +57 -0
- llama_stack_api/providers/models.py +24 -0
- llama_stack_api/rag_tool.py +168 -0
- llama_stack_api/resource.py +36 -0
- llama_stack_api/router_utils.py +160 -0
- llama_stack_api/safety/__init__.py +37 -0
- llama_stack_api/safety/api.py +29 -0
- llama_stack_api/safety/datatypes.py +83 -0
- llama_stack_api/safety/fastapi_routes.py +55 -0
- llama_stack_api/safety/models.py +38 -0
- llama_stack_api/schema_utils.py +251 -0
- llama_stack_api/scoring/__init__.py +66 -0
- llama_stack_api/scoring/api.py +35 -0
- llama_stack_api/scoring/fastapi_routes.py +67 -0
- llama_stack_api/scoring/models.py +81 -0
- llama_stack_api/scoring_functions/__init__.py +50 -0
- llama_stack_api/scoring_functions/api.py +39 -0
- llama_stack_api/scoring_functions/fastapi_routes.py +108 -0
- llama_stack_api/scoring_functions/models.py +214 -0
- llama_stack_api/shields/__init__.py +41 -0
- llama_stack_api/shields/api.py +39 -0
- llama_stack_api/shields/fastapi_routes.py +104 -0
- llama_stack_api/shields/models.py +74 -0
- llama_stack_api/tools.py +226 -0
- llama_stack_api/validators.py +46 -0
- llama_stack_api/vector_io/__init__.py +88 -0
- llama_stack_api/vector_io/api.py +234 -0
- llama_stack_api/vector_io/fastapi_routes.py +447 -0
- llama_stack_api/vector_io/models.py +663 -0
- llama_stack_api/vector_stores.py +53 -0
- llama_stack_api/version.py +9 -0
- {llama_stack_api-0.4.3.dist-info → llama_stack_api-0.5.0rc1.dist-info}/METADATA +1 -1
- llama_stack_api-0.5.0rc1.dist-info/RECORD +115 -0
- llama_stack_api-0.5.0rc1.dist-info/top_level.txt +1 -0
- llama_stack_api-0.4.3.dist-info/RECORD +0 -4
- llama_stack_api-0.4.3.dist-info/top_level.txt +0 -1
- {llama_stack_api-0.4.3.dist-info → llama_stack_api-0.5.0rc1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from typing import Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
from llama_stack_api.models import Model
|
|
11
|
+
|
|
12
|
+
from .models import (
|
|
13
|
+
GetChatCompletionRequest,
|
|
14
|
+
ListChatCompletionsRequest,
|
|
15
|
+
ListOpenAIChatCompletionResponse,
|
|
16
|
+
OpenAIChatCompletion,
|
|
17
|
+
OpenAIChatCompletionChunk,
|
|
18
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
19
|
+
OpenAICompletion,
|
|
20
|
+
OpenAICompletionRequestWithExtraBody,
|
|
21
|
+
OpenAICompletionWithInputMessages,
|
|
22
|
+
OpenAIEmbeddingsRequestWithExtraBody,
|
|
23
|
+
OpenAIEmbeddingsResponse,
|
|
24
|
+
RerankRequest,
|
|
25
|
+
RerankResponse,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ModelStore(Protocol):
|
|
30
|
+
async def get_model(self, identifier: str) -> Model: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@runtime_checkable
|
|
34
|
+
class InferenceProvider(Protocol):
|
|
35
|
+
"""
|
|
36
|
+
This protocol defines the interface that should be implemented by all inference providers.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
API_NAMESPACE: str = "Inference"
|
|
40
|
+
|
|
41
|
+
model_store: ModelStore | None = None
|
|
42
|
+
|
|
43
|
+
async def rerank(
|
|
44
|
+
self,
|
|
45
|
+
request: RerankRequest,
|
|
46
|
+
) -> RerankResponse:
|
|
47
|
+
"""Rerank a list of documents based on their relevance to a query."""
|
|
48
|
+
raise NotImplementedError("Reranking is not implemented")
|
|
49
|
+
return # this is so mypy's safe-super rule will consider the method concrete
|
|
50
|
+
|
|
51
|
+
async def openai_completion(
|
|
52
|
+
self,
|
|
53
|
+
params: OpenAICompletionRequestWithExtraBody,
|
|
54
|
+
) -> OpenAICompletion | AsyncIterator[OpenAICompletion]:
|
|
55
|
+
"""Generate an OpenAI-compatible completion for the given prompt using the specified model."""
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
async def openai_chat_completion(
|
|
59
|
+
self,
|
|
60
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
61
|
+
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
62
|
+
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
async def openai_embeddings(
|
|
66
|
+
self,
|
|
67
|
+
params: OpenAIEmbeddingsRequestWithExtraBody,
|
|
68
|
+
) -> OpenAIEmbeddingsResponse:
|
|
69
|
+
"""Generate OpenAI-compatible embeddings for the given input using the specified model."""
|
|
70
|
+
...
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Inference(InferenceProvider):
|
|
74
|
+
"""Inference
|
|
75
|
+
|
|
76
|
+
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
|
77
|
+
|
|
78
|
+
This API provides the raw interface to the underlying models. Three kinds of models are supported:
|
|
79
|
+
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
|
80
|
+
- Embedding models: these models generate embeddings to be used for semantic search.
|
|
81
|
+
- Rerank models: these models reorder the documents based on their relevance to a query.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
async def list_chat_completions(
|
|
85
|
+
self,
|
|
86
|
+
request: ListChatCompletionsRequest,
|
|
87
|
+
) -> ListOpenAIChatCompletionResponse:
|
|
88
|
+
"""List stored chat completions."""
|
|
89
|
+
raise NotImplementedError("List chat completions is not implemented")
|
|
90
|
+
|
|
91
|
+
async def get_chat_completion(self, request: GetChatCompletionRequest) -> OpenAICompletionWithInputMessages:
|
|
92
|
+
"""Retrieve a stored chat completion by its ID."""
|
|
93
|
+
raise NotImplementedError("Get chat completion is not implemented")
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
"""FastAPI router for the Inference API.
|
|
8
|
+
|
|
9
|
+
This module defines the FastAPI router for the Inference API using standard
|
|
10
|
+
FastAPI route decorators. The router is defined in the API package to keep
|
|
11
|
+
all API-related code together.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import contextvars
|
|
16
|
+
import json
|
|
17
|
+
import logging # allow-direct-logging
|
|
18
|
+
from collections.abc import AsyncIterator
|
|
19
|
+
from typing import Annotated, Any
|
|
20
|
+
|
|
21
|
+
from fastapi import APIRouter, Body, Depends, HTTPException
|
|
22
|
+
from fastapi.responses import StreamingResponse
|
|
23
|
+
from pydantic import BaseModel
|
|
24
|
+
|
|
25
|
+
from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
|
|
26
|
+
from llama_stack_api.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
|
|
27
|
+
|
|
28
|
+
from .api import Inference
|
|
29
|
+
from .models import (
|
|
30
|
+
GetChatCompletionRequest,
|
|
31
|
+
ListChatCompletionsRequest,
|
|
32
|
+
ListOpenAIChatCompletionResponse,
|
|
33
|
+
OpenAIChatCompletion,
|
|
34
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
35
|
+
OpenAICompletion,
|
|
36
|
+
OpenAICompletionRequestWithExtraBody,
|
|
37
|
+
OpenAICompletionWithInputMessages,
|
|
38
|
+
OpenAIEmbeddingsRequestWithExtraBody,
|
|
39
|
+
OpenAIEmbeddingsResponse,
|
|
40
|
+
RerankRequest,
|
|
41
|
+
RerankResponse,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
logger = logging.LoggerAdapter(logging.getLogger(__name__), {"category": "inference"})
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _create_sse_event(data: Any) -> str:
|
|
48
|
+
"""Create a Server-Sent Event string from data."""
|
|
49
|
+
if isinstance(data, BaseModel):
|
|
50
|
+
data = data.model_dump_json()
|
|
51
|
+
else:
|
|
52
|
+
data = json.dumps(data)
|
|
53
|
+
return f"data: {data}\n\n"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def _sse_generator(event_gen: AsyncIterator[Any], context: str = "inference") -> AsyncIterator[str]:
|
|
57
|
+
"""Convert an async generator to SSE format."""
|
|
58
|
+
try:
|
|
59
|
+
async for item in event_gen:
|
|
60
|
+
yield _create_sse_event(item)
|
|
61
|
+
except asyncio.CancelledError:
|
|
62
|
+
if hasattr(event_gen, "aclose"):
|
|
63
|
+
await event_gen.aclose()
|
|
64
|
+
raise
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.exception(f"Error in SSE generator ({context})")
|
|
67
|
+
exc = _http_exception_from_sse_error(e)
|
|
68
|
+
yield _create_sse_event({"error": {"status_code": exc.status_code, "message": exc.detail}})
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _http_exception_from_value_error(exc: ValueError) -> HTTPException:
|
|
72
|
+
"""Convert a ValueError to an HTTPException."""
|
|
73
|
+
detail = str(exc) or "Invalid value"
|
|
74
|
+
return HTTPException(status_code=400, detail=detail)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _http_exception_from_sse_error(exc: Exception) -> HTTPException:
|
|
78
|
+
"""Convert an exception to an HTTPException."""
|
|
79
|
+
if isinstance(exc, HTTPException):
|
|
80
|
+
return exc
|
|
81
|
+
if isinstance(exc, ValueError):
|
|
82
|
+
return _http_exception_from_value_error(exc)
|
|
83
|
+
status_code = getattr(exc, "status_code", None)
|
|
84
|
+
if isinstance(status_code, int):
|
|
85
|
+
return HTTPException(status_code=status_code, detail=str(exc))
|
|
86
|
+
return HTTPException(status_code=500, detail="Internal server error: An unexpected error occurred.")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _preserve_context_for_sse(event_gen):
|
|
90
|
+
"""Preserve request context for SSE streaming.
|
|
91
|
+
|
|
92
|
+
StreamingResponse runs in a different task, losing request contextvars.
|
|
93
|
+
This wrapper captures and restores the context.
|
|
94
|
+
"""
|
|
95
|
+
context = contextvars.copy_context()
|
|
96
|
+
|
|
97
|
+
async def wrapper():
|
|
98
|
+
try:
|
|
99
|
+
while True:
|
|
100
|
+
try:
|
|
101
|
+
task = context.run(asyncio.create_task, event_gen.__anext__())
|
|
102
|
+
item = await task
|
|
103
|
+
except StopAsyncIteration:
|
|
104
|
+
break
|
|
105
|
+
yield item
|
|
106
|
+
except (asyncio.CancelledError, GeneratorExit):
|
|
107
|
+
if hasattr(event_gen, "aclose"):
|
|
108
|
+
await event_gen.aclose()
|
|
109
|
+
raise
|
|
110
|
+
|
|
111
|
+
return wrapper()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Automatically generate dependency functions from Pydantic models
|
|
115
|
+
# This ensures the models are the single source of truth for descriptions
|
|
116
|
+
get_list_chat_completions_request = create_query_dependency(ListChatCompletionsRequest)
|
|
117
|
+
get_chat_completion_request = create_path_dependency(GetChatCompletionRequest)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def create_router(impl: Inference) -> APIRouter:
|
|
121
|
+
"""Create a FastAPI router for the Inference API.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
impl: The Inference implementation instance
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
APIRouter configured for the Inference API
|
|
128
|
+
"""
|
|
129
|
+
# Use no prefix - specify full paths for each route to support both v1 and v1alpha endpoints
|
|
130
|
+
router = APIRouter(
|
|
131
|
+
tags=["Inference"],
|
|
132
|
+
responses=standard_responses,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
@router.post(
|
|
136
|
+
f"/{LLAMA_STACK_API_V1}/chat/completions",
|
|
137
|
+
response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
|
|
138
|
+
summary="Create chat completions.",
|
|
139
|
+
description="Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
|
|
140
|
+
responses={
|
|
141
|
+
200: {
|
|
142
|
+
"description": "An OpenAIChatCompletion. When streaming, returns Server-Sent Events (SSE) with OpenAIChatCompletionChunk objects.",
|
|
143
|
+
"content": {
|
|
144
|
+
"application/json": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletion"}},
|
|
145
|
+
"text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAIChatCompletionChunk"}},
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
)
|
|
150
|
+
async def openai_chat_completion(
|
|
151
|
+
params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)],
|
|
152
|
+
) -> OpenAIChatCompletion | StreamingResponse:
|
|
153
|
+
result = await impl.openai_chat_completion(params)
|
|
154
|
+
if isinstance(result, AsyncIterator):
|
|
155
|
+
return StreamingResponse(
|
|
156
|
+
_preserve_context_for_sse(_sse_generator(result, context="chat_completion")),
|
|
157
|
+
media_type="text/event-stream",
|
|
158
|
+
)
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
@router.get(
|
|
162
|
+
f"/{LLAMA_STACK_API_V1}/chat/completions",
|
|
163
|
+
response_model=ListOpenAIChatCompletionResponse,
|
|
164
|
+
summary="List chat completions.",
|
|
165
|
+
description="List chat completions.",
|
|
166
|
+
responses={
|
|
167
|
+
200: {"description": "A ListOpenAIChatCompletionResponse."},
|
|
168
|
+
},
|
|
169
|
+
)
|
|
170
|
+
async def list_chat_completions(
|
|
171
|
+
request: Annotated[ListChatCompletionsRequest, Depends(get_list_chat_completions_request)],
|
|
172
|
+
) -> ListOpenAIChatCompletionResponse:
|
|
173
|
+
return await impl.list_chat_completions(request)
|
|
174
|
+
|
|
175
|
+
@router.get(
|
|
176
|
+
f"/{LLAMA_STACK_API_V1}/chat/completions/{{completion_id}}",
|
|
177
|
+
response_model=OpenAICompletionWithInputMessages,
|
|
178
|
+
summary="Get chat completion.",
|
|
179
|
+
description="Describe a chat completion by its ID.",
|
|
180
|
+
responses={
|
|
181
|
+
200: {"description": "A OpenAICompletionWithInputMessages."},
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
async def get_chat_completion(
|
|
185
|
+
request: Annotated[GetChatCompletionRequest, Depends(get_chat_completion_request)],
|
|
186
|
+
) -> OpenAICompletionWithInputMessages:
|
|
187
|
+
return await impl.get_chat_completion(request)
|
|
188
|
+
|
|
189
|
+
@router.post(
|
|
190
|
+
f"/{LLAMA_STACK_API_V1}/completions",
|
|
191
|
+
response_model=None, # Dynamic response: non-streaming (JSON) or streaming (SSE)
|
|
192
|
+
summary="Create completion.",
|
|
193
|
+
description="Generate an OpenAI-compatible completion for the given prompt using the specified model.",
|
|
194
|
+
responses={
|
|
195
|
+
200: {
|
|
196
|
+
"description": "An OpenAICompletion. When streaming, returns Server-Sent Events (SSE) with OpenAICompletion chunks.",
|
|
197
|
+
"content": {
|
|
198
|
+
"application/json": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
|
|
199
|
+
"text/event-stream": {"schema": {"$ref": "#/components/schemas/OpenAICompletion"}},
|
|
200
|
+
},
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
)
|
|
204
|
+
async def openai_completion(
|
|
205
|
+
params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
|
|
206
|
+
) -> OpenAICompletion | StreamingResponse:
|
|
207
|
+
result = await impl.openai_completion(params)
|
|
208
|
+
if isinstance(result, AsyncIterator):
|
|
209
|
+
return StreamingResponse(
|
|
210
|
+
_preserve_context_for_sse(_sse_generator(result, context="completion")),
|
|
211
|
+
media_type="text/event-stream",
|
|
212
|
+
)
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
@router.post(
|
|
216
|
+
f"/{LLAMA_STACK_API_V1}/embeddings",
|
|
217
|
+
response_model=OpenAIEmbeddingsResponse,
|
|
218
|
+
summary="Create embeddings.",
|
|
219
|
+
description="Generate OpenAI-compatible embeddings for the given input using the specified model.",
|
|
220
|
+
responses={
|
|
221
|
+
200: {"description": "An OpenAIEmbeddingsResponse containing the embeddings."},
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
async def openai_embeddings(
|
|
225
|
+
params: Annotated[OpenAIEmbeddingsRequestWithExtraBody, Body(...)],
|
|
226
|
+
) -> OpenAIEmbeddingsResponse:
|
|
227
|
+
return await impl.openai_embeddings(params)
|
|
228
|
+
|
|
229
|
+
@router.post(
|
|
230
|
+
f"/{LLAMA_STACK_API_V1ALPHA}/inference/rerank",
|
|
231
|
+
response_model=RerankResponse,
|
|
232
|
+
summary="Rerank documents based on relevance to a query.",
|
|
233
|
+
description="Rerank a list of documents based on their relevance to a query.",
|
|
234
|
+
responses={
|
|
235
|
+
200: {"description": "RerankResponse with indices sorted by relevance score (descending)."},
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
async def rerank(
|
|
239
|
+
request: Annotated[RerankRequest, Body(...)],
|
|
240
|
+
) -> RerankResponse:
|
|
241
|
+
return await impl.rerank(request)
|
|
242
|
+
|
|
243
|
+
return router
|