guidellm 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +38 -6
- guidellm/__main__.py +294 -0
- guidellm/backend/__init__.py +19 -6
- guidellm/backend/backend.py +238 -0
- guidellm/backend/openai.py +532 -122
- guidellm/backend/response.py +132 -0
- guidellm/benchmark/__init__.py +73 -0
- guidellm/benchmark/aggregator.py +760 -0
- guidellm/benchmark/benchmark.py +838 -0
- guidellm/benchmark/benchmarker.py +334 -0
- guidellm/benchmark/entrypoints.py +141 -0
- guidellm/benchmark/output.py +946 -0
- guidellm/benchmark/profile.py +409 -0
- guidellm/benchmark/progress.py +720 -0
- guidellm/config.py +34 -56
- guidellm/data/__init__.py +4 -0
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +22 -0
- guidellm/dataset/creator.py +213 -0
- guidellm/dataset/entrypoints.py +42 -0
- guidellm/dataset/file.py +90 -0
- guidellm/dataset/hf_datasets.py +62 -0
- guidellm/dataset/in_memory.py +132 -0
- guidellm/dataset/synthetic.py +262 -0
- guidellm/objects/__init__.py +18 -0
- guidellm/objects/pydantic.py +60 -0
- guidellm/objects/statistics.py +947 -0
- guidellm/request/__init__.py +12 -10
- guidellm/request/loader.py +281 -0
- guidellm/request/request.py +79 -0
- guidellm/scheduler/__init__.py +51 -3
- guidellm/scheduler/result.py +137 -0
- guidellm/scheduler/scheduler.py +382 -0
- guidellm/scheduler/strategy.py +493 -0
- guidellm/scheduler/types.py +7 -0
- guidellm/scheduler/worker.py +511 -0
- guidellm/utils/__init__.py +16 -29
- guidellm/utils/colors.py +8 -0
- guidellm/utils/hf_transformers.py +35 -0
- guidellm/utils/random.py +43 -0
- guidellm/utils/text.py +118 -357
- {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/METADATA +96 -79
- guidellm-0.2.0.dist-info/RECORD +48 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/WHEEL +1 -1
- guidellm-0.2.0.dist-info/entry_points.txt +2 -0
- guidellm/backend/base.py +0 -320
- guidellm/core/__init__.py +0 -24
- guidellm/core/distribution.py +0 -190
- guidellm/core/report.py +0 -321
- guidellm/core/request.py +0 -44
- guidellm/core/result.py +0 -545
- guidellm/core/serializable.py +0 -169
- guidellm/executor/__init__.py +0 -10
- guidellm/executor/base.py +0 -213
- guidellm/executor/profile_generator.py +0 -343
- guidellm/main.py +0 -336
- guidellm/request/base.py +0 -194
- guidellm/request/emulated.py +0 -391
- guidellm/request/file.py +0 -76
- guidellm/request/transformers.py +0 -100
- guidellm/scheduler/base.py +0 -374
- guidellm/scheduler/load_generator.py +0 -196
- guidellm/utils/injector.py +0 -70
- guidellm/utils/progress.py +0 -196
- guidellm/utils/transformers.py +0 -151
- guidellm-0.1.0.dist-info/RECORD +0 -35
- guidellm-0.1.0.dist-info/entry_points.txt +0 -3
- {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info/licenses}/LICENSE +0 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/top_level.txt +0 -0
guidellm/backend/openai.py
CHANGED
|
@@ -1,168 +1,578 @@
|
|
|
1
|
-
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Literal, Optional, Union
|
|
2
7
|
|
|
8
|
+
import httpx
|
|
3
9
|
from loguru import logger
|
|
4
|
-
from
|
|
10
|
+
from PIL import Image
|
|
5
11
|
|
|
6
|
-
from guidellm.backend.
|
|
12
|
+
from guidellm.backend.backend import Backend
|
|
13
|
+
from guidellm.backend.response import (
|
|
14
|
+
RequestArgs,
|
|
15
|
+
ResponseSummary,
|
|
16
|
+
StreamingTextResponse,
|
|
17
|
+
)
|
|
7
18
|
from guidellm.config import settings
|
|
8
|
-
from guidellm.core import TextGenerationRequest
|
|
9
19
|
|
|
10
|
-
__all__ = ["
|
|
20
|
+
__all__ = ["OpenAIHTTPBackend", "TEXT_COMPLETIONS_PATH", "CHAT_COMPLETIONS_PATH"]
|
|
11
21
|
|
|
12
22
|
|
|
13
|
-
|
|
14
|
-
|
|
23
|
+
TEXT_COMPLETIONS_PATH = "/v1/completions"
|
|
24
|
+
CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@Backend.register("openai_http")
|
|
28
|
+
class OpenAIHTTPBackend(Backend):
|
|
15
29
|
"""
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
:param
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
A HTTP-based backend implementation for requests to an OpenAI compatible server.
|
|
31
|
+
For example, a vLLM server instance or requests to OpenAI's API.
|
|
32
|
+
|
|
33
|
+
:param target: The target URL string for the OpenAI server. ex: http://0.0.0.0:8000
|
|
34
|
+
:param model: The model to use for all requests on the target server.
|
|
35
|
+
If none is provided, the first available model will be used.
|
|
36
|
+
:param api_key: The API key to use for requests to the OpenAI server.
|
|
37
|
+
If provided, adds an Authorization header with the value
|
|
38
|
+
"Authorization: Bearer {api_key}".
|
|
39
|
+
If not provided, no Authorization header is added.
|
|
40
|
+
:param organization: The organization to use for requests to the OpenAI server.
|
|
41
|
+
For example, if set to "org_123", adds an OpenAI-Organization header with the
|
|
42
|
+
value "OpenAI-Organization: org_123".
|
|
43
|
+
If not provided, no OpenAI-Organization header is added.
|
|
44
|
+
:param project: The project to use for requests to the OpenAI server.
|
|
45
|
+
For example, if set to "project_123", adds an OpenAI-Project header with the
|
|
46
|
+
value "OpenAI-Project: project_123".
|
|
47
|
+
If not provided, no OpenAI-Project header is added.
|
|
48
|
+
:param timeout: The timeout to use for requests to the OpenAI server.
|
|
49
|
+
If not provided, the default timeout provided from settings is used.
|
|
50
|
+
:param http2: If True, uses HTTP/2 for requests to the OpenAI server.
|
|
51
|
+
Defaults to True.
|
|
52
|
+
:param max_output_tokens: The maximum number of tokens to request for completions.
|
|
53
|
+
If not provided, the default maximum tokens provided from settings is used.
|
|
30
54
|
"""
|
|
31
55
|
|
|
32
56
|
def __init__(
|
|
33
57
|
self,
|
|
34
|
-
openai_api_key: Optional[str] = None,
|
|
35
58
|
target: Optional[str] = None,
|
|
36
59
|
model: Optional[str] = None,
|
|
37
|
-
|
|
60
|
+
api_key: Optional[str] = None,
|
|
61
|
+
organization: Optional[str] = None,
|
|
62
|
+
project: Optional[str] = None,
|
|
63
|
+
timeout: Optional[float] = None,
|
|
64
|
+
http2: Optional[bool] = True,
|
|
65
|
+
max_output_tokens: Optional[int] = None,
|
|
38
66
|
):
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
if not
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
67
|
+
super().__init__(type_="openai_http")
|
|
68
|
+
self._target = target or settings.openai.base_url
|
|
69
|
+
|
|
70
|
+
if not self._target:
|
|
71
|
+
raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
|
|
72
|
+
|
|
73
|
+
if self._target.endswith("/v1") or self._target.endswith("/v1/"):
|
|
74
|
+
# backwards compatability, strip v1 off
|
|
75
|
+
self._target = self._target[:-3]
|
|
76
|
+
|
|
77
|
+
if self._target.endswith("/"):
|
|
78
|
+
self._target = self._target[:-1]
|
|
79
|
+
|
|
80
|
+
self._model = model
|
|
81
|
+
|
|
82
|
+
api_key = api_key or settings.openai.api_key
|
|
83
|
+
self.authorization = (
|
|
84
|
+
f"Bearer {api_key}" if api_key else settings.openai.bearer_token
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self.organization = organization or settings.openai.organization
|
|
88
|
+
self.project = project or settings.openai.project
|
|
89
|
+
self.timeout = timeout if timeout is not None else settings.request_timeout
|
|
90
|
+
self.http2 = http2 if http2 is not None else settings.request_http2
|
|
91
|
+
self.max_output_tokens = (
|
|
92
|
+
max_output_tokens
|
|
93
|
+
if max_output_tokens is not None
|
|
94
|
+
else settings.openai.max_output_tokens
|
|
95
|
+
)
|
|
96
|
+
self._async_client: Optional[httpx.AsyncClient] = None
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def target(self) -> str:
|
|
100
|
+
"""
|
|
101
|
+
:return: The target URL string for the OpenAI server.
|
|
102
|
+
"""
|
|
103
|
+
return self._target
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def model(self) -> Optional[str]:
|
|
107
|
+
"""
|
|
108
|
+
:return: The model to use for all requests on the target server.
|
|
109
|
+
If validate hasn't been called yet and no model was passed in,
|
|
110
|
+
this will be None until validate is called to set the default.
|
|
111
|
+
"""
|
|
112
|
+
return self._model
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def info(self) -> dict[str, Any]:
|
|
116
|
+
"""
|
|
117
|
+
:return: The information about the backend.
|
|
118
|
+
"""
|
|
119
|
+
return {
|
|
120
|
+
"max_output_tokens": self.max_output_tokens,
|
|
121
|
+
"timeout": self.timeout,
|
|
122
|
+
"http2": self.http2,
|
|
123
|
+
"authorization": bool(self.authorization),
|
|
124
|
+
"organization": self.organization,
|
|
125
|
+
"project": self.project,
|
|
126
|
+
"text_completions_path": TEXT_COMPLETIONS_PATH,
|
|
127
|
+
"chat_completions_path": CHAT_COMPLETIONS_PATH,
|
|
128
|
+
}
|
|
50
129
|
|
|
51
|
-
|
|
130
|
+
async def check_setup(self):
|
|
131
|
+
"""
|
|
132
|
+
Check if the backend is setup correctly and can be used for requests.
|
|
133
|
+
Specifically, if a model is not provided, it grabs the first available model.
|
|
134
|
+
If no models are available, raises a ValueError.
|
|
135
|
+
If a model is provided and not available, raises a ValueError.
|
|
136
|
+
|
|
137
|
+
:raises ValueError: If no models or the provided model is not available.
|
|
138
|
+
"""
|
|
139
|
+
models = await self.available_models()
|
|
140
|
+
if not models:
|
|
141
|
+
raise ValueError(f"No models available for target: {self.target}")
|
|
52
142
|
|
|
53
|
-
if not
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
143
|
+
if not self.model:
|
|
144
|
+
self._model = models[0]
|
|
145
|
+
elif self.model not in models:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"Model {self.model} not found in available models:"
|
|
148
|
+
"{models} for target: {self.target}"
|
|
57
149
|
)
|
|
58
|
-
logger.error("{}", err)
|
|
59
|
-
raise err
|
|
60
150
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
151
|
+
async def prepare_multiprocessing(self):
|
|
152
|
+
"""
|
|
153
|
+
Prepare the backend for use in a multiprocessing environment.
|
|
154
|
+
Clears out the sync and async clients to ensure they are re-initialized
|
|
155
|
+
for each process.
|
|
156
|
+
"""
|
|
157
|
+
if self._async_client is not None:
|
|
158
|
+
await self._async_client.aclose()
|
|
159
|
+
self._async_client = None
|
|
160
|
+
|
|
161
|
+
async def available_models(self) -> list[str]:
|
|
162
|
+
"""
|
|
163
|
+
Get the available models for the target server using the OpenAI models endpoint:
|
|
164
|
+
/v1/models
|
|
165
|
+
"""
|
|
166
|
+
target = f"{self.target}/v1/models"
|
|
167
|
+
headers = self._headers()
|
|
168
|
+
response = await self._get_async_client().get(target, headers=headers)
|
|
169
|
+
response.raise_for_status()
|
|
170
|
+
|
|
171
|
+
models = []
|
|
172
|
+
|
|
173
|
+
for item in response.json()["data"]:
|
|
174
|
+
models.append(item["id"])
|
|
175
|
+
|
|
176
|
+
return models
|
|
64
177
|
|
|
65
|
-
|
|
66
|
-
|
|
178
|
+
async def text_completions( # type: ignore[override]
|
|
179
|
+
self,
|
|
180
|
+
prompt: Union[str, list[str]],
|
|
181
|
+
request_id: Optional[str] = None,
|
|
182
|
+
prompt_token_count: Optional[int] = None,
|
|
183
|
+
output_token_count: Optional[int] = None,
|
|
184
|
+
**kwargs,
|
|
185
|
+
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
|
|
186
|
+
"""
|
|
187
|
+
Generate text completions for the given prompt using the OpenAI
|
|
188
|
+
completions endpoint: /v1/completions.
|
|
189
|
+
|
|
190
|
+
:param prompt: The prompt (or list of prompts) to generate a completion for.
|
|
191
|
+
If a list is supplied, these are concatenated and run through the model
|
|
192
|
+
for a single prompt.
|
|
193
|
+
:param request_id: The unique identifier for the request, if any.
|
|
194
|
+
Added to logging statements and the response for tracking purposes.
|
|
195
|
+
:param prompt_token_count: The number of tokens measured in the prompt, if any.
|
|
196
|
+
Returned in the response stats for later analysis, if applicable.
|
|
197
|
+
:param output_token_count: If supplied, the number of tokens to enforce
|
|
198
|
+
generation of for the output for this request.
|
|
199
|
+
:param kwargs: Additional keyword arguments to pass with the request.
|
|
200
|
+
:return: An async generator that yields a StreamingTextResponse for start,
|
|
201
|
+
a StreamingTextResponse for each received iteration,
|
|
202
|
+
and a ResponseSummary for the final response.
|
|
203
|
+
"""
|
|
204
|
+
logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
|
|
205
|
+
headers = self._headers()
|
|
206
|
+
payload = self._completions_payload(
|
|
207
|
+
orig_kwargs=kwargs,
|
|
208
|
+
max_output_tokens=output_token_count,
|
|
209
|
+
prompt=prompt,
|
|
210
|
+
)
|
|
67
211
|
|
|
68
|
-
|
|
212
|
+
try:
|
|
213
|
+
async for resp in self._iterative_completions_request(
|
|
214
|
+
type_="text_completions",
|
|
215
|
+
request_id=request_id,
|
|
216
|
+
request_prompt_tokens=prompt_token_count,
|
|
217
|
+
request_output_tokens=output_token_count,
|
|
218
|
+
headers=headers,
|
|
219
|
+
payload=payload,
|
|
220
|
+
):
|
|
221
|
+
yield resp
|
|
222
|
+
except Exception as ex:
|
|
223
|
+
logger.error(
|
|
224
|
+
"{} request with headers: {} and payload: {} failed: {}",
|
|
225
|
+
self.__class__.__name__,
|
|
226
|
+
headers,
|
|
227
|
+
payload,
|
|
228
|
+
ex,
|
|
229
|
+
)
|
|
230
|
+
raise ex
|
|
231
|
+
|
|
232
|
+
async def chat_completions( # type: ignore[override]
|
|
69
233
|
self,
|
|
70
|
-
|
|
71
|
-
|
|
234
|
+
content: Union[
|
|
235
|
+
str,
|
|
236
|
+
list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
|
|
237
|
+
Any,
|
|
238
|
+
],
|
|
239
|
+
request_id: Optional[str] = None,
|
|
240
|
+
prompt_token_count: Optional[int] = None,
|
|
241
|
+
output_token_count: Optional[int] = None,
|
|
242
|
+
raw_content: bool = False,
|
|
243
|
+
**kwargs,
|
|
244
|
+
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
|
|
72
245
|
"""
|
|
73
|
-
|
|
246
|
+
Generate chat completions for the given content using the OpenAI
|
|
247
|
+
chat completions endpoint: /v1/chat/completions.
|
|
248
|
+
|
|
249
|
+
:param content: The content (or list of content) to generate a completion for.
|
|
250
|
+
This supports any combination of text, images, and audio (model dependent).
|
|
251
|
+
Supported text only request examples:
|
|
252
|
+
content="Sample prompt", content=["Sample prompt", "Second prompt"],
|
|
253
|
+
content=[{"type": "text", "value": "Sample prompt"}.
|
|
254
|
+
Supported text and image request examples:
|
|
255
|
+
content=["Describe the image", PIL.Image.open("image.jpg")],
|
|
256
|
+
content=["Describe the image", Path("image.jpg")],
|
|
257
|
+
content=["Describe the image", {"type": "image_url",
|
|
258
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}].
|
|
259
|
+
Supported text and audio request examples:
|
|
260
|
+
content=["Transcribe the audio", Path("audio.wav")],
|
|
261
|
+
content=["Transcribe the audio", {"type": "input_audio",
|
|
262
|
+
"input_audio": {"data": f"{base64_bytes}", "format": "wav}].
|
|
263
|
+
Additionally, if raw_content=True then the content is passed directly to the
|
|
264
|
+
backend without any processing.
|
|
265
|
+
:param request_id: The unique identifier for the request, if any.
|
|
266
|
+
Added to logging statements and the response for tracking purposes.
|
|
267
|
+
:param prompt_token_count: The number of tokens measured in the prompt, if any.
|
|
268
|
+
Returned in the response stats for later analysis, if applicable.
|
|
269
|
+
:param output_token_count: If supplied, the number of tokens to enforce
|
|
270
|
+
generation of for the output for this request.
|
|
271
|
+
:param kwargs: Additional keyword arguments to pass with the request.
|
|
272
|
+
:return: An async generator that yields a StreamingTextResponse for start,
|
|
273
|
+
a StreamingTextResponse for each received iteration,
|
|
274
|
+
and a ResponseSummary for the final response.
|
|
275
|
+
"""
|
|
276
|
+
logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
|
|
277
|
+
headers = self._headers()
|
|
278
|
+
messages = (
|
|
279
|
+
content if raw_content else self._create_chat_messages(content=content)
|
|
280
|
+
)
|
|
281
|
+
payload = self._completions_payload(
|
|
282
|
+
orig_kwargs=kwargs,
|
|
283
|
+
max_output_tokens=output_token_count,
|
|
284
|
+
messages=messages,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
async for resp in self._iterative_completions_request(
|
|
289
|
+
type_="chat_completions",
|
|
290
|
+
request_id=request_id,
|
|
291
|
+
request_prompt_tokens=prompt_token_count,
|
|
292
|
+
request_output_tokens=output_token_count,
|
|
293
|
+
headers=headers,
|
|
294
|
+
payload=payload,
|
|
295
|
+
):
|
|
296
|
+
yield resp
|
|
297
|
+
except Exception as ex:
|
|
298
|
+
logger.error(
|
|
299
|
+
"{} request with headers: {} and payload: {} failed: {}",
|
|
300
|
+
self.__class__.__name__,
|
|
301
|
+
headers,
|
|
302
|
+
payload,
|
|
303
|
+
ex,
|
|
304
|
+
)
|
|
305
|
+
raise ex
|
|
74
306
|
|
|
75
|
-
|
|
76
|
-
|
|
307
|
+
def _get_async_client(self) -> httpx.AsyncClient:
|
|
308
|
+
"""
|
|
309
|
+
Get the async HTTP client for making requests.
|
|
310
|
+
If the client has not been created yet, it will create one.
|
|
77
311
|
|
|
78
|
-
:
|
|
79
|
-
:type request: TextGenerationRequest
|
|
80
|
-
:yield: A stream of GenerativeResponse objects.
|
|
81
|
-
:rtype: AsyncGenerator[GenerativeResponse, None]
|
|
312
|
+
:return: The async HTTP client.
|
|
82
313
|
"""
|
|
314
|
+
if self._async_client is None:
|
|
315
|
+
client = httpx.AsyncClient(http2=self.http2, timeout=self.timeout)
|
|
316
|
+
self._async_client = client
|
|
317
|
+
else:
|
|
318
|
+
client = self._async_client
|
|
319
|
+
|
|
320
|
+
return client
|
|
321
|
+
|
|
322
|
+
def _headers(self) -> dict[str, str]:
|
|
323
|
+
headers = {
|
|
324
|
+
"Content-Type": "application/json",
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
if self.authorization:
|
|
328
|
+
headers["Authorization"] = self.authorization
|
|
329
|
+
|
|
330
|
+
if self.organization:
|
|
331
|
+
headers["OpenAI-Organization"] = self.organization
|
|
332
|
+
|
|
333
|
+
if self.project:
|
|
334
|
+
headers["OpenAI-Project"] = self.project
|
|
83
335
|
|
|
84
|
-
|
|
336
|
+
return headers
|
|
85
337
|
|
|
86
|
-
|
|
87
|
-
|
|
338
|
+
def _completions_payload(
|
|
339
|
+
self, orig_kwargs: Optional[dict], max_output_tokens: Optional[int], **kwargs
|
|
340
|
+
) -> dict:
|
|
341
|
+
payload = orig_kwargs or {}
|
|
342
|
+
payload.update(kwargs)
|
|
343
|
+
payload["model"] = self.model
|
|
344
|
+
payload["stream"] = True
|
|
345
|
+
payload["stream_options"] = {
|
|
346
|
+
"include_usage": True,
|
|
88
347
|
}
|
|
89
348
|
|
|
90
|
-
if
|
|
91
|
-
|
|
349
|
+
if max_output_tokens or self.max_output_tokens:
|
|
350
|
+
logger.debug(
|
|
351
|
+
"{} adding payload args for setting output_token_count: {}",
|
|
352
|
+
self.__class__.__name__,
|
|
353
|
+
max_output_tokens or self.max_output_tokens,
|
|
354
|
+
)
|
|
355
|
+
payload["max_tokens"] = max_output_tokens or self.max_output_tokens
|
|
356
|
+
payload["max_completion_tokens"] = payload["max_tokens"]
|
|
357
|
+
|
|
358
|
+
if max_output_tokens:
|
|
359
|
+
# only set stop and ignore_eos if max_output_tokens set at request level
|
|
360
|
+
# otherwise the instance value is just the max to enforce we stay below
|
|
361
|
+
payload["stop"] = None
|
|
362
|
+
payload["ignore_eos"] = True
|
|
363
|
+
|
|
364
|
+
return payload
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def _create_chat_messages(
|
|
368
|
+
content: Union[
|
|
369
|
+
str,
|
|
370
|
+
list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
|
|
371
|
+
Any,
|
|
372
|
+
],
|
|
373
|
+
) -> list[dict]:
|
|
374
|
+
if isinstance(content, str):
|
|
375
|
+
return [
|
|
92
376
|
{
|
|
93
|
-
"
|
|
94
|
-
"
|
|
377
|
+
"role": "user",
|
|
378
|
+
"content": content,
|
|
95
379
|
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
if isinstance(content, list):
|
|
383
|
+
resolved_content = []
|
|
384
|
+
|
|
385
|
+
for item in content:
|
|
386
|
+
if isinstance(item, dict):
|
|
387
|
+
resolved_content.append(item)
|
|
388
|
+
elif isinstance(item, str):
|
|
389
|
+
resolved_content.append({"type": "text", "text": item})
|
|
390
|
+
elif isinstance(item, Image.Image) or (
|
|
391
|
+
isinstance(item, Path) and item.suffix.lower() in [".jpg", ".jpeg"]
|
|
392
|
+
):
|
|
393
|
+
image = item if isinstance(item, Image.Image) else Image.open(item)
|
|
394
|
+
encoded = base64.b64encode(image.tobytes()).decode("utf-8")
|
|
395
|
+
resolved_content.append(
|
|
396
|
+
{
|
|
397
|
+
"type": "image",
|
|
398
|
+
"image": {
|
|
399
|
+
"url": f"data:image/jpeg;base64,{encoded}",
|
|
400
|
+
},
|
|
401
|
+
}
|
|
402
|
+
)
|
|
403
|
+
elif isinstance(item, Path) and item.suffix.lower() in [".wav"]:
|
|
404
|
+
encoded = base64.b64encode(item.read_bytes()).decode("utf-8")
|
|
405
|
+
resolved_content.append(
|
|
406
|
+
{
|
|
407
|
+
"type": "input_audio",
|
|
408
|
+
"input_audio": {
|
|
409
|
+
"data": f"{encoded}",
|
|
410
|
+
"format": "wav",
|
|
411
|
+
},
|
|
412
|
+
}
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
raise ValueError(
|
|
416
|
+
f"Unsupported content item type: {item} in list: {content}"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
return [
|
|
99
420
|
{
|
|
100
|
-
"
|
|
421
|
+
"role": "user",
|
|
422
|
+
"content": resolved_content,
|
|
101
423
|
}
|
|
102
|
-
|
|
424
|
+
]
|
|
425
|
+
|
|
426
|
+
raise ValueError(f"Unsupported content type: {content}")
|
|
427
|
+
|
|
428
|
+
async def _iterative_completions_request(
|
|
429
|
+
self,
|
|
430
|
+
type_: Literal["text_completions", "chat_completions"],
|
|
431
|
+
request_id: Optional[str],
|
|
432
|
+
request_prompt_tokens: Optional[int],
|
|
433
|
+
request_output_tokens: Optional[int],
|
|
434
|
+
headers: dict,
|
|
435
|
+
payload: dict,
|
|
436
|
+
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
|
|
437
|
+
if type_ == "text_completions":
|
|
438
|
+
target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
|
|
439
|
+
elif type_ == "chat_completions":
|
|
440
|
+
target = f"{self.target}{CHAT_COMPLETIONS_PATH}"
|
|
441
|
+
else:
|
|
442
|
+
raise ValueError(f"Unsupported type: {type_}")
|
|
443
|
+
|
|
444
|
+
logger.info(
|
|
445
|
+
"{} making request: {} to target: {} using http2: {} for "
|
|
446
|
+
"timeout: {} with headers: {} and payload: {}",
|
|
447
|
+
self.__class__.__name__,
|
|
448
|
+
request_id,
|
|
449
|
+
target,
|
|
450
|
+
self.http2,
|
|
451
|
+
self.timeout,
|
|
452
|
+
headers,
|
|
453
|
+
payload,
|
|
454
|
+
)
|
|
103
455
|
|
|
104
|
-
|
|
456
|
+
response_value = ""
|
|
457
|
+
response_prompt_count: Optional[int] = None
|
|
458
|
+
response_output_count: Optional[int] = None
|
|
459
|
+
iter_count = 0
|
|
460
|
+
start_time = time.time()
|
|
461
|
+
iter_time = start_time
|
|
462
|
+
first_iter_time: Optional[float] = None
|
|
463
|
+
last_iter_time: Optional[float] = None
|
|
105
464
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
465
|
+
yield StreamingTextResponse(
|
|
466
|
+
type_="start",
|
|
467
|
+
value="",
|
|
468
|
+
start_time=start_time,
|
|
469
|
+
first_iter_time=None,
|
|
470
|
+
iter_count=iter_count,
|
|
471
|
+
delta="",
|
|
472
|
+
time=start_time,
|
|
473
|
+
request_id=request_id,
|
|
113
474
|
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
475
|
+
|
|
476
|
+
# reset start time after yielding start response to ensure accurate timing
|
|
477
|
+
start_time = time.time()
|
|
478
|
+
|
|
479
|
+
async with self._get_async_client().stream(
|
|
480
|
+
"POST", target, headers=headers, json=payload
|
|
481
|
+
) as stream:
|
|
482
|
+
stream.raise_for_status()
|
|
483
|
+
|
|
484
|
+
async for line in stream.aiter_lines():
|
|
485
|
+
iter_time = time.time()
|
|
486
|
+
logger.debug(
|
|
487
|
+
"{} request: {} recieved iter response line: {}",
|
|
488
|
+
self.__class__.__name__,
|
|
489
|
+
request_id,
|
|
490
|
+
line,
|
|
125
491
|
)
|
|
126
|
-
break
|
|
127
|
-
|
|
128
|
-
token_count += 1
|
|
129
|
-
yield GenerativeResponse(
|
|
130
|
-
type_="token_iter",
|
|
131
|
-
add_token=token,
|
|
132
|
-
prompt=request.prompt,
|
|
133
|
-
prompt_token_count=request.prompt_token_count,
|
|
134
|
-
output_token_count=token_count,
|
|
135
|
-
)
|
|
136
492
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
Get the available models for the backend.
|
|
493
|
+
if not line or not line.strip().startswith("data:"):
|
|
494
|
+
continue
|
|
140
495
|
|
|
141
|
-
|
|
496
|
+
if line.strip() == "data: [DONE]":
|
|
497
|
+
break
|
|
142
498
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
499
|
+
data = json.loads(line.strip()[len("data: ") :])
|
|
500
|
+
if delta := self._extract_completions_delta_content(type_, data):
|
|
501
|
+
if first_iter_time is None:
|
|
502
|
+
first_iter_time = iter_time
|
|
503
|
+
last_iter_time = iter_time
|
|
147
504
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
except Exception as error:
|
|
151
|
-
logger.error("Failed to retrieve available models: {}", error)
|
|
152
|
-
raise error
|
|
505
|
+
iter_count += 1
|
|
506
|
+
response_value += delta
|
|
153
507
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
508
|
+
yield StreamingTextResponse(
|
|
509
|
+
type_="iter",
|
|
510
|
+
value=response_value,
|
|
511
|
+
iter_count=iter_count,
|
|
512
|
+
start_time=start_time,
|
|
513
|
+
first_iter_time=first_iter_time,
|
|
514
|
+
delta=delta,
|
|
515
|
+
time=iter_time,
|
|
516
|
+
request_id=request_id,
|
|
517
|
+
)
|
|
157
518
|
|
|
158
|
-
|
|
159
|
-
|
|
519
|
+
if usage := self._extract_completions_usage(data):
|
|
520
|
+
response_prompt_count = usage["prompt"]
|
|
521
|
+
response_output_count = usage["output"]
|
|
160
522
|
|
|
161
|
-
|
|
162
|
-
|
|
523
|
+
logger.info(
|
|
524
|
+
"{} request: {} with headers: {} and payload: {} completed with: {}",
|
|
525
|
+
self.__class__.__name__,
|
|
526
|
+
request_id,
|
|
527
|
+
headers,
|
|
528
|
+
payload,
|
|
529
|
+
response_value,
|
|
530
|
+
)
|
|
163
531
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
532
|
+
yield ResponseSummary(
|
|
533
|
+
value=response_value,
|
|
534
|
+
request_args=RequestArgs(
|
|
535
|
+
target=target,
|
|
536
|
+
headers=headers,
|
|
537
|
+
payload=payload,
|
|
538
|
+
timeout=self.timeout,
|
|
539
|
+
http2=self.http2,
|
|
540
|
+
),
|
|
541
|
+
start_time=start_time,
|
|
542
|
+
end_time=iter_time,
|
|
543
|
+
first_iter_time=first_iter_time,
|
|
544
|
+
last_iter_time=last_iter_time,
|
|
545
|
+
iterations=iter_count,
|
|
546
|
+
request_prompt_tokens=request_prompt_tokens,
|
|
547
|
+
request_output_tokens=request_output_tokens,
|
|
548
|
+
response_prompt_tokens=response_prompt_count,
|
|
549
|
+
response_output_tokens=response_output_count,
|
|
550
|
+
request_id=request_id,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
@staticmethod
|
|
554
|
+
def _extract_completions_delta_content(
|
|
555
|
+
type_: Literal["text_completions", "chat_completions"], data: dict
|
|
556
|
+
) -> Optional[str]:
|
|
557
|
+
if "choices" not in data or not data["choices"]:
|
|
558
|
+
return None
|
|
559
|
+
|
|
560
|
+
if type_ == "text_completions":
|
|
561
|
+
return data["choices"][0]["text"]
|
|
562
|
+
|
|
563
|
+
if type_ == "chat_completions":
|
|
564
|
+
return data["choices"][0]["delta"]["content"]
|
|
565
|
+
|
|
566
|
+
raise ValueError(f"Unsupported type: {type_}")
|
|
567
|
+
|
|
568
|
+
@staticmethod
|
|
569
|
+
def _extract_completions_usage(
|
|
570
|
+
data: dict,
|
|
571
|
+
) -> Optional[dict[Literal["prompt", "output"], int]]:
|
|
572
|
+
if "usage" not in data or not data["usage"]:
|
|
573
|
+
return None
|
|
574
|
+
|
|
575
|
+
return {
|
|
576
|
+
"prompt": data["usage"]["prompt_tokens"],
|
|
577
|
+
"output": data["usage"]["completion_tokens"],
|
|
578
|
+
}
|