guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -1,708 +0,0 @@
1
- import base64
2
- import copy
3
- import json
4
- import time
5
- from collections.abc import AsyncGenerator
6
- from pathlib import Path
7
- from typing import Any, Literal, Optional, Union
8
-
9
- import httpx
10
- from loguru import logger
11
- from PIL import Image
12
-
13
- from guidellm.backend.backend import Backend
14
- from guidellm.backend.response import (
15
- RequestArgs,
16
- ResponseSummary,
17
- StreamingTextResponse,
18
- )
19
- from guidellm.config import settings
20
-
21
- __all__ = [
22
- "CHAT_COMPLETIONS",
23
- "CHAT_COMPLETIONS_PATH",
24
- "MODELS",
25
- "TEXT_COMPLETIONS",
26
- "TEXT_COMPLETIONS_PATH",
27
- "OpenAIHTTPBackend",
28
- ]
29
-
30
-
31
- TEXT_COMPLETIONS_PATH = "/v1/completions"
32
- CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
33
-
34
- CompletionEndpointType = Literal["text_completions", "chat_completions"]
35
- EndpointType = Union[Literal["models"], CompletionEndpointType]
36
- CHAT_COMPLETIONS: CompletionEndpointType = "chat_completions"
37
- MODELS: EndpointType = "models"
38
- TEXT_COMPLETIONS: CompletionEndpointType = "text_completions"
39
-
40
-
41
- @Backend.register("openai_http")
42
- class OpenAIHTTPBackend(Backend):
43
- """
44
- A HTTP-based backend implementation for requests to an OpenAI compatible server.
45
- For example, a vLLM server instance or requests to OpenAI's API.
46
-
47
- :param target: The target URL string for the OpenAI server. ex: http://0.0.0.0:8000
48
- :param model: The model to use for all requests on the target server.
49
- If none is provided, the first available model will be used.
50
- :param api_key: The API key to use for requests to the OpenAI server.
51
- If provided, adds an Authorization header with the value
52
- "Authorization: Bearer {api_key}".
53
- If not provided, no Authorization header is added.
54
- :param organization: The organization to use for requests to the OpenAI server.
55
- For example, if set to "org_123", adds an OpenAI-Organization header with the
56
- value "OpenAI-Organization: org_123".
57
- If not provided, no OpenAI-Organization header is added.
58
- :param project: The project to use for requests to the OpenAI server.
59
- For example, if set to "project_123", adds an OpenAI-Project header with the
60
- value "OpenAI-Project: project_123".
61
- If not provided, no OpenAI-Project header is added.
62
- :param timeout: The timeout to use for requests to the OpenAI server.
63
- If not provided, the default timeout provided from settings is used.
64
- :param http2: If True, uses HTTP/2 for requests to the OpenAI server.
65
- Defaults to True.
66
- :param follow_redirects: If True, the HTTP client will follow redirect responses.
67
- If not provided, the default value from settings is used.
68
- :param max_output_tokens: The maximum number of tokens to request for completions.
69
- If not provided, the default maximum tokens provided from settings is used.
70
- :param extra_query: Query parameters to include in requests to the OpenAI server.
71
- If "chat_completions", "models", or "text_completions" are included as keys,
72
- the values of these keys will be used as the parameters for the respective
73
- endpoint.
74
- If not provided, no extra query parameters are added.
75
- :param extra_body: Body parameters to include in requests to the OpenAI server.
76
- If "chat_completions", "models", or "text_completions" are included as keys,
77
- the values of these keys will be included in the body for the respective
78
- endpoint.
79
- If not provided, no extra body parameters are added.
80
- :param remove_from_body: Parameters that should be removed from the body of each
81
- request.
82
- If not provided, no parameters are removed from the body.
83
- """
84
-
85
- def __init__(
86
- self,
87
- target: Optional[str] = None,
88
- model: Optional[str] = None,
89
- api_key: Optional[str] = None,
90
- organization: Optional[str] = None,
91
- project: Optional[str] = None,
92
- timeout: Optional[float] = None,
93
- http2: Optional[bool] = True,
94
- follow_redirects: Optional[bool] = None,
95
- max_output_tokens: Optional[int] = None,
96
- extra_query: Optional[dict] = None,
97
- extra_body: Optional[dict] = None,
98
- remove_from_body: Optional[list[str]] = None,
99
- headers: Optional[dict] = None,
100
- verify: Optional[bool] = None,
101
- ):
102
- super().__init__(type_="openai_http")
103
- self._target = target or settings.openai.base_url
104
-
105
- if not self._target:
106
- raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
107
-
108
- if self._target.endswith("/v1") or self._target.endswith("/v1/"):
109
- # backwards compatability, strip v1 off
110
- self._target = self._target[:-3]
111
-
112
- if self._target.endswith("/"):
113
- self._target = self._target[:-1]
114
-
115
- self._model = model
116
-
117
- # Start with default headers based on other params
118
- default_headers: dict[str, str] = {}
119
- api_key = api_key or settings.openai.api_key
120
- bearer_token = settings.openai.bearer_token
121
- if api_key:
122
- default_headers["Authorization"] = f"Bearer {api_key}"
123
- elif bearer_token:
124
- default_headers["Authorization"] = bearer_token
125
-
126
- self.organization = organization or settings.openai.organization
127
- if self.organization:
128
- default_headers["OpenAI-Organization"] = self.organization
129
-
130
- self.project = project or settings.openai.project
131
- if self.project:
132
- default_headers["OpenAI-Project"] = self.project
133
-
134
- # User-provided headers from kwargs or settings override defaults
135
- merged_headers = default_headers.copy()
136
- merged_headers.update(settings.openai.headers or {})
137
- if headers:
138
- merged_headers.update(headers)
139
-
140
- # Remove headers with None values for backward compatibility and convenience
141
- self.headers = {k: v for k, v in merged_headers.items() if v is not None}
142
-
143
- self.timeout = timeout if timeout is not None else settings.request_timeout
144
- self.http2 = http2 if http2 is not None else settings.request_http2
145
- self.follow_redirects = (
146
- follow_redirects
147
- if follow_redirects is not None
148
- else settings.request_follow_redirects
149
- )
150
- self.verify = verify if verify is not None else settings.openai.verify
151
- self.max_output_tokens = (
152
- max_output_tokens
153
- if max_output_tokens is not None
154
- else settings.openai.max_output_tokens
155
- )
156
- self.extra_query = extra_query
157
- self.extra_body = extra_body
158
- self.remove_from_body = remove_from_body
159
- self._async_client: Optional[httpx.AsyncClient] = None
160
-
161
- @property
162
- def target(self) -> str:
163
- """
164
- :return: The target URL string for the OpenAI server.
165
- """
166
- return self._target
167
-
168
- @property
169
- def model(self) -> Optional[str]:
170
- """
171
- :return: The model to use for all requests on the target server.
172
- If validate hasn't been called yet and no model was passed in,
173
- this will be None until validate is called to set the default.
174
- """
175
- return self._model
176
-
177
- @property
178
- def info(self) -> dict[str, Any]:
179
- """
180
- :return: The information about the backend.
181
- """
182
- return {
183
- "max_output_tokens": self.max_output_tokens,
184
- "timeout": self.timeout,
185
- "http2": self.http2,
186
- "follow_redirects": self.follow_redirects,
187
- "headers": self.headers,
188
- "text_completions_path": TEXT_COMPLETIONS_PATH,
189
- "chat_completions_path": CHAT_COMPLETIONS_PATH,
190
- }
191
-
192
- async def reset(self) -> None:
193
- """
194
- Reset the connection object. This is useful for backends that
195
- reuse connections or have state that needs to be cleared.
196
- For this backend, it closes the async client if it exists.
197
- """
198
- if self._async_client is not None:
199
- await self._async_client.aclose()
200
-
201
- async def check_setup(self):
202
- """
203
- Check if the backend is setup correctly and can be used for requests.
204
- Specifically, if a model is not provided, it grabs the first available model.
205
- If no models are available, raises a ValueError.
206
- If a model is provided and not available, raises a ValueError.
207
-
208
- :raises ValueError: If no models or the provided model is not available.
209
- """
210
- models = await self.available_models()
211
- if not models:
212
- raise ValueError(f"No models available for target: {self.target}")
213
-
214
- if not self.model:
215
- self._model = models[0]
216
- elif self.model not in models:
217
- raise ValueError(
218
- f"Model {self.model} not found in available models:"
219
- f"{models} for target: {self.target}"
220
- )
221
-
222
- async def prepare_multiprocessing(self):
223
- """
224
- Prepare the backend for use in a multiprocessing environment.
225
- Clears out the sync and async clients to ensure they are re-initialized
226
- for each process.
227
- """
228
- if self._async_client is not None:
229
- await self._async_client.aclose()
230
- self._async_client = None
231
-
232
- async def available_models(self) -> list[str]:
233
- """
234
- Get the available models for the target server using the OpenAI models endpoint:
235
- /v1/models
236
- """
237
- target = f"{self.target}/v1/models"
238
- headers = self._headers()
239
- params = self._params(MODELS)
240
- response = await self._get_async_client().get(
241
- target, headers=headers, params=params
242
- )
243
- response.raise_for_status()
244
-
245
- models = []
246
-
247
- for item in response.json()["data"]:
248
- models.append(item["id"])
249
-
250
- return models
251
-
252
- async def text_completions( # type: ignore[override]
253
- self,
254
- prompt: Union[str, list[str]],
255
- request_id: Optional[str] = None,
256
- prompt_token_count: Optional[int] = None,
257
- output_token_count: Optional[int] = None,
258
- **kwargs,
259
- ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
260
- """
261
- Generate text completions for the given prompt using the OpenAI
262
- completions endpoint: /v1/completions.
263
-
264
- :param prompt: The prompt (or list of prompts) to generate a completion for.
265
- If a list is supplied, these are concatenated and run through the model
266
- for a single prompt.
267
- :param request_id: The unique identifier for the request, if any.
268
- Added to logging statements and the response for tracking purposes.
269
- :param prompt_token_count: The number of tokens measured in the prompt, if any.
270
- Returned in the response stats for later analysis, if applicable.
271
- :param output_token_count: If supplied, the number of tokens to enforce
272
- generation of for the output for this request.
273
- :param kwargs: Additional keyword arguments to pass with the request.
274
- :return: An async generator that yields a StreamingTextResponse for start,
275
- a StreamingTextResponse for each received iteration,
276
- and a ResponseSummary for the final response.
277
- """
278
- logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
279
-
280
- if isinstance(prompt, list):
281
- raise ValueError(
282
- "List prompts (batching) is currently not supported for "
283
- f"text_completions OpenAI pathways. Received: {prompt}"
284
- )
285
-
286
- headers = self._headers()
287
- params = self._params(TEXT_COMPLETIONS)
288
- payload = self._completions_payload(
289
- endpoint_type=TEXT_COMPLETIONS,
290
- orig_kwargs=kwargs,
291
- max_output_tokens=output_token_count,
292
- prompt=prompt,
293
- )
294
-
295
- try:
296
- async for resp in self._iterative_completions_request(
297
- type_="text_completions",
298
- request_id=request_id,
299
- request_prompt_tokens=prompt_token_count,
300
- request_output_tokens=output_token_count,
301
- headers=headers,
302
- params=params,
303
- payload=payload,
304
- ):
305
- yield resp
306
- except Exception as ex:
307
- logger.error(
308
- "{} request with headers: {} and params: {} and payload: {} failed: {}",
309
- self.__class__.__name__,
310
- headers,
311
- params,
312
- payload,
313
- ex,
314
- )
315
- raise ex
316
-
317
- async def chat_completions( # type: ignore[override]
318
- self,
319
- content: Union[
320
- str,
321
- list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
322
- Any,
323
- ],
324
- request_id: Optional[str] = None,
325
- prompt_token_count: Optional[int] = None,
326
- output_token_count: Optional[int] = None,
327
- raw_content: bool = False,
328
- **kwargs,
329
- ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
330
- """
331
- Generate chat completions for the given content using the OpenAI
332
- chat completions endpoint: /v1/chat/completions.
333
-
334
- :param content: The content (or list of content) to generate a completion for.
335
- This supports any combination of text, images, and audio (model dependent).
336
- Supported text only request examples:
337
- content="Sample prompt", content=["Sample prompt", "Second prompt"],
338
- content=[{"type": "text", "value": "Sample prompt"}.
339
- Supported text and image request examples:
340
- content=["Describe the image", PIL.Image.open("image.jpg")],
341
- content=["Describe the image", Path("image.jpg")],
342
- content=["Describe the image", {"type": "image_url",
343
- "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}].
344
- Supported text and audio request examples:
345
- content=["Transcribe the audio", Path("audio.wav")],
346
- content=["Transcribe the audio", {"type": "input_audio",
347
- "input_audio": {"data": f"{base64_bytes}", "format": "wav}].
348
- Additionally, if raw_content=True then the content is passed directly to the
349
- backend without any processing.
350
- :param request_id: The unique identifier for the request, if any.
351
- Added to logging statements and the response for tracking purposes.
352
- :param prompt_token_count: The number of tokens measured in the prompt, if any.
353
- Returned in the response stats for later analysis, if applicable.
354
- :param output_token_count: If supplied, the number of tokens to enforce
355
- generation of for the output for this request.
356
- :param kwargs: Additional keyword arguments to pass with the request.
357
- :return: An async generator that yields a StreamingTextResponse for start,
358
- a StreamingTextResponse for each received iteration,
359
- and a ResponseSummary for the final response.
360
- """
361
- logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
362
- headers = self._headers()
363
- params = self._params(CHAT_COMPLETIONS)
364
- messages = (
365
- content if raw_content else self._create_chat_messages(content=content)
366
- )
367
- payload = self._completions_payload(
368
- endpoint_type=CHAT_COMPLETIONS,
369
- orig_kwargs=kwargs,
370
- max_output_tokens=output_token_count,
371
- messages=messages,
372
- )
373
-
374
- try:
375
- async for resp in self._iterative_completions_request(
376
- type_="chat_completions",
377
- request_id=request_id,
378
- request_prompt_tokens=prompt_token_count,
379
- request_output_tokens=output_token_count,
380
- headers=headers,
381
- params=params,
382
- payload=payload,
383
- ):
384
- yield resp
385
- except Exception as ex:
386
- logger.error(
387
- "{} request with headers: {} and params: {} and payload: {} failed: {}",
388
- self.__class__.__name__,
389
- headers,
390
- params,
391
- payload,
392
- ex,
393
- )
394
- raise ex
395
-
396
- def _get_async_client(self) -> httpx.AsyncClient:
397
- """
398
- Get the async HTTP client for making requests.
399
- If the client has not been created yet, it will create one.
400
-
401
- :return: The async HTTP client.
402
- """
403
- if self._async_client is None or self._async_client.is_closed:
404
- client = httpx.AsyncClient(
405
- http2=self.http2,
406
- timeout=self.timeout,
407
- follow_redirects=self.follow_redirects,
408
- verify=self.verify,
409
- )
410
- self._async_client = client
411
- else:
412
- client = self._async_client
413
-
414
- return client
415
-
416
- def _headers(self) -> dict[str, str]:
417
- headers = {
418
- "Content-Type": "application/json",
419
- }
420
- headers.update(self.headers)
421
- return headers
422
-
423
- def _params(self, endpoint_type: EndpointType) -> dict[str, str]:
424
- if self.extra_query is None:
425
- return {}
426
-
427
- if (
428
- CHAT_COMPLETIONS in self.extra_query
429
- or MODELS in self.extra_query
430
- or TEXT_COMPLETIONS in self.extra_query
431
- ):
432
- return self.extra_query.get(endpoint_type, {})
433
-
434
- return self.extra_query
435
-
436
- def _extra_body(self, endpoint_type: EndpointType) -> dict[str, Any]:
437
- if self.extra_body is None:
438
- return {}
439
-
440
- if (
441
- CHAT_COMPLETIONS in self.extra_body
442
- or MODELS in self.extra_body
443
- or TEXT_COMPLETIONS in self.extra_body
444
- ):
445
- return copy.deepcopy(self.extra_body.get(endpoint_type, {}))
446
-
447
- return copy.deepcopy(self.extra_body)
448
-
449
- def _completions_payload(
450
- self,
451
- endpoint_type: CompletionEndpointType,
452
- orig_kwargs: Optional[dict],
453
- max_output_tokens: Optional[int],
454
- **kwargs,
455
- ) -> dict:
456
- payload = self._extra_body(endpoint_type)
457
- payload.update(orig_kwargs or {})
458
- payload.update(kwargs)
459
- payload["model"] = self.model
460
- payload["stream"] = True
461
- payload["stream_options"] = {
462
- "include_usage": True,
463
- }
464
-
465
- if max_output_tokens or self.max_output_tokens:
466
- logger.debug(
467
- "{} adding payload args for setting output_token_count: {}",
468
- self.__class__.__name__,
469
- max_output_tokens or self.max_output_tokens,
470
- )
471
- max_output_key = settings.openai.max_output_key.get(
472
- endpoint_type, "max_tokens"
473
- )
474
- payload[max_output_key] = max_output_tokens or self.max_output_tokens
475
-
476
- if max_output_tokens:
477
- # only set stop and ignore_eos if max_output_tokens set at request level
478
- # otherwise the instance value is just the max to enforce we stay below
479
- payload["stop"] = None
480
- payload["ignore_eos"] = True
481
-
482
- if self.remove_from_body:
483
- for key in self.remove_from_body:
484
- payload.pop(key, None)
485
-
486
- return payload
487
-
488
- @staticmethod
489
- def _create_chat_messages(
490
- content: Union[
491
- str,
492
- list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
493
- Any,
494
- ],
495
- ) -> list[dict]:
496
- if isinstance(content, str):
497
- return [
498
- {
499
- "role": "user",
500
- "content": content,
501
- }
502
- ]
503
-
504
- if isinstance(content, list):
505
- resolved_content = []
506
-
507
- for item in content:
508
- if isinstance(item, dict):
509
- resolved_content.append(item)
510
- elif isinstance(item, str):
511
- resolved_content.append({"type": "text", "text": item})
512
- elif isinstance(item, Image.Image) or (
513
- isinstance(item, Path) and item.suffix.lower() in [".jpg", ".jpeg"]
514
- ):
515
- image = item if isinstance(item, Image.Image) else Image.open(item)
516
- encoded = base64.b64encode(image.tobytes()).decode("utf-8")
517
- resolved_content.append(
518
- {
519
- "type": "image",
520
- "image": {
521
- "url": f"data:image/jpeg;base64,{encoded}",
522
- },
523
- }
524
- )
525
- elif isinstance(item, Path) and item.suffix.lower() in [".wav"]:
526
- encoded = base64.b64encode(item.read_bytes()).decode("utf-8")
527
- resolved_content.append(
528
- {
529
- "type": "input_audio",
530
- "input_audio": {
531
- "data": f"{encoded}",
532
- "format": "wav",
533
- },
534
- }
535
- )
536
- else:
537
- raise ValueError(
538
- f"Unsupported content item type: {item} in list: {content}"
539
- )
540
-
541
- return [
542
- {
543
- "role": "user",
544
- "content": resolved_content,
545
- }
546
- ]
547
-
548
- raise ValueError(f"Unsupported content type: {content}")
549
-
550
- async def _iterative_completions_request(
551
- self,
552
- type_: Literal["text_completions", "chat_completions"],
553
- request_id: Optional[str],
554
- request_prompt_tokens: Optional[int],
555
- request_output_tokens: Optional[int],
556
- headers: dict[str, str],
557
- params: dict[str, str],
558
- payload: dict[str, Any],
559
- ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
560
- if type_ == "text_completions":
561
- target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
562
- elif type_ == "chat_completions":
563
- target = f"{self.target}{CHAT_COMPLETIONS_PATH}"
564
- else:
565
- raise ValueError(f"Unsupported type: {type_}")
566
-
567
- logger.info(
568
- "{} making request: {} to target: {} using http2: {} following "
569
- "redirects: {} for timeout: {} with headers: {} and params: {} and ",
570
- "payload: {}",
571
- self.__class__.__name__,
572
- request_id,
573
- target,
574
- self.http2,
575
- self.follow_redirects,
576
- self.timeout,
577
- headers,
578
- params,
579
- payload,
580
- )
581
-
582
- response_value = ""
583
- response_prompt_count: Optional[int] = None
584
- response_output_count: Optional[int] = None
585
- iter_count = 0
586
- start_time = time.time()
587
- iter_time = start_time
588
- first_iter_time: Optional[float] = None
589
- last_iter_time: Optional[float] = None
590
-
591
- yield StreamingTextResponse(
592
- type_="start",
593
- value="",
594
- start_time=start_time,
595
- first_iter_time=None,
596
- iter_count=iter_count,
597
- delta="",
598
- time=start_time,
599
- request_id=request_id,
600
- )
601
-
602
- # reset start time after yielding start response to ensure accurate timing
603
- start_time = time.time()
604
-
605
- async with self._get_async_client().stream(
606
- "POST", target, headers=headers, params=params, json=payload
607
- ) as stream:
608
- stream.raise_for_status()
609
-
610
- async for line in stream.aiter_lines():
611
- iter_time = time.time()
612
- logger.debug(
613
- "{} request: {} recieved iter response line: {}",
614
- self.__class__.__name__,
615
- request_id,
616
- line,
617
- )
618
-
619
- if not line or not line.strip().startswith("data:"):
620
- continue
621
-
622
- if line.strip() == "data: [DONE]":
623
- break
624
-
625
- data = json.loads(line.strip()[len("data: ") :])
626
- if delta := self._extract_completions_delta_content(type_, data):
627
- if first_iter_time is None:
628
- first_iter_time = iter_time
629
- last_iter_time = iter_time
630
-
631
- iter_count += 1
632
- response_value += delta
633
-
634
- yield StreamingTextResponse(
635
- type_="iter",
636
- value=response_value,
637
- iter_count=iter_count,
638
- start_time=start_time,
639
- first_iter_time=first_iter_time,
640
- delta=delta,
641
- time=iter_time,
642
- request_id=request_id,
643
- )
644
-
645
- if usage := self._extract_completions_usage(data):
646
- response_prompt_count = usage["prompt"]
647
- response_output_count = usage["output"]
648
-
649
- logger.info(
650
- "{} request: {} with headers: {} and params: {} and payload: {} completed"
651
- "with: {}",
652
- self.__class__.__name__,
653
- request_id,
654
- headers,
655
- params,
656
- payload,
657
- response_value,
658
- )
659
-
660
- yield ResponseSummary(
661
- value=response_value,
662
- request_args=RequestArgs(
663
- target=target,
664
- headers=headers,
665
- params=params,
666
- payload=payload,
667
- timeout=self.timeout,
668
- http2=self.http2,
669
- follow_redirects=self.follow_redirects,
670
- ),
671
- start_time=start_time,
672
- end_time=iter_time,
673
- first_iter_time=first_iter_time,
674
- last_iter_time=last_iter_time,
675
- iterations=iter_count,
676
- request_prompt_tokens=request_prompt_tokens,
677
- request_output_tokens=request_output_tokens,
678
- response_prompt_tokens=response_prompt_count,
679
- response_output_tokens=response_output_count,
680
- request_id=request_id,
681
- )
682
-
683
- @staticmethod
684
- def _extract_completions_delta_content(
685
- type_: Literal["text_completions", "chat_completions"], data: dict
686
- ) -> Optional[str]:
687
- if "choices" not in data or not data["choices"]:
688
- return None
689
-
690
- if type_ == "text_completions":
691
- return data["choices"][0]["text"]
692
-
693
- if type_ == "chat_completions":
694
- return data.get("choices", [{}])[0].get("delta", {}).get("content")
695
-
696
- raise ValueError(f"Unsupported type: {type_}")
697
-
698
- @staticmethod
699
- def _extract_completions_usage(
700
- data: dict,
701
- ) -> Optional[dict[Literal["prompt", "output"], int]]:
702
- if "usage" not in data or not data["usage"]:
703
- return None
704
-
705
- return {
706
- "prompt": data["usage"]["prompt_tokens"],
707
- "output": data["usage"]["completion_tokens"],
708
- }