guidellm 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show
  1. guidellm/__init__.py +38 -6
  2. guidellm/__main__.py +294 -0
  3. guidellm/backend/__init__.py +19 -6
  4. guidellm/backend/backend.py +238 -0
  5. guidellm/backend/openai.py +532 -122
  6. guidellm/backend/response.py +132 -0
  7. guidellm/benchmark/__init__.py +73 -0
  8. guidellm/benchmark/aggregator.py +760 -0
  9. guidellm/benchmark/benchmark.py +838 -0
  10. guidellm/benchmark/benchmarker.py +334 -0
  11. guidellm/benchmark/entrypoints.py +141 -0
  12. guidellm/benchmark/output.py +946 -0
  13. guidellm/benchmark/profile.py +409 -0
  14. guidellm/benchmark/progress.py +720 -0
  15. guidellm/config.py +34 -56
  16. guidellm/data/__init__.py +4 -0
  17. guidellm/data/prideandprejudice.txt.gz +0 -0
  18. guidellm/dataset/__init__.py +22 -0
  19. guidellm/dataset/creator.py +213 -0
  20. guidellm/dataset/entrypoints.py +42 -0
  21. guidellm/dataset/file.py +90 -0
  22. guidellm/dataset/hf_datasets.py +62 -0
  23. guidellm/dataset/in_memory.py +132 -0
  24. guidellm/dataset/synthetic.py +262 -0
  25. guidellm/objects/__init__.py +18 -0
  26. guidellm/objects/pydantic.py +60 -0
  27. guidellm/objects/statistics.py +947 -0
  28. guidellm/request/__init__.py +12 -10
  29. guidellm/request/loader.py +281 -0
  30. guidellm/request/request.py +79 -0
  31. guidellm/scheduler/__init__.py +51 -3
  32. guidellm/scheduler/result.py +137 -0
  33. guidellm/scheduler/scheduler.py +382 -0
  34. guidellm/scheduler/strategy.py +493 -0
  35. guidellm/scheduler/types.py +7 -0
  36. guidellm/scheduler/worker.py +511 -0
  37. guidellm/utils/__init__.py +16 -29
  38. guidellm/utils/colors.py +8 -0
  39. guidellm/utils/hf_transformers.py +35 -0
  40. guidellm/utils/random.py +43 -0
  41. guidellm/utils/text.py +118 -357
  42. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/METADATA +96 -79
  43. guidellm-0.2.0.dist-info/RECORD +48 -0
  44. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/WHEEL +1 -1
  45. guidellm-0.2.0.dist-info/entry_points.txt +2 -0
  46. guidellm/backend/base.py +0 -320
  47. guidellm/core/__init__.py +0 -24
  48. guidellm/core/distribution.py +0 -190
  49. guidellm/core/report.py +0 -321
  50. guidellm/core/request.py +0 -44
  51. guidellm/core/result.py +0 -545
  52. guidellm/core/serializable.py +0 -169
  53. guidellm/executor/__init__.py +0 -10
  54. guidellm/executor/base.py +0 -213
  55. guidellm/executor/profile_generator.py +0 -343
  56. guidellm/main.py +0 -336
  57. guidellm/request/base.py +0 -194
  58. guidellm/request/emulated.py +0 -391
  59. guidellm/request/file.py +0 -76
  60. guidellm/request/transformers.py +0 -100
  61. guidellm/scheduler/base.py +0 -374
  62. guidellm/scheduler/load_generator.py +0 -196
  63. guidellm/utils/injector.py +0 -70
  64. guidellm/utils/progress.py +0 -196
  65. guidellm/utils/transformers.py +0 -151
  66. guidellm-0.1.0.dist-info/RECORD +0 -35
  67. guidellm-0.1.0.dist-info/entry_points.txt +0 -3
  68. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info/licenses}/LICENSE +0 -0
  69. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,168 +1,578 @@
1
- from typing import AsyncGenerator, Dict, List, Optional
1
+ import base64
2
+ import json
3
+ import time
4
+ from collections.abc import AsyncGenerator
5
+ from pathlib import Path
6
+ from typing import Any, Literal, Optional, Union
2
7
 
8
+ import httpx
3
9
  from loguru import logger
4
- from openai import AsyncOpenAI, OpenAI
10
+ from PIL import Image
5
11
 
6
- from guidellm.backend.base import Backend, GenerativeResponse
12
+ from guidellm.backend.backend import Backend
13
+ from guidellm.backend.response import (
14
+ RequestArgs,
15
+ ResponseSummary,
16
+ StreamingTextResponse,
17
+ )
7
18
  from guidellm.config import settings
8
- from guidellm.core import TextGenerationRequest
9
19
 
10
- __all__ = ["OpenAIBackend"]
20
+ __all__ = ["OpenAIHTTPBackend", "TEXT_COMPLETIONS_PATH", "CHAT_COMPLETIONS_PATH"]
11
21
 
12
22
 
13
- @Backend.register("openai_server")
14
- class OpenAIBackend(Backend):
23
+ TEXT_COMPLETIONS_PATH = "/v1/completions"
24
+ CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
25
+
26
+
27
+ @Backend.register("openai_http")
28
+ class OpenAIHTTPBackend(Backend):
15
29
  """
16
- An OpenAI backend implementation for generative AI results.
17
-
18
- This class provides an interface to communicate with the
19
- OpenAI server for generating responses based on given prompts.
20
-
21
- :param openai_api_key: The API key for OpenAI.
22
- If not provided, it will default to the key from settings.
23
- :type openai_api_key: Optional[str]
24
- :param target: The target URL string for the OpenAI server.
25
- :type target: Optional[str]
26
- :param model: The OpenAI model to use, defaults to the first available model.
27
- :type model: Optional[str]
28
- :param request_args: Additional arguments for the OpenAI request.
29
- :type request_args: Dict[str, Any]
30
+ A HTTP-based backend implementation for requests to an OpenAI compatible server.
31
+ For example, a vLLM server instance or requests to OpenAI's API.
32
+
33
+ :param target: The target URL string for the OpenAI server. ex: http://0.0.0.0:8000
34
+ :param model: The model to use for all requests on the target server.
35
+ If none is provided, the first available model will be used.
36
+ :param api_key: The API key to use for requests to the OpenAI server.
37
+ If provided, adds an Authorization header with the value
38
+ "Authorization: Bearer {api_key}".
39
+ If not provided, no Authorization header is added.
40
+ :param organization: The organization to use for requests to the OpenAI server.
41
+ For example, if set to "org_123", adds an OpenAI-Organization header with the
42
+ value "OpenAI-Organization: org_123".
43
+ If not provided, no OpenAI-Organization header is added.
44
+ :param project: The project to use for requests to the OpenAI server.
45
+ For example, if set to "project_123", adds an OpenAI-Project header with the
46
+ value "OpenAI-Project: project_123".
47
+ If not provided, no OpenAI-Project header is added.
48
+ :param timeout: The timeout to use for requests to the OpenAI server.
49
+ If not provided, the default timeout provided from settings is used.
50
+ :param http2: If True, uses HTTP/2 for requests to the OpenAI server.
51
+ Defaults to True.
52
+ :param max_output_tokens: The maximum number of tokens to request for completions.
53
+ If not provided, the default maximum tokens provided from settings is used.
30
54
  """
31
55
 
32
56
  def __init__(
33
57
  self,
34
- openai_api_key: Optional[str] = None,
35
58
  target: Optional[str] = None,
36
59
  model: Optional[str] = None,
37
- **request_args,
60
+ api_key: Optional[str] = None,
61
+ organization: Optional[str] = None,
62
+ project: Optional[str] = None,
63
+ timeout: Optional[float] = None,
64
+ http2: Optional[bool] = True,
65
+ max_output_tokens: Optional[int] = None,
38
66
  ):
39
- self._request_args: Dict = request_args
40
- api_key: str = openai_api_key or settings.openai.api_key
41
-
42
- if not api_key:
43
- err = ValueError(
44
- "`GUIDELLM__OPENAI__API_KEY` environment variable or "
45
- "--openai-api-key CLI parameter must be specified for the "
46
- "OpenAI backend."
47
- )
48
- logger.error("{}", err)
49
- raise err
67
+ super().__init__(type_="openai_http")
68
+ self._target = target or settings.openai.base_url
69
+
70
+ if not self._target:
71
+ raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
72
+
73
+ if self._target.endswith("/v1") or self._target.endswith("/v1/"):
74
+ # backwards compatability, strip v1 off
75
+ self._target = self._target[:-3]
76
+
77
+ if self._target.endswith("/"):
78
+ self._target = self._target[:-1]
79
+
80
+ self._model = model
81
+
82
+ api_key = api_key or settings.openai.api_key
83
+ self.authorization = (
84
+ f"Bearer {api_key}" if api_key else settings.openai.bearer_token
85
+ )
86
+
87
+ self.organization = organization or settings.openai.organization
88
+ self.project = project or settings.openai.project
89
+ self.timeout = timeout if timeout is not None else settings.request_timeout
90
+ self.http2 = http2 if http2 is not None else settings.request_http2
91
+ self.max_output_tokens = (
92
+ max_output_tokens
93
+ if max_output_tokens is not None
94
+ else settings.openai.max_output_tokens
95
+ )
96
+ self._async_client: Optional[httpx.AsyncClient] = None
97
+
98
+ @property
99
+ def target(self) -> str:
100
+ """
101
+ :return: The target URL string for the OpenAI server.
102
+ """
103
+ return self._target
104
+
105
+ @property
106
+ def model(self) -> Optional[str]:
107
+ """
108
+ :return: The model to use for all requests on the target server.
109
+ If validate hasn't been called yet and no model was passed in,
110
+ this will be None until validate is called to set the default.
111
+ """
112
+ return self._model
113
+
114
+ @property
115
+ def info(self) -> dict[str, Any]:
116
+ """
117
+ :return: The information about the backend.
118
+ """
119
+ return {
120
+ "max_output_tokens": self.max_output_tokens,
121
+ "timeout": self.timeout,
122
+ "http2": self.http2,
123
+ "authorization": bool(self.authorization),
124
+ "organization": self.organization,
125
+ "project": self.project,
126
+ "text_completions_path": TEXT_COMPLETIONS_PATH,
127
+ "chat_completions_path": CHAT_COMPLETIONS_PATH,
128
+ }
50
129
 
51
- base_url = target or settings.openai.base_url
130
+ async def check_setup(self):
131
+ """
132
+ Check if the backend is setup correctly and can be used for requests.
133
+ Specifically, if a model is not provided, it grabs the first available model.
134
+ If no models are available, raises a ValueError.
135
+ If a model is provided and not available, raises a ValueError.
136
+
137
+ :raises ValueError: If no models or the provided model is not available.
138
+ """
139
+ models = await self.available_models()
140
+ if not models:
141
+ raise ValueError(f"No models available for target: {self.target}")
52
142
 
53
- if not base_url:
54
- err = ValueError(
55
- "`GUIDELLM__OPENAI__BASE_URL` environment variable or "
56
- "target parameter must be specified for the OpenAI backend."
143
+ if not self.model:
144
+ self._model = models[0]
145
+ elif self.model not in models:
146
+ raise ValueError(
147
+ f"Model {self.model} not found in available models:"
148
+ "{models} for target: {self.target}"
57
149
  )
58
- logger.error("{}", err)
59
- raise err
60
150
 
61
- self._async_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
62
- self._client = OpenAI(api_key=api_key, base_url=base_url)
63
- self._model = model or self.default_model
151
+ async def prepare_multiprocessing(self):
152
+ """
153
+ Prepare the backend for use in a multiprocessing environment.
154
+ Clears out the sync and async clients to ensure they are re-initialized
155
+ for each process.
156
+ """
157
+ if self._async_client is not None:
158
+ await self._async_client.aclose()
159
+ self._async_client = None
160
+
161
+ async def available_models(self) -> list[str]:
162
+ """
163
+ Get the available models for the target server using the OpenAI models endpoint:
164
+ /v1/models
165
+ """
166
+ target = f"{self.target}/v1/models"
167
+ headers = self._headers()
168
+ response = await self._get_async_client().get(target, headers=headers)
169
+ response.raise_for_status()
170
+
171
+ models = []
172
+
173
+ for item in response.json()["data"]:
174
+ models.append(item["id"])
175
+
176
+ return models
64
177
 
65
- super().__init__(type_="openai_server", target=base_url, model=self._model)
66
- logger.info("OpenAI {} Backend listening on {}", self._model, base_url)
178
+ async def text_completions( # type: ignore[override]
179
+ self,
180
+ prompt: Union[str, list[str]],
181
+ request_id: Optional[str] = None,
182
+ prompt_token_count: Optional[int] = None,
183
+ output_token_count: Optional[int] = None,
184
+ **kwargs,
185
+ ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
186
+ """
187
+ Generate text completions for the given prompt using the OpenAI
188
+ completions endpoint: /v1/completions.
189
+
190
+ :param prompt: The prompt (or list of prompts) to generate a completion for.
191
+ If a list is supplied, these are concatenated and run through the model
192
+ for a single prompt.
193
+ :param request_id: The unique identifier for the request, if any.
194
+ Added to logging statements and the response for tracking purposes.
195
+ :param prompt_token_count: The number of tokens measured in the prompt, if any.
196
+ Returned in the response stats for later analysis, if applicable.
197
+ :param output_token_count: If supplied, the number of tokens to enforce
198
+ generation of for the output for this request.
199
+ :param kwargs: Additional keyword arguments to pass with the request.
200
+ :return: An async generator that yields a StreamingTextResponse for start,
201
+ a StreamingTextResponse for each received iteration,
202
+ and a ResponseSummary for the final response.
203
+ """
204
+ logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
205
+ headers = self._headers()
206
+ payload = self._completions_payload(
207
+ orig_kwargs=kwargs,
208
+ max_output_tokens=output_token_count,
209
+ prompt=prompt,
210
+ )
67
211
 
68
- async def make_request(
212
+ try:
213
+ async for resp in self._iterative_completions_request(
214
+ type_="text_completions",
215
+ request_id=request_id,
216
+ request_prompt_tokens=prompt_token_count,
217
+ request_output_tokens=output_token_count,
218
+ headers=headers,
219
+ payload=payload,
220
+ ):
221
+ yield resp
222
+ except Exception as ex:
223
+ logger.error(
224
+ "{} request with headers: {} and payload: {} failed: {}",
225
+ self.__class__.__name__,
226
+ headers,
227
+ payload,
228
+ ex,
229
+ )
230
+ raise ex
231
+
232
+ async def chat_completions( # type: ignore[override]
69
233
  self,
70
- request: TextGenerationRequest,
71
- ) -> AsyncGenerator[GenerativeResponse, None]:
234
+ content: Union[
235
+ str,
236
+ list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
237
+ Any,
238
+ ],
239
+ request_id: Optional[str] = None,
240
+ prompt_token_count: Optional[int] = None,
241
+ output_token_count: Optional[int] = None,
242
+ raw_content: bool = False,
243
+ **kwargs,
244
+ ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
72
245
  """
73
- Make a request to the OpenAI backend.
246
+ Generate chat completions for the given content using the OpenAI
247
+ chat completions endpoint: /v1/chat/completions.
248
+
249
+ :param content: The content (or list of content) to generate a completion for.
250
+ This supports any combination of text, images, and audio (model dependent).
251
+ Supported text only request examples:
252
+ content="Sample prompt", content=["Sample prompt", "Second prompt"],
253
+ content=[{"type": "text", "value": "Sample prompt"}.
254
+ Supported text and image request examples:
255
+ content=["Describe the image", PIL.Image.open("image.jpg")],
256
+ content=["Describe the image", Path("image.jpg")],
257
+ content=["Describe the image", {"type": "image_url",
258
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}].
259
+ Supported text and audio request examples:
260
+ content=["Transcribe the audio", Path("audio.wav")],
261
+ content=["Transcribe the audio", {"type": "input_audio",
262
+ "input_audio": {"data": f"{base64_bytes}", "format": "wav}].
263
+ Additionally, if raw_content=True then the content is passed directly to the
264
+ backend without any processing.
265
+ :param request_id: The unique identifier for the request, if any.
266
+ Added to logging statements and the response for tracking purposes.
267
+ :param prompt_token_count: The number of tokens measured in the prompt, if any.
268
+ Returned in the response stats for later analysis, if applicable.
269
+ :param output_token_count: If supplied, the number of tokens to enforce
270
+ generation of for the output for this request.
271
+ :param kwargs: Additional keyword arguments to pass with the request.
272
+ :return: An async generator that yields a StreamingTextResponse for start,
273
+ a StreamingTextResponse for each received iteration,
274
+ and a ResponseSummary for the final response.
275
+ """
276
+ logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
277
+ headers = self._headers()
278
+ messages = (
279
+ content if raw_content else self._create_chat_messages(content=content)
280
+ )
281
+ payload = self._completions_payload(
282
+ orig_kwargs=kwargs,
283
+ max_output_tokens=output_token_count,
284
+ messages=messages,
285
+ )
286
+
287
+ try:
288
+ async for resp in self._iterative_completions_request(
289
+ type_="chat_completions",
290
+ request_id=request_id,
291
+ request_prompt_tokens=prompt_token_count,
292
+ request_output_tokens=output_token_count,
293
+ headers=headers,
294
+ payload=payload,
295
+ ):
296
+ yield resp
297
+ except Exception as ex:
298
+ logger.error(
299
+ "{} request with headers: {} and payload: {} failed: {}",
300
+ self.__class__.__name__,
301
+ headers,
302
+ payload,
303
+ ex,
304
+ )
305
+ raise ex
74
306
 
75
- This method sends a prompt to the OpenAI backend and streams
76
- the response tokens back.
307
+ def _get_async_client(self) -> httpx.AsyncClient:
308
+ """
309
+ Get the async HTTP client for making requests.
310
+ If the client has not been created yet, it will create one.
77
311
 
78
- :param request: The text generation request to submit.
79
- :type request: TextGenerationRequest
80
- :yield: A stream of GenerativeResponse objects.
81
- :rtype: AsyncGenerator[GenerativeResponse, None]
312
+ :return: The async HTTP client.
82
313
  """
314
+ if self._async_client is None:
315
+ client = httpx.AsyncClient(http2=self.http2, timeout=self.timeout)
316
+ self._async_client = client
317
+ else:
318
+ client = self._async_client
319
+
320
+ return client
321
+
322
+ def _headers(self) -> dict[str, str]:
323
+ headers = {
324
+ "Content-Type": "application/json",
325
+ }
326
+
327
+ if self.authorization:
328
+ headers["Authorization"] = self.authorization
329
+
330
+ if self.organization:
331
+ headers["OpenAI-Organization"] = self.organization
332
+
333
+ if self.project:
334
+ headers["OpenAI-Project"] = self.project
83
335
 
84
- logger.debug("Making request to OpenAI backend with prompt: {}", request.prompt)
336
+ return headers
85
337
 
86
- request_args: Dict = {
87
- "n": 1, # Number of completions for each prompt
338
+ def _completions_payload(
339
+ self, orig_kwargs: Optional[dict], max_output_tokens: Optional[int], **kwargs
340
+ ) -> dict:
341
+ payload = orig_kwargs or {}
342
+ payload.update(kwargs)
343
+ payload["model"] = self.model
344
+ payload["stream"] = True
345
+ payload["stream_options"] = {
346
+ "include_usage": True,
88
347
  }
89
348
 
90
- if request.output_token_count is not None:
91
- request_args.update(
349
+ if max_output_tokens or self.max_output_tokens:
350
+ logger.debug(
351
+ "{} adding payload args for setting output_token_count: {}",
352
+ self.__class__.__name__,
353
+ max_output_tokens or self.max_output_tokens,
354
+ )
355
+ payload["max_tokens"] = max_output_tokens or self.max_output_tokens
356
+ payload["max_completion_tokens"] = payload["max_tokens"]
357
+
358
+ if max_output_tokens:
359
+ # only set stop and ignore_eos if max_output_tokens set at request level
360
+ # otherwise the instance value is just the max to enforce we stay below
361
+ payload["stop"] = None
362
+ payload["ignore_eos"] = True
363
+
364
+ return payload
365
+
366
+ @staticmethod
367
+ def _create_chat_messages(
368
+ content: Union[
369
+ str,
370
+ list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
371
+ Any,
372
+ ],
373
+ ) -> list[dict]:
374
+ if isinstance(content, str):
375
+ return [
92
376
  {
93
- "max_tokens": request.output_token_count,
94
- "stop": None,
377
+ "role": "user",
378
+ "content": content,
95
379
  }
96
- )
97
- elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:
98
- request_args.update(
380
+ ]
381
+
382
+ if isinstance(content, list):
383
+ resolved_content = []
384
+
385
+ for item in content:
386
+ if isinstance(item, dict):
387
+ resolved_content.append(item)
388
+ elif isinstance(item, str):
389
+ resolved_content.append({"type": "text", "text": item})
390
+ elif isinstance(item, Image.Image) or (
391
+ isinstance(item, Path) and item.suffix.lower() in [".jpg", ".jpeg"]
392
+ ):
393
+ image = item if isinstance(item, Image.Image) else Image.open(item)
394
+ encoded = base64.b64encode(image.tobytes()).decode("utf-8")
395
+ resolved_content.append(
396
+ {
397
+ "type": "image",
398
+ "image": {
399
+ "url": f"data:image/jpeg;base64,{encoded}",
400
+ },
401
+ }
402
+ )
403
+ elif isinstance(item, Path) and item.suffix.lower() in [".wav"]:
404
+ encoded = base64.b64encode(item.read_bytes()).decode("utf-8")
405
+ resolved_content.append(
406
+ {
407
+ "type": "input_audio",
408
+ "input_audio": {
409
+ "data": f"{encoded}",
410
+ "format": "wav",
411
+ },
412
+ }
413
+ )
414
+ else:
415
+ raise ValueError(
416
+ f"Unsupported content item type: {item} in list: {content}"
417
+ )
418
+
419
+ return [
99
420
  {
100
- "max_tokens": settings.openai.max_gen_tokens,
421
+ "role": "user",
422
+ "content": resolved_content,
101
423
  }
102
- )
424
+ ]
425
+
426
+ raise ValueError(f"Unsupported content type: {content}")
427
+
428
+ async def _iterative_completions_request(
429
+ self,
430
+ type_: Literal["text_completions", "chat_completions"],
431
+ request_id: Optional[str],
432
+ request_prompt_tokens: Optional[int],
433
+ request_output_tokens: Optional[int],
434
+ headers: dict,
435
+ payload: dict,
436
+ ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
437
+ if type_ == "text_completions":
438
+ target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
439
+ elif type_ == "chat_completions":
440
+ target = f"{self.target}{CHAT_COMPLETIONS_PATH}"
441
+ else:
442
+ raise ValueError(f"Unsupported type: {type_}")
443
+
444
+ logger.info(
445
+ "{} making request: {} to target: {} using http2: {} for "
446
+ "timeout: {} with headers: {} and payload: {}",
447
+ self.__class__.__name__,
448
+ request_id,
449
+ target,
450
+ self.http2,
451
+ self.timeout,
452
+ headers,
453
+ payload,
454
+ )
103
455
 
104
- request_args.update(self._request_args)
456
+ response_value = ""
457
+ response_prompt_count: Optional[int] = None
458
+ response_output_count: Optional[int] = None
459
+ iter_count = 0
460
+ start_time = time.time()
461
+ iter_time = start_time
462
+ first_iter_time: Optional[float] = None
463
+ last_iter_time: Optional[float] = None
105
464
 
106
- stream = await self._async_client.chat.completions.create(
107
- model=self.model,
108
- messages=[
109
- {"role": "system", "content": request.prompt},
110
- ],
111
- stream=True,
112
- **request_args,
465
+ yield StreamingTextResponse(
466
+ type_="start",
467
+ value="",
468
+ start_time=start_time,
469
+ first_iter_time=None,
470
+ iter_count=iter_count,
471
+ delta="",
472
+ time=start_time,
473
+ request_id=request_id,
113
474
  )
114
- token_count = 0
115
- async for chunk in stream:
116
- choice = chunk.choices[0]
117
- token = choice.delta.content or ""
118
-
119
- if choice.finish_reason is not None:
120
- yield GenerativeResponse(
121
- type_="final",
122
- prompt=request.prompt,
123
- prompt_token_count=request.prompt_token_count,
124
- output_token_count=token_count,
475
+
476
+ # reset start time after yielding start response to ensure accurate timing
477
+ start_time = time.time()
478
+
479
+ async with self._get_async_client().stream(
480
+ "POST", target, headers=headers, json=payload
481
+ ) as stream:
482
+ stream.raise_for_status()
483
+
484
+ async for line in stream.aiter_lines():
485
+ iter_time = time.time()
486
+ logger.debug(
487
+ "{} request: {} recieved iter response line: {}",
488
+ self.__class__.__name__,
489
+ request_id,
490
+ line,
125
491
  )
126
- break
127
-
128
- token_count += 1
129
- yield GenerativeResponse(
130
- type_="token_iter",
131
- add_token=token,
132
- prompt=request.prompt,
133
- prompt_token_count=request.prompt_token_count,
134
- output_token_count=token_count,
135
- )
136
492
 
137
- def available_models(self) -> List[str]:
138
- """
139
- Get the available models for the backend.
493
+ if not line or not line.strip().startswith("data:"):
494
+ continue
140
495
 
141
- This method queries the OpenAI API to retrieve a list of available models.
496
+ if line.strip() == "data: [DONE]":
497
+ break
142
498
 
143
- :return: A list of available models.
144
- :rtype: List[str]
145
- :raises openai.OpenAIError: If an error occurs while retrieving models.
146
- """
499
+ data = json.loads(line.strip()[len("data: ") :])
500
+ if delta := self._extract_completions_delta_content(type_, data):
501
+ if first_iter_time is None:
502
+ first_iter_time = iter_time
503
+ last_iter_time = iter_time
147
504
 
148
- try:
149
- return [model.id for model in self._client.models.list().data]
150
- except Exception as error:
151
- logger.error("Failed to retrieve available models: {}", error)
152
- raise error
505
+ iter_count += 1
506
+ response_value += delta
153
507
 
154
- def validate_connection(self):
155
- """
156
- Validate the connection to the OpenAI backend.
508
+ yield StreamingTextResponse(
509
+ type_="iter",
510
+ value=response_value,
511
+ iter_count=iter_count,
512
+ start_time=start_time,
513
+ first_iter_time=first_iter_time,
514
+ delta=delta,
515
+ time=iter_time,
516
+ request_id=request_id,
517
+ )
157
518
 
158
- This method checks that the OpenAI backend is reachable and
159
- the API key is valid.
519
+ if usage := self._extract_completions_usage(data):
520
+ response_prompt_count = usage["prompt"]
521
+ response_output_count = usage["output"]
160
522
 
161
- :raises openai.OpenAIError: If the connection is invalid.
162
- """
523
+ logger.info(
524
+ "{} request: {} with headers: {} and payload: {} completed with: {}",
525
+ self.__class__.__name__,
526
+ request_id,
527
+ headers,
528
+ payload,
529
+ response_value,
530
+ )
163
531
 
164
- try:
165
- self._client.models.list()
166
- except Exception as error:
167
- logger.error("Failed to validate OpenAI connection: {}", error)
168
- raise error
532
+ yield ResponseSummary(
533
+ value=response_value,
534
+ request_args=RequestArgs(
535
+ target=target,
536
+ headers=headers,
537
+ payload=payload,
538
+ timeout=self.timeout,
539
+ http2=self.http2,
540
+ ),
541
+ start_time=start_time,
542
+ end_time=iter_time,
543
+ first_iter_time=first_iter_time,
544
+ last_iter_time=last_iter_time,
545
+ iterations=iter_count,
546
+ request_prompt_tokens=request_prompt_tokens,
547
+ request_output_tokens=request_output_tokens,
548
+ response_prompt_tokens=response_prompt_count,
549
+ response_output_tokens=response_output_count,
550
+ request_id=request_id,
551
+ )
552
+
553
+ @staticmethod
554
+ def _extract_completions_delta_content(
555
+ type_: Literal["text_completions", "chat_completions"], data: dict
556
+ ) -> Optional[str]:
557
+ if "choices" not in data or not data["choices"]:
558
+ return None
559
+
560
+ if type_ == "text_completions":
561
+ return data["choices"][0]["text"]
562
+
563
+ if type_ == "chat_completions":
564
+ return data["choices"][0]["delta"]["content"]
565
+
566
+ raise ValueError(f"Unsupported type: {type_}")
567
+
568
+ @staticmethod
569
+ def _extract_completions_usage(
570
+ data: dict,
571
+ ) -> Optional[dict[Literal["prompt", "output"], int]]:
572
+ if "usage" not in data or not data["usage"]:
573
+ return None
574
+
575
+ return {
576
+ "prompt": data["usage"]["prompt_tokens"],
577
+ "output": data["usage"]["completion_tokens"],
578
+ }