guidellm 0.3.0rc20250429__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +8 -13
- guidellm/__main__.py +290 -69
- guidellm/backend/__init__.py +6 -6
- guidellm/backend/backend.py +25 -4
- guidellm/backend/openai.py +153 -30
- guidellm/backend/response.py +6 -2
- guidellm/benchmark/__init__.py +16 -22
- guidellm/benchmark/aggregator.py +3 -3
- guidellm/benchmark/benchmark.py +11 -12
- guidellm/benchmark/benchmarker.py +2 -2
- guidellm/benchmark/entrypoints.py +34 -10
- guidellm/benchmark/output.py +59 -8
- guidellm/benchmark/profile.py +4 -4
- guidellm/benchmark/progress.py +2 -2
- guidellm/benchmark/scenario.py +104 -0
- guidellm/benchmark/scenarios/__init__.py +0 -0
- guidellm/config.py +32 -7
- guidellm/dataset/__init__.py +4 -4
- guidellm/dataset/creator.py +1 -1
- guidellm/dataset/synthetic.py +36 -11
- guidellm/logger.py +8 -4
- guidellm/objects/__init__.py +2 -2
- guidellm/objects/pydantic.py +30 -1
- guidellm/objects/statistics.py +20 -14
- guidellm/preprocess/__init__.py +3 -0
- guidellm/preprocess/dataset.py +374 -0
- guidellm/presentation/__init__.py +28 -0
- guidellm/presentation/builder.py +27 -0
- guidellm/presentation/data_models.py +232 -0
- guidellm/presentation/injector.py +66 -0
- guidellm/request/__init__.py +6 -3
- guidellm/request/loader.py +5 -5
- guidellm/{scheduler → request}/types.py +4 -1
- guidellm/scheduler/__init__.py +10 -15
- guidellm/scheduler/queues.py +25 -0
- guidellm/scheduler/result.py +21 -3
- guidellm/scheduler/scheduler.py +68 -60
- guidellm/scheduler/strategy.py +26 -24
- guidellm/scheduler/worker.py +64 -103
- guidellm/utils/__init__.py +17 -5
- guidellm/utils/cli.py +62 -0
- guidellm/utils/default_group.py +105 -0
- guidellm/utils/dict.py +23 -0
- guidellm/utils/hf_datasets.py +36 -0
- guidellm/utils/random.py +1 -1
- guidellm/utils/text.py +14 -15
- guidellm/version.py +6 -0
- guidellm-0.3.1.dist-info/METADATA +329 -0
- guidellm-0.3.1.dist-info/RECORD +62 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.3.1.dist-info}/WHEEL +1 -1
- guidellm-0.3.0rc20250429.dist-info/METADATA +0 -453
- guidellm-0.3.0rc20250429.dist-info/RECORD +0 -48
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.3.1.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.3.1.dist-info}/top_level.txt +0 -0
guidellm/backend/openai.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import copy
|
|
2
3
|
import json
|
|
3
4
|
import time
|
|
4
5
|
from collections.abc import AsyncGenerator
|
|
@@ -17,12 +18,25 @@ from guidellm.backend.response import (
|
|
|
17
18
|
)
|
|
18
19
|
from guidellm.config import settings
|
|
19
20
|
|
|
20
|
-
__all__ = [
|
|
21
|
+
__all__ = [
|
|
22
|
+
"CHAT_COMPLETIONS",
|
|
23
|
+
"CHAT_COMPLETIONS_PATH",
|
|
24
|
+
"MODELS",
|
|
25
|
+
"TEXT_COMPLETIONS",
|
|
26
|
+
"TEXT_COMPLETIONS_PATH",
|
|
27
|
+
"OpenAIHTTPBackend",
|
|
28
|
+
]
|
|
21
29
|
|
|
22
30
|
|
|
23
31
|
TEXT_COMPLETIONS_PATH = "/v1/completions"
|
|
24
32
|
CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
|
|
25
33
|
|
|
34
|
+
CompletionEndpointType = Literal["text_completions", "chat_completions"]
|
|
35
|
+
EndpointType = Union[Literal["models"], CompletionEndpointType]
|
|
36
|
+
CHAT_COMPLETIONS: CompletionEndpointType = "chat_completions"
|
|
37
|
+
MODELS: EndpointType = "models"
|
|
38
|
+
TEXT_COMPLETIONS: CompletionEndpointType = "text_completions"
|
|
39
|
+
|
|
26
40
|
|
|
27
41
|
@Backend.register("openai_http")
|
|
28
42
|
class OpenAIHTTPBackend(Backend):
|
|
@@ -49,8 +63,23 @@ class OpenAIHTTPBackend(Backend):
|
|
|
49
63
|
If not provided, the default timeout provided from settings is used.
|
|
50
64
|
:param http2: If True, uses HTTP/2 for requests to the OpenAI server.
|
|
51
65
|
Defaults to True.
|
|
66
|
+
:param follow_redirects: If True, the HTTP client will follow redirect responses.
|
|
67
|
+
If not provided, the default value from settings is used.
|
|
52
68
|
:param max_output_tokens: The maximum number of tokens to request for completions.
|
|
53
69
|
If not provided, the default maximum tokens provided from settings is used.
|
|
70
|
+
:param extra_query: Query parameters to include in requests to the OpenAI server.
|
|
71
|
+
If "chat_completions", "models", or "text_completions" are included as keys,
|
|
72
|
+
the values of these keys will be used as the parameters for the respective
|
|
73
|
+
endpoint.
|
|
74
|
+
If not provided, no extra query parameters are added.
|
|
75
|
+
:param extra_body: Body parameters to include in requests to the OpenAI server.
|
|
76
|
+
If "chat_completions", "models", or "text_completions" are included as keys,
|
|
77
|
+
the values of these keys will be included in the body for the respective
|
|
78
|
+
endpoint.
|
|
79
|
+
If not provided, no extra body parameters are added.
|
|
80
|
+
:param remove_from_body: Parameters that should be removed from the body of each
|
|
81
|
+
request.
|
|
82
|
+
If not provided, no parameters are removed from the body.
|
|
54
83
|
"""
|
|
55
84
|
|
|
56
85
|
def __init__(
|
|
@@ -62,7 +91,13 @@ class OpenAIHTTPBackend(Backend):
|
|
|
62
91
|
project: Optional[str] = None,
|
|
63
92
|
timeout: Optional[float] = None,
|
|
64
93
|
http2: Optional[bool] = True,
|
|
94
|
+
follow_redirects: Optional[bool] = None,
|
|
65
95
|
max_output_tokens: Optional[int] = None,
|
|
96
|
+
extra_query: Optional[dict] = None,
|
|
97
|
+
extra_body: Optional[dict] = None,
|
|
98
|
+
remove_from_body: Optional[list[str]] = None,
|
|
99
|
+
headers: Optional[dict] = None,
|
|
100
|
+
verify: Optional[bool] = None,
|
|
66
101
|
):
|
|
67
102
|
super().__init__(type_="openai_http")
|
|
68
103
|
self._target = target or settings.openai.base_url
|
|
@@ -79,20 +114,48 @@ class OpenAIHTTPBackend(Backend):
|
|
|
79
114
|
|
|
80
115
|
self._model = model
|
|
81
116
|
|
|
117
|
+
# Start with default headers based on other params
|
|
118
|
+
default_headers: dict[str, str] = {}
|
|
82
119
|
api_key = api_key or settings.openai.api_key
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
120
|
+
bearer_token = settings.openai.bearer_token
|
|
121
|
+
if api_key:
|
|
122
|
+
default_headers["Authorization"] = f"Bearer {api_key}"
|
|
123
|
+
elif bearer_token:
|
|
124
|
+
default_headers["Authorization"] = bearer_token
|
|
86
125
|
|
|
87
126
|
self.organization = organization or settings.openai.organization
|
|
127
|
+
if self.organization:
|
|
128
|
+
default_headers["OpenAI-Organization"] = self.organization
|
|
129
|
+
|
|
88
130
|
self.project = project or settings.openai.project
|
|
131
|
+
if self.project:
|
|
132
|
+
default_headers["OpenAI-Project"] = self.project
|
|
133
|
+
|
|
134
|
+
# User-provided headers from kwargs or settings override defaults
|
|
135
|
+
merged_headers = default_headers.copy()
|
|
136
|
+
merged_headers.update(settings.openai.headers or {})
|
|
137
|
+
if headers:
|
|
138
|
+
merged_headers.update(headers)
|
|
139
|
+
|
|
140
|
+
# Remove headers with None values for backward compatibility and convenience
|
|
141
|
+
self.headers = {k: v for k, v in merged_headers.items() if v is not None}
|
|
142
|
+
|
|
89
143
|
self.timeout = timeout if timeout is not None else settings.request_timeout
|
|
90
144
|
self.http2 = http2 if http2 is not None else settings.request_http2
|
|
145
|
+
self.follow_redirects = (
|
|
146
|
+
follow_redirects
|
|
147
|
+
if follow_redirects is not None
|
|
148
|
+
else settings.request_follow_redirects
|
|
149
|
+
)
|
|
150
|
+
self.verify = verify if verify is not None else settings.openai.verify
|
|
91
151
|
self.max_output_tokens = (
|
|
92
152
|
max_output_tokens
|
|
93
153
|
if max_output_tokens is not None
|
|
94
154
|
else settings.openai.max_output_tokens
|
|
95
155
|
)
|
|
156
|
+
self.extra_query = extra_query
|
|
157
|
+
self.extra_body = extra_body
|
|
158
|
+
self.remove_from_body = remove_from_body
|
|
96
159
|
self._async_client: Optional[httpx.AsyncClient] = None
|
|
97
160
|
|
|
98
161
|
@property
|
|
@@ -120,13 +183,21 @@ class OpenAIHTTPBackend(Backend):
|
|
|
120
183
|
"max_output_tokens": self.max_output_tokens,
|
|
121
184
|
"timeout": self.timeout,
|
|
122
185
|
"http2": self.http2,
|
|
123
|
-
"
|
|
124
|
-
"
|
|
125
|
-
"project": self.project,
|
|
186
|
+
"follow_redirects": self.follow_redirects,
|
|
187
|
+
"headers": self.headers,
|
|
126
188
|
"text_completions_path": TEXT_COMPLETIONS_PATH,
|
|
127
189
|
"chat_completions_path": CHAT_COMPLETIONS_PATH,
|
|
128
190
|
}
|
|
129
191
|
|
|
192
|
+
async def reset(self) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Reset the connection object. This is useful for backends that
|
|
195
|
+
reuse connections or have state that needs to be cleared.
|
|
196
|
+
For this backend, it closes the async client if it exists.
|
|
197
|
+
"""
|
|
198
|
+
if self._async_client is not None:
|
|
199
|
+
await self._async_client.aclose()
|
|
200
|
+
|
|
130
201
|
async def check_setup(self):
|
|
131
202
|
"""
|
|
132
203
|
Check if the backend is setup correctly and can be used for requests.
|
|
@@ -165,7 +236,10 @@ class OpenAIHTTPBackend(Backend):
|
|
|
165
236
|
"""
|
|
166
237
|
target = f"{self.target}/v1/models"
|
|
167
238
|
headers = self._headers()
|
|
168
|
-
|
|
239
|
+
params = self._params(MODELS)
|
|
240
|
+
response = await self._get_async_client().get(
|
|
241
|
+
target, headers=headers, params=params
|
|
242
|
+
)
|
|
169
243
|
response.raise_for_status()
|
|
170
244
|
|
|
171
245
|
models = []
|
|
@@ -210,7 +284,9 @@ class OpenAIHTTPBackend(Backend):
|
|
|
210
284
|
)
|
|
211
285
|
|
|
212
286
|
headers = self._headers()
|
|
287
|
+
params = self._params(TEXT_COMPLETIONS)
|
|
213
288
|
payload = self._completions_payload(
|
|
289
|
+
endpoint_type=TEXT_COMPLETIONS,
|
|
214
290
|
orig_kwargs=kwargs,
|
|
215
291
|
max_output_tokens=output_token_count,
|
|
216
292
|
prompt=prompt,
|
|
@@ -223,14 +299,16 @@ class OpenAIHTTPBackend(Backend):
|
|
|
223
299
|
request_prompt_tokens=prompt_token_count,
|
|
224
300
|
request_output_tokens=output_token_count,
|
|
225
301
|
headers=headers,
|
|
302
|
+
params=params,
|
|
226
303
|
payload=payload,
|
|
227
304
|
):
|
|
228
305
|
yield resp
|
|
229
306
|
except Exception as ex:
|
|
230
307
|
logger.error(
|
|
231
|
-
"{} request with headers: {} and payload: {} failed: {}",
|
|
308
|
+
"{} request with headers: {} and params: {} and payload: {} failed: {}",
|
|
232
309
|
self.__class__.__name__,
|
|
233
310
|
headers,
|
|
311
|
+
params,
|
|
234
312
|
payload,
|
|
235
313
|
ex,
|
|
236
314
|
)
|
|
@@ -282,10 +360,12 @@ class OpenAIHTTPBackend(Backend):
|
|
|
282
360
|
"""
|
|
283
361
|
logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
|
|
284
362
|
headers = self._headers()
|
|
363
|
+
params = self._params(CHAT_COMPLETIONS)
|
|
285
364
|
messages = (
|
|
286
365
|
content if raw_content else self._create_chat_messages(content=content)
|
|
287
366
|
)
|
|
288
367
|
payload = self._completions_payload(
|
|
368
|
+
endpoint_type=CHAT_COMPLETIONS,
|
|
289
369
|
orig_kwargs=kwargs,
|
|
290
370
|
max_output_tokens=output_token_count,
|
|
291
371
|
messages=messages,
|
|
@@ -298,14 +378,16 @@ class OpenAIHTTPBackend(Backend):
|
|
|
298
378
|
request_prompt_tokens=prompt_token_count,
|
|
299
379
|
request_output_tokens=output_token_count,
|
|
300
380
|
headers=headers,
|
|
381
|
+
params=params,
|
|
301
382
|
payload=payload,
|
|
302
383
|
):
|
|
303
384
|
yield resp
|
|
304
385
|
except Exception as ex:
|
|
305
386
|
logger.error(
|
|
306
|
-
"{} request with headers: {} and payload: {} failed: {}",
|
|
387
|
+
"{} request with headers: {} and params: {} and payload: {} failed: {}",
|
|
307
388
|
self.__class__.__name__,
|
|
308
389
|
headers,
|
|
390
|
+
params,
|
|
309
391
|
payload,
|
|
310
392
|
ex,
|
|
311
393
|
)
|
|
@@ -318,8 +400,13 @@ class OpenAIHTTPBackend(Backend):
|
|
|
318
400
|
|
|
319
401
|
:return: The async HTTP client.
|
|
320
402
|
"""
|
|
321
|
-
if self._async_client is None:
|
|
322
|
-
client = httpx.AsyncClient(
|
|
403
|
+
if self._async_client is None or self._async_client.is_closed:
|
|
404
|
+
client = httpx.AsyncClient(
|
|
405
|
+
http2=self.http2,
|
|
406
|
+
timeout=self.timeout,
|
|
407
|
+
follow_redirects=self.follow_redirects,
|
|
408
|
+
verify=self.verify,
|
|
409
|
+
)
|
|
323
410
|
self._async_client = client
|
|
324
411
|
else:
|
|
325
412
|
client = self._async_client
|
|
@@ -330,22 +417,44 @@ class OpenAIHTTPBackend(Backend):
|
|
|
330
417
|
headers = {
|
|
331
418
|
"Content-Type": "application/json",
|
|
332
419
|
}
|
|
420
|
+
headers.update(self.headers)
|
|
421
|
+
return headers
|
|
333
422
|
|
|
334
|
-
|
|
335
|
-
|
|
423
|
+
def _params(self, endpoint_type: EndpointType) -> dict[str, str]:
|
|
424
|
+
if self.extra_query is None:
|
|
425
|
+
return {}
|
|
336
426
|
|
|
337
|
-
if
|
|
338
|
-
|
|
427
|
+
if (
|
|
428
|
+
CHAT_COMPLETIONS in self.extra_query
|
|
429
|
+
or MODELS in self.extra_query
|
|
430
|
+
or TEXT_COMPLETIONS in self.extra_query
|
|
431
|
+
):
|
|
432
|
+
return self.extra_query.get(endpoint_type, {})
|
|
339
433
|
|
|
340
|
-
|
|
341
|
-
headers["OpenAI-Project"] = self.project
|
|
434
|
+
return self.extra_query
|
|
342
435
|
|
|
343
|
-
|
|
436
|
+
def _extra_body(self, endpoint_type: EndpointType) -> dict[str, Any]:
|
|
437
|
+
if self.extra_body is None:
|
|
438
|
+
return {}
|
|
439
|
+
|
|
440
|
+
if (
|
|
441
|
+
CHAT_COMPLETIONS in self.extra_body
|
|
442
|
+
or MODELS in self.extra_body
|
|
443
|
+
or TEXT_COMPLETIONS in self.extra_body
|
|
444
|
+
):
|
|
445
|
+
return copy.deepcopy(self.extra_body.get(endpoint_type, {}))
|
|
446
|
+
|
|
447
|
+
return copy.deepcopy(self.extra_body)
|
|
344
448
|
|
|
345
449
|
def _completions_payload(
|
|
346
|
-
self,
|
|
450
|
+
self,
|
|
451
|
+
endpoint_type: CompletionEndpointType,
|
|
452
|
+
orig_kwargs: Optional[dict],
|
|
453
|
+
max_output_tokens: Optional[int],
|
|
454
|
+
**kwargs,
|
|
347
455
|
) -> dict:
|
|
348
|
-
payload =
|
|
456
|
+
payload = self._extra_body(endpoint_type)
|
|
457
|
+
payload.update(orig_kwargs or {})
|
|
349
458
|
payload.update(kwargs)
|
|
350
459
|
payload["model"] = self.model
|
|
351
460
|
payload["stream"] = True
|
|
@@ -359,8 +468,10 @@ class OpenAIHTTPBackend(Backend):
|
|
|
359
468
|
self.__class__.__name__,
|
|
360
469
|
max_output_tokens or self.max_output_tokens,
|
|
361
470
|
)
|
|
362
|
-
|
|
363
|
-
|
|
471
|
+
max_output_key = settings.openai.max_output_key.get(
|
|
472
|
+
endpoint_type, "max_tokens"
|
|
473
|
+
)
|
|
474
|
+
payload[max_output_key] = max_output_tokens or self.max_output_tokens
|
|
364
475
|
|
|
365
476
|
if max_output_tokens:
|
|
366
477
|
# only set stop and ignore_eos if max_output_tokens set at request level
|
|
@@ -368,6 +479,10 @@ class OpenAIHTTPBackend(Backend):
|
|
|
368
479
|
payload["stop"] = None
|
|
369
480
|
payload["ignore_eos"] = True
|
|
370
481
|
|
|
482
|
+
if self.remove_from_body:
|
|
483
|
+
for key in self.remove_from_body:
|
|
484
|
+
payload.pop(key, None)
|
|
485
|
+
|
|
371
486
|
return payload
|
|
372
487
|
|
|
373
488
|
@staticmethod
|
|
@@ -438,8 +553,9 @@ class OpenAIHTTPBackend(Backend):
|
|
|
438
553
|
request_id: Optional[str],
|
|
439
554
|
request_prompt_tokens: Optional[int],
|
|
440
555
|
request_output_tokens: Optional[int],
|
|
441
|
-
headers: dict,
|
|
442
|
-
|
|
556
|
+
headers: dict[str, str],
|
|
557
|
+
params: dict[str, str],
|
|
558
|
+
payload: dict[str, Any],
|
|
443
559
|
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
|
|
444
560
|
if type_ == "text_completions":
|
|
445
561
|
target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
|
|
@@ -449,14 +565,17 @@ class OpenAIHTTPBackend(Backend):
|
|
|
449
565
|
raise ValueError(f"Unsupported type: {type_}")
|
|
450
566
|
|
|
451
567
|
logger.info(
|
|
452
|
-
"{} making request: {} to target: {} using http2: {}
|
|
453
|
-
"timeout: {} with headers: {} and
|
|
568
|
+
"{} making request: {} to target: {} using http2: {} following "
|
|
569
|
+
"redirects: {} for timeout: {} with headers: {} and params: {} and ",
|
|
570
|
+
"payload: {}",
|
|
454
571
|
self.__class__.__name__,
|
|
455
572
|
request_id,
|
|
456
573
|
target,
|
|
457
574
|
self.http2,
|
|
575
|
+
self.follow_redirects,
|
|
458
576
|
self.timeout,
|
|
459
577
|
headers,
|
|
578
|
+
params,
|
|
460
579
|
payload,
|
|
461
580
|
)
|
|
462
581
|
|
|
@@ -484,7 +603,7 @@ class OpenAIHTTPBackend(Backend):
|
|
|
484
603
|
start_time = time.time()
|
|
485
604
|
|
|
486
605
|
async with self._get_async_client().stream(
|
|
487
|
-
"POST", target, headers=headers, json=payload
|
|
606
|
+
"POST", target, headers=headers, params=params, json=payload
|
|
488
607
|
) as stream:
|
|
489
608
|
stream.raise_for_status()
|
|
490
609
|
|
|
@@ -528,10 +647,12 @@ class OpenAIHTTPBackend(Backend):
|
|
|
528
647
|
response_output_count = usage["output"]
|
|
529
648
|
|
|
530
649
|
logger.info(
|
|
531
|
-
"{} request: {} with headers: {} and
|
|
650
|
+
"{} request: {} with headers: {} and params: {} and payload: {} completed"
|
|
651
|
+
"with: {}",
|
|
532
652
|
self.__class__.__name__,
|
|
533
653
|
request_id,
|
|
534
654
|
headers,
|
|
655
|
+
params,
|
|
535
656
|
payload,
|
|
536
657
|
response_value,
|
|
537
658
|
)
|
|
@@ -541,9 +662,11 @@ class OpenAIHTTPBackend(Backend):
|
|
|
541
662
|
request_args=RequestArgs(
|
|
542
663
|
target=target,
|
|
543
664
|
headers=headers,
|
|
665
|
+
params=params,
|
|
544
666
|
payload=payload,
|
|
545
667
|
timeout=self.timeout,
|
|
546
668
|
http2=self.http2,
|
|
669
|
+
follow_redirects=self.follow_redirects,
|
|
547
670
|
),
|
|
548
671
|
start_time=start_time,
|
|
549
672
|
end_time=iter_time,
|
|
@@ -568,7 +691,7 @@ class OpenAIHTTPBackend(Backend):
|
|
|
568
691
|
return data["choices"][0]["text"]
|
|
569
692
|
|
|
570
693
|
if type_ == "chat_completions":
|
|
571
|
-
return data
|
|
694
|
+
return data.get("choices", [{}])[0].get("delta", {}).get("content")
|
|
572
695
|
|
|
573
696
|
raise ValueError(f"Unsupported type: {type_}")
|
|
574
697
|
|
guidellm/backend/response.py
CHANGED
|
@@ -6,10 +6,10 @@ from guidellm.config import settings
|
|
|
6
6
|
from guidellm.objects.pydantic import StandardBaseModel
|
|
7
7
|
|
|
8
8
|
__all__ = [
|
|
9
|
-
"StreamingResponseType",
|
|
10
|
-
"StreamingTextResponse",
|
|
11
9
|
"RequestArgs",
|
|
12
10
|
"ResponseSummary",
|
|
11
|
+
"StreamingResponseType",
|
|
12
|
+
"StreamingTextResponse",
|
|
13
13
|
]
|
|
14
14
|
|
|
15
15
|
|
|
@@ -48,17 +48,21 @@ class RequestArgs(StandardBaseModel):
|
|
|
48
48
|
|
|
49
49
|
:param target: The target URL or function for the request.
|
|
50
50
|
:param headers: The headers, if any, included in the request such as authorization.
|
|
51
|
+
:param params: The query parameters, if any, included in the request.
|
|
51
52
|
:param payload: The payload / arguments for the request including the prompt /
|
|
52
53
|
content and other configurations.
|
|
53
54
|
:param timeout: The timeout for the request in seconds, if any.
|
|
54
55
|
:param http2: Whether HTTP/2 was used for the request, if applicable.
|
|
56
|
+
:param follow_redirects: Whether the request should follow redirect responses.
|
|
55
57
|
"""
|
|
56
58
|
|
|
57
59
|
target: str
|
|
58
60
|
headers: dict[str, str]
|
|
61
|
+
params: dict[str, str]
|
|
59
62
|
payload: dict[str, Any]
|
|
60
63
|
timeout: Optional[float] = None
|
|
61
64
|
http2: Optional[bool] = None
|
|
65
|
+
follow_redirects: Optional[bool] = None
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
class ResponseSummary(StandardBaseModel):
|
guidellm/benchmark/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@ from .benchmark import (
|
|
|
12
12
|
StatusBreakdown,
|
|
13
13
|
)
|
|
14
14
|
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
|
|
15
|
-
from .entrypoints import benchmark_generative_text
|
|
15
|
+
from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
|
|
16
16
|
from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport
|
|
17
17
|
from .profile import (
|
|
18
18
|
AsyncProfile,
|
|
@@ -32,42 +32,36 @@ from .progress import (
|
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
__all__ = [
|
|
35
|
-
# Aggregator
|
|
36
35
|
"AggregatorT",
|
|
37
|
-
"
|
|
38
|
-
"GenerativeBenchmarkAggregator",
|
|
39
|
-
# Benchmark
|
|
36
|
+
"AsyncProfile",
|
|
40
37
|
"Benchmark",
|
|
38
|
+
"BenchmarkAggregator",
|
|
41
39
|
"BenchmarkArgs",
|
|
42
40
|
"BenchmarkMetrics",
|
|
43
41
|
"BenchmarkRunStats",
|
|
44
42
|
"BenchmarkT",
|
|
45
|
-
"GenerativeBenchmark",
|
|
46
|
-
"GenerativeMetrics",
|
|
47
|
-
"GenerativeTextErrorStats",
|
|
48
|
-
"GenerativeTextResponseStats",
|
|
49
|
-
"StatusBreakdown",
|
|
50
|
-
# Benchmarker
|
|
51
43
|
"Benchmarker",
|
|
44
|
+
"BenchmarkerProgressDisplay",
|
|
52
45
|
"BenchmarkerResult",
|
|
46
|
+
"BenchmarkerTaskProgressState",
|
|
47
|
+
"ConcurrentProfile",
|
|
48
|
+
"GenerativeBenchmark",
|
|
49
|
+
"GenerativeBenchmarkAggregator",
|
|
53
50
|
"GenerativeBenchmarker",
|
|
54
|
-
# Entry points
|
|
55
|
-
"benchmark_generative_text",
|
|
56
|
-
# Output
|
|
57
51
|
"GenerativeBenchmarksConsole",
|
|
58
52
|
"GenerativeBenchmarksReport",
|
|
59
|
-
|
|
60
|
-
"
|
|
61
|
-
"
|
|
53
|
+
"GenerativeMetrics",
|
|
54
|
+
"GenerativeTextBenchmarkerProgressDisplay",
|
|
55
|
+
"GenerativeTextBenchmarkerTaskProgressState",
|
|
56
|
+
"GenerativeTextErrorStats",
|
|
57
|
+
"GenerativeTextResponseStats",
|
|
62
58
|
"Profile",
|
|
63
59
|
"ProfileType",
|
|
60
|
+
"StatusBreakdown",
|
|
64
61
|
"SweepProfile",
|
|
65
62
|
"SynchronousProfile",
|
|
66
63
|
"ThroughputProfile",
|
|
64
|
+
"benchmark_generative_text",
|
|
67
65
|
"create_profile",
|
|
68
|
-
|
|
69
|
-
"BenchmarkerProgressDisplay",
|
|
70
|
-
"BenchmarkerTaskProgressState",
|
|
71
|
-
"GenerativeTextBenchmarkerProgressDisplay",
|
|
72
|
-
"GenerativeTextBenchmarkerTaskProgressState",
|
|
66
|
+
"reimport_benchmarks_report",
|
|
73
67
|
]
|
guidellm/benchmark/aggregator.py
CHANGED
|
@@ -32,11 +32,11 @@ from guidellm.request import (
|
|
|
32
32
|
GenerationRequest,
|
|
33
33
|
GenerativeRequestLoaderDescription,
|
|
34
34
|
RequestLoaderDescription,
|
|
35
|
+
RequestT,
|
|
36
|
+
ResponseT,
|
|
35
37
|
)
|
|
36
38
|
from guidellm.scheduler import (
|
|
37
39
|
GenerativeRequestsWorkerDescription,
|
|
38
|
-
RequestT,
|
|
39
|
-
ResponseT,
|
|
40
40
|
SchedulerRequestResult,
|
|
41
41
|
WorkerDescription,
|
|
42
42
|
)
|
|
@@ -403,7 +403,7 @@ class BenchmarkAggregator(
|
|
|
403
403
|
in_warmup_duration = (
|
|
404
404
|
self.args.warmup_duration
|
|
405
405
|
and result.request_info.worker_start
|
|
406
|
-
<= (global_start_time
|
|
406
|
+
<= (global_start_time + self.args.warmup_duration)
|
|
407
407
|
)
|
|
408
408
|
|
|
409
409
|
if in_warmup_number or in_warmup_duration:
|
guidellm/benchmark/benchmark.py
CHANGED
|
@@ -34,16 +34,16 @@ from guidellm.scheduler import (
|
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
__all__ = [
|
|
37
|
-
"BenchmarkT",
|
|
38
|
-
"StatusBreakdown",
|
|
39
|
-
"BenchmarkArgs",
|
|
40
|
-
"BenchmarkRunStats",
|
|
41
37
|
"Benchmark",
|
|
38
|
+
"BenchmarkArgs",
|
|
42
39
|
"BenchmarkMetrics",
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"GenerativeMetrics",
|
|
40
|
+
"BenchmarkRunStats",
|
|
41
|
+
"BenchmarkT",
|
|
46
42
|
"GenerativeBenchmark",
|
|
43
|
+
"GenerativeMetrics",
|
|
44
|
+
"GenerativeTextErrorStats",
|
|
45
|
+
"GenerativeTextResponseStats",
|
|
46
|
+
"StatusBreakdown",
|
|
47
47
|
]
|
|
48
48
|
|
|
49
49
|
|
|
@@ -815,12 +815,11 @@ class GenerativeBenchmark(Benchmark):
|
|
|
815
815
|
req.first_token_time or req.start_time
|
|
816
816
|
for req in total_with_output_first
|
|
817
817
|
],
|
|
818
|
-
iter_counts=[
|
|
819
|
-
req.prompt_tokens + req.output_tokens
|
|
820
|
-
for req in total_with_output_first
|
|
821
|
-
],
|
|
818
|
+
iter_counts=[req.output_tokens for req in total_with_output_first],
|
|
822
819
|
first_iter_counts=[
|
|
823
|
-
|
|
820
|
+
# prompt tokens + first token
|
|
821
|
+
req.prompt_tokens + 1
|
|
822
|
+
for req in total_with_output_first
|
|
824
823
|
],
|
|
825
824
|
),
|
|
826
825
|
),
|
|
@@ -27,12 +27,12 @@ from guidellm.request import (
|
|
|
27
27
|
GenerationRequest,
|
|
28
28
|
GenerativeRequestLoaderDescription,
|
|
29
29
|
RequestLoaderDescription,
|
|
30
|
+
RequestT,
|
|
31
|
+
ResponseT,
|
|
30
32
|
)
|
|
31
33
|
from guidellm.scheduler import (
|
|
32
34
|
GenerativeRequestsWorker,
|
|
33
35
|
RequestsWorker,
|
|
34
|
-
RequestT,
|
|
35
|
-
ResponseT,
|
|
36
36
|
Scheduler,
|
|
37
37
|
SchedulerRequestResult,
|
|
38
38
|
SchedulingStrategy,
|
|
@@ -15,10 +15,22 @@ from guidellm.benchmark.output import (
|
|
|
15
15
|
)
|
|
16
16
|
from guidellm.benchmark.profile import ProfileType, create_profile
|
|
17
17
|
from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay
|
|
18
|
+
from guidellm.benchmark.scenario import GenerativeTextScenario, Scenario
|
|
18
19
|
from guidellm.request import GenerativeRequestLoader
|
|
19
20
|
from guidellm.scheduler import StrategyType
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
async def benchmark_with_scenario(scenario: Scenario, **kwargs):
|
|
24
|
+
"""
|
|
25
|
+
Run a benchmark using a scenario and specify any extra arguments
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
if isinstance(scenario, GenerativeTextScenario):
|
|
29
|
+
return await benchmark_generative_text(**vars(scenario), **kwargs)
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError(f"Unsupported Scenario type {type(scenario)}")
|
|
32
|
+
|
|
33
|
+
|
|
22
34
|
async def benchmark_generative_text(
|
|
23
35
|
target: str,
|
|
24
36
|
backend_type: BackendType,
|
|
@@ -38,18 +50,18 @@ async def benchmark_generative_text(
|
|
|
38
50
|
data_args: Optional[dict[str, Any]],
|
|
39
51
|
data_sampler: Optional[Literal["random"]],
|
|
40
52
|
rate_type: Union[StrategyType, ProfileType],
|
|
41
|
-
rate: Optional[Union[
|
|
53
|
+
rate: Optional[Union[float, list[float]]],
|
|
42
54
|
max_seconds: Optional[float],
|
|
43
55
|
max_requests: Optional[int],
|
|
44
56
|
warmup_percent: Optional[float],
|
|
45
57
|
cooldown_percent: Optional[float],
|
|
46
|
-
show_progress: bool,
|
|
47
|
-
show_progress_scheduler_stats: bool,
|
|
48
|
-
output_console: bool,
|
|
49
58
|
output_path: Optional[Union[str, Path]],
|
|
50
59
|
output_extras: Optional[dict[str, Any]],
|
|
51
60
|
output_sampling: Optional[int],
|
|
52
61
|
random_seed: int,
|
|
62
|
+
show_progress: bool = True,
|
|
63
|
+
show_progress_scheduler_stats: bool = False,
|
|
64
|
+
output_console: bool = True,
|
|
53
65
|
) -> tuple[GenerativeBenchmarksReport, Optional[Path]]:
|
|
54
66
|
console = GenerativeBenchmarksConsole(enabled=show_progress)
|
|
55
67
|
console.print_line("Creating backend...")
|
|
@@ -121,13 +133,8 @@ async def benchmark_generative_text(
|
|
|
121
133
|
)
|
|
122
134
|
|
|
123
135
|
if output_console:
|
|
124
|
-
orig_enabled = console.enabled
|
|
125
|
-
console.enabled = True
|
|
126
136
|
console.benchmarks = report.benchmarks
|
|
127
|
-
console.
|
|
128
|
-
console.print_benchmarks_info()
|
|
129
|
-
console.print_benchmarks_stats()
|
|
130
|
-
console.enabled = orig_enabled
|
|
137
|
+
console.print_full_report()
|
|
131
138
|
|
|
132
139
|
if output_path:
|
|
133
140
|
console.print_line("\nSaving benchmarks report...")
|
|
@@ -139,3 +146,20 @@ async def benchmark_generative_text(
|
|
|
139
146
|
console.print_line("\nBenchmarking complete.")
|
|
140
147
|
|
|
141
148
|
return report, saved_path
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def reimport_benchmarks_report(file: Path, output_path: Optional[Path]) -> None:
|
|
152
|
+
"""
|
|
153
|
+
The command-line entry point for re-importing and displaying an
|
|
154
|
+
existing benchmarks report. Can also specify
|
|
155
|
+
Assumes the file provided exists.
|
|
156
|
+
"""
|
|
157
|
+
console = GenerativeBenchmarksConsole(enabled=True)
|
|
158
|
+
report = GenerativeBenchmarksReport.load_file(file)
|
|
159
|
+
console.benchmarks = report.benchmarks
|
|
160
|
+
console.print_full_report()
|
|
161
|
+
|
|
162
|
+
if output_path:
|
|
163
|
+
console.print_line("\nSaving benchmarks report...")
|
|
164
|
+
saved_path = report.save_file(output_path)
|
|
165
|
+
console.print_line(f"Benchmarks report saved to {saved_path}")
|