guidellm 0.3.0rc20250429__py3-none-any.whl → 0.4.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +8 -13
- guidellm/__main__.py +290 -69
- guidellm/backend/__init__.py +6 -6
- guidellm/backend/backend.py +25 -4
- guidellm/backend/openai.py +147 -27
- guidellm/backend/response.py +6 -2
- guidellm/benchmark/__init__.py +16 -22
- guidellm/benchmark/aggregator.py +3 -3
- guidellm/benchmark/benchmark.py +11 -12
- guidellm/benchmark/benchmarker.py +2 -2
- guidellm/benchmark/entrypoints.py +34 -10
- guidellm/benchmark/output.py +57 -5
- guidellm/benchmark/profile.py +4 -4
- guidellm/benchmark/progress.py +2 -2
- guidellm/benchmark/scenario.py +104 -0
- guidellm/benchmark/scenarios/__init__.py +0 -0
- guidellm/config.py +28 -7
- guidellm/dataset/__init__.py +4 -4
- guidellm/dataset/creator.py +1 -1
- guidellm/dataset/synthetic.py +36 -11
- guidellm/logger.py +8 -4
- guidellm/objects/__init__.py +2 -2
- guidellm/objects/pydantic.py +30 -1
- guidellm/objects/statistics.py +20 -14
- guidellm/preprocess/__init__.py +3 -0
- guidellm/preprocess/dataset.py +374 -0
- guidellm/presentation/__init__.py +28 -0
- guidellm/presentation/builder.py +27 -0
- guidellm/presentation/data_models.py +232 -0
- guidellm/presentation/injector.py +66 -0
- guidellm/request/__init__.py +6 -3
- guidellm/request/loader.py +5 -5
- guidellm/{scheduler → request}/types.py +4 -1
- guidellm/scheduler/__init__.py +10 -15
- guidellm/scheduler/queues.py +25 -0
- guidellm/scheduler/result.py +21 -3
- guidellm/scheduler/scheduler.py +68 -60
- guidellm/scheduler/strategy.py +26 -24
- guidellm/scheduler/worker.py +64 -103
- guidellm/utils/__init__.py +17 -5
- guidellm/utils/cli.py +62 -0
- guidellm/utils/default_group.py +105 -0
- guidellm/utils/dict.py +23 -0
- guidellm/utils/hf_datasets.py +36 -0
- guidellm/utils/random.py +1 -1
- guidellm/utils/text.py +14 -15
- guidellm/version.py +6 -0
- guidellm-0.4.0a0.dist-info/METADATA +317 -0
- guidellm-0.4.0a0.dist-info/RECORD +62 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.4.0a0.dist-info}/WHEEL +1 -1
- guidellm-0.3.0rc20250429.dist-info/METADATA +0 -453
- guidellm-0.3.0rc20250429.dist-info/RECORD +0 -48
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.4.0a0.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.4.0a0.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.0rc20250429.dist-info → guidellm-0.4.0a0.dist-info}/top_level.txt +0 -0
guidellm/backend/openai.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import copy
|
|
2
3
|
import json
|
|
3
4
|
import time
|
|
4
5
|
from collections.abc import AsyncGenerator
|
|
@@ -17,12 +18,24 @@ from guidellm.backend.response import (
|
|
|
17
18
|
)
|
|
18
19
|
from guidellm.config import settings
|
|
19
20
|
|
|
20
|
-
__all__ = [
|
|
21
|
+
__all__ = [
|
|
22
|
+
"CHAT_COMPLETIONS",
|
|
23
|
+
"CHAT_COMPLETIONS_PATH",
|
|
24
|
+
"MODELS",
|
|
25
|
+
"TEXT_COMPLETIONS",
|
|
26
|
+
"TEXT_COMPLETIONS_PATH",
|
|
27
|
+
"OpenAIHTTPBackend",
|
|
28
|
+
]
|
|
21
29
|
|
|
22
30
|
|
|
23
31
|
TEXT_COMPLETIONS_PATH = "/v1/completions"
|
|
24
32
|
CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
|
|
25
33
|
|
|
34
|
+
EndpointType = Literal["chat_completions", "models", "text_completions"]
|
|
35
|
+
CHAT_COMPLETIONS: EndpointType = "chat_completions"
|
|
36
|
+
MODELS: EndpointType = "models"
|
|
37
|
+
TEXT_COMPLETIONS: EndpointType = "text_completions"
|
|
38
|
+
|
|
26
39
|
|
|
27
40
|
@Backend.register("openai_http")
|
|
28
41
|
class OpenAIHTTPBackend(Backend):
|
|
@@ -49,8 +62,23 @@ class OpenAIHTTPBackend(Backend):
|
|
|
49
62
|
If not provided, the default timeout provided from settings is used.
|
|
50
63
|
:param http2: If True, uses HTTP/2 for requests to the OpenAI server.
|
|
51
64
|
Defaults to True.
|
|
65
|
+
:param follow_redirects: If True, the HTTP client will follow redirect responses.
|
|
66
|
+
If not provided, the default value from settings is used.
|
|
52
67
|
:param max_output_tokens: The maximum number of tokens to request for completions.
|
|
53
68
|
If not provided, the default maximum tokens provided from settings is used.
|
|
69
|
+
:param extra_query: Query parameters to include in requests to the OpenAI server.
|
|
70
|
+
If "chat_completions", "models", or "text_completions" are included as keys,
|
|
71
|
+
the values of these keys will be used as the parameters for the respective
|
|
72
|
+
endpoint.
|
|
73
|
+
If not provided, no extra query parameters are added.
|
|
74
|
+
:param extra_body: Body parameters to include in requests to the OpenAI server.
|
|
75
|
+
If "chat_completions", "models", or "text_completions" are included as keys,
|
|
76
|
+
the values of these keys will be included in the body for the respective
|
|
77
|
+
endpoint.
|
|
78
|
+
If not provided, no extra body parameters are added.
|
|
79
|
+
:param remove_from_body: Parameters that should be removed from the body of each
|
|
80
|
+
request.
|
|
81
|
+
If not provided, no parameters are removed from the body.
|
|
54
82
|
"""
|
|
55
83
|
|
|
56
84
|
def __init__(
|
|
@@ -62,7 +90,13 @@ class OpenAIHTTPBackend(Backend):
|
|
|
62
90
|
project: Optional[str] = None,
|
|
63
91
|
timeout: Optional[float] = None,
|
|
64
92
|
http2: Optional[bool] = True,
|
|
93
|
+
follow_redirects: Optional[bool] = None,
|
|
65
94
|
max_output_tokens: Optional[int] = None,
|
|
95
|
+
extra_query: Optional[dict] = None,
|
|
96
|
+
extra_body: Optional[dict] = None,
|
|
97
|
+
remove_from_body: Optional[list[str]] = None,
|
|
98
|
+
headers: Optional[dict] = None,
|
|
99
|
+
verify: Optional[bool] = None,
|
|
66
100
|
):
|
|
67
101
|
super().__init__(type_="openai_http")
|
|
68
102
|
self._target = target or settings.openai.base_url
|
|
@@ -79,20 +113,48 @@ class OpenAIHTTPBackend(Backend):
|
|
|
79
113
|
|
|
80
114
|
self._model = model
|
|
81
115
|
|
|
116
|
+
# Start with default headers based on other params
|
|
117
|
+
default_headers: dict[str, str] = {}
|
|
82
118
|
api_key = api_key or settings.openai.api_key
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
119
|
+
bearer_token = settings.openai.bearer_token
|
|
120
|
+
if api_key:
|
|
121
|
+
default_headers["Authorization"] = f"Bearer {api_key}"
|
|
122
|
+
elif bearer_token:
|
|
123
|
+
default_headers["Authorization"] = bearer_token
|
|
86
124
|
|
|
87
125
|
self.organization = organization or settings.openai.organization
|
|
126
|
+
if self.organization:
|
|
127
|
+
default_headers["OpenAI-Organization"] = self.organization
|
|
128
|
+
|
|
88
129
|
self.project = project or settings.openai.project
|
|
130
|
+
if self.project:
|
|
131
|
+
default_headers["OpenAI-Project"] = self.project
|
|
132
|
+
|
|
133
|
+
# User-provided headers from kwargs or settings override defaults
|
|
134
|
+
merged_headers = default_headers.copy()
|
|
135
|
+
merged_headers.update(settings.openai.headers or {})
|
|
136
|
+
if headers:
|
|
137
|
+
merged_headers.update(headers)
|
|
138
|
+
|
|
139
|
+
# Remove headers with None values for backward compatibility and convenience
|
|
140
|
+
self.headers = {k: v for k, v in merged_headers.items() if v is not None}
|
|
141
|
+
|
|
89
142
|
self.timeout = timeout if timeout is not None else settings.request_timeout
|
|
90
143
|
self.http2 = http2 if http2 is not None else settings.request_http2
|
|
144
|
+
self.follow_redirects = (
|
|
145
|
+
follow_redirects
|
|
146
|
+
if follow_redirects is not None
|
|
147
|
+
else settings.request_follow_redirects
|
|
148
|
+
)
|
|
149
|
+
self.verify = verify if verify is not None else settings.openai.verify
|
|
91
150
|
self.max_output_tokens = (
|
|
92
151
|
max_output_tokens
|
|
93
152
|
if max_output_tokens is not None
|
|
94
153
|
else settings.openai.max_output_tokens
|
|
95
154
|
)
|
|
155
|
+
self.extra_query = extra_query
|
|
156
|
+
self.extra_body = extra_body
|
|
157
|
+
self.remove_from_body = remove_from_body
|
|
96
158
|
self._async_client: Optional[httpx.AsyncClient] = None
|
|
97
159
|
|
|
98
160
|
@property
|
|
@@ -120,13 +182,21 @@ class OpenAIHTTPBackend(Backend):
|
|
|
120
182
|
"max_output_tokens": self.max_output_tokens,
|
|
121
183
|
"timeout": self.timeout,
|
|
122
184
|
"http2": self.http2,
|
|
123
|
-
"
|
|
124
|
-
"
|
|
125
|
-
"project": self.project,
|
|
185
|
+
"follow_redirects": self.follow_redirects,
|
|
186
|
+
"headers": self.headers,
|
|
126
187
|
"text_completions_path": TEXT_COMPLETIONS_PATH,
|
|
127
188
|
"chat_completions_path": CHAT_COMPLETIONS_PATH,
|
|
128
189
|
}
|
|
129
190
|
|
|
191
|
+
async def reset(self) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Reset the connection object. This is useful for backends that
|
|
194
|
+
reuse connections or have state that needs to be cleared.
|
|
195
|
+
For this backend, it closes the async client if it exists.
|
|
196
|
+
"""
|
|
197
|
+
if self._async_client is not None:
|
|
198
|
+
await self._async_client.aclose()
|
|
199
|
+
|
|
130
200
|
async def check_setup(self):
|
|
131
201
|
"""
|
|
132
202
|
Check if the backend is setup correctly and can be used for requests.
|
|
@@ -165,7 +235,10 @@ class OpenAIHTTPBackend(Backend):
|
|
|
165
235
|
"""
|
|
166
236
|
target = f"{self.target}/v1/models"
|
|
167
237
|
headers = self._headers()
|
|
168
|
-
|
|
238
|
+
params = self._params(MODELS)
|
|
239
|
+
response = await self._get_async_client().get(
|
|
240
|
+
target, headers=headers, params=params
|
|
241
|
+
)
|
|
169
242
|
response.raise_for_status()
|
|
170
243
|
|
|
171
244
|
models = []
|
|
@@ -210,7 +283,9 @@ class OpenAIHTTPBackend(Backend):
|
|
|
210
283
|
)
|
|
211
284
|
|
|
212
285
|
headers = self._headers()
|
|
286
|
+
params = self._params(TEXT_COMPLETIONS)
|
|
213
287
|
payload = self._completions_payload(
|
|
288
|
+
endpoint_type=TEXT_COMPLETIONS,
|
|
214
289
|
orig_kwargs=kwargs,
|
|
215
290
|
max_output_tokens=output_token_count,
|
|
216
291
|
prompt=prompt,
|
|
@@ -223,14 +298,16 @@ class OpenAIHTTPBackend(Backend):
|
|
|
223
298
|
request_prompt_tokens=prompt_token_count,
|
|
224
299
|
request_output_tokens=output_token_count,
|
|
225
300
|
headers=headers,
|
|
301
|
+
params=params,
|
|
226
302
|
payload=payload,
|
|
227
303
|
):
|
|
228
304
|
yield resp
|
|
229
305
|
except Exception as ex:
|
|
230
306
|
logger.error(
|
|
231
|
-
"{} request with headers: {} and payload: {} failed: {}",
|
|
307
|
+
"{} request with headers: {} and params: {} and payload: {} failed: {}",
|
|
232
308
|
self.__class__.__name__,
|
|
233
309
|
headers,
|
|
310
|
+
params,
|
|
234
311
|
payload,
|
|
235
312
|
ex,
|
|
236
313
|
)
|
|
@@ -282,10 +359,12 @@ class OpenAIHTTPBackend(Backend):
|
|
|
282
359
|
"""
|
|
283
360
|
logger.debug("{} invocation with args: {}", self.__class__.__name__, locals())
|
|
284
361
|
headers = self._headers()
|
|
362
|
+
params = self._params(CHAT_COMPLETIONS)
|
|
285
363
|
messages = (
|
|
286
364
|
content if raw_content else self._create_chat_messages(content=content)
|
|
287
365
|
)
|
|
288
366
|
payload = self._completions_payload(
|
|
367
|
+
endpoint_type=CHAT_COMPLETIONS,
|
|
289
368
|
orig_kwargs=kwargs,
|
|
290
369
|
max_output_tokens=output_token_count,
|
|
291
370
|
messages=messages,
|
|
@@ -298,14 +377,16 @@ class OpenAIHTTPBackend(Backend):
|
|
|
298
377
|
request_prompt_tokens=prompt_token_count,
|
|
299
378
|
request_output_tokens=output_token_count,
|
|
300
379
|
headers=headers,
|
|
380
|
+
params=params,
|
|
301
381
|
payload=payload,
|
|
302
382
|
):
|
|
303
383
|
yield resp
|
|
304
384
|
except Exception as ex:
|
|
305
385
|
logger.error(
|
|
306
|
-
"{} request with headers: {} and payload: {} failed: {}",
|
|
386
|
+
"{} request with headers: {} and params: {} and payload: {} failed: {}",
|
|
307
387
|
self.__class__.__name__,
|
|
308
388
|
headers,
|
|
389
|
+
params,
|
|
309
390
|
payload,
|
|
310
391
|
ex,
|
|
311
392
|
)
|
|
@@ -318,8 +399,13 @@ class OpenAIHTTPBackend(Backend):
|
|
|
318
399
|
|
|
319
400
|
:return: The async HTTP client.
|
|
320
401
|
"""
|
|
321
|
-
if self._async_client is None:
|
|
322
|
-
client = httpx.AsyncClient(
|
|
402
|
+
if self._async_client is None or self._async_client.is_closed:
|
|
403
|
+
client = httpx.AsyncClient(
|
|
404
|
+
http2=self.http2,
|
|
405
|
+
timeout=self.timeout,
|
|
406
|
+
follow_redirects=self.follow_redirects,
|
|
407
|
+
verify=self.verify,
|
|
408
|
+
)
|
|
323
409
|
self._async_client = client
|
|
324
410
|
else:
|
|
325
411
|
client = self._async_client
|
|
@@ -330,22 +416,44 @@ class OpenAIHTTPBackend(Backend):
|
|
|
330
416
|
headers = {
|
|
331
417
|
"Content-Type": "application/json",
|
|
332
418
|
}
|
|
419
|
+
headers.update(self.headers)
|
|
420
|
+
return headers
|
|
333
421
|
|
|
334
|
-
|
|
335
|
-
|
|
422
|
+
def _params(self, endpoint_type: EndpointType) -> dict[str, str]:
|
|
423
|
+
if self.extra_query is None:
|
|
424
|
+
return {}
|
|
336
425
|
|
|
337
|
-
if
|
|
338
|
-
|
|
426
|
+
if (
|
|
427
|
+
CHAT_COMPLETIONS in self.extra_query
|
|
428
|
+
or MODELS in self.extra_query
|
|
429
|
+
or TEXT_COMPLETIONS in self.extra_query
|
|
430
|
+
):
|
|
431
|
+
return self.extra_query.get(endpoint_type, {})
|
|
339
432
|
|
|
340
|
-
|
|
341
|
-
headers["OpenAI-Project"] = self.project
|
|
433
|
+
return self.extra_query
|
|
342
434
|
|
|
343
|
-
|
|
435
|
+
def _extra_body(self, endpoint_type: EndpointType) -> dict[str, Any]:
|
|
436
|
+
if self.extra_body is None:
|
|
437
|
+
return {}
|
|
438
|
+
|
|
439
|
+
if (
|
|
440
|
+
CHAT_COMPLETIONS in self.extra_body
|
|
441
|
+
or MODELS in self.extra_body
|
|
442
|
+
or TEXT_COMPLETIONS in self.extra_body
|
|
443
|
+
):
|
|
444
|
+
return copy.deepcopy(self.extra_body.get(endpoint_type, {}))
|
|
445
|
+
|
|
446
|
+
return copy.deepcopy(self.extra_body)
|
|
344
447
|
|
|
345
448
|
def _completions_payload(
|
|
346
|
-
self,
|
|
449
|
+
self,
|
|
450
|
+
endpoint_type: EndpointType,
|
|
451
|
+
orig_kwargs: Optional[dict],
|
|
452
|
+
max_output_tokens: Optional[int],
|
|
453
|
+
**kwargs,
|
|
347
454
|
) -> dict:
|
|
348
|
-
payload =
|
|
455
|
+
payload = self._extra_body(endpoint_type)
|
|
456
|
+
payload.update(orig_kwargs or {})
|
|
349
457
|
payload.update(kwargs)
|
|
350
458
|
payload["model"] = self.model
|
|
351
459
|
payload["stream"] = True
|
|
@@ -368,6 +476,10 @@ class OpenAIHTTPBackend(Backend):
|
|
|
368
476
|
payload["stop"] = None
|
|
369
477
|
payload["ignore_eos"] = True
|
|
370
478
|
|
|
479
|
+
if self.remove_from_body:
|
|
480
|
+
for key in self.remove_from_body:
|
|
481
|
+
payload.pop(key, None)
|
|
482
|
+
|
|
371
483
|
return payload
|
|
372
484
|
|
|
373
485
|
@staticmethod
|
|
@@ -438,8 +550,9 @@ class OpenAIHTTPBackend(Backend):
|
|
|
438
550
|
request_id: Optional[str],
|
|
439
551
|
request_prompt_tokens: Optional[int],
|
|
440
552
|
request_output_tokens: Optional[int],
|
|
441
|
-
headers: dict,
|
|
442
|
-
|
|
553
|
+
headers: dict[str, str],
|
|
554
|
+
params: dict[str, str],
|
|
555
|
+
payload: dict[str, Any],
|
|
443
556
|
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
|
|
444
557
|
if type_ == "text_completions":
|
|
445
558
|
target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
|
|
@@ -449,14 +562,17 @@ class OpenAIHTTPBackend(Backend):
|
|
|
449
562
|
raise ValueError(f"Unsupported type: {type_}")
|
|
450
563
|
|
|
451
564
|
logger.info(
|
|
452
|
-
"{} making request: {} to target: {} using http2: {}
|
|
453
|
-
"timeout: {} with headers: {} and
|
|
565
|
+
"{} making request: {} to target: {} using http2: {} following "
|
|
566
|
+
"redirects: {} for timeout: {} with headers: {} and params: {} and ",
|
|
567
|
+
"payload: {}",
|
|
454
568
|
self.__class__.__name__,
|
|
455
569
|
request_id,
|
|
456
570
|
target,
|
|
457
571
|
self.http2,
|
|
572
|
+
self.follow_redirects,
|
|
458
573
|
self.timeout,
|
|
459
574
|
headers,
|
|
575
|
+
params,
|
|
460
576
|
payload,
|
|
461
577
|
)
|
|
462
578
|
|
|
@@ -484,7 +600,7 @@ class OpenAIHTTPBackend(Backend):
|
|
|
484
600
|
start_time = time.time()
|
|
485
601
|
|
|
486
602
|
async with self._get_async_client().stream(
|
|
487
|
-
"POST", target, headers=headers, json=payload
|
|
603
|
+
"POST", target, headers=headers, params=params, json=payload
|
|
488
604
|
) as stream:
|
|
489
605
|
stream.raise_for_status()
|
|
490
606
|
|
|
@@ -528,10 +644,12 @@ class OpenAIHTTPBackend(Backend):
|
|
|
528
644
|
response_output_count = usage["output"]
|
|
529
645
|
|
|
530
646
|
logger.info(
|
|
531
|
-
"{} request: {} with headers: {} and
|
|
647
|
+
"{} request: {} with headers: {} and params: {} and payload: {} completed"
|
|
648
|
+
"with: {}",
|
|
532
649
|
self.__class__.__name__,
|
|
533
650
|
request_id,
|
|
534
651
|
headers,
|
|
652
|
+
params,
|
|
535
653
|
payload,
|
|
536
654
|
response_value,
|
|
537
655
|
)
|
|
@@ -541,9 +659,11 @@ class OpenAIHTTPBackend(Backend):
|
|
|
541
659
|
request_args=RequestArgs(
|
|
542
660
|
target=target,
|
|
543
661
|
headers=headers,
|
|
662
|
+
params=params,
|
|
544
663
|
payload=payload,
|
|
545
664
|
timeout=self.timeout,
|
|
546
665
|
http2=self.http2,
|
|
666
|
+
follow_redirects=self.follow_redirects,
|
|
547
667
|
),
|
|
548
668
|
start_time=start_time,
|
|
549
669
|
end_time=iter_time,
|
guidellm/backend/response.py
CHANGED
|
@@ -6,10 +6,10 @@ from guidellm.config import settings
|
|
|
6
6
|
from guidellm.objects.pydantic import StandardBaseModel
|
|
7
7
|
|
|
8
8
|
__all__ = [
|
|
9
|
-
"StreamingResponseType",
|
|
10
|
-
"StreamingTextResponse",
|
|
11
9
|
"RequestArgs",
|
|
12
10
|
"ResponseSummary",
|
|
11
|
+
"StreamingResponseType",
|
|
12
|
+
"StreamingTextResponse",
|
|
13
13
|
]
|
|
14
14
|
|
|
15
15
|
|
|
@@ -48,17 +48,21 @@ class RequestArgs(StandardBaseModel):
|
|
|
48
48
|
|
|
49
49
|
:param target: The target URL or function for the request.
|
|
50
50
|
:param headers: The headers, if any, included in the request such as authorization.
|
|
51
|
+
:param params: The query parameters, if any, included in the request.
|
|
51
52
|
:param payload: The payload / arguments for the request including the prompt /
|
|
52
53
|
content and other configurations.
|
|
53
54
|
:param timeout: The timeout for the request in seconds, if any.
|
|
54
55
|
:param http2: Whether HTTP/2 was used for the request, if applicable.
|
|
56
|
+
:param follow_redirects: Whether the request should follow redirect responses.
|
|
55
57
|
"""
|
|
56
58
|
|
|
57
59
|
target: str
|
|
58
60
|
headers: dict[str, str]
|
|
61
|
+
params: dict[str, str]
|
|
59
62
|
payload: dict[str, Any]
|
|
60
63
|
timeout: Optional[float] = None
|
|
61
64
|
http2: Optional[bool] = None
|
|
65
|
+
follow_redirects: Optional[bool] = None
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
class ResponseSummary(StandardBaseModel):
|
guidellm/benchmark/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@ from .benchmark import (
|
|
|
12
12
|
StatusBreakdown,
|
|
13
13
|
)
|
|
14
14
|
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
|
|
15
|
-
from .entrypoints import benchmark_generative_text
|
|
15
|
+
from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
|
|
16
16
|
from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport
|
|
17
17
|
from .profile import (
|
|
18
18
|
AsyncProfile,
|
|
@@ -32,42 +32,36 @@ from .progress import (
|
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
__all__ = [
|
|
35
|
-
# Aggregator
|
|
36
35
|
"AggregatorT",
|
|
37
|
-
"
|
|
38
|
-
"GenerativeBenchmarkAggregator",
|
|
39
|
-
# Benchmark
|
|
36
|
+
"AsyncProfile",
|
|
40
37
|
"Benchmark",
|
|
38
|
+
"BenchmarkAggregator",
|
|
41
39
|
"BenchmarkArgs",
|
|
42
40
|
"BenchmarkMetrics",
|
|
43
41
|
"BenchmarkRunStats",
|
|
44
42
|
"BenchmarkT",
|
|
45
|
-
"GenerativeBenchmark",
|
|
46
|
-
"GenerativeMetrics",
|
|
47
|
-
"GenerativeTextErrorStats",
|
|
48
|
-
"GenerativeTextResponseStats",
|
|
49
|
-
"StatusBreakdown",
|
|
50
|
-
# Benchmarker
|
|
51
43
|
"Benchmarker",
|
|
44
|
+
"BenchmarkerProgressDisplay",
|
|
52
45
|
"BenchmarkerResult",
|
|
46
|
+
"BenchmarkerTaskProgressState",
|
|
47
|
+
"ConcurrentProfile",
|
|
48
|
+
"GenerativeBenchmark",
|
|
49
|
+
"GenerativeBenchmarkAggregator",
|
|
53
50
|
"GenerativeBenchmarker",
|
|
54
|
-
# Entry points
|
|
55
|
-
"benchmark_generative_text",
|
|
56
|
-
# Output
|
|
57
51
|
"GenerativeBenchmarksConsole",
|
|
58
52
|
"GenerativeBenchmarksReport",
|
|
59
|
-
|
|
60
|
-
"
|
|
61
|
-
"
|
|
53
|
+
"GenerativeMetrics",
|
|
54
|
+
"GenerativeTextBenchmarkerProgressDisplay",
|
|
55
|
+
"GenerativeTextBenchmarkerTaskProgressState",
|
|
56
|
+
"GenerativeTextErrorStats",
|
|
57
|
+
"GenerativeTextResponseStats",
|
|
62
58
|
"Profile",
|
|
63
59
|
"ProfileType",
|
|
60
|
+
"StatusBreakdown",
|
|
64
61
|
"SweepProfile",
|
|
65
62
|
"SynchronousProfile",
|
|
66
63
|
"ThroughputProfile",
|
|
64
|
+
"benchmark_generative_text",
|
|
67
65
|
"create_profile",
|
|
68
|
-
|
|
69
|
-
"BenchmarkerProgressDisplay",
|
|
70
|
-
"BenchmarkerTaskProgressState",
|
|
71
|
-
"GenerativeTextBenchmarkerProgressDisplay",
|
|
72
|
-
"GenerativeTextBenchmarkerTaskProgressState",
|
|
66
|
+
"reimport_benchmarks_report",
|
|
73
67
|
]
|
guidellm/benchmark/aggregator.py
CHANGED
|
@@ -32,11 +32,11 @@ from guidellm.request import (
|
|
|
32
32
|
GenerationRequest,
|
|
33
33
|
GenerativeRequestLoaderDescription,
|
|
34
34
|
RequestLoaderDescription,
|
|
35
|
+
RequestT,
|
|
36
|
+
ResponseT,
|
|
35
37
|
)
|
|
36
38
|
from guidellm.scheduler import (
|
|
37
39
|
GenerativeRequestsWorkerDescription,
|
|
38
|
-
RequestT,
|
|
39
|
-
ResponseT,
|
|
40
40
|
SchedulerRequestResult,
|
|
41
41
|
WorkerDescription,
|
|
42
42
|
)
|
|
@@ -403,7 +403,7 @@ class BenchmarkAggregator(
|
|
|
403
403
|
in_warmup_duration = (
|
|
404
404
|
self.args.warmup_duration
|
|
405
405
|
and result.request_info.worker_start
|
|
406
|
-
<= (global_start_time
|
|
406
|
+
<= (global_start_time + self.args.warmup_duration)
|
|
407
407
|
)
|
|
408
408
|
|
|
409
409
|
if in_warmup_number or in_warmup_duration:
|
guidellm/benchmark/benchmark.py
CHANGED
|
@@ -34,16 +34,16 @@ from guidellm.scheduler import (
|
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
__all__ = [
|
|
37
|
-
"BenchmarkT",
|
|
38
|
-
"StatusBreakdown",
|
|
39
|
-
"BenchmarkArgs",
|
|
40
|
-
"BenchmarkRunStats",
|
|
41
37
|
"Benchmark",
|
|
38
|
+
"BenchmarkArgs",
|
|
42
39
|
"BenchmarkMetrics",
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"GenerativeMetrics",
|
|
40
|
+
"BenchmarkRunStats",
|
|
41
|
+
"BenchmarkT",
|
|
46
42
|
"GenerativeBenchmark",
|
|
43
|
+
"GenerativeMetrics",
|
|
44
|
+
"GenerativeTextErrorStats",
|
|
45
|
+
"GenerativeTextResponseStats",
|
|
46
|
+
"StatusBreakdown",
|
|
47
47
|
]
|
|
48
48
|
|
|
49
49
|
|
|
@@ -815,12 +815,11 @@ class GenerativeBenchmark(Benchmark):
|
|
|
815
815
|
req.first_token_time or req.start_time
|
|
816
816
|
for req in total_with_output_first
|
|
817
817
|
],
|
|
818
|
-
iter_counts=[
|
|
819
|
-
req.prompt_tokens + req.output_tokens
|
|
820
|
-
for req in total_with_output_first
|
|
821
|
-
],
|
|
818
|
+
iter_counts=[req.output_tokens for req in total_with_output_first],
|
|
822
819
|
first_iter_counts=[
|
|
823
|
-
|
|
820
|
+
# prompt tokens + first token
|
|
821
|
+
req.prompt_tokens + 1
|
|
822
|
+
for req in total_with_output_first
|
|
824
823
|
],
|
|
825
824
|
),
|
|
826
825
|
),
|
|
@@ -27,12 +27,12 @@ from guidellm.request import (
|
|
|
27
27
|
GenerationRequest,
|
|
28
28
|
GenerativeRequestLoaderDescription,
|
|
29
29
|
RequestLoaderDescription,
|
|
30
|
+
RequestT,
|
|
31
|
+
ResponseT,
|
|
30
32
|
)
|
|
31
33
|
from guidellm.scheduler import (
|
|
32
34
|
GenerativeRequestsWorker,
|
|
33
35
|
RequestsWorker,
|
|
34
|
-
RequestT,
|
|
35
|
-
ResponseT,
|
|
36
36
|
Scheduler,
|
|
37
37
|
SchedulerRequestResult,
|
|
38
38
|
SchedulingStrategy,
|
|
@@ -15,10 +15,22 @@ from guidellm.benchmark.output import (
|
|
|
15
15
|
)
|
|
16
16
|
from guidellm.benchmark.profile import ProfileType, create_profile
|
|
17
17
|
from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay
|
|
18
|
+
from guidellm.benchmark.scenario import GenerativeTextScenario, Scenario
|
|
18
19
|
from guidellm.request import GenerativeRequestLoader
|
|
19
20
|
from guidellm.scheduler import StrategyType
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
async def benchmark_with_scenario(scenario: Scenario, **kwargs):
|
|
24
|
+
"""
|
|
25
|
+
Run a benchmark using a scenario and specify any extra arguments
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
if isinstance(scenario, GenerativeTextScenario):
|
|
29
|
+
return await benchmark_generative_text(**vars(scenario), **kwargs)
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError(f"Unsupported Scenario type {type(scenario)}")
|
|
32
|
+
|
|
33
|
+
|
|
22
34
|
async def benchmark_generative_text(
|
|
23
35
|
target: str,
|
|
24
36
|
backend_type: BackendType,
|
|
@@ -38,18 +50,18 @@ async def benchmark_generative_text(
|
|
|
38
50
|
data_args: Optional[dict[str, Any]],
|
|
39
51
|
data_sampler: Optional[Literal["random"]],
|
|
40
52
|
rate_type: Union[StrategyType, ProfileType],
|
|
41
|
-
rate: Optional[Union[
|
|
53
|
+
rate: Optional[Union[float, list[float]]],
|
|
42
54
|
max_seconds: Optional[float],
|
|
43
55
|
max_requests: Optional[int],
|
|
44
56
|
warmup_percent: Optional[float],
|
|
45
57
|
cooldown_percent: Optional[float],
|
|
46
|
-
show_progress: bool,
|
|
47
|
-
show_progress_scheduler_stats: bool,
|
|
48
|
-
output_console: bool,
|
|
49
58
|
output_path: Optional[Union[str, Path]],
|
|
50
59
|
output_extras: Optional[dict[str, Any]],
|
|
51
60
|
output_sampling: Optional[int],
|
|
52
61
|
random_seed: int,
|
|
62
|
+
show_progress: bool = True,
|
|
63
|
+
show_progress_scheduler_stats: bool = False,
|
|
64
|
+
output_console: bool = True,
|
|
53
65
|
) -> tuple[GenerativeBenchmarksReport, Optional[Path]]:
|
|
54
66
|
console = GenerativeBenchmarksConsole(enabled=show_progress)
|
|
55
67
|
console.print_line("Creating backend...")
|
|
@@ -121,13 +133,8 @@ async def benchmark_generative_text(
|
|
|
121
133
|
)
|
|
122
134
|
|
|
123
135
|
if output_console:
|
|
124
|
-
orig_enabled = console.enabled
|
|
125
|
-
console.enabled = True
|
|
126
136
|
console.benchmarks = report.benchmarks
|
|
127
|
-
console.
|
|
128
|
-
console.print_benchmarks_info()
|
|
129
|
-
console.print_benchmarks_stats()
|
|
130
|
-
console.enabled = orig_enabled
|
|
137
|
+
console.print_full_report()
|
|
131
138
|
|
|
132
139
|
if output_path:
|
|
133
140
|
console.print_line("\nSaving benchmarks report...")
|
|
@@ -139,3 +146,20 @@ async def benchmark_generative_text(
|
|
|
139
146
|
console.print_line("\nBenchmarking complete.")
|
|
140
147
|
|
|
141
148
|
return report, saved_path
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def reimport_benchmarks_report(file: Path, output_path: Optional[Path]) -> None:
|
|
152
|
+
"""
|
|
153
|
+
The command-line entry point for re-importing and displaying an
|
|
154
|
+
existing benchmarks report. Can also specify
|
|
155
|
+
Assumes the file provided exists.
|
|
156
|
+
"""
|
|
157
|
+
console = GenerativeBenchmarksConsole(enabled=True)
|
|
158
|
+
report = GenerativeBenchmarksReport.load_file(file)
|
|
159
|
+
console.benchmarks = report.benchmarks
|
|
160
|
+
console.print_full_report()
|
|
161
|
+
|
|
162
|
+
if output_path:
|
|
163
|
+
console.print_line("\nSaving benchmarks report...")
|
|
164
|
+
saved_path = report.save_file(output_path)
|
|
165
|
+
console.print_line(f"Benchmarks report saved to {saved_path}")
|