guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a18.dist-info/RECORD +0 -62
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
guidellm/benchmark/benchmark.py
DELETED
|
@@ -1,837 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import uuid
|
|
3
|
-
from typing import Any, Literal, Optional, TypeVar, Union
|
|
4
|
-
|
|
5
|
-
from pydantic import Field, computed_field
|
|
6
|
-
|
|
7
|
-
from guidellm.benchmark.profile import (
|
|
8
|
-
AsyncProfile,
|
|
9
|
-
ConcurrentProfile,
|
|
10
|
-
Profile,
|
|
11
|
-
SweepProfile,
|
|
12
|
-
SynchronousProfile,
|
|
13
|
-
ThroughputProfile,
|
|
14
|
-
)
|
|
15
|
-
from guidellm.objects import (
|
|
16
|
-
StandardBaseModel,
|
|
17
|
-
StatusBreakdown,
|
|
18
|
-
StatusDistributionSummary,
|
|
19
|
-
)
|
|
20
|
-
from guidellm.request import (
|
|
21
|
-
GenerativeRequestLoaderDescription,
|
|
22
|
-
RequestLoaderDescription,
|
|
23
|
-
)
|
|
24
|
-
from guidellm.scheduler import (
|
|
25
|
-
AsyncConstantStrategy,
|
|
26
|
-
AsyncPoissonStrategy,
|
|
27
|
-
ConcurrentStrategy,
|
|
28
|
-
GenerativeRequestsWorkerDescription,
|
|
29
|
-
SchedulerRequestInfo,
|
|
30
|
-
SchedulingStrategy,
|
|
31
|
-
SynchronousStrategy,
|
|
32
|
-
ThroughputStrategy,
|
|
33
|
-
WorkerDescription,
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
__all__ = [
|
|
37
|
-
"Benchmark",
|
|
38
|
-
"BenchmarkArgs",
|
|
39
|
-
"BenchmarkMetrics",
|
|
40
|
-
"BenchmarkRunStats",
|
|
41
|
-
"BenchmarkT",
|
|
42
|
-
"GenerativeBenchmark",
|
|
43
|
-
"GenerativeMetrics",
|
|
44
|
-
"GenerativeTextErrorStats",
|
|
45
|
-
"GenerativeTextResponseStats",
|
|
46
|
-
"StatusBreakdown",
|
|
47
|
-
]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class BenchmarkArgs(StandardBaseModel):
|
|
51
|
-
"""
|
|
52
|
-
A serializable model representing the arguments used to specify a benchmark run
|
|
53
|
-
and how data was collected for it.
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
profile: Union[
|
|
57
|
-
AsyncProfile,
|
|
58
|
-
SweepProfile,
|
|
59
|
-
ConcurrentProfile,
|
|
60
|
-
ThroughputProfile,
|
|
61
|
-
SynchronousProfile,
|
|
62
|
-
Profile,
|
|
63
|
-
] = Field(
|
|
64
|
-
description=(
|
|
65
|
-
"The profile used for the entire benchmark run that the strategy for "
|
|
66
|
-
"this benchmark was pulled from."
|
|
67
|
-
),
|
|
68
|
-
discriminator="type_",
|
|
69
|
-
)
|
|
70
|
-
strategy_index: int = Field(
|
|
71
|
-
description=(
|
|
72
|
-
"The index of the strategy in the profile that was used for this benchmark."
|
|
73
|
-
)
|
|
74
|
-
)
|
|
75
|
-
strategy: Union[
|
|
76
|
-
ConcurrentStrategy,
|
|
77
|
-
SchedulingStrategy,
|
|
78
|
-
ThroughputStrategy,
|
|
79
|
-
SynchronousStrategy,
|
|
80
|
-
AsyncPoissonStrategy,
|
|
81
|
-
AsyncConstantStrategy,
|
|
82
|
-
SchedulingStrategy,
|
|
83
|
-
] = Field(
|
|
84
|
-
description="The scheduling strategy used to run this benchmark. ",
|
|
85
|
-
discriminator="type_",
|
|
86
|
-
)
|
|
87
|
-
max_number: Optional[int] = Field(
|
|
88
|
-
description="The maximum number of requests to run for this benchmark, if any."
|
|
89
|
-
)
|
|
90
|
-
max_duration: Optional[float] = Field(
|
|
91
|
-
description="The maximum duration in seconds to run this benchmark, if any."
|
|
92
|
-
)
|
|
93
|
-
warmup_number: Optional[int] = Field(
|
|
94
|
-
description=(
|
|
95
|
-
"The number of requests to run for the warmup phase of this benchmark, "
|
|
96
|
-
"if any. These are requests that were not included in the final results."
|
|
97
|
-
)
|
|
98
|
-
)
|
|
99
|
-
warmup_duration: Optional[float] = Field(
|
|
100
|
-
description=(
|
|
101
|
-
"The duration in seconds to run for the warmup phase of this benchmark, "
|
|
102
|
-
"if any. These are requests that were not included in the final results."
|
|
103
|
-
)
|
|
104
|
-
)
|
|
105
|
-
cooldown_number: Optional[int] = Field(
|
|
106
|
-
description=(
|
|
107
|
-
"The number of requests to run for the cooldown phase of this benchmark, "
|
|
108
|
-
"if any. These are requests that were not included in the final results."
|
|
109
|
-
)
|
|
110
|
-
)
|
|
111
|
-
cooldown_duration: Optional[float] = Field(
|
|
112
|
-
description=(
|
|
113
|
-
"The duration in seconds to run for the cooldown phase of this benchmark, "
|
|
114
|
-
"if any. These are requests that were not included in the final results."
|
|
115
|
-
)
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
class BenchmarkRunStats(StandardBaseModel):
|
|
120
|
-
"""
|
|
121
|
-
A serializable model representing the run process statistics for the
|
|
122
|
-
entire benchmark run across all requests including warmup and cooldown.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
start_time: float = Field(
|
|
126
|
-
description="The start time of the benchmark run.",
|
|
127
|
-
)
|
|
128
|
-
end_time: float = Field(
|
|
129
|
-
description="The end time of the benchmark run.",
|
|
130
|
-
)
|
|
131
|
-
requests_made: StatusBreakdown[int, int, int, int] = Field(
|
|
132
|
-
description=(
|
|
133
|
-
"The number of requests made for the benchmark run broken down by "
|
|
134
|
-
"status including successful, incomplete, errored, and the sum of all three"
|
|
135
|
-
)
|
|
136
|
-
)
|
|
137
|
-
queued_time_avg: float = Field(
|
|
138
|
-
description=(
|
|
139
|
-
"The average time spent in the queue for each request in the benchmark "
|
|
140
|
-
"run until it was dequeued by a worker."
|
|
141
|
-
)
|
|
142
|
-
)
|
|
143
|
-
scheduled_time_delay_avg: float = Field(
|
|
144
|
-
description=(
|
|
145
|
-
"The average time delay between when a request was dequeued and when it "
|
|
146
|
-
"was scheduled to be processed by a worker in the benchmark run. "
|
|
147
|
-
"This should be as close to 0 as possible, any additional time is "
|
|
148
|
-
"overheads from the system or the worker."
|
|
149
|
-
)
|
|
150
|
-
)
|
|
151
|
-
scheduled_time_sleep_avg: float = Field(
|
|
152
|
-
description=(
|
|
153
|
-
"The average time spent sleeping til the desired start time was reached "
|
|
154
|
-
"after being scheduled by the worker in the benchmark run."
|
|
155
|
-
)
|
|
156
|
-
)
|
|
157
|
-
worker_start_delay_avg: float = Field(
|
|
158
|
-
description=(
|
|
159
|
-
"The average time delay between when a request was scheduled and when "
|
|
160
|
-
"the worker started processing it in the benchmark run. "
|
|
161
|
-
"This should be as close to 0 as possible, any additional time is "
|
|
162
|
-
"overheads from the system or the worker."
|
|
163
|
-
)
|
|
164
|
-
)
|
|
165
|
-
worker_time_avg: float = Field(
|
|
166
|
-
description=(
|
|
167
|
-
"The average time taken by the worker to process each request in the "
|
|
168
|
-
"benchmark run. This includes the time to generate the response and "
|
|
169
|
-
"any additional processing time."
|
|
170
|
-
)
|
|
171
|
-
)
|
|
172
|
-
worker_start_time_targeted_delay_avg: float = Field(
|
|
173
|
-
description=(
|
|
174
|
-
"The average time delay between when a request was targeted to start "
|
|
175
|
-
"and when the worker actually started processing it in the benchmark "
|
|
176
|
-
"run. For async strategies, this represents delays from the ideal "
|
|
177
|
-
"system. For sync strategies, since those are doubled in queue, "
|
|
178
|
-
"this should be as close to the time for a request to be processed "
|
|
179
|
-
"as possible. Any additional time is overhead from the system or "
|
|
180
|
-
"the worker."
|
|
181
|
-
)
|
|
182
|
-
)
|
|
183
|
-
request_start_time_delay_avg: float = Field(
|
|
184
|
-
description=(
|
|
185
|
-
"The average time delay between the actual request being made "
|
|
186
|
-
"and the time the worker started on the request for all requests "
|
|
187
|
-
"that completed within the benchmark run. This time should be as close "
|
|
188
|
-
"to 0 as possible, any additional time is overhead from the system or "
|
|
189
|
-
"the worker."
|
|
190
|
-
)
|
|
191
|
-
)
|
|
192
|
-
request_start_time_targeted_delay_avg: float = Field(
|
|
193
|
-
description=(
|
|
194
|
-
"The average time delay between when the targeted start time and "
|
|
195
|
-
"the actual start time for each request in the benchmark run. "
|
|
196
|
-
"For async strategies, this represents delays from the ideal "
|
|
197
|
-
"system. For sync strategies, this should be as close to the "
|
|
198
|
-
"time for a request to be processed as possible. Any additional "
|
|
199
|
-
"time is overhead from the system or the worker."
|
|
200
|
-
)
|
|
201
|
-
)
|
|
202
|
-
request_time_delay_avg: float = Field(
|
|
203
|
-
description=(
|
|
204
|
-
"The average time delay between the total request time and the "
|
|
205
|
-
"worker time. This should be as close to 0 as possible, any additional "
|
|
206
|
-
"time is overhead from the system or the worker. "
|
|
207
|
-
)
|
|
208
|
-
)
|
|
209
|
-
request_time_avg: float = Field(
|
|
210
|
-
description=(
|
|
211
|
-
"The average time spent processing all requests in the benchmark run. "
|
|
212
|
-
"This is the time from when the actual request was started to when "
|
|
213
|
-
"it was completed."
|
|
214
|
-
)
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
class BenchmarkMetrics(StandardBaseModel):
|
|
219
|
-
"""
|
|
220
|
-
A serializable model representing the metrics for a benchmark run.
|
|
221
|
-
"""
|
|
222
|
-
|
|
223
|
-
requests_per_second: StatusDistributionSummary = Field(
|
|
224
|
-
description="The distribution of requests per second for the benchmark.",
|
|
225
|
-
)
|
|
226
|
-
request_concurrency: StatusDistributionSummary = Field(
|
|
227
|
-
description="The distribution of requests concurrency for the benchmark.",
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
class Benchmark(StandardBaseModel):
|
|
232
|
-
"""
|
|
233
|
-
The base serializable model representing a benchmark run and its results.
|
|
234
|
-
Specific benchmarker implementations should extend this model to include
|
|
235
|
-
additional information or metadata as needed.
|
|
236
|
-
|
|
237
|
-
Note, requests_per_second and request_concurrency are kept at this level
|
|
238
|
-
and are expected to be populated by the subclass implementation to ensure
|
|
239
|
-
the logic for Profiles can include more complicated logic for determining
|
|
240
|
-
what rates and concurrency values to use for subsequent strategies.
|
|
241
|
-
"""
|
|
242
|
-
|
|
243
|
-
type_: Literal["benchmark"] = "benchmark"
|
|
244
|
-
id_: str = Field(
|
|
245
|
-
default_factory=lambda: str(uuid.uuid4()),
|
|
246
|
-
description="The unique identifier for the benchmark.",
|
|
247
|
-
)
|
|
248
|
-
run_id: str = Field(
|
|
249
|
-
description=(
|
|
250
|
-
"The unique identifier for the encompasing benchmark run that this "
|
|
251
|
-
"benchmark was a part of."
|
|
252
|
-
)
|
|
253
|
-
)
|
|
254
|
-
args: BenchmarkArgs = Field(
|
|
255
|
-
description=(
|
|
256
|
-
"The arguments used to specify how to run the benchmark and collect data."
|
|
257
|
-
)
|
|
258
|
-
)
|
|
259
|
-
run_stats: BenchmarkRunStats = Field(
|
|
260
|
-
description=(
|
|
261
|
-
"The process statistics for the entire benchmark run across all requests."
|
|
262
|
-
)
|
|
263
|
-
)
|
|
264
|
-
worker: Union[WorkerDescription] = Field(
|
|
265
|
-
description=(
|
|
266
|
-
"The description and specifics for the worker used to resolve requests "
|
|
267
|
-
"for this benchmark."
|
|
268
|
-
),
|
|
269
|
-
)
|
|
270
|
-
request_loader: Union[RequestLoaderDescription] = Field(
|
|
271
|
-
description=(
|
|
272
|
-
"The description and specifics for the request loader used to create "
|
|
273
|
-
"requests for this benchmark."
|
|
274
|
-
),
|
|
275
|
-
)
|
|
276
|
-
extras: dict[str, Any] = Field(
|
|
277
|
-
description=(
|
|
278
|
-
"Any additional information or metadata that was passed for this benchmark."
|
|
279
|
-
)
|
|
280
|
-
)
|
|
281
|
-
metrics: BenchmarkMetrics = Field(
|
|
282
|
-
description=(
|
|
283
|
-
"The metrics for the benchmark run represented as a distribution of "
|
|
284
|
-
"various per-request statistics."
|
|
285
|
-
),
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
class GenerativeTextResponseStats(StandardBaseModel):
|
|
293
|
-
"""
|
|
294
|
-
A serializable model representing the request values, response values, and
|
|
295
|
-
statistics for a generative text response.
|
|
296
|
-
"""
|
|
297
|
-
|
|
298
|
-
type_: Literal["generative_text_response"] = "generative_text_response"
|
|
299
|
-
request_id: Optional[str] = Field(
|
|
300
|
-
description="The unique identifier for the request.",
|
|
301
|
-
)
|
|
302
|
-
request_type: Literal["text_completions", "chat_completions"] = Field(
|
|
303
|
-
description="The type of request made to the generative backend."
|
|
304
|
-
)
|
|
305
|
-
scheduler_info: SchedulerRequestInfo = Field(
|
|
306
|
-
description=(
|
|
307
|
-
"The info about the request from the scheduler about how it was run."
|
|
308
|
-
),
|
|
309
|
-
)
|
|
310
|
-
prompt: str = Field(
|
|
311
|
-
description="The text prompt used for the generative request.",
|
|
312
|
-
)
|
|
313
|
-
output: str = Field(
|
|
314
|
-
description="The generated text output from the generative request.",
|
|
315
|
-
)
|
|
316
|
-
prompt_tokens: int = Field(
|
|
317
|
-
description="The number of tokens in the prompt text.",
|
|
318
|
-
)
|
|
319
|
-
output_tokens: int = Field(
|
|
320
|
-
description="The number of tokens in the generated output text.",
|
|
321
|
-
)
|
|
322
|
-
start_time: float = Field(
|
|
323
|
-
description="The time the request started.",
|
|
324
|
-
)
|
|
325
|
-
end_time: float = Field(
|
|
326
|
-
description="The time the request ended.",
|
|
327
|
-
)
|
|
328
|
-
first_token_time: float = Field(
|
|
329
|
-
description="The time the first token was received.",
|
|
330
|
-
)
|
|
331
|
-
last_token_time: float = Field(
|
|
332
|
-
description="The time the last token was received.",
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
@computed_field # type: ignore[misc]
|
|
336
|
-
@property
|
|
337
|
-
def request_latency(self) -> float:
|
|
338
|
-
"""
|
|
339
|
-
:return: The duration of the request in seconds from the start to the end.
|
|
340
|
-
"""
|
|
341
|
-
return self.end_time - self.start_time
|
|
342
|
-
|
|
343
|
-
@computed_field # type: ignore[misc]
|
|
344
|
-
@property
|
|
345
|
-
def time_to_first_token_ms(self) -> float:
|
|
346
|
-
"""
|
|
347
|
-
:return: The time in milliseconds from the start of the request to the first
|
|
348
|
-
token received.
|
|
349
|
-
"""
|
|
350
|
-
return 1000 * (self.first_token_time - self.start_time)
|
|
351
|
-
|
|
352
|
-
@computed_field # type: ignore[misc]
|
|
353
|
-
@property
|
|
354
|
-
def time_per_output_token_ms(self) -> float:
|
|
355
|
-
"""
|
|
356
|
-
:return: The average time in milliseconds per output token generated.
|
|
357
|
-
This includes the time to generate the first token and all other tokens.
|
|
358
|
-
"""
|
|
359
|
-
if self.output_tokens == 0:
|
|
360
|
-
return 0.0
|
|
361
|
-
|
|
362
|
-
return (
|
|
363
|
-
1000 * (self.last_token_time - self.first_token_time) / self.output_tokens
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
@computed_field # type: ignore[misc]
|
|
367
|
-
@property
|
|
368
|
-
def inter_token_latency_ms(self) -> float:
|
|
369
|
-
"""
|
|
370
|
-
:return: The average time in milliseconds between generating tokens in the
|
|
371
|
-
output text. Note, does not include the time to generate the first token.
|
|
372
|
-
"""
|
|
373
|
-
if self.output_tokens <= 1:
|
|
374
|
-
return 0.0
|
|
375
|
-
|
|
376
|
-
return (
|
|
377
|
-
1000
|
|
378
|
-
* (self.last_token_time - self.first_token_time)
|
|
379
|
-
/ (self.output_tokens - 1)
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
@computed_field # type: ignore[misc]
|
|
383
|
-
@property
|
|
384
|
-
def tokens_per_second(self) -> float:
|
|
385
|
-
"""
|
|
386
|
-
:return: The average number of tokens generated per second in the prompt and
|
|
387
|
-
output text.
|
|
388
|
-
"""
|
|
389
|
-
if (latency := self.request_latency) == 0.0:
|
|
390
|
-
return 0.0
|
|
391
|
-
|
|
392
|
-
return (self.prompt_tokens + self.output_tokens) / latency
|
|
393
|
-
|
|
394
|
-
@computed_field # type: ignore[misc]
|
|
395
|
-
@property
|
|
396
|
-
def output_tokens_per_second(self) -> float:
|
|
397
|
-
"""
|
|
398
|
-
:return: The average number of output tokens generated per second.
|
|
399
|
-
"""
|
|
400
|
-
if (latency := self.request_latency) == 0.0:
|
|
401
|
-
return 0.0
|
|
402
|
-
|
|
403
|
-
return self.output_tokens / latency
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
class GenerativeTextErrorStats(GenerativeTextResponseStats):
|
|
407
|
-
"""
|
|
408
|
-
A serializable model representing the request values, response values, and
|
|
409
|
-
statistics for a generative text response that errored.
|
|
410
|
-
Extends and overrides the GenerativeTextResponseStats model to include the
|
|
411
|
-
error message and optional properties given the error occurred.
|
|
412
|
-
"""
|
|
413
|
-
|
|
414
|
-
type_: Literal["generative_text_error"] = "generative_text_error" # type: ignore[assignment]
|
|
415
|
-
error: str = Field(
|
|
416
|
-
description=(
|
|
417
|
-
"The error message for the error that occurred while making the request."
|
|
418
|
-
)
|
|
419
|
-
)
|
|
420
|
-
output: Optional[str] = Field( # type: ignore[assignment]
|
|
421
|
-
default=None,
|
|
422
|
-
description=(
|
|
423
|
-
"The generated text output from the generative request, if any, "
|
|
424
|
-
"before the error occurred."
|
|
425
|
-
),
|
|
426
|
-
)
|
|
427
|
-
first_token_time: Optional[float] = Field( # type: ignore[assignment]
|
|
428
|
-
default=None,
|
|
429
|
-
description=(
|
|
430
|
-
"The time the first token was received, if any, before the error occurred."
|
|
431
|
-
),
|
|
432
|
-
)
|
|
433
|
-
last_token_time: Optional[float] = Field( # type: ignore[assignment]
|
|
434
|
-
default=None,
|
|
435
|
-
description=(
|
|
436
|
-
"The time the last token was received, if any, before the error occurred."
|
|
437
|
-
),
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
@computed_field # type: ignore[misc]
|
|
441
|
-
@property
|
|
442
|
-
def time_to_first_token_ms(self) -> Optional[float]: # type: ignore[override]
|
|
443
|
-
"""
|
|
444
|
-
:return: The time in milliseconds from the start of the request to the first
|
|
445
|
-
token received. None if the first token was not received.
|
|
446
|
-
"""
|
|
447
|
-
if self.first_token_time is None:
|
|
448
|
-
return None
|
|
449
|
-
|
|
450
|
-
return super().time_to_first_token_ms
|
|
451
|
-
|
|
452
|
-
@computed_field # type: ignore[misc]
|
|
453
|
-
@property
|
|
454
|
-
def time_per_output_token_ms(self) -> Optional[float]: # type: ignore[override]
|
|
455
|
-
"""
|
|
456
|
-
:return: The average time in milliseconds per output token generated.
|
|
457
|
-
This includes the time to generate the first token and all other tokens.
|
|
458
|
-
None if the output_tokens is None or 0.
|
|
459
|
-
"""
|
|
460
|
-
if (
|
|
461
|
-
self.output_tokens is None
|
|
462
|
-
or self.output_tokens == 0
|
|
463
|
-
or self.first_token_time is None
|
|
464
|
-
or self.last_token_time is None
|
|
465
|
-
):
|
|
466
|
-
return None
|
|
467
|
-
|
|
468
|
-
return super().time_per_output_token_ms
|
|
469
|
-
|
|
470
|
-
@computed_field # type: ignore[misc]
|
|
471
|
-
@property
|
|
472
|
-
def inter_token_latency_ms(self) -> Optional[float]: # type: ignore[override]
|
|
473
|
-
"""
|
|
474
|
-
:return: The average time in milliseconds between generating tokens in the
|
|
475
|
-
output text. Note, does not include the time to generate the first token.
|
|
476
|
-
None if there were no output_tokens or the first token was not received.
|
|
477
|
-
"""
|
|
478
|
-
if (
|
|
479
|
-
self.output_tokens is None
|
|
480
|
-
or self.first_token_time is None
|
|
481
|
-
or self.last_token_time is None
|
|
482
|
-
):
|
|
483
|
-
return None
|
|
484
|
-
|
|
485
|
-
return super().inter_token_latency_ms
|
|
486
|
-
|
|
487
|
-
@computed_field # type: ignore[misc]
|
|
488
|
-
@property
|
|
489
|
-
def output_tokens_per_second(self) -> Optional[float]: # type: ignore[override]
|
|
490
|
-
"""
|
|
491
|
-
:return: The average number of tokens generated per second in the output text.
|
|
492
|
-
Note, does not include the time to generate the first token. None if there
|
|
493
|
-
were no output_tokens or the first token was not received.
|
|
494
|
-
"""
|
|
495
|
-
if self.inter_token_latency_ms is None:
|
|
496
|
-
return None
|
|
497
|
-
|
|
498
|
-
return super().output_tokens_per_second
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
class GenerativeMetrics(BenchmarkMetrics):
|
|
502
|
-
"""
|
|
503
|
-
A serializable model representing the metrics for a generative benchmark run.
|
|
504
|
-
"""
|
|
505
|
-
|
|
506
|
-
request_latency: StatusDistributionSummary = Field(
|
|
507
|
-
description="The distribution of latencies for the completed requests.",
|
|
508
|
-
)
|
|
509
|
-
prompt_token_count: StatusDistributionSummary = Field(
|
|
510
|
-
description=(
|
|
511
|
-
"The distribution of token counts in the prompts for completed, "
|
|
512
|
-
"errored, and all requests."
|
|
513
|
-
)
|
|
514
|
-
)
|
|
515
|
-
output_token_count: StatusDistributionSummary = Field(
|
|
516
|
-
description=(
|
|
517
|
-
"The distribution of token counts in the outputs for completed, "
|
|
518
|
-
"errored, and all requests."
|
|
519
|
-
)
|
|
520
|
-
)
|
|
521
|
-
time_to_first_token_ms: StatusDistributionSummary = Field(
|
|
522
|
-
description=(
|
|
523
|
-
"The distribution of latencies to receiving the first token in "
|
|
524
|
-
"milliseconds for completed, errored, and all requests."
|
|
525
|
-
),
|
|
526
|
-
)
|
|
527
|
-
time_per_output_token_ms: StatusDistributionSummary = Field(
|
|
528
|
-
description=(
|
|
529
|
-
"The distribution of latencies per output token in milliseconds for "
|
|
530
|
-
"completed, errored, and all requests. "
|
|
531
|
-
"This includes the time to generate the first token and all other tokens."
|
|
532
|
-
),
|
|
533
|
-
)
|
|
534
|
-
inter_token_latency_ms: StatusDistributionSummary = Field(
|
|
535
|
-
description=(
|
|
536
|
-
"The distribution of latencies between tokens in milliseconds for "
|
|
537
|
-
"completed, errored, and all requests."
|
|
538
|
-
),
|
|
539
|
-
)
|
|
540
|
-
output_tokens_per_second: StatusDistributionSummary = Field(
|
|
541
|
-
description=(
|
|
542
|
-
"The distribution of output tokens per second for completed, "
|
|
543
|
-
"errored, and all requests."
|
|
544
|
-
),
|
|
545
|
-
)
|
|
546
|
-
tokens_per_second: StatusDistributionSummary = Field(
|
|
547
|
-
description=(
|
|
548
|
-
"The distribution of tokens per second, including prompt and output tokens "
|
|
549
|
-
"for completed, errored, and all requests."
|
|
550
|
-
),
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
class GenerativeBenchmark(Benchmark):
|
|
555
|
-
"""
|
|
556
|
-
A serializable model representing a benchmark run and its results for generative
|
|
557
|
-
requests and responses. Includes the completed and errored requests, the start
|
|
558
|
-
and end times for the benchmark, and the statistics for the requests and responses.
|
|
559
|
-
"""
|
|
560
|
-
|
|
561
|
-
type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment]
|
|
562
|
-
start_time: float = Field(
|
|
563
|
-
description="The start time of the first request for the benchmark.",
|
|
564
|
-
)
|
|
565
|
-
end_time: float = Field(
|
|
566
|
-
description="The end time of the last request for the benchmark.",
|
|
567
|
-
)
|
|
568
|
-
|
|
569
|
-
@computed_field # type: ignore[misc]
|
|
570
|
-
@property
|
|
571
|
-
def duration(self) -> float:
|
|
572
|
-
"""
|
|
573
|
-
:return: The duration of the benchmark in seconds from the start of the
|
|
574
|
-
first request to the end of the last request.
|
|
575
|
-
"""
|
|
576
|
-
return self.end_time - self.start_time
|
|
577
|
-
|
|
578
|
-
worker: GenerativeRequestsWorkerDescription = Field(
|
|
579
|
-
description=(
|
|
580
|
-
"The description and specifics for the worker used to resolve requests "
|
|
581
|
-
"for this benchmark."
|
|
582
|
-
),
|
|
583
|
-
)
|
|
584
|
-
request_loader: GenerativeRequestLoaderDescription = Field(
|
|
585
|
-
description=(
|
|
586
|
-
"The description and specifics for the request loader used to create "
|
|
587
|
-
"requests for this benchmark."
|
|
588
|
-
),
|
|
589
|
-
)
|
|
590
|
-
metrics: GenerativeMetrics = Field(
|
|
591
|
-
description=(
|
|
592
|
-
"The metrics for the benchmark run represented as a distribution of "
|
|
593
|
-
"various per-request statistics."
|
|
594
|
-
),
|
|
595
|
-
)
|
|
596
|
-
# Output is ordered so keep the requests at the end for better readability in files
|
|
597
|
-
request_totals: StatusBreakdown[int, int, int, int] = Field(
|
|
598
|
-
description=(
|
|
599
|
-
"The number of requests made for the benchmark broken down by status "
|
|
600
|
-
"including successful, incomplete, errored, and the sum of all three"
|
|
601
|
-
)
|
|
602
|
-
)
|
|
603
|
-
request_samples: Optional[StatusBreakdown[int, int, int, None]] = Field(
|
|
604
|
-
description=(
|
|
605
|
-
"The number of requests that were randomly sampled for "
|
|
606
|
-
"the benchmark. None if no sampling was applied."
|
|
607
|
-
),
|
|
608
|
-
default=None,
|
|
609
|
-
)
|
|
610
|
-
requests: StatusBreakdown[
|
|
611
|
-
list[GenerativeTextResponseStats],
|
|
612
|
-
list[GenerativeTextErrorStats],
|
|
613
|
-
list[GenerativeTextErrorStats],
|
|
614
|
-
None,
|
|
615
|
-
] = Field(
|
|
616
|
-
description=(
|
|
617
|
-
"The breakdown of requests for the benchmark run including successful, "
|
|
618
|
-
"incomplete, and errored requests."
|
|
619
|
-
),
|
|
620
|
-
)
|
|
621
|
-
|
|
622
|
-
def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
|
|
623
|
-
"""
|
|
624
|
-
Set the sample size for the benchmark. This will randomly sample the
|
|
625
|
-
requests for each status type to the given sample size or the maximum
|
|
626
|
-
number of requests for that status type, whichever is smaller.
|
|
627
|
-
This is applied to requests.successful, requests.errored, and
|
|
628
|
-
requests.incomplete.
|
|
629
|
-
If None, no sampling is applied and the state is kept.
|
|
630
|
-
|
|
631
|
-
:param sample_size: The number of requests to sample for each status type.
|
|
632
|
-
:return: The benchmark with the sampled requests.
|
|
633
|
-
:raises ValueError: If the sample size is invalid.
|
|
634
|
-
"""
|
|
635
|
-
|
|
636
|
-
if sample_size is not None:
|
|
637
|
-
if sample_size < 0 or not isinstance(sample_size, int):
|
|
638
|
-
raise ValueError(
|
|
639
|
-
f"Sample size must be non-negative integer, given {sample_size}"
|
|
640
|
-
)
|
|
641
|
-
|
|
642
|
-
sample_size = min(sample_size, len(self.requests.successful))
|
|
643
|
-
error_sample_size = min(sample_size, len(self.requests.errored))
|
|
644
|
-
incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
|
|
645
|
-
|
|
646
|
-
self.requests.successful = random.sample(
|
|
647
|
-
self.requests.successful, sample_size
|
|
648
|
-
)
|
|
649
|
-
self.requests.errored = random.sample(
|
|
650
|
-
self.requests.errored, error_sample_size
|
|
651
|
-
)
|
|
652
|
-
self.requests.incomplete = random.sample(
|
|
653
|
-
self.requests.incomplete, incomplete_sample_size
|
|
654
|
-
)
|
|
655
|
-
self.request_samples = StatusBreakdown(
|
|
656
|
-
successful=len(self.requests.successful),
|
|
657
|
-
incomplete=len(self.requests.incomplete),
|
|
658
|
-
errored=len(self.requests.errored),
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
return self
|
|
662
|
-
|
|
663
|
-
@staticmethod
|
|
664
|
-
def from_stats(
|
|
665
|
-
run_id: str,
|
|
666
|
-
successful: list[GenerativeTextResponseStats],
|
|
667
|
-
incomplete: list[GenerativeTextErrorStats],
|
|
668
|
-
errored: list[GenerativeTextErrorStats],
|
|
669
|
-
args: BenchmarkArgs,
|
|
670
|
-
run_stats: BenchmarkRunStats,
|
|
671
|
-
worker: GenerativeRequestsWorkerDescription,
|
|
672
|
-
requests_loader: GenerativeRequestLoaderDescription,
|
|
673
|
-
extras: Optional[dict[str, Any]],
|
|
674
|
-
) -> "GenerativeBenchmark":
|
|
675
|
-
"""
|
|
676
|
-
Create a GenerativeBenchmark instance from the given statistics and metadata.
|
|
677
|
-
Given the completed and errored requests, the benchmark will fill in the
|
|
678
|
-
remaining statistics for the various metrics required for a benchmark.
|
|
679
|
-
This is the preferred method for creating a GenerativeBenchmark instance
|
|
680
|
-
to ensure all statistics are properly calculated and populated.
|
|
681
|
-
|
|
682
|
-
:param run_id: The unique identifier for the benchmark run.
|
|
683
|
-
:param completed: The list of completed requests.
|
|
684
|
-
:param errored: The list of errored requests.
|
|
685
|
-
:param args: The arguments used to specify how to run the benchmark
|
|
686
|
-
and collect data.
|
|
687
|
-
:param run_stats: The process statistics for the entire benchmark run across
|
|
688
|
-
all requests.
|
|
689
|
-
:param worker: The description and specifics for the worker used to resolve
|
|
690
|
-
requests.
|
|
691
|
-
:param requests_loader: The description and specifics for the request loader
|
|
692
|
-
used to create requests.
|
|
693
|
-
:param extras: Any additional information or metadata that was passed for
|
|
694
|
-
this benchmark.
|
|
695
|
-
:return: A GenerativeBenchmark instance with the given statistics and metadata
|
|
696
|
-
populated and calculated
|
|
697
|
-
"""
|
|
698
|
-
total = successful + incomplete + errored
|
|
699
|
-
total_types: list[Literal["successful", "incomplete", "error"]] = [
|
|
700
|
-
*["successful"] * len(successful), # type: ignore[list-item]
|
|
701
|
-
*["incomplete"] * len(incomplete), # type: ignore[list-item]
|
|
702
|
-
*["error"] * len(errored), # type: ignore[list-item]
|
|
703
|
-
]
|
|
704
|
-
start_time = min(req.start_time for req in total)
|
|
705
|
-
end_time = max(req.end_time for req in total)
|
|
706
|
-
|
|
707
|
-
total_with_prompt, total_types_with_prompt = (
|
|
708
|
-
zip(*filtered)
|
|
709
|
-
if (
|
|
710
|
-
filtered := list(
|
|
711
|
-
filter(lambda val: bool(val[0].prompt), zip(total, total_types))
|
|
712
|
-
)
|
|
713
|
-
)
|
|
714
|
-
else ([], [])
|
|
715
|
-
)
|
|
716
|
-
total_with_output_first, total_types_with_output_first = (
|
|
717
|
-
zip(*filtered)
|
|
718
|
-
if (
|
|
719
|
-
filtered := list(
|
|
720
|
-
filter(
|
|
721
|
-
lambda val: bool(val[0].output_tokens > 0),
|
|
722
|
-
zip(total, total_types),
|
|
723
|
-
)
|
|
724
|
-
)
|
|
725
|
-
)
|
|
726
|
-
else ([], [])
|
|
727
|
-
)
|
|
728
|
-
total_with_output_multi, total_types_with_output_multi = (
|
|
729
|
-
zip(*filtered)
|
|
730
|
-
if (
|
|
731
|
-
filtered := list(
|
|
732
|
-
filter(
|
|
733
|
-
lambda val: bool(val[0].output_tokens > 1),
|
|
734
|
-
zip(total, total_types),
|
|
735
|
-
)
|
|
736
|
-
)
|
|
737
|
-
)
|
|
738
|
-
else ([], [])
|
|
739
|
-
)
|
|
740
|
-
|
|
741
|
-
return GenerativeBenchmark(
|
|
742
|
-
run_id=run_id,
|
|
743
|
-
args=args,
|
|
744
|
-
run_stats=run_stats,
|
|
745
|
-
extras=extras or {},
|
|
746
|
-
start_time=start_time,
|
|
747
|
-
end_time=end_time,
|
|
748
|
-
worker=worker,
|
|
749
|
-
request_loader=requests_loader,
|
|
750
|
-
metrics=GenerativeMetrics(
|
|
751
|
-
requests_per_second=StatusDistributionSummary.from_request_times(
|
|
752
|
-
request_types=total_types,
|
|
753
|
-
requests=[(req.start_time, req.end_time) for req in total],
|
|
754
|
-
distribution_type="rate",
|
|
755
|
-
),
|
|
756
|
-
request_concurrency=StatusDistributionSummary.from_request_times(
|
|
757
|
-
request_types=total_types,
|
|
758
|
-
requests=[(req.start_time, req.end_time) for req in total],
|
|
759
|
-
distribution_type="concurrency",
|
|
760
|
-
),
|
|
761
|
-
request_latency=StatusDistributionSummary.from_values(
|
|
762
|
-
value_types=total_types,
|
|
763
|
-
values=[req.request_latency for req in total],
|
|
764
|
-
),
|
|
765
|
-
prompt_token_count=StatusDistributionSummary.from_values(
|
|
766
|
-
value_types=list(total_types_with_prompt),
|
|
767
|
-
values=[req.prompt_tokens for req in total_with_prompt],
|
|
768
|
-
),
|
|
769
|
-
output_token_count=StatusDistributionSummary.from_values(
|
|
770
|
-
value_types=list(total_types_with_output_first),
|
|
771
|
-
values=[req.output_tokens for req in total_with_output_first],
|
|
772
|
-
),
|
|
773
|
-
time_to_first_token_ms=StatusDistributionSummary.from_values(
|
|
774
|
-
value_types=list(total_types_with_output_first),
|
|
775
|
-
values=[
|
|
776
|
-
req.time_to_first_token_ms or 0
|
|
777
|
-
for req in total_with_output_first
|
|
778
|
-
],
|
|
779
|
-
),
|
|
780
|
-
time_per_output_token_ms=StatusDistributionSummary.from_values(
|
|
781
|
-
value_types=list(total_types_with_output_first),
|
|
782
|
-
values=[
|
|
783
|
-
req.time_per_output_token_ms or 0
|
|
784
|
-
for req in total_with_output_first
|
|
785
|
-
],
|
|
786
|
-
weights=[req.output_tokens for req in total_with_output_first],
|
|
787
|
-
),
|
|
788
|
-
inter_token_latency_ms=StatusDistributionSummary.from_values(
|
|
789
|
-
value_types=list(total_types_with_output_multi),
|
|
790
|
-
values=[
|
|
791
|
-
req.inter_token_latency_ms or 0
|
|
792
|
-
for req in total_with_output_multi
|
|
793
|
-
],
|
|
794
|
-
weights=[req.output_tokens - 1 for req in total_with_output_multi],
|
|
795
|
-
),
|
|
796
|
-
output_tokens_per_second=StatusDistributionSummary.from_iterable_request_times(
|
|
797
|
-
request_types=list(total_types_with_output_first),
|
|
798
|
-
requests=[
|
|
799
|
-
(req.start_time, req.end_time)
|
|
800
|
-
for req in total_with_output_first
|
|
801
|
-
],
|
|
802
|
-
first_iter_times=[
|
|
803
|
-
req.first_token_time or req.start_time
|
|
804
|
-
for req in total_with_output_first
|
|
805
|
-
],
|
|
806
|
-
iter_counts=[req.output_tokens for req in total_with_output_first],
|
|
807
|
-
),
|
|
808
|
-
tokens_per_second=StatusDistributionSummary.from_iterable_request_times(
|
|
809
|
-
request_types=list(total_types_with_output_first),
|
|
810
|
-
requests=[
|
|
811
|
-
(req.start_time, req.end_time)
|
|
812
|
-
for req in total_with_output_first
|
|
813
|
-
],
|
|
814
|
-
first_iter_times=[
|
|
815
|
-
req.first_token_time or req.start_time
|
|
816
|
-
for req in total_with_output_first
|
|
817
|
-
],
|
|
818
|
-
iter_counts=[req.output_tokens for req in total_with_output_first],
|
|
819
|
-
first_iter_counts=[
|
|
820
|
-
# prompt tokens + first token
|
|
821
|
-
req.prompt_tokens + 1
|
|
822
|
-
for req in total_with_output_first
|
|
823
|
-
],
|
|
824
|
-
),
|
|
825
|
-
),
|
|
826
|
-
request_totals=StatusBreakdown(
|
|
827
|
-
successful=len(successful),
|
|
828
|
-
incomplete=len(incomplete),
|
|
829
|
-
errored=len(errored),
|
|
830
|
-
total=len(total),
|
|
831
|
-
),
|
|
832
|
-
requests=StatusBreakdown(
|
|
833
|
-
successful=successful,
|
|
834
|
-
incomplete=incomplete,
|
|
835
|
-
errored=errored,
|
|
836
|
-
),
|
|
837
|
-
)
|