guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a18.dist-info/RECORD +0 -62
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
guidellm/benchmark/aggregator.py
DELETED
|
@@ -1,760 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import (
|
|
5
|
-
Any,
|
|
6
|
-
Generic,
|
|
7
|
-
Literal,
|
|
8
|
-
Optional,
|
|
9
|
-
TypeVar,
|
|
10
|
-
Union,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
from pydantic import Field
|
|
14
|
-
|
|
15
|
-
from guidellm.backend import ResponseSummary
|
|
16
|
-
from guidellm.benchmark.benchmark import (
|
|
17
|
-
BenchmarkArgs,
|
|
18
|
-
BenchmarkRunStats,
|
|
19
|
-
BenchmarkT,
|
|
20
|
-
GenerativeBenchmark,
|
|
21
|
-
GenerativeTextErrorStats,
|
|
22
|
-
GenerativeTextResponseStats,
|
|
23
|
-
)
|
|
24
|
-
from guidellm.config import settings
|
|
25
|
-
from guidellm.objects import (
|
|
26
|
-
RunningStats,
|
|
27
|
-
StandardBaseModel,
|
|
28
|
-
StatusBreakdown,
|
|
29
|
-
TimeRunningStats,
|
|
30
|
-
)
|
|
31
|
-
from guidellm.request import (
|
|
32
|
-
GenerationRequest,
|
|
33
|
-
GenerativeRequestLoaderDescription,
|
|
34
|
-
RequestLoaderDescription,
|
|
35
|
-
RequestT,
|
|
36
|
-
ResponseT,
|
|
37
|
-
)
|
|
38
|
-
from guidellm.scheduler import (
|
|
39
|
-
GenerativeRequestsWorkerDescription,
|
|
40
|
-
SchedulerRequestResult,
|
|
41
|
-
WorkerDescription,
|
|
42
|
-
)
|
|
43
|
-
from guidellm.utils import check_load_processor
|
|
44
|
-
|
|
45
|
-
__all__ = [
|
|
46
|
-
"AggregatorT",
|
|
47
|
-
"BenchmarkAggregator",
|
|
48
|
-
"GenerativeBenchmarkAggregator",
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class SchedulerRunningStats(StandardBaseModel):
|
|
53
|
-
"""
|
|
54
|
-
The metrics for the scheduler stored as running statistics for easy calculations
|
|
55
|
-
of rates, averages, totals, etc.
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
created_requests: RunningStats = Field(
|
|
59
|
-
description=(
|
|
60
|
-
"The running statistics for the number of requests created for this "
|
|
61
|
-
"benchmark run. This includes all requests created, regardless of "
|
|
62
|
-
"their status."
|
|
63
|
-
),
|
|
64
|
-
default_factory=RunningStats,
|
|
65
|
-
)
|
|
66
|
-
queued_requests: RunningStats = Field(
|
|
67
|
-
description=(
|
|
68
|
-
"The running statistics for the number of requests pending in queue "
|
|
69
|
-
"for this benchmark run. This includes requests that are waiting to "
|
|
70
|
-
"be scheduled."
|
|
71
|
-
),
|
|
72
|
-
default_factory=RunningStats,
|
|
73
|
-
)
|
|
74
|
-
scheduled_requests: RunningStats = Field(
|
|
75
|
-
description=(
|
|
76
|
-
"The running statistics for the number of requests scheduled (actively "
|
|
77
|
-
"running but waiting for the desired start time) for this benchmark run."
|
|
78
|
-
),
|
|
79
|
-
default_factory=RunningStats,
|
|
80
|
-
)
|
|
81
|
-
processing_requests: RunningStats = Field(
|
|
82
|
-
description=(
|
|
83
|
-
"The running statistics for the number of requests actively being "
|
|
84
|
-
"processed by the worker for this benchmark run."
|
|
85
|
-
),
|
|
86
|
-
default_factory=RunningStats,
|
|
87
|
-
)
|
|
88
|
-
completed_requests: RunningStats = Field(
|
|
89
|
-
description=(
|
|
90
|
-
"The running statistics for the number of requests completed for this "
|
|
91
|
-
"benchmark run. This includes requests within the warmup and cooldown "
|
|
92
|
-
"period, if any, along with the final results."
|
|
93
|
-
),
|
|
94
|
-
default_factory=RunningStats,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class RequestsRunningStats(StandardBaseModel):
|
|
99
|
-
"""
|
|
100
|
-
The metrics for requests that have succeeded, been canceled, or errored stored
|
|
101
|
-
as running statistics for easy calculations of rates, averages, totals, etc.
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
totals: StatusBreakdown[RunningStats, RunningStats, RunningStats, RunningStats] = (
|
|
105
|
-
Field(
|
|
106
|
-
description=(
|
|
107
|
-
"The running statistics for the total number of requests that "
|
|
108
|
-
"completed within the benchmark run."
|
|
109
|
-
),
|
|
110
|
-
default_factory=lambda: StatusBreakdown(
|
|
111
|
-
successful=RunningStats(),
|
|
112
|
-
errored=RunningStats(),
|
|
113
|
-
incomplete=RunningStats(),
|
|
114
|
-
total=RunningStats(),
|
|
115
|
-
),
|
|
116
|
-
)
|
|
117
|
-
)
|
|
118
|
-
queued_time: TimeRunningStats = Field(
|
|
119
|
-
description=(
|
|
120
|
-
"The running statistics for the time spent in queue for all requests that "
|
|
121
|
-
"completed within the benchmark run. This is the time from when the "
|
|
122
|
-
"request was created to when it was dequeued by the worker."
|
|
123
|
-
),
|
|
124
|
-
default_factory=TimeRunningStats,
|
|
125
|
-
)
|
|
126
|
-
scheduled_time_delay: TimeRunningStats = Field(
|
|
127
|
-
description=(
|
|
128
|
-
"The running statistics for the time spent from when a request was "
|
|
129
|
-
"dequeued by the worker to when it was actually scheduled by the worker"
|
|
130
|
-
"for all requests that completed within the benchmark run. "
|
|
131
|
-
"This should be as close to 0 as possible, any additional time is "
|
|
132
|
-
"overheads from the system or the worker."
|
|
133
|
-
),
|
|
134
|
-
default_factory=TimeRunningStats,
|
|
135
|
-
)
|
|
136
|
-
scheduled_time_sleep: TimeRunningStats = Field(
|
|
137
|
-
description=(
|
|
138
|
-
"The running statistics for the time for each request spent sleeping til "
|
|
139
|
-
"the desired start time was reached for all requests that completed within "
|
|
140
|
-
"the benchmark run. This is the time from when the request was scheduled "
|
|
141
|
-
"to when the desired start time was reached. "
|
|
142
|
-
),
|
|
143
|
-
default_factory=TimeRunningStats,
|
|
144
|
-
)
|
|
145
|
-
worker_start_delay: TimeRunningStats = Field(
|
|
146
|
-
description=(
|
|
147
|
-
"The running statistics for the time delay between when the request was "
|
|
148
|
-
"scheduled and when the worker actually started processing subtracting any "
|
|
149
|
-
"sleep time for all requests that completed within the benchmark run. "
|
|
150
|
-
"This should be as close to 0 as possible, any additional time is "
|
|
151
|
-
"overheads from the system or the worker."
|
|
152
|
-
),
|
|
153
|
-
default_factory=TimeRunningStats,
|
|
154
|
-
)
|
|
155
|
-
worker_time: TimeRunningStats = Field(
|
|
156
|
-
description=(
|
|
157
|
-
"The running statistics for the time spent processing all requests that "
|
|
158
|
-
"completed within the benchmark run. This is the time from when the "
|
|
159
|
-
"request was started to when it was completed."
|
|
160
|
-
),
|
|
161
|
-
default_factory=TimeRunningStats,
|
|
162
|
-
)
|
|
163
|
-
worker_start_time_targeted_delay: TimeRunningStats = Field(
|
|
164
|
-
description=(
|
|
165
|
-
"The running statistics for the delay between the targeted start time and "
|
|
166
|
-
"the actual start time for requests that completed within the benchmark "
|
|
167
|
-
"run. This represents delays from the best case desired start time. "
|
|
168
|
-
"For async strategies, this represents delays from the ideal system. "
|
|
169
|
-
"For sync strategies, since those are doubled in queue, this should be "
|
|
170
|
-
"as close to the time for a request to be processed as possible."
|
|
171
|
-
),
|
|
172
|
-
default_factory=TimeRunningStats,
|
|
173
|
-
)
|
|
174
|
-
request_start_time_delay: TimeRunningStats = Field(
|
|
175
|
-
description=(
|
|
176
|
-
"The running statistics for the delay between the actual request being "
|
|
177
|
-
"made and the time the worker started on the request for all requests "
|
|
178
|
-
"that completed within the benchmark run. This time should be as close to "
|
|
179
|
-
"0 as possible, any additional time is overhead from the system or "
|
|
180
|
-
"the worker."
|
|
181
|
-
),
|
|
182
|
-
default_factory=TimeRunningStats,
|
|
183
|
-
)
|
|
184
|
-
request_start_time_targeted_delay: TimeRunningStats = Field(
|
|
185
|
-
description=(
|
|
186
|
-
"The running statistics for the delay between the targeted start time and "
|
|
187
|
-
"the actual start time for all requests that completed within the "
|
|
188
|
-
"benchmark run. This represents delays from the best case desired start "
|
|
189
|
-
"time. For async strategies, this represents delays from the ideal system. "
|
|
190
|
-
"For sync strategies, since those are duplicated in queue, this should be "
|
|
191
|
-
"as close to the time for a request to be processed."
|
|
192
|
-
),
|
|
193
|
-
default_factory=TimeRunningStats,
|
|
194
|
-
)
|
|
195
|
-
request_time_delay: TimeRunningStats = Field(
|
|
196
|
-
description=(
|
|
197
|
-
"The running statistics for the delay in time between the total request "
|
|
198
|
-
"time and the worker time. This should be as close to 0 as possible, any "
|
|
199
|
-
"additional time is overhead from the system or the worker. "
|
|
200
|
-
),
|
|
201
|
-
default_factory=TimeRunningStats,
|
|
202
|
-
)
|
|
203
|
-
request_time: TimeRunningStats = Field(
|
|
204
|
-
description=(
|
|
205
|
-
"The running statistics for the time spent processing all requests that "
|
|
206
|
-
"completed within the benchmark run. This is the time from when the "
|
|
207
|
-
"request was created to when it was completed."
|
|
208
|
-
),
|
|
209
|
-
default_factory=TimeRunningStats,
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
class BenchmarkAggregator(
|
|
214
|
-
ABC, StandardBaseModel, Generic[BenchmarkT, RequestT, ResponseT]
|
|
215
|
-
):
|
|
216
|
-
"""
|
|
217
|
-
A pydantic base class representing the base class for aggregating benchmark results.
|
|
218
|
-
The purpose is to receive and process results from a Benchmarker as it iterates
|
|
219
|
-
through a Scheduler for an individual benchmark run.
|
|
220
|
-
As results are added, lightweight statistics are updated and stored for immediate
|
|
221
|
-
progress and informational updates to the caller.
|
|
222
|
-
Once the benchmark run is complete, the `compile` method is called to finalize
|
|
223
|
-
the benchmark and return a Benchmark object with all the results and statistics
|
|
224
|
-
fully calculated.
|
|
225
|
-
"""
|
|
226
|
-
|
|
227
|
-
type_: Literal["benchmark_aggregator"] = "benchmark_aggregator"
|
|
228
|
-
run_id: str = Field(
|
|
229
|
-
description=(
|
|
230
|
-
"The unique identifier for the encompasing benchmark run that this "
|
|
231
|
-
"benchmark was a part of."
|
|
232
|
-
)
|
|
233
|
-
)
|
|
234
|
-
args: BenchmarkArgs = Field(
|
|
235
|
-
description=(
|
|
236
|
-
"The arguments used to create the benchmark run that this benchmark was "
|
|
237
|
-
"a part of."
|
|
238
|
-
)
|
|
239
|
-
)
|
|
240
|
-
worker_description: Union[
|
|
241
|
-
GenerativeRequestsWorkerDescription, WorkerDescription
|
|
242
|
-
] = Field(
|
|
243
|
-
description=(
|
|
244
|
-
"The description and specifics for the worker used to resolve requests "
|
|
245
|
-
"for this benchmark."
|
|
246
|
-
),
|
|
247
|
-
discriminator="type_",
|
|
248
|
-
)
|
|
249
|
-
request_loader_description: Union[
|
|
250
|
-
GenerativeRequestLoaderDescription, RequestLoaderDescription
|
|
251
|
-
] = Field(
|
|
252
|
-
description=(
|
|
253
|
-
"The description and specifics for the request loader used to create "
|
|
254
|
-
"requests for this benchmark."
|
|
255
|
-
),
|
|
256
|
-
discriminator="type_",
|
|
257
|
-
)
|
|
258
|
-
extras: dict[str, Any] = Field(
|
|
259
|
-
description=(
|
|
260
|
-
"Any additional information or metadata that was passed for this benchmark."
|
|
261
|
-
)
|
|
262
|
-
)
|
|
263
|
-
in_warmup: bool = Field(
|
|
264
|
-
description=(
|
|
265
|
-
"A flag to indicate if the benchmark is currently in the warmup phase."
|
|
266
|
-
),
|
|
267
|
-
default=False,
|
|
268
|
-
exclude=True,
|
|
269
|
-
)
|
|
270
|
-
in_cooldown: bool = Field(
|
|
271
|
-
description=(
|
|
272
|
-
"A flag to indicate if the benchmark is currently in the cooldown phase."
|
|
273
|
-
),
|
|
274
|
-
default=False,
|
|
275
|
-
exclude=True,
|
|
276
|
-
)
|
|
277
|
-
scheduler_stats: SchedulerRunningStats = Field(
|
|
278
|
-
description=(
|
|
279
|
-
"The running statistics for the scheduler for this benchmark run. "
|
|
280
|
-
"This includes all requests created, regardless of their status."
|
|
281
|
-
),
|
|
282
|
-
default_factory=SchedulerRunningStats,
|
|
283
|
-
)
|
|
284
|
-
requests_stats: RequestsRunningStats = Field(
|
|
285
|
-
description=(
|
|
286
|
-
"The running statistics for the requests for this benchmark run. "
|
|
287
|
-
"This includes all requests created, regardless of their status."
|
|
288
|
-
),
|
|
289
|
-
default_factory=RequestsRunningStats,
|
|
290
|
-
)
|
|
291
|
-
results: StatusBreakdown[
|
|
292
|
-
list[SchedulerRequestResult[RequestT, ResponseT]],
|
|
293
|
-
list[SchedulerRequestResult[RequestT, ResponseT]],
|
|
294
|
-
list[SchedulerRequestResult[RequestT, ResponseT]],
|
|
295
|
-
None,
|
|
296
|
-
] = Field(
|
|
297
|
-
description=(
|
|
298
|
-
"The completed requests for this benchmark run broken down by status"
|
|
299
|
-
"and excluding warmup and cooldown requests."
|
|
300
|
-
),
|
|
301
|
-
default_factory=lambda: StatusBreakdown( # type: ignore[arg-type]
|
|
302
|
-
successful=[],
|
|
303
|
-
errored=[],
|
|
304
|
-
incomplete=[],
|
|
305
|
-
total=None,
|
|
306
|
-
),
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
def add_result(
|
|
310
|
-
self,
|
|
311
|
-
result: SchedulerRequestResult[RequestT, ResponseT],
|
|
312
|
-
) -> bool:
|
|
313
|
-
"""
|
|
314
|
-
Add a result to the aggregator. This will update the internal statistics
|
|
315
|
-
and add the result to the list of results if it is not within the warmup or
|
|
316
|
-
cooldown period.
|
|
317
|
-
|
|
318
|
-
:param result: The result to add to the aggregator.
|
|
319
|
-
:return: True if the result was added, False if it was added because it
|
|
320
|
-
did not fit within the warmup or cooldown period, was not requested,
|
|
321
|
-
or is not finished
|
|
322
|
-
"""
|
|
323
|
-
# Add scheduler statistics
|
|
324
|
-
self.scheduler_stats.created_requests += max(
|
|
325
|
-
0, result.run_info.created_requests
|
|
326
|
-
)
|
|
327
|
-
self.scheduler_stats.queued_requests += max(0, result.run_info.queued_requests)
|
|
328
|
-
self.scheduler_stats.scheduled_requests += max(
|
|
329
|
-
0, result.run_info.scheduled_requests
|
|
330
|
-
)
|
|
331
|
-
self.scheduler_stats.processing_requests += max(
|
|
332
|
-
0, result.run_info.processing_requests
|
|
333
|
-
)
|
|
334
|
-
self.scheduler_stats.completed_requests += max(
|
|
335
|
-
0, result.run_info.completed_requests
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
if result.type_ != "request_complete" or (
|
|
339
|
-
result.request_info.canceled and not result.request_info.requested
|
|
340
|
-
):
|
|
341
|
-
# If the result is not completed yet, don't add to the results
|
|
342
|
-
# If the result was canceled and not started, ignore it
|
|
343
|
-
return False
|
|
344
|
-
|
|
345
|
-
# Add request statistics
|
|
346
|
-
self.requests_stats.totals.total += 1
|
|
347
|
-
if result.request_info.canceled:
|
|
348
|
-
self.requests_stats.totals.incomplete += 1
|
|
349
|
-
elif result.request_info.errored:
|
|
350
|
-
self.requests_stats.totals.errored += 1
|
|
351
|
-
elif result.request_info.completed:
|
|
352
|
-
self.requests_stats.totals.successful += 1
|
|
353
|
-
else:
|
|
354
|
-
raise ValueError(
|
|
355
|
-
"Unexpected state: request_info must be either "
|
|
356
|
-
"completed, canceled, or errored. "
|
|
357
|
-
f"Got {result.request_info}"
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
self.requests_stats.queued_time.update(
|
|
361
|
-
result.request_info.dequeued_time - result.request_info.queued_time
|
|
362
|
-
)
|
|
363
|
-
self.requests_stats.scheduled_time_delay.update(
|
|
364
|
-
result.request_info.scheduled_time - result.request_info.dequeued_time
|
|
365
|
-
)
|
|
366
|
-
sleep_time = max(
|
|
367
|
-
0.0,
|
|
368
|
-
result.request_info.targeted_start_time
|
|
369
|
-
- result.request_info.scheduled_time,
|
|
370
|
-
)
|
|
371
|
-
self.requests_stats.scheduled_time_sleep.update(sleep_time)
|
|
372
|
-
time_to_worker_start = (
|
|
373
|
-
result.request_info.worker_start - result.request_info.scheduled_time
|
|
374
|
-
)
|
|
375
|
-
self.requests_stats.worker_start_delay.update(time_to_worker_start - sleep_time)
|
|
376
|
-
self.requests_stats.worker_time.update(
|
|
377
|
-
result.request_info.worker_end - result.request_info.worker_start
|
|
378
|
-
)
|
|
379
|
-
self.requests_stats.worker_start_time_targeted_delay.update(
|
|
380
|
-
result.request_info.worker_start - result.request_info.targeted_start_time
|
|
381
|
-
)
|
|
382
|
-
self.requests_stats.request_start_time_delay.update(
|
|
383
|
-
result.request_info.worker_start - result.request_info.targeted_start_time
|
|
384
|
-
)
|
|
385
|
-
self.requests_stats.request_start_time_targeted_delay.update(
|
|
386
|
-
result.request_info.worker_start - result.request_info.targeted_start_time
|
|
387
|
-
)
|
|
388
|
-
self.requests_stats.request_time_delay.update(
|
|
389
|
-
(result.request_info.worker_end - result.request_info.worker_start)
|
|
390
|
-
- (result.request_info.worker_end - result.request_info.worker_start)
|
|
391
|
-
)
|
|
392
|
-
self.requests_stats.request_time.update(
|
|
393
|
-
result.request_info.worker_end - result.request_info.worker_start
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
# Add result to the list of results provided we are not in warmup or cooldown
|
|
397
|
-
total_completed = self.requests_stats.totals.total.total
|
|
398
|
-
global_start_time = self.requests_stats.totals.total.start_time
|
|
399
|
-
|
|
400
|
-
in_warmup_number = (
|
|
401
|
-
self.args.warmup_number and total_completed <= self.args.warmup_number
|
|
402
|
-
)
|
|
403
|
-
in_warmup_duration = (
|
|
404
|
-
self.args.warmup_duration
|
|
405
|
-
and result.request_info.worker_start
|
|
406
|
-
<= (global_start_time + self.args.warmup_duration)
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
if in_warmup_number or in_warmup_duration:
|
|
410
|
-
self.in_warmup = True
|
|
411
|
-
return True
|
|
412
|
-
|
|
413
|
-
self.in_warmup = False
|
|
414
|
-
in_cooldown_number = (
|
|
415
|
-
self.args.cooldown_number
|
|
416
|
-
and self.args.max_number
|
|
417
|
-
and total_completed > self.args.max_number - self.args.cooldown_number
|
|
418
|
-
)
|
|
419
|
-
in_cooldown_duration = (
|
|
420
|
-
self.args.cooldown_duration
|
|
421
|
-
and self.args.max_duration
|
|
422
|
-
and result.request_info.worker_start
|
|
423
|
-
> global_start_time + self.args.max_duration - self.args.cooldown_duration
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
if in_cooldown_number or in_cooldown_duration:
|
|
427
|
-
self.in_cooldown = True
|
|
428
|
-
return True
|
|
429
|
-
|
|
430
|
-
self.in_cooldown = False
|
|
431
|
-
|
|
432
|
-
if result.request_info.canceled:
|
|
433
|
-
self.results.incomplete.append(result)
|
|
434
|
-
elif result.request_info.errored:
|
|
435
|
-
self.results.errored.append(result)
|
|
436
|
-
elif result.request_info.completed:
|
|
437
|
-
self.results.successful.append(result)
|
|
438
|
-
else:
|
|
439
|
-
raise ValueError(
|
|
440
|
-
"Unexpected state: request_info must be either "
|
|
441
|
-
"completed, canceled, or errored. "
|
|
442
|
-
f"Got {result.request_info}"
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
return True
|
|
446
|
-
|
|
447
|
-
@abstractmethod
|
|
448
|
-
def compile(self) -> BenchmarkT:
|
|
449
|
-
"""
|
|
450
|
-
Compile the benchmark results and statistics into a Benchmark object.
|
|
451
|
-
This is required to be implemented by subclasses to finalize the benchmark
|
|
452
|
-
and return the compiled object.
|
|
453
|
-
"""
|
|
454
|
-
...
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
AggregatorT = TypeVar("AggregatorT", bound=BenchmarkAggregator)
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
class GenerativeRequestsRunningStats(RequestsRunningStats):
|
|
461
|
-
"""
|
|
462
|
-
The metrics for generative requests that have succeeded, been canceled, or errored
|
|
463
|
-
stored as running statistics for easy calculations of rates, averages, totals, etc.
|
|
464
|
-
"""
|
|
465
|
-
|
|
466
|
-
time_to_first_token: TimeRunningStats = Field(
|
|
467
|
-
description=(
|
|
468
|
-
"The running statistics for the time from the start of the request to the "
|
|
469
|
-
"first token being generated for all requests that completed within the "
|
|
470
|
-
"benchmark run."
|
|
471
|
-
),
|
|
472
|
-
default_factory=TimeRunningStats,
|
|
473
|
-
)
|
|
474
|
-
inter_token_latency: TimeRunningStats = Field(
|
|
475
|
-
description=(
|
|
476
|
-
"The running statistics for the time between each token being generated "
|
|
477
|
-
"for all requests that completed within the benchmark run."
|
|
478
|
-
),
|
|
479
|
-
default_factory=TimeRunningStats,
|
|
480
|
-
)
|
|
481
|
-
prompt_tokens: RunningStats = Field(
|
|
482
|
-
description=(
|
|
483
|
-
"The running statistics for the token count for the prompt for all "
|
|
484
|
-
"requests that completed, if available in the response."
|
|
485
|
-
),
|
|
486
|
-
default_factory=RunningStats,
|
|
487
|
-
)
|
|
488
|
-
output_tokens: RunningStats = Field(
|
|
489
|
-
description=(
|
|
490
|
-
"The running statistics for the token count for the output for all "
|
|
491
|
-
"requests that completed, if available in the response."
|
|
492
|
-
),
|
|
493
|
-
default_factory=RunningStats,
|
|
494
|
-
)
|
|
495
|
-
total_tokens: RunningStats = Field(
|
|
496
|
-
description=(
|
|
497
|
-
"The running statistics for the total token count for all requests that "
|
|
498
|
-
"completed, if available in the response."
|
|
499
|
-
),
|
|
500
|
-
default_factory=RunningStats,
|
|
501
|
-
)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
class GenerativeBenchmarkAggregator(
|
|
505
|
-
BenchmarkAggregator[GenerativeBenchmark, GenerationRequest, ResponseSummary]
|
|
506
|
-
):
|
|
507
|
-
type_: Literal["generative_benchmark_aggregator"] = (
|
|
508
|
-
"generative_benchmark_aggregator" # type: ignore[assignment]
|
|
509
|
-
)
|
|
510
|
-
processor: Optional[Union[str, Path, Any]] = Field(
|
|
511
|
-
description=(
|
|
512
|
-
"The tokenizer to use for calculating token counts when none are "
|
|
513
|
-
"avaiable that match the preferred source."
|
|
514
|
-
)
|
|
515
|
-
)
|
|
516
|
-
processor_args: Optional[dict[str, Any]] = Field(
|
|
517
|
-
description=(
|
|
518
|
-
"Additional arguments to pass to the tokenizer if it requires "
|
|
519
|
-
"any specific configuration for loading or processing."
|
|
520
|
-
),
|
|
521
|
-
)
|
|
522
|
-
worker_description: GenerativeRequestsWorkerDescription = Field(
|
|
523
|
-
description=(
|
|
524
|
-
"The description and specifics for the worker used to resolve requests "
|
|
525
|
-
"for this benchmark."
|
|
526
|
-
),
|
|
527
|
-
discriminator="type_",
|
|
528
|
-
)
|
|
529
|
-
request_loader_description: GenerativeRequestLoaderDescription = Field(
|
|
530
|
-
description=(
|
|
531
|
-
"The description and specifics for the request loader used to create "
|
|
532
|
-
"requests for this benchmark."
|
|
533
|
-
),
|
|
534
|
-
discriminator="type_",
|
|
535
|
-
)
|
|
536
|
-
requests_stats: GenerativeRequestsRunningStats = Field(
|
|
537
|
-
description=(
|
|
538
|
-
"The running statistics for the requests for this benchmark run. "
|
|
539
|
-
"This includes all requests created, regardless of their status."
|
|
540
|
-
),
|
|
541
|
-
default_factory=GenerativeRequestsRunningStats,
|
|
542
|
-
)
|
|
543
|
-
|
|
544
|
-
def add_result(
|
|
545
|
-
self, result: SchedulerRequestResult[GenerationRequest, ResponseSummary]
|
|
546
|
-
) -> bool:
|
|
547
|
-
"""
|
|
548
|
-
Add a result to the aggregator. This will update the internal statistics
|
|
549
|
-
and add the result to the list of results if it is not within the warmup or
|
|
550
|
-
cooldown period.
|
|
551
|
-
|
|
552
|
-
:param result: The result to add to the aggregator.
|
|
553
|
-
"""
|
|
554
|
-
if not super().add_result(result):
|
|
555
|
-
return False
|
|
556
|
-
|
|
557
|
-
if result.request is None:
|
|
558
|
-
raise ValueError("Request is None, cannot add result.")
|
|
559
|
-
|
|
560
|
-
if result.response is None:
|
|
561
|
-
raise ValueError("Response is None, cannot add result.")
|
|
562
|
-
|
|
563
|
-
self.requests_stats.request_start_time_delay.update(
|
|
564
|
-
result.response.start_time - result.request_info.worker_start
|
|
565
|
-
)
|
|
566
|
-
self.requests_stats.request_start_time_targeted_delay.update(
|
|
567
|
-
result.response.start_time - result.request_info.targeted_start_time
|
|
568
|
-
)
|
|
569
|
-
self.requests_stats.request_time_delay.update(
|
|
570
|
-
(result.response.start_time - result.request_info.worker_start)
|
|
571
|
-
+ result.request_info.worker_end
|
|
572
|
-
- result.response.end_time
|
|
573
|
-
)
|
|
574
|
-
self.requests_stats.request_time.update(
|
|
575
|
-
result.response.end_time - result.response.start_time
|
|
576
|
-
)
|
|
577
|
-
if result.response.first_iter_time:
|
|
578
|
-
self.requests_stats.time_to_first_token.update(
|
|
579
|
-
result.response.first_iter_time - result.response.start_time
|
|
580
|
-
)
|
|
581
|
-
if result.response.last_iter_time and result.response.first_iter_time:
|
|
582
|
-
self.requests_stats.inter_token_latency.update(
|
|
583
|
-
result.response.last_iter_time - result.response.first_iter_time,
|
|
584
|
-
count=(result.response.output_tokens or 1) - 1,
|
|
585
|
-
)
|
|
586
|
-
self.requests_stats.prompt_tokens += result.response.request_prompt_tokens or 0
|
|
587
|
-
self.requests_stats.output_tokens += result.response.request_output_tokens or 0
|
|
588
|
-
total_tokens = (result.response.request_prompt_tokens or 0) + (
|
|
589
|
-
result.response.request_output_tokens or 0
|
|
590
|
-
)
|
|
591
|
-
self.requests_stats.total_tokens += total_tokens
|
|
592
|
-
|
|
593
|
-
return True
|
|
594
|
-
|
|
595
|
-
def compile(self) -> GenerativeBenchmark:
|
|
596
|
-
"""
|
|
597
|
-
Compile the benchmark results and statistics into a GenerativeBenchmark object.
|
|
598
|
-
This is required to be implemented by subclasses to finalize the benchmark
|
|
599
|
-
and return the compiled object.
|
|
600
|
-
"""
|
|
601
|
-
successful, incomplete, errored = self._compile_results()
|
|
602
|
-
|
|
603
|
-
return GenerativeBenchmark.from_stats(
|
|
604
|
-
run_id=self.run_id,
|
|
605
|
-
successful=successful,
|
|
606
|
-
incomplete=incomplete,
|
|
607
|
-
errored=errored,
|
|
608
|
-
args=self.args,
|
|
609
|
-
run_stats=BenchmarkRunStats(
|
|
610
|
-
start_time=self.requests_stats.totals.total.start_time,
|
|
611
|
-
end_time=time.time(),
|
|
612
|
-
requests_made=StatusBreakdown(
|
|
613
|
-
successful=int(self.requests_stats.totals.successful.total),
|
|
614
|
-
errored=int(self.requests_stats.totals.errored.total),
|
|
615
|
-
incomplete=int(self.requests_stats.totals.incomplete.total),
|
|
616
|
-
total=int(self.requests_stats.totals.total.total),
|
|
617
|
-
),
|
|
618
|
-
queued_time_avg=self.requests_stats.queued_time.mean,
|
|
619
|
-
scheduled_time_delay_avg=self.requests_stats.scheduled_time_delay.mean,
|
|
620
|
-
scheduled_time_sleep_avg=self.requests_stats.scheduled_time_sleep.mean,
|
|
621
|
-
worker_start_delay_avg=self.requests_stats.worker_start_delay.mean,
|
|
622
|
-
worker_time_avg=self.requests_stats.worker_time.mean,
|
|
623
|
-
worker_start_time_targeted_delay_avg=self.requests_stats.worker_start_time_targeted_delay.mean,
|
|
624
|
-
request_start_time_delay_avg=self.requests_stats.request_start_time_delay.mean,
|
|
625
|
-
request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
|
|
626
|
-
request_time_delay_avg=self.requests_stats.request_time_delay.mean,
|
|
627
|
-
request_time_avg=self.requests_stats.request_time.mean,
|
|
628
|
-
),
|
|
629
|
-
worker=self.worker_description,
|
|
630
|
-
requests_loader=self.request_loader_description,
|
|
631
|
-
extras=self.extras,
|
|
632
|
-
)
|
|
633
|
-
|
|
634
|
-
def _compile_results(
|
|
635
|
-
self,
|
|
636
|
-
) -> tuple[
|
|
637
|
-
list[GenerativeTextResponseStats],
|
|
638
|
-
list[GenerativeTextErrorStats],
|
|
639
|
-
list[GenerativeTextErrorStats],
|
|
640
|
-
]:
|
|
641
|
-
successful: list[GenerativeTextResponseStats] = [
|
|
642
|
-
GenerativeTextResponseStats(
|
|
643
|
-
request_id=result.request.request_id,
|
|
644
|
-
request_type=result.request.request_type,
|
|
645
|
-
scheduler_info=result.request_info,
|
|
646
|
-
prompt=str(result.request.content),
|
|
647
|
-
prompt_tokens=self._compile_tokens_count(
|
|
648
|
-
value=str(result.request.content),
|
|
649
|
-
requests_tokens=result.response.request_prompt_tokens,
|
|
650
|
-
response_tokens=result.response.response_prompt_tokens,
|
|
651
|
-
preferred_tokens_source=settings.preferred_prompt_tokens_source,
|
|
652
|
-
errored=False,
|
|
653
|
-
),
|
|
654
|
-
output=result.response.value,
|
|
655
|
-
output_tokens=self._compile_tokens_count(
|
|
656
|
-
value=result.response.value,
|
|
657
|
-
requests_tokens=result.response.request_output_tokens,
|
|
658
|
-
response_tokens=result.response.response_output_tokens,
|
|
659
|
-
preferred_tokens_source=settings.preferred_output_tokens_source,
|
|
660
|
-
errored=False,
|
|
661
|
-
),
|
|
662
|
-
start_time=result.response.start_time,
|
|
663
|
-
end_time=result.response.end_time,
|
|
664
|
-
first_token_time=result.response.first_iter_time or -1.0,
|
|
665
|
-
last_token_time=result.response.last_iter_time or -1.0,
|
|
666
|
-
)
|
|
667
|
-
for result in self.results.successful
|
|
668
|
-
if result.request and result.response
|
|
669
|
-
]
|
|
670
|
-
incomplete: list[GenerativeTextErrorStats] = [
|
|
671
|
-
GenerativeTextErrorStats(
|
|
672
|
-
error=result.response.error or "",
|
|
673
|
-
request_id=result.request.request_id,
|
|
674
|
-
request_type=result.request.request_type,
|
|
675
|
-
scheduler_info=result.request_info,
|
|
676
|
-
prompt=str(result.request.content),
|
|
677
|
-
prompt_tokens=self._compile_tokens_count(
|
|
678
|
-
value=str(result.request.content),
|
|
679
|
-
requests_tokens=result.response.request_prompt_tokens,
|
|
680
|
-
response_tokens=result.response.response_prompt_tokens,
|
|
681
|
-
preferred_tokens_source=settings.preferred_prompt_tokens_source,
|
|
682
|
-
errored=True,
|
|
683
|
-
),
|
|
684
|
-
output=result.response.value,
|
|
685
|
-
output_tokens=self._compile_tokens_count(
|
|
686
|
-
value=result.response.value,
|
|
687
|
-
requests_tokens=result.response.request_output_tokens,
|
|
688
|
-
response_tokens=result.response.response_output_tokens,
|
|
689
|
-
preferred_tokens_source=settings.preferred_output_tokens_source,
|
|
690
|
-
errored=True,
|
|
691
|
-
),
|
|
692
|
-
start_time=result.response.start_time,
|
|
693
|
-
end_time=result.response.end_time,
|
|
694
|
-
first_token_time=result.response.first_iter_time,
|
|
695
|
-
last_token_time=result.response.last_iter_time,
|
|
696
|
-
)
|
|
697
|
-
for result in self.results.incomplete
|
|
698
|
-
if result.request and result.response
|
|
699
|
-
]
|
|
700
|
-
error: list[GenerativeTextErrorStats] = [
|
|
701
|
-
GenerativeTextErrorStats(
|
|
702
|
-
error=result.response.error or "",
|
|
703
|
-
request_id=result.request.request_id,
|
|
704
|
-
request_type=result.request.request_type,
|
|
705
|
-
scheduler_info=result.request_info,
|
|
706
|
-
prompt=str(result.request.content),
|
|
707
|
-
prompt_tokens=self._compile_tokens_count(
|
|
708
|
-
value=str(result.request.content),
|
|
709
|
-
requests_tokens=result.response.request_prompt_tokens,
|
|
710
|
-
response_tokens=result.response.response_prompt_tokens,
|
|
711
|
-
preferred_tokens_source=settings.preferred_prompt_tokens_source,
|
|
712
|
-
errored=True,
|
|
713
|
-
),
|
|
714
|
-
output=result.response.value,
|
|
715
|
-
output_tokens=self._compile_tokens_count(
|
|
716
|
-
value=result.response.value,
|
|
717
|
-
requests_tokens=result.response.request_output_tokens,
|
|
718
|
-
response_tokens=result.response.response_output_tokens,
|
|
719
|
-
preferred_tokens_source=settings.preferred_output_tokens_source,
|
|
720
|
-
errored=True,
|
|
721
|
-
),
|
|
722
|
-
start_time=result.response.start_time,
|
|
723
|
-
end_time=result.response.end_time,
|
|
724
|
-
first_token_time=result.response.first_iter_time,
|
|
725
|
-
last_token_time=result.response.last_iter_time,
|
|
726
|
-
)
|
|
727
|
-
for result in self.results.errored
|
|
728
|
-
if result.request and result.response
|
|
729
|
-
]
|
|
730
|
-
|
|
731
|
-
return successful, incomplete, error
|
|
732
|
-
|
|
733
|
-
def _compile_tokens_count(
|
|
734
|
-
self,
|
|
735
|
-
value: str,
|
|
736
|
-
requests_tokens: Optional[int],
|
|
737
|
-
response_tokens: Optional[int],
|
|
738
|
-
preferred_tokens_source: Optional[Literal["request", "response", "local"]],
|
|
739
|
-
errored: bool,
|
|
740
|
-
) -> int:
|
|
741
|
-
if not errored and preferred_tokens_source == "response" and response_tokens:
|
|
742
|
-
return response_tokens or 0
|
|
743
|
-
|
|
744
|
-
if not errored and preferred_tokens_source == "request" and requests_tokens:
|
|
745
|
-
return requests_tokens or 0
|
|
746
|
-
|
|
747
|
-
if preferred_tokens_source in {"response", "request"} and (
|
|
748
|
-
self.processor is None or errored or response_tokens or requests_tokens
|
|
749
|
-
):
|
|
750
|
-
# we had a preferred tokens source that isn't local and we either
|
|
751
|
-
# have the data to return something or we don't have the ability
|
|
752
|
-
# to calculate locally
|
|
753
|
-
return response_tokens or requests_tokens or 0
|
|
754
|
-
|
|
755
|
-
self.processor = check_load_processor(
|
|
756
|
-
self.processor,
|
|
757
|
-
processor_args=self.processor_args,
|
|
758
|
-
error_msg="Processor/Tokenizer is required for calculating token counts.",
|
|
759
|
-
)
|
|
760
|
-
return len(self.processor.tokenize(value))
|