guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a18.dist-info/RECORD +0 -62
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2085 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark data models and metrics for generative AI performance measurement.
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data structures for capturing, storing, and analyzing
|
|
5
|
+
benchmark results from scheduler-driven generative AI workload executions.
|
|
6
|
+
Core abstractions include base benchmark interfaces, generative-specific
|
|
7
|
+
metrics with token/latency distributions, request-level statistics tracking,
|
|
8
|
+
and multi-benchmark reporting capabilities. These models enable detailed
|
|
9
|
+
performance analysis including throughput, latency, concurrency patterns, and
|
|
10
|
+
domain-specific metrics for text, image, video, and audio generation tasks.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
import json
|
|
17
|
+
import random
|
|
18
|
+
import time
|
|
19
|
+
import uuid
|
|
20
|
+
from abc import ABC, abstractmethod
|
|
21
|
+
from collections.abc import Callable, Iterable
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, ClassVar, Literal, TypeVar, cast
|
|
24
|
+
|
|
25
|
+
import yaml
|
|
26
|
+
from pydantic import ConfigDict, Field, computed_field, model_serializer
|
|
27
|
+
from torch.utils.data import Sampler
|
|
28
|
+
from transformers import PreTrainedTokenizerBase
|
|
29
|
+
|
|
30
|
+
from guidellm.backends import Backend, BackendType
|
|
31
|
+
from guidellm.benchmark.profile import Profile, ProfileType
|
|
32
|
+
from guidellm.benchmark.scenarios import get_builtin_scenarios
|
|
33
|
+
from guidellm.data import DatasetPreprocessor
|
|
34
|
+
from guidellm.scheduler import (
|
|
35
|
+
BackendInterface,
|
|
36
|
+
Environment,
|
|
37
|
+
SchedulerState,
|
|
38
|
+
SchedulingStrategy,
|
|
39
|
+
StrategyType,
|
|
40
|
+
)
|
|
41
|
+
from guidellm.schemas import (
|
|
42
|
+
GenerationRequest,
|
|
43
|
+
GenerationResponse,
|
|
44
|
+
GenerativeRequestStats,
|
|
45
|
+
RequestInfo,
|
|
46
|
+
UsageMetrics,
|
|
47
|
+
)
|
|
48
|
+
from guidellm.utils import (
|
|
49
|
+
InfoMixin,
|
|
50
|
+
StandardBaseDict,
|
|
51
|
+
StandardBaseModel,
|
|
52
|
+
StatusBreakdown,
|
|
53
|
+
StatusDistributionSummary,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"Benchmark",
|
|
58
|
+
"BenchmarkGenerativeTextArgs",
|
|
59
|
+
"BenchmarkSchedulerStats",
|
|
60
|
+
"BenchmarkT",
|
|
61
|
+
"BenchmarkerArgs",
|
|
62
|
+
"BenchmarkerDict",
|
|
63
|
+
"EstimatedBenchmarkState",
|
|
64
|
+
"GenerativeAudioMetricsSummary",
|
|
65
|
+
"GenerativeBenchmark",
|
|
66
|
+
"GenerativeBenchmarksReport",
|
|
67
|
+
"GenerativeImageMetricsSummary",
|
|
68
|
+
"GenerativeMetrics",
|
|
69
|
+
"GenerativeMetricsSummary",
|
|
70
|
+
"GenerativeTextMetricsSummary",
|
|
71
|
+
"GenerativeVideoMetricsSummary",
|
|
72
|
+
"SchedulerDict",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EstimatedBenchmarkState(dict[str, Any]):
|
|
77
|
+
"""
|
|
78
|
+
Accumulator for real-time benchmark metrics during scheduler execution.
|
|
79
|
+
|
|
80
|
+
Tracks incremental metrics, running averages, and time-based statistics as
|
|
81
|
+
requests are processed. Maintains grouped metrics for benchmark state,
|
|
82
|
+
benchmark-level metrics, and scheduler-level metrics with support for
|
|
83
|
+
average, rate, and time-averaged metric calculations.
|
|
84
|
+
|
|
85
|
+
:cvar benchmark_state_group: Metric group key for benchmark state tracking
|
|
86
|
+
:cvar benchmark_metrics_group: Metric group key for benchmark-level metrics
|
|
87
|
+
:cvar scheduler_state_group: Metric group key for scheduler-level metrics
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
benchmark_state_group: ClassVar[Literal["benchmark_state"]] = "benchmark_state"
|
|
91
|
+
benchmark_metrics_group: ClassVar[Literal["benchmark_metrics"]] = (
|
|
92
|
+
"benchmark_metrics"
|
|
93
|
+
)
|
|
94
|
+
scheduler_state_group: ClassVar[Literal["scheduler_state"]] = "scheduler_state"
|
|
95
|
+
|
|
96
|
+
def get_metric(
|
|
97
|
+
self,
|
|
98
|
+
group: str,
|
|
99
|
+
key: str,
|
|
100
|
+
default: int | float | None = None,
|
|
101
|
+
) -> int | float | None:
|
|
102
|
+
"""
|
|
103
|
+
Retrieve a grouped metric value by group and key.
|
|
104
|
+
|
|
105
|
+
:param group: Metric group identifier
|
|
106
|
+
:param key: Metric key within the group
|
|
107
|
+
:param default: Value returned if metric doesn't exist
|
|
108
|
+
:return: The metric value or default if not found
|
|
109
|
+
"""
|
|
110
|
+
return self.get(f"{group}_{key}", default)
|
|
111
|
+
|
|
112
|
+
def set_metric(
|
|
113
|
+
self,
|
|
114
|
+
group: str,
|
|
115
|
+
key: str,
|
|
116
|
+
value: bool | int | float | None,
|
|
117
|
+
start_val: bool | int | float | None = None,
|
|
118
|
+
) -> bool | int | float | None:
|
|
119
|
+
"""
|
|
120
|
+
Set a grouped metric value, optionally adjusting by a starting value.
|
|
121
|
+
|
|
122
|
+
:param group: Metric group identifier
|
|
123
|
+
:param key: Metric key within the group
|
|
124
|
+
:param value: Metric value to set
|
|
125
|
+
:param start_val: Optional starting value to subtract from the metric value
|
|
126
|
+
:return: The adjusted metric value or None if value is None
|
|
127
|
+
"""
|
|
128
|
+
if value is None:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
if start_val is not None:
|
|
132
|
+
value -= start_val
|
|
133
|
+
self[f"{group}_{key}"] = value
|
|
134
|
+
|
|
135
|
+
return value
|
|
136
|
+
|
|
137
|
+
def add_avg_metric(
|
|
138
|
+
self,
|
|
139
|
+
group: str,
|
|
140
|
+
key: str,
|
|
141
|
+
value: bool | int | float | None,
|
|
142
|
+
start_val: bool | int | float | None = 0.0,
|
|
143
|
+
count: int | None = 1,
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Add a value to a running average metric calculation.
|
|
147
|
+
|
|
148
|
+
:param group: Metric group identifier
|
|
149
|
+
:param key: Metric key within the group
|
|
150
|
+
:param value: Value to add to the average
|
|
151
|
+
:param start_val: Optional starting value to subtract before adding
|
|
152
|
+
:param count: Number of observations this value represents
|
|
153
|
+
"""
|
|
154
|
+
if value is None or count is None:
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
if start_val is not None:
|
|
158
|
+
value -= start_val
|
|
159
|
+
|
|
160
|
+
total_key = f"{group}_{key}_total"
|
|
161
|
+
count_key = f"{group}_{key}_count"
|
|
162
|
+
self[total_key] = self.get(total_key, 0) + value
|
|
163
|
+
self[count_key] = self.get(count_key, 0) + count
|
|
164
|
+
|
|
165
|
+
average = self[total_key] / self[count_key] if self[count_key] > 0 else 0.0
|
|
166
|
+
self.set_metric(
|
|
167
|
+
group=group,
|
|
168
|
+
key=key,
|
|
169
|
+
value=average,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def add_avg_rate_metric(
|
|
173
|
+
self,
|
|
174
|
+
group: str,
|
|
175
|
+
key: str,
|
|
176
|
+
value: bool | int | float | None,
|
|
177
|
+
start_val: bool | int | float | None = 0.0,
|
|
178
|
+
start_time: float | None = None,
|
|
179
|
+
end_time: float | None = None,
|
|
180
|
+
numerator_type: Literal["avg", "total", "count"] = "total",
|
|
181
|
+
):
|
|
182
|
+
"""
|
|
183
|
+
Add a value to a rate-based average metric calculation.
|
|
184
|
+
|
|
185
|
+
:param group: Metric group identifier
|
|
186
|
+
:param key: Metric key within the group
|
|
187
|
+
:param value: Value to add to the average
|
|
188
|
+
:param start_val: Optional starting value to subtract before adding
|
|
189
|
+
:param start_time: Start time for rate calculation, defaults to current time
|
|
190
|
+
:param end_time: End time for rate calculation, defaults to current time
|
|
191
|
+
:param numerator_type: Type of numerator for rate calculation
|
|
192
|
+
"""
|
|
193
|
+
if value is None:
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
self.add_avg_metric(
|
|
197
|
+
group=group,
|
|
198
|
+
key=key,
|
|
199
|
+
value=value,
|
|
200
|
+
start_val=start_val,
|
|
201
|
+
)
|
|
202
|
+
start_time_key = f"{group}_{key}_start_time"
|
|
203
|
+
if self.get(start_time_key) is None:
|
|
204
|
+
if start_time is None:
|
|
205
|
+
start_time = time.time()
|
|
206
|
+
self[start_time_key] = start_time
|
|
207
|
+
else:
|
|
208
|
+
self[start_time_key] = start_time or self[start_time_key]
|
|
209
|
+
|
|
210
|
+
end_time = end_time or time.time()
|
|
211
|
+
elapsed_time = end_time - self[start_time_key]
|
|
212
|
+
|
|
213
|
+
if elapsed_time > 0:
|
|
214
|
+
numerator_key = (
|
|
215
|
+
f"{group}_{key}_{numerator_type}"
|
|
216
|
+
if numerator_type != "avg"
|
|
217
|
+
else f"{group}_{key}"
|
|
218
|
+
)
|
|
219
|
+
rate = self[numerator_key] / elapsed_time
|
|
220
|
+
self.set_metric(
|
|
221
|
+
group=group,
|
|
222
|
+
key=f"{key}_per_second",
|
|
223
|
+
value=rate,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def add_time_averaged_metric(
|
|
227
|
+
self,
|
|
228
|
+
group: str,
|
|
229
|
+
key: str,
|
|
230
|
+
value: bool | int | float | None,
|
|
231
|
+
recorded_time: float | None = None,
|
|
232
|
+
):
|
|
233
|
+
"""
|
|
234
|
+
Add a value to a time-weighted average metric calculation.
|
|
235
|
+
|
|
236
|
+
:param group: Metric group identifier
|
|
237
|
+
:param key: Metric key within the group
|
|
238
|
+
:param value: Value to add to the time-weighted average
|
|
239
|
+
:param recorded_time: Time of the observation, defaults to current time
|
|
240
|
+
"""
|
|
241
|
+
if value is None:
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
if recorded_time is None:
|
|
245
|
+
recorded_time = time.time()
|
|
246
|
+
|
|
247
|
+
time_avg_numerator_key = f"{group}_{key}_time_avg_numerator"
|
|
248
|
+
time_avg_denominator_key = f"{group}_{key}_time_avg_denominator"
|
|
249
|
+
last_recorded_time_key = f"{group}_{key}_last_recorded_time"
|
|
250
|
+
last_recorded_value_key = f"{group}_{key}_last_recorded_value"
|
|
251
|
+
|
|
252
|
+
if last_recorded_time_key not in self:
|
|
253
|
+
self[last_recorded_time_key] = recorded_time
|
|
254
|
+
self[last_recorded_value_key] = value
|
|
255
|
+
self[time_avg_numerator_key] = value
|
|
256
|
+
self[time_avg_denominator_key] = 0.0
|
|
257
|
+
else:
|
|
258
|
+
time_delta = recorded_time - self[last_recorded_time_key]
|
|
259
|
+
self[time_avg_numerator_key] += self[last_recorded_value_key] * time_delta
|
|
260
|
+
self[time_avg_denominator_key] += time_delta
|
|
261
|
+
self[last_recorded_time_key] = recorded_time
|
|
262
|
+
self[last_recorded_value_key] = value
|
|
263
|
+
|
|
264
|
+
if self[time_avg_denominator_key] > 0:
|
|
265
|
+
average = self[time_avg_numerator_key] / self[time_avg_denominator_key]
|
|
266
|
+
else:
|
|
267
|
+
average = value
|
|
268
|
+
|
|
269
|
+
self.set_metric(
|
|
270
|
+
group=group,
|
|
271
|
+
key=key,
|
|
272
|
+
value=average,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class BenchmarkerArgs(StandardBaseDict):
|
|
277
|
+
"""
|
|
278
|
+
Configuration parameters for benchmark execution and request sampling.
|
|
279
|
+
|
|
280
|
+
Defines run identification, request sampling strategy, warmup/cooldown phases,
|
|
281
|
+
and metric preferences for benchmark executions. Provides methods to determine
|
|
282
|
+
whether a request falls within warmup or cooldown periods based on time,
|
|
283
|
+
request count, or percentage-based thresholds.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
run_id: str = Field(
|
|
287
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
288
|
+
description="Unique identifier for the benchmark run",
|
|
289
|
+
)
|
|
290
|
+
run_index: int = Field(default=0, description="Index of the benchmark run")
|
|
291
|
+
sample_requests: int | None = Field(
|
|
292
|
+
default=20,
|
|
293
|
+
description=(
|
|
294
|
+
"Number of requests to sample and keep in the final benchmark for metrics"
|
|
295
|
+
),
|
|
296
|
+
)
|
|
297
|
+
warmup: int | float | None = Field(
|
|
298
|
+
default=None, description="Warmup time before benchmarking starts"
|
|
299
|
+
)
|
|
300
|
+
cooldown: int | float | None = Field(
|
|
301
|
+
default=None, description="Cooldown time after benchmarking ends"
|
|
302
|
+
)
|
|
303
|
+
prefer_response_metrics: bool = Field(
|
|
304
|
+
default=True,
|
|
305
|
+
description="Whether to prefer response metrics over request metrics",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def is_in_warmup(
|
|
309
|
+
self, request_info: RequestInfo, scheduler_state: SchedulerState
|
|
310
|
+
) -> bool:
|
|
311
|
+
"""
|
|
312
|
+
Check if a request is in the warmup phase.
|
|
313
|
+
|
|
314
|
+
:param request_info: Information about the current request
|
|
315
|
+
:param scheduler_state: Current state of the scheduler
|
|
316
|
+
:return: True if the request is in warmup phase, False otherwise
|
|
317
|
+
"""
|
|
318
|
+
if self.warmup is not None and 0 < self.warmup < 1:
|
|
319
|
+
# Percentage-based warmup
|
|
320
|
+
return (
|
|
321
|
+
scheduler_state.remaining_fraction is not None
|
|
322
|
+
and scheduler_state.remaining_fraction > (1 - self.warmup)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if self.warmup is not None and self.warmup > 1:
|
|
326
|
+
# Count/time-based warmup
|
|
327
|
+
if scheduler_state.processed_requests < self.warmup:
|
|
328
|
+
return True
|
|
329
|
+
|
|
330
|
+
current_time = request_info.timings.targeted_start
|
|
331
|
+
return (
|
|
332
|
+
current_time is not None
|
|
333
|
+
and (current_time - scheduler_state.start_time) < self.warmup
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return False
|
|
337
|
+
|
|
338
|
+
def is_in_cooldown(
|
|
339
|
+
self, request_info: RequestInfo, scheduler_state: SchedulerState
|
|
340
|
+
) -> bool:
|
|
341
|
+
"""
|
|
342
|
+
Check if a request is in the cooldown phase.
|
|
343
|
+
|
|
344
|
+
:param request_info: Information about the current request
|
|
345
|
+
:param scheduler_state: Current state of the scheduler
|
|
346
|
+
:return: True if the request is in cooldown phase, False otherwise
|
|
347
|
+
"""
|
|
348
|
+
if self.cooldown is not None and 0 < self.cooldown < 1:
|
|
349
|
+
# Percentage-based cooldown
|
|
350
|
+
return (
|
|
351
|
+
scheduler_state.remaining_fraction is not None
|
|
352
|
+
and scheduler_state.remaining_fraction < self.cooldown
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if self.cooldown is not None and self.cooldown > 1:
|
|
356
|
+
# Count/time-based cooldown
|
|
357
|
+
if (
|
|
358
|
+
scheduler_state.remaining_requests is not None
|
|
359
|
+
and scheduler_state.remaining_requests <= self.cooldown
|
|
360
|
+
):
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
current_time = (
|
|
364
|
+
request_info.timings.resolve_end or request_info.timings.targeted_start
|
|
365
|
+
)
|
|
366
|
+
return (
|
|
367
|
+
current_time is not None
|
|
368
|
+
and scheduler_state.remaining_duration is not None
|
|
369
|
+
and scheduler_state.remaining_duration < self.cooldown
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return False
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class Benchmark(ABC):
|
|
376
|
+
"""
|
|
377
|
+
Abstract base interface for benchmark result implementations.
|
|
378
|
+
|
|
379
|
+
Defines the contract for benchmark classes to provide run metrics sampling,
|
|
380
|
+
request metrics sampling, real-time estimate updates, and final compilation
|
|
381
|
+
of benchmark results from scheduler execution data.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
@abstractmethod
|
|
385
|
+
def get_run_metrics_sample(
|
|
386
|
+
self,
|
|
387
|
+
) -> dict[Literal["start_time", "end_time", "duration"], float]:
|
|
388
|
+
"""
|
|
389
|
+
Get a sample of run-level timing metrics.
|
|
390
|
+
|
|
391
|
+
:return: Dictionary containing start_time, end_time, and duration metrics
|
|
392
|
+
"""
|
|
393
|
+
...
|
|
394
|
+
|
|
395
|
+
@abstractmethod
|
|
396
|
+
def get_request_metrics_sample(
|
|
397
|
+
self,
|
|
398
|
+
) -> dict[
|
|
399
|
+
Literal[
|
|
400
|
+
"request_count",
|
|
401
|
+
"request_latency",
|
|
402
|
+
"request_throughput",
|
|
403
|
+
"request_concurrency",
|
|
404
|
+
],
|
|
405
|
+
float,
|
|
406
|
+
]:
|
|
407
|
+
"""
|
|
408
|
+
Get a sample of request-level performance metrics.
|
|
409
|
+
|
|
410
|
+
:return: Dictionary containing request count, latency, throughput, and
|
|
411
|
+
concurrency metrics
|
|
412
|
+
"""
|
|
413
|
+
...
|
|
414
|
+
|
|
415
|
+
@classmethod
|
|
416
|
+
@abstractmethod
|
|
417
|
+
def update_estimate(
|
|
418
|
+
cls,
|
|
419
|
+
args: BenchmarkerArgs,
|
|
420
|
+
state: EstimatedBenchmarkState,
|
|
421
|
+
response: Any,
|
|
422
|
+
request: Any,
|
|
423
|
+
request_info: RequestInfo,
|
|
424
|
+
scheduler_state: SchedulerState,
|
|
425
|
+
):
|
|
426
|
+
"""
|
|
427
|
+
Update real-time benchmark estimates with new request data.
|
|
428
|
+
|
|
429
|
+
:param args: Benchmark configuration arguments
|
|
430
|
+
:param state: Current estimated benchmark state to update
|
|
431
|
+
:param response: Response received from the backend
|
|
432
|
+
:param request: Original request sent to the backend
|
|
433
|
+
:param request_info: Metadata about the request execution
|
|
434
|
+
:param scheduler_state: Current state of the scheduler
|
|
435
|
+
"""
|
|
436
|
+
...
|
|
437
|
+
|
|
438
|
+
@classmethod
|
|
439
|
+
@abstractmethod
|
|
440
|
+
def compile(
|
|
441
|
+
cls,
|
|
442
|
+
args: BenchmarkerArgs,
|
|
443
|
+
estimated_state: EstimatedBenchmarkState,
|
|
444
|
+
scheduler_state: SchedulerState,
|
|
445
|
+
profile: Profile,
|
|
446
|
+
requests: Iterable,
|
|
447
|
+
backend: BackendInterface,
|
|
448
|
+
environment: Environment,
|
|
449
|
+
strategy: SchedulingStrategy,
|
|
450
|
+
constraints: dict[str, dict[str, Any]],
|
|
451
|
+
) -> Any:
|
|
452
|
+
"""
|
|
453
|
+
Compile final benchmark results from accumulated state.
|
|
454
|
+
|
|
455
|
+
:param args: Benchmark configuration arguments
|
|
456
|
+
:param estimated_state: Accumulated benchmark state from execution
|
|
457
|
+
:param scheduler_state: Final state of the scheduler
|
|
458
|
+
:param profile: Benchmark profile configuration
|
|
459
|
+
:param requests: Collection of requests executed
|
|
460
|
+
:param backend: Backend interface used for execution
|
|
461
|
+
:param environment: Execution environment configuration
|
|
462
|
+
:param strategy: Scheduling strategy used
|
|
463
|
+
:param constraints: Execution constraints applied
|
|
464
|
+
:return: Compiled benchmark results instance
|
|
465
|
+
"""
|
|
466
|
+
...
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class BenchmarkSchedulerStats(StandardBaseDict):
|
|
473
|
+
"""Scheduler timing and performance statistics."""
|
|
474
|
+
|
|
475
|
+
group_name: ClassVar[Literal["scheduler_stats"]] = "scheduler_stats"
|
|
476
|
+
|
|
477
|
+
start_time: float = Field(
|
|
478
|
+
description="Unix timestamp when the benchmark run started"
|
|
479
|
+
)
|
|
480
|
+
end_time: float = Field(description="Unix timestamp when the benchmark run ended")
|
|
481
|
+
requests_made: StatusBreakdown[int, int, int, int] = Field(
|
|
482
|
+
description="Request counts by status: successful, incomplete, errored, total"
|
|
483
|
+
)
|
|
484
|
+
queued_time_avg: float = Field(
|
|
485
|
+
description="Avg time requests spent in the queue (seconds)"
|
|
486
|
+
)
|
|
487
|
+
worker_resolve_start_delay_avg: float = Field(
|
|
488
|
+
description="Avg delay before worker begins resolving req after dequeue (sec)"
|
|
489
|
+
)
|
|
490
|
+
worker_resolve_time_avg: float = Field(
|
|
491
|
+
description="Avg time for worker to resolve requests (seconds)"
|
|
492
|
+
)
|
|
493
|
+
worker_resolve_end_delay_avg: float = Field(
|
|
494
|
+
description="Avg delay after request end till worker resolves (seconds)"
|
|
495
|
+
)
|
|
496
|
+
finalized_delay_avg: float = Field(
|
|
497
|
+
description="Avg delay after resolve til finalized with in scheduler (sec)"
|
|
498
|
+
)
|
|
499
|
+
worker_targeted_start_delay_avg: float = Field(
|
|
500
|
+
description="Avg delay from targeted start to actual worker start (seconds)"
|
|
501
|
+
)
|
|
502
|
+
request_start_delay_avg: float = Field(
|
|
503
|
+
description="Avg delay after resolve til request start (seconds)"
|
|
504
|
+
)
|
|
505
|
+
request_time_avg: float = Field(description="Avg request processing time (seconds)")
|
|
506
|
+
request_targeted_start_delay_avg: float = Field(
|
|
507
|
+
description="Avg delay from targeted start to actual request start"
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
@classmethod
|
|
511
|
+
def update_estimate(cls, state: EstimatedBenchmarkState, request_info: RequestInfo):
|
|
512
|
+
"""
|
|
513
|
+
Update estimated scheduler statistics with request timing information.
|
|
514
|
+
|
|
515
|
+
:param state: Current estimated benchmark state to update
|
|
516
|
+
:param request_info: Metadata about the request execution with timing data
|
|
517
|
+
"""
|
|
518
|
+
state.set_metric(group=cls.group_name, key="updated", value=True)
|
|
519
|
+
state.add_avg_metric(
|
|
520
|
+
group=cls.group_name,
|
|
521
|
+
key="queued_time",
|
|
522
|
+
value=request_info.timings.dequeued,
|
|
523
|
+
start_val=request_info.timings.queued,
|
|
524
|
+
)
|
|
525
|
+
state.add_avg_metric(
|
|
526
|
+
group=cls.group_name,
|
|
527
|
+
key="worker_resolve_start_delay",
|
|
528
|
+
value=request_info.timings.resolve_start,
|
|
529
|
+
start_val=request_info.timings.scheduled_at,
|
|
530
|
+
)
|
|
531
|
+
state.add_avg_metric(
|
|
532
|
+
group=cls.group_name,
|
|
533
|
+
key="worker_resolve_time",
|
|
534
|
+
value=request_info.timings.resolve_end,
|
|
535
|
+
start_val=request_info.timings.resolve_start,
|
|
536
|
+
)
|
|
537
|
+
state.add_avg_metric(
|
|
538
|
+
group=cls.group_name,
|
|
539
|
+
key="worker_resolve_end_delay",
|
|
540
|
+
value=request_info.timings.request_end,
|
|
541
|
+
start_val=request_info.timings.resolve_end,
|
|
542
|
+
)
|
|
543
|
+
state.add_avg_metric(
|
|
544
|
+
group=cls.group_name,
|
|
545
|
+
key="finalized_delay",
|
|
546
|
+
value=request_info.timings.finalized,
|
|
547
|
+
start_val=request_info.timings.resolve_end,
|
|
548
|
+
)
|
|
549
|
+
state.add_avg_metric(
|
|
550
|
+
group=cls.group_name,
|
|
551
|
+
key="worker_targeted_start_delay",
|
|
552
|
+
value=request_info.timings.resolve_start,
|
|
553
|
+
start_val=request_info.timings.targeted_start,
|
|
554
|
+
)
|
|
555
|
+
state.add_avg_metric(
|
|
556
|
+
group=cls.group_name,
|
|
557
|
+
key="request_start_delay",
|
|
558
|
+
value=request_info.timings.request_start,
|
|
559
|
+
start_val=request_info.timings.resolve_start,
|
|
560
|
+
)
|
|
561
|
+
state.add_avg_metric(
|
|
562
|
+
group=cls.group_name,
|
|
563
|
+
key="request_time",
|
|
564
|
+
value=request_info.timings.request_end,
|
|
565
|
+
start_val=request_info.timings.request_start,
|
|
566
|
+
)
|
|
567
|
+
state.add_avg_metric(
|
|
568
|
+
group=cls.group_name,
|
|
569
|
+
key="request_targeted_start_delay",
|
|
570
|
+
value=request_info.timings.request_start,
|
|
571
|
+
start_val=request_info.timings.targeted_start,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
@classmethod
|
|
575
|
+
def compile(
|
|
576
|
+
cls, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState
|
|
577
|
+
) -> BenchmarkSchedulerStats:
|
|
578
|
+
"""
|
|
579
|
+
Compile final scheduler statistics from accumulated state.
|
|
580
|
+
|
|
581
|
+
:param estimated_state: Accumulated benchmark state with scheduler metrics
|
|
582
|
+
:param scheduler_state: Final state of the scheduler
|
|
583
|
+
:return: Compiled scheduler statistics instance
|
|
584
|
+
"""
|
|
585
|
+
return BenchmarkSchedulerStats(
|
|
586
|
+
start_time=scheduler_state.start_time,
|
|
587
|
+
end_time=scheduler_state.end_time or scheduler_state.start_time,
|
|
588
|
+
requests_made=StatusBreakdown[int, int, int, int](
|
|
589
|
+
successful=scheduler_state.successful_requests,
|
|
590
|
+
incomplete=scheduler_state.cancelled_requests,
|
|
591
|
+
errored=scheduler_state.errored_requests,
|
|
592
|
+
total=(
|
|
593
|
+
scheduler_state.successful_requests
|
|
594
|
+
+ scheduler_state.cancelled_requests
|
|
595
|
+
+ scheduler_state.errored_requests
|
|
596
|
+
),
|
|
597
|
+
),
|
|
598
|
+
queued_time_avg=cast(
|
|
599
|
+
"float",
|
|
600
|
+
estimated_state.get_metric(
|
|
601
|
+
group=cls.group_name, key="queued_time", default=-1.0
|
|
602
|
+
),
|
|
603
|
+
),
|
|
604
|
+
worker_resolve_start_delay_avg=cast(
|
|
605
|
+
"float",
|
|
606
|
+
estimated_state.get_metric(
|
|
607
|
+
group=cls.group_name, key="worker_resolve_start_delay", default=-1.0
|
|
608
|
+
),
|
|
609
|
+
),
|
|
610
|
+
worker_resolve_time_avg=cast(
|
|
611
|
+
"float",
|
|
612
|
+
estimated_state.get_metric(
|
|
613
|
+
group=cls.group_name, key="worker_resolve_time", default=-1.0
|
|
614
|
+
),
|
|
615
|
+
),
|
|
616
|
+
worker_resolve_end_delay_avg=cast(
|
|
617
|
+
"float",
|
|
618
|
+
estimated_state.get_metric(
|
|
619
|
+
group=cls.group_name, key="worker_resolve_end_delay", default=-1.0
|
|
620
|
+
),
|
|
621
|
+
),
|
|
622
|
+
finalized_delay_avg=cast(
|
|
623
|
+
"float",
|
|
624
|
+
estimated_state.get_metric(
|
|
625
|
+
group=cls.group_name, key="finalized_delay", default=-1.0
|
|
626
|
+
),
|
|
627
|
+
),
|
|
628
|
+
worker_targeted_start_delay_avg=cast(
|
|
629
|
+
"float",
|
|
630
|
+
estimated_state.get_metric(
|
|
631
|
+
group=cls.group_name,
|
|
632
|
+
key="worker_targeted_start_delay",
|
|
633
|
+
default=-1.0,
|
|
634
|
+
),
|
|
635
|
+
),
|
|
636
|
+
request_start_delay_avg=cast(
|
|
637
|
+
"float",
|
|
638
|
+
estimated_state.get_metric(
|
|
639
|
+
group=cls.group_name, key="request_start_delay", default=-1.0
|
|
640
|
+
),
|
|
641
|
+
),
|
|
642
|
+
request_time_avg=cast(
|
|
643
|
+
"float",
|
|
644
|
+
estimated_state.get_metric(
|
|
645
|
+
group=cls.group_name, key="request_time", default=-1.0
|
|
646
|
+
),
|
|
647
|
+
),
|
|
648
|
+
request_targeted_start_delay_avg=cast(
|
|
649
|
+
"float",
|
|
650
|
+
estimated_state.get_metric(
|
|
651
|
+
group=cls.group_name,
|
|
652
|
+
key="request_targeted_start_delay",
|
|
653
|
+
default=-1.0,
|
|
654
|
+
),
|
|
655
|
+
),
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
class GenerativeMetricsSummary(StandardBaseDict):
|
|
660
|
+
"""
|
|
661
|
+
Statistical summaries for input, output, and total metrics.
|
|
662
|
+
|
|
663
|
+
Provides distribution summaries across successful, incomplete, and errored
|
|
664
|
+
requests for absolute values, per-second rates, and concurrency levels.
|
|
665
|
+
"""
|
|
666
|
+
|
|
667
|
+
input: StatusDistributionSummary = Field(
|
|
668
|
+
description="Distribution of input metric values"
|
|
669
|
+
)
|
|
670
|
+
input_per_second: StatusDistributionSummary = Field(
|
|
671
|
+
description="Distribution of input metric rates per second"
|
|
672
|
+
)
|
|
673
|
+
input_concurrency: StatusDistributionSummary = Field(
|
|
674
|
+
description="Distribution of concurrent input metric values"
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
output: StatusDistributionSummary = Field(
|
|
678
|
+
description="Distribution of output metric values"
|
|
679
|
+
)
|
|
680
|
+
output_per_second: StatusDistributionSummary = Field(
|
|
681
|
+
description="Distribution of output metric rates per second"
|
|
682
|
+
)
|
|
683
|
+
output_concurrency: StatusDistributionSummary = Field(
|
|
684
|
+
description="Distribution of concurrent output metric values"
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
total: StatusDistributionSummary = Field(
|
|
688
|
+
description="Distribution of total metric values (input + output)"
|
|
689
|
+
)
|
|
690
|
+
total_per_second: StatusDistributionSummary = Field(
|
|
691
|
+
description="Distribution of total metric rates per second"
|
|
692
|
+
)
|
|
693
|
+
total_concurrency: StatusDistributionSummary = Field(
|
|
694
|
+
description="Distribution of concurrent total metric values"
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
@classmethod
|
|
698
|
+
def compile(
|
|
699
|
+
cls,
|
|
700
|
+
request_types: list[Literal["successful", "incomplete", "error"]],
|
|
701
|
+
request_times: list[tuple[float, float]],
|
|
702
|
+
input_values: list[int | float],
|
|
703
|
+
output_values: list[int | float],
|
|
704
|
+
) -> GenerativeMetricsSummary:
|
|
705
|
+
"""
|
|
706
|
+
Compile generative metrics summary from request data.
|
|
707
|
+
|
|
708
|
+
:param request_types: Status types for each request
|
|
709
|
+
:param request_times: Start and end times for each request
|
|
710
|
+
:param input_values: Input metric values for each request
|
|
711
|
+
:param output_values: Output metric values for each request
|
|
712
|
+
:return: Compiled generative metrics summary
|
|
713
|
+
"""
|
|
714
|
+
total_values = [
|
|
715
|
+
input_val + output_val
|
|
716
|
+
for input_val, output_val in zip(input_values, output_values, strict=False)
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
return GenerativeMetricsSummary(
|
|
720
|
+
input=StatusDistributionSummary.from_values(
|
|
721
|
+
value_types=request_types,
|
|
722
|
+
values=input_values,
|
|
723
|
+
),
|
|
724
|
+
input_per_second=StatusDistributionSummary.from_request_times(
|
|
725
|
+
request_types=request_types,
|
|
726
|
+
requests=request_times,
|
|
727
|
+
distribution_type="rate",
|
|
728
|
+
weights=input_values,
|
|
729
|
+
),
|
|
730
|
+
input_concurrency=StatusDistributionSummary.from_request_times(
|
|
731
|
+
request_types=request_types,
|
|
732
|
+
requests=request_times,
|
|
733
|
+
distribution_type="concurrency",
|
|
734
|
+
weights=input_values,
|
|
735
|
+
),
|
|
736
|
+
output=StatusDistributionSummary.from_values(
|
|
737
|
+
value_types=request_types,
|
|
738
|
+
values=output_values,
|
|
739
|
+
),
|
|
740
|
+
output_per_second=StatusDistributionSummary.from_request_times(
|
|
741
|
+
request_types=request_types,
|
|
742
|
+
requests=request_times,
|
|
743
|
+
distribution_type="rate",
|
|
744
|
+
weights=output_values,
|
|
745
|
+
),
|
|
746
|
+
output_concurrency=StatusDistributionSummary.from_request_times(
|
|
747
|
+
request_types=request_types,
|
|
748
|
+
requests=request_times,
|
|
749
|
+
distribution_type="concurrency",
|
|
750
|
+
weights=output_values,
|
|
751
|
+
),
|
|
752
|
+
total=StatusDistributionSummary.from_values(
|
|
753
|
+
value_types=request_types,
|
|
754
|
+
values=total_values,
|
|
755
|
+
),
|
|
756
|
+
total_per_second=StatusDistributionSummary.from_request_times(
|
|
757
|
+
request_types=request_types,
|
|
758
|
+
requests=request_times,
|
|
759
|
+
distribution_type="rate",
|
|
760
|
+
weights=total_values,
|
|
761
|
+
),
|
|
762
|
+
total_concurrency=StatusDistributionSummary.from_request_times(
|
|
763
|
+
request_types=request_types,
|
|
764
|
+
requests=request_times,
|
|
765
|
+
distribution_type="concurrency",
|
|
766
|
+
weights=total_values,
|
|
767
|
+
),
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
class GenerativeTextMetricsSummary(StandardBaseDict):
|
|
772
|
+
"""
|
|
773
|
+
Text-specific metric summaries for generative benchmarks.
|
|
774
|
+
|
|
775
|
+
Tracks token, word, and character-level metrics across input, output, and
|
|
776
|
+
total usage for text generation workloads.
|
|
777
|
+
"""
|
|
778
|
+
|
|
779
|
+
tokens: GenerativeMetricsSummary = Field(
|
|
780
|
+
description="Token count metrics and distributions"
|
|
781
|
+
)
|
|
782
|
+
words: GenerativeMetricsSummary = Field(
|
|
783
|
+
description="Word count metrics and distributions"
|
|
784
|
+
)
|
|
785
|
+
characters: GenerativeMetricsSummary = Field(
|
|
786
|
+
description="Character count metrics and distributions"
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
@classmethod
|
|
790
|
+
def compile(
|
|
791
|
+
cls,
|
|
792
|
+
request_types: list[Literal["successful", "incomplete", "error"]],
|
|
793
|
+
request_times: list[tuple[float, float]],
|
|
794
|
+
input_metrics: list[UsageMetrics],
|
|
795
|
+
output_metrics: list[UsageMetrics],
|
|
796
|
+
) -> GenerativeTextMetricsSummary:
|
|
797
|
+
"""
|
|
798
|
+
Compile text metrics summary from request usage data.
|
|
799
|
+
|
|
800
|
+
:param request_types: Status types for each request
|
|
801
|
+
:param request_times: Start and end times for each request
|
|
802
|
+
:param input_metrics: Input usage metrics for each request
|
|
803
|
+
:param output_metrics: Output usage metrics for each request
|
|
804
|
+
:return: Compiled text metrics summary
|
|
805
|
+
"""
|
|
806
|
+
return GenerativeTextMetricsSummary(
|
|
807
|
+
tokens=GenerativeMetricsSummary.compile(
|
|
808
|
+
request_types=request_types,
|
|
809
|
+
request_times=request_times,
|
|
810
|
+
input_values=[metrics.text_tokens or 0 for metrics in input_metrics],
|
|
811
|
+
output_values=[metrics.text_tokens or 0 for metrics in output_metrics],
|
|
812
|
+
),
|
|
813
|
+
words=GenerativeMetricsSummary.compile(
|
|
814
|
+
request_types=request_types,
|
|
815
|
+
request_times=request_times,
|
|
816
|
+
input_values=[metrics.text_words or 0 for metrics in input_metrics],
|
|
817
|
+
output_values=[metrics.text_words or 0 for metrics in output_metrics],
|
|
818
|
+
),
|
|
819
|
+
characters=GenerativeMetricsSummary.compile(
|
|
820
|
+
request_types=request_types,
|
|
821
|
+
request_times=request_times,
|
|
822
|
+
input_values=[
|
|
823
|
+
metrics.text_characters or 0 for metrics in input_metrics
|
|
824
|
+
],
|
|
825
|
+
output_values=[
|
|
826
|
+
metrics.text_characters or 0 for metrics in output_metrics
|
|
827
|
+
],
|
|
828
|
+
),
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
class GenerativeImageMetricsSummary(StandardBaseDict):
|
|
833
|
+
"""
|
|
834
|
+
Image-specific metric summaries for generative benchmarks.
|
|
835
|
+
|
|
836
|
+
Tracks token, image count, pixel, and byte-level metrics across input, output,
|
|
837
|
+
and total usage for image generation workloads.
|
|
838
|
+
"""
|
|
839
|
+
|
|
840
|
+
tokens: GenerativeMetricsSummary = Field(
|
|
841
|
+
description="Image token count metrics and distributions"
|
|
842
|
+
)
|
|
843
|
+
images: GenerativeMetricsSummary = Field(
|
|
844
|
+
description="Image count metrics and distributions"
|
|
845
|
+
)
|
|
846
|
+
pixels: GenerativeMetricsSummary = Field(
|
|
847
|
+
description="Pixel count metrics and distributions"
|
|
848
|
+
)
|
|
849
|
+
bytes: GenerativeMetricsSummary = Field(
|
|
850
|
+
description="Byte size metrics and distributions"
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
@classmethod
|
|
854
|
+
def compile(
|
|
855
|
+
cls,
|
|
856
|
+
request_types: list[Literal["successful", "incomplete", "error"]],
|
|
857
|
+
request_times: list[tuple[float, float]],
|
|
858
|
+
input_metrics: list[UsageMetrics],
|
|
859
|
+
output_metrics: list[UsageMetrics],
|
|
860
|
+
) -> GenerativeImageMetricsSummary:
|
|
861
|
+
"""
|
|
862
|
+
Compile image metrics summary from request usage data.
|
|
863
|
+
|
|
864
|
+
:param request_types: Status types for each request
|
|
865
|
+
:param request_times: Start and end times for each request
|
|
866
|
+
:param input_metrics: Input usage metrics for each request
|
|
867
|
+
:param output_metrics: Output usage metrics for each request
|
|
868
|
+
:return: Compiled image metrics summary
|
|
869
|
+
"""
|
|
870
|
+
return GenerativeImageMetricsSummary(
|
|
871
|
+
tokens=GenerativeMetricsSummary.compile(
|
|
872
|
+
request_types=request_types,
|
|
873
|
+
request_times=request_times,
|
|
874
|
+
input_values=[metrics.image_tokens or 0 for metrics in input_metrics],
|
|
875
|
+
output_values=[metrics.image_tokens or 0 for metrics in output_metrics],
|
|
876
|
+
),
|
|
877
|
+
images=GenerativeMetricsSummary.compile(
|
|
878
|
+
request_types=request_types,
|
|
879
|
+
request_times=request_times,
|
|
880
|
+
input_values=[metrics.image_count or 0 for metrics in input_metrics],
|
|
881
|
+
output_values=[metrics.image_count or 0 for metrics in output_metrics],
|
|
882
|
+
),
|
|
883
|
+
pixels=GenerativeMetricsSummary.compile(
|
|
884
|
+
request_types=request_types,
|
|
885
|
+
request_times=request_times,
|
|
886
|
+
input_values=[metrics.image_pixels or 0 for metrics in input_metrics],
|
|
887
|
+
output_values=[metrics.image_pixels or 0 for metrics in output_metrics],
|
|
888
|
+
),
|
|
889
|
+
bytes=GenerativeMetricsSummary.compile(
|
|
890
|
+
request_types=request_types,
|
|
891
|
+
request_times=request_times,
|
|
892
|
+
input_values=[metrics.image_bytes or 0 for metrics in input_metrics],
|
|
893
|
+
output_values=[metrics.image_bytes or 0 for metrics in output_metrics],
|
|
894
|
+
),
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
class GenerativeVideoMetricsSummary(StandardBaseDict):
|
|
899
|
+
"""
|
|
900
|
+
Video-specific metric summaries for generative benchmarks.
|
|
901
|
+
|
|
902
|
+
Tracks token, frame count, duration, and byte-level metrics across input,
|
|
903
|
+
output, and total usage for video generation workloads.
|
|
904
|
+
"""
|
|
905
|
+
|
|
906
|
+
tokens: GenerativeMetricsSummary = Field(
|
|
907
|
+
description="Video token count metrics and distributions"
|
|
908
|
+
)
|
|
909
|
+
frames: GenerativeMetricsSummary = Field(
|
|
910
|
+
description="Frame count metrics and distributions"
|
|
911
|
+
)
|
|
912
|
+
seconds: GenerativeMetricsSummary = Field(
|
|
913
|
+
description="Duration metrics in seconds and distributions"
|
|
914
|
+
)
|
|
915
|
+
bytes: GenerativeMetricsSummary = Field(
|
|
916
|
+
description="Byte size metrics and distributions"
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
@classmethod
|
|
920
|
+
def compile(
|
|
921
|
+
cls,
|
|
922
|
+
request_types: list[Literal["successful", "incomplete", "error"]],
|
|
923
|
+
request_times: list[tuple[float, float]],
|
|
924
|
+
input_metrics: list[UsageMetrics],
|
|
925
|
+
output_metrics: list[UsageMetrics],
|
|
926
|
+
) -> GenerativeVideoMetricsSummary:
|
|
927
|
+
"""
|
|
928
|
+
Compile video metrics summary from request usage data.
|
|
929
|
+
|
|
930
|
+
:param request_types: Status types for each request
|
|
931
|
+
:param request_times: Start and end times for each request
|
|
932
|
+
:param input_metrics: Input usage metrics for each request
|
|
933
|
+
:param output_metrics: Output usage metrics for each request
|
|
934
|
+
:return: Compiled video metrics summary
|
|
935
|
+
"""
|
|
936
|
+
return GenerativeVideoMetricsSummary(
|
|
937
|
+
tokens=GenerativeMetricsSummary.compile(
|
|
938
|
+
request_types=request_types,
|
|
939
|
+
request_times=request_times,
|
|
940
|
+
input_values=[metrics.video_tokens or 0 for metrics in input_metrics],
|
|
941
|
+
output_values=[metrics.video_tokens or 0 for metrics in output_metrics],
|
|
942
|
+
),
|
|
943
|
+
frames=GenerativeMetricsSummary.compile(
|
|
944
|
+
request_types=request_types,
|
|
945
|
+
request_times=request_times,
|
|
946
|
+
input_values=[metrics.video_frames or 0 for metrics in input_metrics],
|
|
947
|
+
output_values=[metrics.video_frames or 0 for metrics in output_metrics],
|
|
948
|
+
),
|
|
949
|
+
seconds=GenerativeMetricsSummary.compile(
|
|
950
|
+
request_types=request_types,
|
|
951
|
+
request_times=request_times,
|
|
952
|
+
input_values=[metrics.video_seconds or 0 for metrics in input_metrics],
|
|
953
|
+
output_values=[
|
|
954
|
+
metrics.video_seconds or 0 for metrics in output_metrics
|
|
955
|
+
],
|
|
956
|
+
),
|
|
957
|
+
bytes=GenerativeMetricsSummary.compile(
|
|
958
|
+
request_types=request_types,
|
|
959
|
+
request_times=request_times,
|
|
960
|
+
input_values=[metrics.video_bytes or 0 for metrics in input_metrics],
|
|
961
|
+
output_values=[metrics.video_bytes or 0 for metrics in output_metrics],
|
|
962
|
+
),
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
class GenerativeAudioMetricsSummary(StandardBaseDict):
|
|
967
|
+
"""
|
|
968
|
+
Audio-specific metric summaries for generative benchmarks.
|
|
969
|
+
|
|
970
|
+
Tracks token, sample count, duration, and byte-level metrics across input,
|
|
971
|
+
output, and total usage for audio generation workloads.
|
|
972
|
+
"""
|
|
973
|
+
|
|
974
|
+
tokens: GenerativeMetricsSummary = Field(
|
|
975
|
+
description="Audio token count metrics and distributions"
|
|
976
|
+
)
|
|
977
|
+
samples: GenerativeMetricsSummary = Field(
|
|
978
|
+
description="Sample count metrics and distributions"
|
|
979
|
+
)
|
|
980
|
+
seconds: GenerativeMetricsSummary = Field(
|
|
981
|
+
description="Duration metrics in seconds and distributions"
|
|
982
|
+
)
|
|
983
|
+
bytes: GenerativeMetricsSummary = Field(
|
|
984
|
+
description="Byte size metrics and distributions"
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
@classmethod
|
|
988
|
+
def compile(
|
|
989
|
+
cls,
|
|
990
|
+
request_types: list[Literal["successful", "incomplete", "error"]],
|
|
991
|
+
request_times: list[tuple[float, float]],
|
|
992
|
+
input_metrics: list[UsageMetrics],
|
|
993
|
+
output_metrics: list[UsageMetrics],
|
|
994
|
+
) -> GenerativeAudioMetricsSummary:
|
|
995
|
+
"""
|
|
996
|
+
Compile audio metrics summary from request usage data.
|
|
997
|
+
|
|
998
|
+
:param request_types: Status types for each request
|
|
999
|
+
:param request_times: Start and end times for each request
|
|
1000
|
+
:param input_metrics: Input usage metrics for each request
|
|
1001
|
+
:param output_metrics: Output usage metrics for each request
|
|
1002
|
+
:return: Compiled audio metrics summary
|
|
1003
|
+
"""
|
|
1004
|
+
return GenerativeAudioMetricsSummary(
|
|
1005
|
+
tokens=GenerativeMetricsSummary.compile(
|
|
1006
|
+
request_types=request_types,
|
|
1007
|
+
request_times=request_times,
|
|
1008
|
+
input_values=[metrics.audio_tokens or 0 for metrics in input_metrics],
|
|
1009
|
+
output_values=[metrics.audio_tokens or 0 for metrics in output_metrics],
|
|
1010
|
+
),
|
|
1011
|
+
samples=GenerativeMetricsSummary.compile(
|
|
1012
|
+
request_types=request_types,
|
|
1013
|
+
request_times=request_times,
|
|
1014
|
+
input_values=[metrics.audio_samples or 0 for metrics in input_metrics],
|
|
1015
|
+
output_values=[
|
|
1016
|
+
metrics.audio_samples or 0 for metrics in output_metrics
|
|
1017
|
+
],
|
|
1018
|
+
),
|
|
1019
|
+
seconds=GenerativeMetricsSummary.compile(
|
|
1020
|
+
request_types=request_types,
|
|
1021
|
+
request_times=request_times,
|
|
1022
|
+
input_values=[metrics.audio_seconds or 0 for metrics in input_metrics],
|
|
1023
|
+
output_values=[
|
|
1024
|
+
metrics.audio_seconds or 0 for metrics in output_metrics
|
|
1025
|
+
],
|
|
1026
|
+
),
|
|
1027
|
+
bytes=GenerativeMetricsSummary.compile(
|
|
1028
|
+
request_types=request_types,
|
|
1029
|
+
request_times=request_times,
|
|
1030
|
+
input_values=[metrics.audio_bytes or 0 for metrics in input_metrics],
|
|
1031
|
+
output_values=[metrics.audio_bytes or 0 for metrics in output_metrics],
|
|
1032
|
+
),
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
class GenerativeMetrics(StandardBaseDict):
|
|
1037
|
+
"""Comprehensive metrics for generative AI benchmarks."""
|
|
1038
|
+
|
|
1039
|
+
# Request stats
|
|
1040
|
+
requests_per_second: StatusDistributionSummary = Field(
|
|
1041
|
+
description="Distribution of requests per second across benchmark execution"
|
|
1042
|
+
)
|
|
1043
|
+
request_concurrency: StatusDistributionSummary = Field(
|
|
1044
|
+
description="Distribution of concurrent request counts during execution"
|
|
1045
|
+
)
|
|
1046
|
+
request_latency: StatusDistributionSummary = Field(
|
|
1047
|
+
description="Distribution of request latencies for completed requests"
|
|
1048
|
+
)
|
|
1049
|
+
request_streaming_iterations_count: StatusDistributionSummary = Field(
|
|
1050
|
+
description="Distribution of stream iterations for completed requests"
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
# General token stats
|
|
1054
|
+
prompt_token_count: StatusDistributionSummary = Field(
|
|
1055
|
+
description="Distribution of prompt token counts by request status"
|
|
1056
|
+
)
|
|
1057
|
+
output_token_count: StatusDistributionSummary = Field(
|
|
1058
|
+
description="Distribution of output token counts by request status"
|
|
1059
|
+
)
|
|
1060
|
+
total_token_count: StatusDistributionSummary = Field(
|
|
1061
|
+
description="Distribution of total token counts by request status"
|
|
1062
|
+
)
|
|
1063
|
+
time_to_first_token_ms: StatusDistributionSummary = Field(
|
|
1064
|
+
description="Distribution of first token latencies in milliseconds"
|
|
1065
|
+
)
|
|
1066
|
+
time_per_output_token_ms: StatusDistributionSummary = Field(
|
|
1067
|
+
description="Distribution of average time per output token in milliseconds"
|
|
1068
|
+
)
|
|
1069
|
+
inter_token_latency_ms: StatusDistributionSummary = Field(
|
|
1070
|
+
description="Distribution of inter-token latencies in milliseconds"
|
|
1071
|
+
)
|
|
1072
|
+
output_tokens_wo_first_per_iteration: StatusDistributionSummary = Field(
|
|
1073
|
+
description=(
|
|
1074
|
+
"Distribution of output tokens (without first) generated per "
|
|
1075
|
+
"streaming iteration"
|
|
1076
|
+
)
|
|
1077
|
+
)
|
|
1078
|
+
output_tokens_per_second: StatusDistributionSummary = Field(
|
|
1079
|
+
description="Distribution of output token generation rates"
|
|
1080
|
+
)
|
|
1081
|
+
output_tokens_per_iteration: StatusDistributionSummary = Field(
|
|
1082
|
+
description="Distribution of output tokens generated per streaming iteration"
|
|
1083
|
+
)
|
|
1084
|
+
tokens_per_second: StatusDistributionSummary = Field(
|
|
1085
|
+
description="Distribution of total token throughput including prompt and output"
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
# Domain specific stats
|
|
1089
|
+
text: GenerativeTextMetricsSummary = Field(
|
|
1090
|
+
description="Text-specific metrics for tokens, words, and characters"
|
|
1091
|
+
)
|
|
1092
|
+
image: GenerativeImageMetricsSummary = Field(
|
|
1093
|
+
description="Image-specific metrics for tokens, images, pixels, and bytes"
|
|
1094
|
+
)
|
|
1095
|
+
video: GenerativeVideoMetricsSummary = Field(
|
|
1096
|
+
description="Video-specific metrics for tokens, frames, duration, and bytes"
|
|
1097
|
+
)
|
|
1098
|
+
audio: GenerativeAudioMetricsSummary = Field(
|
|
1099
|
+
description="Audio-specific metrics for tokens, samples, duration, and bytes"
|
|
1100
|
+
)
|
|
1101
|
+
|
|
1102
|
+
@classmethod
|
|
1103
|
+
def update_estimate(
|
|
1104
|
+
cls,
|
|
1105
|
+
state: EstimatedBenchmarkState,
|
|
1106
|
+
response: GenerationResponse | None,
|
|
1107
|
+
request: GenerationRequest,
|
|
1108
|
+
request_info: RequestInfo,
|
|
1109
|
+
scheduler_state: SchedulerState,
|
|
1110
|
+
):
|
|
1111
|
+
"""
|
|
1112
|
+
Update real-time generative metrics estimates with new request data.
|
|
1113
|
+
|
|
1114
|
+
:param state: Current estimated benchmark state to update
|
|
1115
|
+
:param response: Response received from the backend
|
|
1116
|
+
:param request: Original request sent to the backend
|
|
1117
|
+
:param request_info: Metadata about the request execution
|
|
1118
|
+
:param scheduler_state: Current state of the scheduler
|
|
1119
|
+
"""
|
|
1120
|
+
benchmark_start_time = scheduler_state.start_time
|
|
1121
|
+
request_start_time = (
|
|
1122
|
+
request_info.timings.request_start or request_info.timings.resolve_start
|
|
1123
|
+
)
|
|
1124
|
+
request_end_time = (
|
|
1125
|
+
request_info.timings.request_end or request_info.timings.resolve_end
|
|
1126
|
+
)
|
|
1127
|
+
event_occurence_time = (
|
|
1128
|
+
request_info.timings.queued
|
|
1129
|
+
if request_info.status == "queued"
|
|
1130
|
+
else (
|
|
1131
|
+
request_info.timings.dequeued
|
|
1132
|
+
if request_info.status == "pending"
|
|
1133
|
+
else request_start_time
|
|
1134
|
+
if request_info.status == "in_progress"
|
|
1135
|
+
else request_end_time
|
|
1136
|
+
)
|
|
1137
|
+
)
|
|
1138
|
+
benchmark_duration = (
|
|
1139
|
+
event_occurence_time - benchmark_start_time
|
|
1140
|
+
if event_occurence_time
|
|
1141
|
+
else None
|
|
1142
|
+
)
|
|
1143
|
+
request_duration = (
|
|
1144
|
+
(request_end_time - request_start_time)
|
|
1145
|
+
if request_end_time and request_start_time else None
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
# Always track concurrency
|
|
1149
|
+
if event_occurence_time is not None:
|
|
1150
|
+
state.add_time_averaged_metric(
|
|
1151
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1152
|
+
key="concurrency_requests",
|
|
1153
|
+
value=scheduler_state.processing_requests,
|
|
1154
|
+
recorded_time=event_occurence_time,
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
if request_info.status not in {"completed", "errored", "cancelled"}:
|
|
1158
|
+
return
|
|
1159
|
+
|
|
1160
|
+
state.set_metric(
|
|
1161
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1162
|
+
key="updated",
|
|
1163
|
+
value=True,
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
for prefix in (request_info.status, "total"):
|
|
1167
|
+
requests_count = (
|
|
1168
|
+
scheduler_state.successful_requests
|
|
1169
|
+
if prefix == "completed"
|
|
1170
|
+
else scheduler_state.errored_requests
|
|
1171
|
+
if prefix == "errored"
|
|
1172
|
+
else scheduler_state.cancelled_requests
|
|
1173
|
+
if prefix == "cancelled"
|
|
1174
|
+
else scheduler_state.processed_requests
|
|
1175
|
+
)
|
|
1176
|
+
input_tokens = (
|
|
1177
|
+
(response.input_metrics.total_tokens if response else None)
|
|
1178
|
+
or request.input_metrics.total_tokens
|
|
1179
|
+
or 0
|
|
1180
|
+
)
|
|
1181
|
+
output_tokens = (
|
|
1182
|
+
(response.output_metrics.total_tokens if response else None)
|
|
1183
|
+
or request.output_metrics.total_tokens
|
|
1184
|
+
or 0
|
|
1185
|
+
)
|
|
1186
|
+
|
|
1187
|
+
# Request distribution stats
|
|
1188
|
+
state.set_metric(
|
|
1189
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1190
|
+
key=f"{prefix}_requests",
|
|
1191
|
+
value=requests_count,
|
|
1192
|
+
)
|
|
1193
|
+
state.set_metric(
|
|
1194
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1195
|
+
key=f"{prefix}_requests_per_second",
|
|
1196
|
+
value=(
|
|
1197
|
+
requests_count / benchmark_duration if benchmark_duration else None
|
|
1198
|
+
),
|
|
1199
|
+
)
|
|
1200
|
+
state.add_avg_metric(
|
|
1201
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1202
|
+
key=f"{prefix}_request_latency",
|
|
1203
|
+
value=request_duration,
|
|
1204
|
+
)
|
|
1205
|
+
state.add_avg_metric(
|
|
1206
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1207
|
+
key=f"{prefix}_request_streaming_iterations",
|
|
1208
|
+
value=request_info.timings.iterations or 0,
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
# Token iteration stats
|
|
1212
|
+
state.add_avg_metric(
|
|
1213
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1214
|
+
key="output_tokens_iterations",
|
|
1215
|
+
value=output_tokens,
|
|
1216
|
+
count=request_info.timings.iterations or 1,
|
|
1217
|
+
)
|
|
1218
|
+
state.add_avg_metric(
|
|
1219
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1220
|
+
key="output_tokens_wo_first_iterations",
|
|
1221
|
+
value=output_tokens - 1 if output_tokens > 1 else 0,
|
|
1222
|
+
count=request_info.timings.iterations or 1,
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Token metrics stats
|
|
1226
|
+
state.add_avg_metric(
|
|
1227
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1228
|
+
key=f"{prefix}_time_to_first_token",
|
|
1229
|
+
value=request_info.timings.first_iteration,
|
|
1230
|
+
start_val=request_start_time,
|
|
1231
|
+
)
|
|
1232
|
+
state.add_avg_metric(
|
|
1233
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1234
|
+
key=f"{prefix}_inter_token_latency",
|
|
1235
|
+
value=request_info.timings.last_iteration,
|
|
1236
|
+
start_val=request_info.timings.first_iteration,
|
|
1237
|
+
count=(output_tokens or 1) - 1,
|
|
1238
|
+
)
|
|
1239
|
+
state.add_avg_metric(
|
|
1240
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1241
|
+
key=f"{prefix}_time_per_output_token",
|
|
1242
|
+
value=request_duration,
|
|
1243
|
+
count=output_tokens or 0,
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
# Input/output throughput stats
|
|
1247
|
+
if event_occurence_time is not None:
|
|
1248
|
+
state.add_avg_rate_metric(
|
|
1249
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1250
|
+
key="input_tokens",
|
|
1251
|
+
value=input_tokens,
|
|
1252
|
+
start_time=benchmark_start_time,
|
|
1253
|
+
end_time=event_occurence_time,
|
|
1254
|
+
)
|
|
1255
|
+
state.add_avg_rate_metric(
|
|
1256
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1257
|
+
key="output_tokens",
|
|
1258
|
+
value=output_tokens,
|
|
1259
|
+
start_time=benchmark_start_time,
|
|
1260
|
+
end_time=event_occurence_time,
|
|
1261
|
+
)
|
|
1262
|
+
state.add_avg_rate_metric(
|
|
1263
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1264
|
+
key="total_tokens",
|
|
1265
|
+
value=input_tokens + output_tokens,
|
|
1266
|
+
start_time=benchmark_start_time,
|
|
1267
|
+
end_time=event_occurence_time,
|
|
1268
|
+
)
|
|
1269
|
+
state.add_avg_rate_metric(
|
|
1270
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1271
|
+
key="input_text_tokens",
|
|
1272
|
+
value=(
|
|
1273
|
+
(response.input_metrics.text_tokens if response else None)
|
|
1274
|
+
or request.input_metrics.text_tokens
|
|
1275
|
+
or 0
|
|
1276
|
+
),
|
|
1277
|
+
start_time=benchmark_start_time,
|
|
1278
|
+
end_time=event_occurence_time,
|
|
1279
|
+
)
|
|
1280
|
+
state.add_avg_rate_metric(
|
|
1281
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1282
|
+
key="input_images",
|
|
1283
|
+
value=(
|
|
1284
|
+
(response.input_metrics.image_count if response else None)
|
|
1285
|
+
or request.input_metrics.image_count
|
|
1286
|
+
or 0
|
|
1287
|
+
),
|
|
1288
|
+
start_time=benchmark_start_time,
|
|
1289
|
+
end_time=event_occurence_time,
|
|
1290
|
+
)
|
|
1291
|
+
state.add_avg_rate_metric(
|
|
1292
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1293
|
+
key="input_video_frames",
|
|
1294
|
+
value=(
|
|
1295
|
+
(response.input_metrics.video_frames if response else None)
|
|
1296
|
+
or request.input_metrics.video_frames
|
|
1297
|
+
or 0
|
|
1298
|
+
),
|
|
1299
|
+
start_time=benchmark_start_time,
|
|
1300
|
+
end_time=event_occurence_time,
|
|
1301
|
+
)
|
|
1302
|
+
state.add_avg_rate_metric(
|
|
1303
|
+
group=EstimatedBenchmarkState.benchmark_metrics_group,
|
|
1304
|
+
key="input_audio_seconds",
|
|
1305
|
+
value=request.input_metrics.audio_seconds or 0,
|
|
1306
|
+
start_time=benchmark_start_time,
|
|
1307
|
+
end_time=event_occurence_time,
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
@classmethod
|
|
1311
|
+
def compile(
|
|
1312
|
+
cls,
|
|
1313
|
+
completed: list[GenerativeRequestStats],
|
|
1314
|
+
errored: list[GenerativeRequestStats],
|
|
1315
|
+
incomplete: list[GenerativeRequestStats],
|
|
1316
|
+
) -> GenerativeMetrics:
|
|
1317
|
+
"""
|
|
1318
|
+
Compile final generative metrics from request statistics.
|
|
1319
|
+
|
|
1320
|
+
:param completed: Successfully completed request statistics
|
|
1321
|
+
:param errored: Failed request statistics
|
|
1322
|
+
:param incomplete: Incomplete/cancelled request statistics
|
|
1323
|
+
:return: Compiled generative metrics with full distributions
|
|
1324
|
+
"""
|
|
1325
|
+
requests = completed + errored + incomplete
|
|
1326
|
+
request_types = cast(
|
|
1327
|
+
"list[Literal['successful', 'error', 'incomplete']]",
|
|
1328
|
+
["successful"] * len(completed)
|
|
1329
|
+
+ ["error"] * len(errored)
|
|
1330
|
+
+ ["incomplete"] * len(incomplete),
|
|
1331
|
+
)
|
|
1332
|
+
request_times = [
|
|
1333
|
+
(
|
|
1334
|
+
req.info.timings.request_start or req.info.timings.resolve_start or 0,
|
|
1335
|
+
req.info.timings.request_end or req.info.timings.resolve_end or 0,
|
|
1336
|
+
)
|
|
1337
|
+
for req in requests
|
|
1338
|
+
]
|
|
1339
|
+
input_metrics = [req.input_metrics for req in requests]
|
|
1340
|
+
output_metrics = [req.output_metrics for req in requests]
|
|
1341
|
+
|
|
1342
|
+
return GenerativeMetrics(
|
|
1343
|
+
# Request stats
|
|
1344
|
+
requests_per_second=StatusDistributionSummary.from_request_times(
|
|
1345
|
+
request_types=request_types,
|
|
1346
|
+
requests=request_times,
|
|
1347
|
+
distribution_type="rate",
|
|
1348
|
+
),
|
|
1349
|
+
request_concurrency=StatusDistributionSummary.from_request_times(
|
|
1350
|
+
request_types=request_types,
|
|
1351
|
+
requests=request_times,
|
|
1352
|
+
distribution_type="concurrency",
|
|
1353
|
+
),
|
|
1354
|
+
request_latency=StatusDistributionSummary.from_values(
|
|
1355
|
+
value_types=request_types,
|
|
1356
|
+
values=[req.request_latency or 0.0 for req in requests],
|
|
1357
|
+
),
|
|
1358
|
+
request_streaming_iterations_count=StatusDistributionSummary.from_values(
|
|
1359
|
+
value_types=request_types,
|
|
1360
|
+
values=[float(req.info.timings.iterations or 0) for req in requests],
|
|
1361
|
+
),
|
|
1362
|
+
# General token stats
|
|
1363
|
+
prompt_token_count=StatusDistributionSummary.from_values(
|
|
1364
|
+
value_types=request_types,
|
|
1365
|
+
values=[float(req.prompt_tokens or 0) for req in requests],
|
|
1366
|
+
),
|
|
1367
|
+
output_token_count=StatusDistributionSummary.from_values(
|
|
1368
|
+
value_types=request_types,
|
|
1369
|
+
values=[float(req.output_tokens or 0) for req in requests],
|
|
1370
|
+
),
|
|
1371
|
+
total_token_count=StatusDistributionSummary.from_values(
|
|
1372
|
+
value_types=request_types,
|
|
1373
|
+
values=[float(req.total_tokens or 0) for req in requests],
|
|
1374
|
+
),
|
|
1375
|
+
time_to_first_token_ms=StatusDistributionSummary.from_values(
|
|
1376
|
+
value_types=request_types,
|
|
1377
|
+
values=[req.time_to_first_token_ms or 0.0 for req in requests],
|
|
1378
|
+
),
|
|
1379
|
+
time_per_output_token_ms=StatusDistributionSummary.from_values(
|
|
1380
|
+
value_types=request_types,
|
|
1381
|
+
values=[req.time_per_output_token_ms or 0.0 for req in requests],
|
|
1382
|
+
),
|
|
1383
|
+
inter_token_latency_ms=StatusDistributionSummary.from_values(
|
|
1384
|
+
value_types=request_types,
|
|
1385
|
+
values=[req.inter_token_latency_ms or 0.0 for req in requests],
|
|
1386
|
+
),
|
|
1387
|
+
output_tokens_wo_first_per_iteration=StatusDistributionSummary.from_values(
|
|
1388
|
+
value_types=request_types,
|
|
1389
|
+
values=[
|
|
1390
|
+
max(0.0, (req.output_metrics.total_tokens or 1.0) - 1.0)
|
|
1391
|
+
for req in requests
|
|
1392
|
+
],
|
|
1393
|
+
weights=[req.info.timings.iterations or 1 for req in requests],
|
|
1394
|
+
),
|
|
1395
|
+
output_tokens_per_second=StatusDistributionSummary.from_values(
|
|
1396
|
+
value_types=request_types,
|
|
1397
|
+
values=[req.output_tokens_per_second or 0.0 for req in requests],
|
|
1398
|
+
),
|
|
1399
|
+
output_tokens_per_iteration=StatusDistributionSummary.from_values(
|
|
1400
|
+
value_types=request_types,
|
|
1401
|
+
values=[req.output_tokens_per_iteration or 0.0 for req in requests],
|
|
1402
|
+
weights=[req.info.timings.iterations or 1 for req in requests],
|
|
1403
|
+
),
|
|
1404
|
+
tokens_per_second=StatusDistributionSummary.from_values(
|
|
1405
|
+
value_types=request_types,
|
|
1406
|
+
values=[req.tokens_per_second or 0.0 for req in requests],
|
|
1407
|
+
),
|
|
1408
|
+
# Domain-specific stats
|
|
1409
|
+
text=GenerativeTextMetricsSummary.compile(
|
|
1410
|
+
request_types=request_types,
|
|
1411
|
+
request_times=request_times,
|
|
1412
|
+
input_metrics=input_metrics,
|
|
1413
|
+
output_metrics=output_metrics,
|
|
1414
|
+
),
|
|
1415
|
+
image=GenerativeImageMetricsSummary.compile(
|
|
1416
|
+
request_types=request_types,
|
|
1417
|
+
request_times=request_times,
|
|
1418
|
+
input_metrics=input_metrics,
|
|
1419
|
+
output_metrics=output_metrics,
|
|
1420
|
+
),
|
|
1421
|
+
video=GenerativeVideoMetricsSummary.compile(
|
|
1422
|
+
request_types=request_types,
|
|
1423
|
+
request_times=request_times,
|
|
1424
|
+
input_metrics=input_metrics,
|
|
1425
|
+
output_metrics=output_metrics,
|
|
1426
|
+
),
|
|
1427
|
+
audio=GenerativeAudioMetricsSummary.compile(
|
|
1428
|
+
request_types=request_types,
|
|
1429
|
+
request_times=request_times,
|
|
1430
|
+
input_metrics=input_metrics,
|
|
1431
|
+
output_metrics=output_metrics,
|
|
1432
|
+
),
|
|
1433
|
+
)
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
class SchedulerDict(StandardBaseDict):
|
|
1437
|
+
"""Scheduler configuration and execution state dictionary."""
|
|
1438
|
+
|
|
1439
|
+
strategy: SchedulingStrategy = Field(
|
|
1440
|
+
description="Scheduling strategy used for request distribution"
|
|
1441
|
+
)
|
|
1442
|
+
constraints: dict[str, dict[str, Any]] = Field(
|
|
1443
|
+
description="Execution constraints applied during benchmarking"
|
|
1444
|
+
)
|
|
1445
|
+
state: SchedulerState = Field(
|
|
1446
|
+
description="Final state of the scheduler after execution"
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
class BenchmarkerDict(StandardBaseDict):
|
|
1451
|
+
"""Benchmarker configuration and component settings dictionary."""
|
|
1452
|
+
|
|
1453
|
+
profile: Profile = Field(description="Benchmark profile configuration")
|
|
1454
|
+
requests: dict[str, Any] = Field(
|
|
1455
|
+
description="Request configuration and dataset information"
|
|
1456
|
+
)
|
|
1457
|
+
backend: dict[str, Any] = Field(
|
|
1458
|
+
description="Backend configuration and connection details"
|
|
1459
|
+
)
|
|
1460
|
+
environment: dict[str, Any] = Field(
|
|
1461
|
+
description="Execution environment configuration"
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
|
|
1465
|
+
class GenerativeBenchmark(Benchmark, StandardBaseDict):
|
|
1466
|
+
"""Complete generative AI benchmark results with specialized metrics."""
|
|
1467
|
+
|
|
1468
|
+
group_name: ClassVar[Literal["generative_benchmark"]] = "generative_benchmark"
|
|
1469
|
+
|
|
1470
|
+
type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment]
|
|
1471
|
+
id_: str = Field(
|
|
1472
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
1473
|
+
description="Unique identifier for this benchmark execution",
|
|
1474
|
+
)
|
|
1475
|
+
run_id: str = Field(
|
|
1476
|
+
description="Identifier for the benchmarker run containing this benchmark"
|
|
1477
|
+
)
|
|
1478
|
+
run_index: int = Field(
|
|
1479
|
+
description="Sequential index of this benchmark within the benchmarker run"
|
|
1480
|
+
)
|
|
1481
|
+
scheduler: SchedulerDict = Field(
|
|
1482
|
+
description="Scheduler configuration and execution state"
|
|
1483
|
+
)
|
|
1484
|
+
benchmarker: BenchmarkerDict = Field(
|
|
1485
|
+
description="Benchmarker configuration and component settings"
|
|
1486
|
+
)
|
|
1487
|
+
run_stats: BenchmarkSchedulerStats = Field(
|
|
1488
|
+
description="Scheduler timing and performance statistics"
|
|
1489
|
+
)
|
|
1490
|
+
start_time: float = Field(
|
|
1491
|
+
default=-1.0, description="Unix timestamp when the first request was initiated"
|
|
1492
|
+
)
|
|
1493
|
+
end_time: float = Field(
|
|
1494
|
+
default=-1.0, description="Unix timestamp when the last request completed"
|
|
1495
|
+
)
|
|
1496
|
+
|
|
1497
|
+
def get_run_metrics_sample(
|
|
1498
|
+
self,
|
|
1499
|
+
) -> dict[Literal["start_time", "end_time", "duration"], float]:
|
|
1500
|
+
return {
|
|
1501
|
+
"start_time": self.start_time,
|
|
1502
|
+
"end_time": self.end_time,
|
|
1503
|
+
"duration": self.duration,
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1506
|
+
def get_request_metrics_sample(
|
|
1507
|
+
self,
|
|
1508
|
+
) -> dict[
|
|
1509
|
+
Literal[
|
|
1510
|
+
"request_count",
|
|
1511
|
+
"request_latency",
|
|
1512
|
+
"request_throughput",
|
|
1513
|
+
"request_concurrency",
|
|
1514
|
+
],
|
|
1515
|
+
float,
|
|
1516
|
+
]:
|
|
1517
|
+
return {
|
|
1518
|
+
"request_count": self.request_totals.successful,
|
|
1519
|
+
"request_latency": self.metrics.request_latency.successful.mean,
|
|
1520
|
+
"request_throughput": self.metrics.requests_per_second.successful.mean,
|
|
1521
|
+
"request_concurrency": self.metrics.request_concurrency.successful.mean,
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
@computed_field # type: ignore[misc]
|
|
1525
|
+
@property
|
|
1526
|
+
def duration(self) -> float:
|
|
1527
|
+
"""
|
|
1528
|
+
Benchmark execution duration in seconds.
|
|
1529
|
+
|
|
1530
|
+
:return: Time elapsed from first request start to last request completion.
|
|
1531
|
+
"""
|
|
1532
|
+
return self.end_time - self.start_time
|
|
1533
|
+
|
|
1534
|
+
metrics: GenerativeMetrics = Field(
|
|
1535
|
+
description="Performance metrics and statistical distributions"
|
|
1536
|
+
)
|
|
1537
|
+
request_totals: StatusBreakdown[int, int, int, int] = Field(
|
|
1538
|
+
description="Request counts by status: successful, incomplete, errored, total"
|
|
1539
|
+
)
|
|
1540
|
+
requests: StatusBreakdown[
|
|
1541
|
+
list[GenerativeRequestStats],
|
|
1542
|
+
list[GenerativeRequestStats],
|
|
1543
|
+
list[GenerativeRequestStats],
|
|
1544
|
+
None,
|
|
1545
|
+
] = Field(
|
|
1546
|
+
description="Request details grouped by status: successful, incomplete, errored"
|
|
1547
|
+
)
|
|
1548
|
+
|
|
1549
|
+
@classmethod
|
|
1550
|
+
def update_estimate(
|
|
1551
|
+
cls,
|
|
1552
|
+
args: BenchmarkerArgs,
|
|
1553
|
+
state: EstimatedBenchmarkState,
|
|
1554
|
+
response: GenerationResponse | None,
|
|
1555
|
+
request: GenerationRequest,
|
|
1556
|
+
request_info: RequestInfo,
|
|
1557
|
+
scheduler_state: SchedulerState,
|
|
1558
|
+
):
|
|
1559
|
+
"""
|
|
1560
|
+
Update generative benchmark estimates with new request data.
|
|
1561
|
+
|
|
1562
|
+
Handles warmup/cooldown filtering, request sampling via reservoir sampling,
|
|
1563
|
+
and delegates metric updates to child metric classes.
|
|
1564
|
+
|
|
1565
|
+
:param args: Benchmark configuration arguments
|
|
1566
|
+
:param state: Current estimated benchmark state to update
|
|
1567
|
+
:param response: Response received from the backend
|
|
1568
|
+
:param request: Original request sent to the backend
|
|
1569
|
+
:param request_info: Metadata about the request execution
|
|
1570
|
+
:param scheduler_state: Current state of the scheduler
|
|
1571
|
+
"""
|
|
1572
|
+
if (
|
|
1573
|
+
request_info.status == "cancelled"
|
|
1574
|
+
and request_info.timings.resolve_start is None
|
|
1575
|
+
):
|
|
1576
|
+
# Cancelled requests that never started should be ignored
|
|
1577
|
+
return
|
|
1578
|
+
|
|
1579
|
+
# Update child metric groups
|
|
1580
|
+
BenchmarkSchedulerStats.update_estimate(state, request_info)
|
|
1581
|
+
GenerativeMetrics.update_estimate(
|
|
1582
|
+
state, response, request, request_info, scheduler_state
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
# Store requests and sampling info, update counts
|
|
1586
|
+
if "requests_completed" not in state:
|
|
1587
|
+
state["requests_completed"] = []
|
|
1588
|
+
state["samples_completed"] = []
|
|
1589
|
+
state["requests_errored"] = []
|
|
1590
|
+
state["samples_errored"] = []
|
|
1591
|
+
state["requests_incomplete"] = []
|
|
1592
|
+
state["samples_incomplete"] = []
|
|
1593
|
+
in_warmup = state.set_metric(
|
|
1594
|
+
group=EstimatedBenchmarkState.benchmark_state_group,
|
|
1595
|
+
key="in_warmup",
|
|
1596
|
+
value=args.is_in_warmup(request_info, scheduler_state),
|
|
1597
|
+
)
|
|
1598
|
+
in_cooldown = state.set_metric(
|
|
1599
|
+
group=EstimatedBenchmarkState.benchmark_state_group,
|
|
1600
|
+
key="in_cooldown",
|
|
1601
|
+
value=args.is_in_cooldown(request_info, scheduler_state),
|
|
1602
|
+
)
|
|
1603
|
+
state[f"{EstimatedBenchmarkState.benchmark_state_group}_status"] = (
|
|
1604
|
+
"in_cooldown"
|
|
1605
|
+
if in_cooldown
|
|
1606
|
+
else "in_warmup"
|
|
1607
|
+
if in_warmup
|
|
1608
|
+
else "in_progress"
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
if (
|
|
1612
|
+
request_info.status not in {"completed", "errored", "cancelled"}
|
|
1613
|
+
or in_warmup
|
|
1614
|
+
or in_cooldown
|
|
1615
|
+
):
|
|
1616
|
+
# Must be fully resolved to be added
|
|
1617
|
+
return
|
|
1618
|
+
|
|
1619
|
+
state.set_metric(
|
|
1620
|
+
group=EstimatedBenchmarkState.benchmark_state_group,
|
|
1621
|
+
key="updated",
|
|
1622
|
+
value=True,
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
if response is None:
|
|
1626
|
+
response = GenerationResponse(
|
|
1627
|
+
request_id=request.request_id, request_args=str(request.arguments)
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1630
|
+
stats = response.compile_stats(
|
|
1631
|
+
request, request_info, args.prefer_response_metrics
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1634
|
+
# Determine status and get corresponding lists
|
|
1635
|
+
if request_info.status == "completed":
|
|
1636
|
+
requests_list = state["requests_completed"]
|
|
1637
|
+
samples_list = state["samples_completed"]
|
|
1638
|
+
elif request_info.status == "errored":
|
|
1639
|
+
requests_list = state["requests_errored"]
|
|
1640
|
+
samples_list = state["samples_errored"]
|
|
1641
|
+
else: # cancelled (incomplete)
|
|
1642
|
+
requests_list = state["requests_incomplete"]
|
|
1643
|
+
samples_list = state["samples_incomplete"]
|
|
1644
|
+
|
|
1645
|
+
# Add to requests list
|
|
1646
|
+
requests_list.append(stats)
|
|
1647
|
+
current_index = len(requests_list) - 1
|
|
1648
|
+
|
|
1649
|
+
# Handle request sampling logic
|
|
1650
|
+
if args.sample_requests is None:
|
|
1651
|
+
# No sampling, add index to samples list
|
|
1652
|
+
samples_list.append(current_index)
|
|
1653
|
+
elif args.sample_requests > 0 and len(samples_list) < args.sample_requests:
|
|
1654
|
+
# Space in samples list, add index
|
|
1655
|
+
samples_list.append(current_index)
|
|
1656
|
+
elif (
|
|
1657
|
+
args.sample_requests > 0
|
|
1658
|
+
and (replace_index := random.randrange(len(requests_list)))
|
|
1659
|
+
< args.sample_requests
|
|
1660
|
+
):
|
|
1661
|
+
# No space, adding based on reservoir sampling
|
|
1662
|
+
samples_list[replace_index] = current_index
|
|
1663
|
+
# Sampling set to 0, don't keep any requests
|
|
1664
|
+
|
|
1665
|
+
@classmethod
|
|
1666
|
+
def compile(
|
|
1667
|
+
cls,
|
|
1668
|
+
args: BenchmarkerArgs,
|
|
1669
|
+
estimated_state: EstimatedBenchmarkState,
|
|
1670
|
+
scheduler_state: SchedulerState,
|
|
1671
|
+
profile: Profile,
|
|
1672
|
+
requests: Iterable,
|
|
1673
|
+
backend: BackendInterface,
|
|
1674
|
+
environment: Environment,
|
|
1675
|
+
strategy: SchedulingStrategy,
|
|
1676
|
+
constraints: dict[str, dict[str, Any]],
|
|
1677
|
+
) -> GenerativeBenchmark:
|
|
1678
|
+
"""
|
|
1679
|
+
Compile final generative benchmark from accumulated state.
|
|
1680
|
+
|
|
1681
|
+
:param args: Benchmark configuration arguments
|
|
1682
|
+
:param estimated_state: Accumulated benchmark state from execution
|
|
1683
|
+
:param scheduler_state: Final state of the scheduler
|
|
1684
|
+
:param profile: Benchmark profile configuration
|
|
1685
|
+
:param requests: Collection of requests executed
|
|
1686
|
+
:param backend: Backend interface used for execution
|
|
1687
|
+
:param environment: Execution environment configuration
|
|
1688
|
+
:param strategy: Scheduling strategy used
|
|
1689
|
+
:param constraints: Execution constraints applied
|
|
1690
|
+
:return: Compiled generative benchmark instance
|
|
1691
|
+
"""
|
|
1692
|
+
return GenerativeBenchmark(
|
|
1693
|
+
run_id=args.run_id,
|
|
1694
|
+
run_index=args.run_index,
|
|
1695
|
+
scheduler=SchedulerDict(
|
|
1696
|
+
strategy=strategy,
|
|
1697
|
+
constraints={
|
|
1698
|
+
key: InfoMixin.extract_from_obj(val)
|
|
1699
|
+
for key, val in constraints.items()
|
|
1700
|
+
},
|
|
1701
|
+
state=scheduler_state,
|
|
1702
|
+
),
|
|
1703
|
+
benchmarker=BenchmarkerDict(
|
|
1704
|
+
profile=profile,
|
|
1705
|
+
requests=InfoMixin.extract_from_obj(requests),
|
|
1706
|
+
backend=backend.info,
|
|
1707
|
+
environment=environment.info,
|
|
1708
|
+
),
|
|
1709
|
+
run_stats=BenchmarkSchedulerStats.compile(estimated_state, scheduler_state),
|
|
1710
|
+
start_time=scheduler_state.start_time or -1.0,
|
|
1711
|
+
end_time=scheduler_state.end_time or -1.0,
|
|
1712
|
+
metrics=GenerativeMetrics.compile(
|
|
1713
|
+
completed=estimated_state.get("requests_completed", []),
|
|
1714
|
+
errored=estimated_state.get("requests_errored", []),
|
|
1715
|
+
incomplete=estimated_state.get("requests_incomplete", []),
|
|
1716
|
+
),
|
|
1717
|
+
request_totals=StatusBreakdown[int, int, int, int](
|
|
1718
|
+
successful=len(estimated_state.get("requests_completed", [])),
|
|
1719
|
+
incomplete=len(estimated_state.get("requests_incomplete", [])),
|
|
1720
|
+
errored=len(estimated_state.get("requests_errored", [])),
|
|
1721
|
+
total=(
|
|
1722
|
+
len(estimated_state.get("requests_completed", []))
|
|
1723
|
+
+ len(estimated_state.get("requests_incomplete", []))
|
|
1724
|
+
+ len(estimated_state.get("requests_errored", []))
|
|
1725
|
+
),
|
|
1726
|
+
),
|
|
1727
|
+
requests=StatusBreakdown[
|
|
1728
|
+
list[GenerativeRequestStats],
|
|
1729
|
+
list[GenerativeRequestStats],
|
|
1730
|
+
list[GenerativeRequestStats],
|
|
1731
|
+
None,
|
|
1732
|
+
](
|
|
1733
|
+
successful=estimated_state.get("requests_completed", []),
|
|
1734
|
+
incomplete=estimated_state.get("requests_incomplete", []),
|
|
1735
|
+
errored=estimated_state.get("requests_errored", []),
|
|
1736
|
+
total=None,
|
|
1737
|
+
),
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
|
|
1741
|
+
class BenchmarkGenerativeTextArgs(StandardBaseModel):
|
|
1742
|
+
"""
|
|
1743
|
+
Configuration arguments for generative text benchmark execution.
|
|
1744
|
+
|
|
1745
|
+
Defines all parameters for benchmark setup including target endpoint, data
|
|
1746
|
+
sources, backend configuration, processing pipeline, output formatting, and
|
|
1747
|
+
execution constraints. Supports loading from scenario files and merging with
|
|
1748
|
+
runtime overrides.
|
|
1749
|
+
"""
|
|
1750
|
+
|
|
1751
|
+
@classmethod
|
|
1752
|
+
def create(
|
|
1753
|
+
cls, scenario: Path | str | None, **kwargs: dict[str, Any]
|
|
1754
|
+
) -> BenchmarkGenerativeTextArgs:
|
|
1755
|
+
"""
|
|
1756
|
+
Create benchmark args from scenario file and/or keyword arguments.
|
|
1757
|
+
|
|
1758
|
+
:param scenario: Path to scenario file or name of built-in scenario
|
|
1759
|
+
:param kwargs: Additional keyword arguments to override scenario values
|
|
1760
|
+
:return: Configured benchmark args instance
|
|
1761
|
+
:raises ValueError: If scenario is not found or file format is unsupported
|
|
1762
|
+
"""
|
|
1763
|
+
constructor_kwargs = {}
|
|
1764
|
+
|
|
1765
|
+
if scenario is not None:
|
|
1766
|
+
if isinstance(scenario, str) and scenario in (
|
|
1767
|
+
builtin_scenarios := get_builtin_scenarios()
|
|
1768
|
+
):
|
|
1769
|
+
scenario_path = builtin_scenarios[scenario]
|
|
1770
|
+
elif Path(scenario).exists() and Path(scenario).is_file():
|
|
1771
|
+
scenario_path = Path(scenario)
|
|
1772
|
+
else:
|
|
1773
|
+
raise ValueError(f"Scenario '{scenario}' not found.")
|
|
1774
|
+
|
|
1775
|
+
with scenario_path.open() as file:
|
|
1776
|
+
if scenario_path.suffix == ".json":
|
|
1777
|
+
scenario_data = json.load(file)
|
|
1778
|
+
elif scenario_path.suffix in {".yaml", ".yml"}:
|
|
1779
|
+
scenario_data = yaml.safe_load(file)
|
|
1780
|
+
else:
|
|
1781
|
+
raise ValueError(
|
|
1782
|
+
f"Unsupported scenario file format: {scenario_path.suffix}"
|
|
1783
|
+
)
|
|
1784
|
+
if "args" in scenario_data:
|
|
1785
|
+
# loading from a report file
|
|
1786
|
+
scenario_data = scenario_data["args"]
|
|
1787
|
+
constructor_kwargs.update(scenario_data)
|
|
1788
|
+
|
|
1789
|
+
for key, value in kwargs.items():
|
|
1790
|
+
if value != cls.get_default(key):
|
|
1791
|
+
constructor_kwargs[key] = value
|
|
1792
|
+
|
|
1793
|
+
return cls.model_validate(constructor_kwargs)
|
|
1794
|
+
|
|
1795
|
+
@classmethod
|
|
1796
|
+
def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
|
|
1797
|
+
"""
|
|
1798
|
+
Get default value for a model field.
|
|
1799
|
+
|
|
1800
|
+
:param field: Name of the field to retrieve default for
|
|
1801
|
+
:return: Default value for the specified field
|
|
1802
|
+
:raises ValueError: If field is not found in model
|
|
1803
|
+
"""
|
|
1804
|
+
if field not in BenchmarkGenerativeTextArgs.model_fields:
|
|
1805
|
+
raise ValueError(
|
|
1806
|
+
f"Field '{field}' not found in BenchmarkGenerativeTextArgs"
|
|
1807
|
+
)
|
|
1808
|
+
|
|
1809
|
+
field_info = BenchmarkGenerativeTextArgs.model_fields[field]
|
|
1810
|
+
factory = field_info.default_factory
|
|
1811
|
+
|
|
1812
|
+
if factory is None:
|
|
1813
|
+
return field_info.default
|
|
1814
|
+
|
|
1815
|
+
if len(inspect.signature(factory).parameters) == 0:
|
|
1816
|
+
return factory() # type: ignore[call-arg] # Confirmed correct at runtime by code above
|
|
1817
|
+
else:
|
|
1818
|
+
return factory({}) # type: ignore[call-arg] # Confirmed correct at runtime by code above
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
|
|
1822
|
+
model_config = ConfigDict(
|
|
1823
|
+
extra="ignore",
|
|
1824
|
+
use_enum_values=True,
|
|
1825
|
+
from_attributes=True,
|
|
1826
|
+
arbitrary_types_allowed=True,
|
|
1827
|
+
)
|
|
1828
|
+
|
|
1829
|
+
# Required
|
|
1830
|
+
target: str = Field(description="Target endpoint URL for benchmark execution")
|
|
1831
|
+
data: list[Any] = Field(
|
|
1832
|
+
description="List of dataset sources or data files",
|
|
1833
|
+
default_factory=list,
|
|
1834
|
+
min_length=1,
|
|
1835
|
+
)
|
|
1836
|
+
# Benchmark configuration
|
|
1837
|
+
profile: StrategyType | ProfileType | Profile = Field(
|
|
1838
|
+
default="sweep", description="Benchmark profile or scheduling strategy type"
|
|
1839
|
+
)
|
|
1840
|
+
rate: float | list[float] | None = Field(
|
|
1841
|
+
default=None, description="Request rate(s) for rate-based scheduling"
|
|
1842
|
+
)
|
|
1843
|
+
# Backend configuration
|
|
1844
|
+
backend: BackendType | Backend = Field(
|
|
1845
|
+
default="openai_http", description="Backend type or instance for execution"
|
|
1846
|
+
)
|
|
1847
|
+
backend_kwargs: dict[str, Any] | None = Field(
|
|
1848
|
+
default=None, description="Additional backend configuration arguments"
|
|
1849
|
+
)
|
|
1850
|
+
model: str | None = Field(default=None, description="Model identifier for backend")
|
|
1851
|
+
# Data configuration
|
|
1852
|
+
processor: str | Path | PreTrainedTokenizerBase | None = Field(
|
|
1853
|
+
default=None, description="Tokenizer path, name, or instance for processing"
|
|
1854
|
+
)
|
|
1855
|
+
processor_args: dict[str, Any] | None = Field(
|
|
1856
|
+
default=None, description="Additional tokenizer configuration arguments"
|
|
1857
|
+
)
|
|
1858
|
+
data_args: list[dict[str, Any]] | None = Field(
|
|
1859
|
+
default_factory=list, description="Per-dataset configuration arguments"
|
|
1860
|
+
)
|
|
1861
|
+
data_samples: int = Field(
|
|
1862
|
+
default=-1, description="Number of samples to use from datasets (-1 for all)"
|
|
1863
|
+
)
|
|
1864
|
+
data_column_mapper: (
|
|
1865
|
+
DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"]
|
|
1866
|
+
) = Field(
|
|
1867
|
+
default="generative_column_mapper",
|
|
1868
|
+
description="Column mapping preprocessor for dataset fields",
|
|
1869
|
+
)
|
|
1870
|
+
data_request_formatter: DatasetPreprocessor | dict[str, str] | str = Field(
|
|
1871
|
+
default="chat_completions",
|
|
1872
|
+
description="Request formatting preprocessor or template name",
|
|
1873
|
+
)
|
|
1874
|
+
data_collator: Callable | Literal["generative"] | None = Field(
|
|
1875
|
+
default="generative", description="Data collator for batch processing"
|
|
1876
|
+
)
|
|
1877
|
+
data_sampler: Sampler[int] | Literal["shuffle"] | None = Field(
|
|
1878
|
+
default=None, description="Data sampler for request ordering"
|
|
1879
|
+
)
|
|
1880
|
+
data_num_workers: int | None = Field(
|
|
1881
|
+
default=None, description="Number of workers for data loading"
|
|
1882
|
+
)
|
|
1883
|
+
dataloader_kwargs: dict[str, Any] | None = Field(
|
|
1884
|
+
default=None, description="Additional dataloader configuration arguments"
|
|
1885
|
+
)
|
|
1886
|
+
random_seed: int = Field(default=42, description="Random seed for reproducibility")
|
|
1887
|
+
# Output configuration
|
|
1888
|
+
output_path: str | Path | None = Field(
|
|
1889
|
+
default_factory=Path.cwd, description="Directory path for output files"
|
|
1890
|
+
)
|
|
1891
|
+
output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field(
|
|
1892
|
+
default_factory=lambda: ["console", "json"],
|
|
1893
|
+
description="Output format names or configuration mappings",
|
|
1894
|
+
)
|
|
1895
|
+
# Benchmarker configuration
|
|
1896
|
+
benchmark_cls: type[GenerativeBenchmark] = Field(
|
|
1897
|
+
default=GenerativeBenchmark,
|
|
1898
|
+
description="Benchmark class to use for result compilation",
|
|
1899
|
+
)
|
|
1900
|
+
sample_requests: int | None = Field(
|
|
1901
|
+
default=10,
|
|
1902
|
+
description="Number of requests to sample for detailed metrics (None for all)",
|
|
1903
|
+
)
|
|
1904
|
+
warmup: float | None = Field(
|
|
1905
|
+
default=None,
|
|
1906
|
+
description="Warmup period in seconds, requests, or fraction (0-1)",
|
|
1907
|
+
)
|
|
1908
|
+
cooldown: float | None = Field(
|
|
1909
|
+
default=None,
|
|
1910
|
+
description="Cooldown period in seconds, requests, or fraction (0-1)",
|
|
1911
|
+
)
|
|
1912
|
+
prefer_response_metrics: bool = Field(
|
|
1913
|
+
default=True,
|
|
1914
|
+
description="Whether to prefer backend response metrics over request metrics",
|
|
1915
|
+
)
|
|
1916
|
+
# Constraints configuration
|
|
1917
|
+
max_seconds: int | float | None = Field(
|
|
1918
|
+
default=None, description="Maximum benchmark execution time in seconds"
|
|
1919
|
+
)
|
|
1920
|
+
max_requests: int | None = Field(
|
|
1921
|
+
default=None, description="Maximum number of requests to execute"
|
|
1922
|
+
)
|
|
1923
|
+
max_errors: int | None = Field(
|
|
1924
|
+
default=None, description="Maximum number of errors before stopping"
|
|
1925
|
+
)
|
|
1926
|
+
max_error_rate: float | None = Field(
|
|
1927
|
+
default=None, description="Maximum error rate (0-1) before stopping"
|
|
1928
|
+
)
|
|
1929
|
+
max_global_error_rate: float | None = Field(
|
|
1930
|
+
default=None, description="Maximum global error rate (0-1) before stopping"
|
|
1931
|
+
)
|
|
1932
|
+
|
|
1933
|
+
@model_serializer
|
|
1934
|
+
def serialize_model(self):
|
|
1935
|
+
"""
|
|
1936
|
+
Custom serialization logic for benchmark args.
|
|
1937
|
+
|
|
1938
|
+
Converts complex types to serializable formats including Profile to type
|
|
1939
|
+
string, Backend to type string, and Path objects to strings.
|
|
1940
|
+
|
|
1941
|
+
:return: Dictionary representation suitable for JSON/YAML serialization
|
|
1942
|
+
"""
|
|
1943
|
+
return {
|
|
1944
|
+
# target - serialize as is
|
|
1945
|
+
"target": self.target,
|
|
1946
|
+
"data": [
|
|
1947
|
+
item if isinstance(item, str | type(None)) else str(item)
|
|
1948
|
+
for item in self.data
|
|
1949
|
+
], # data - for each item in the list, if not a str or None, save str(item)
|
|
1950
|
+
"profile": (
|
|
1951
|
+
self.profile.type_
|
|
1952
|
+
if isinstance(self.profile, Profile)
|
|
1953
|
+
else self.profile
|
|
1954
|
+
), # profile - if instance of Profile, then save as profile.type_
|
|
1955
|
+
"rate": self.rate,
|
|
1956
|
+
"backend": (
|
|
1957
|
+
self.backend.type_
|
|
1958
|
+
if isinstance(self.backend, Backend)
|
|
1959
|
+
else self.backend
|
|
1960
|
+
), # backend - if instance of Backend, then save as backend.type_
|
|
1961
|
+
"backend_kwargs": self.backend_kwargs,
|
|
1962
|
+
"model": self.model,
|
|
1963
|
+
"processor": (
|
|
1964
|
+
self.processor
|
|
1965
|
+
if isinstance(self.processor, str)
|
|
1966
|
+
else str(self.processor)
|
|
1967
|
+
if self.processor is not None
|
|
1968
|
+
else None
|
|
1969
|
+
), # processor - if not str, then save as str(processor)
|
|
1970
|
+
"processor_args": self.processor_args,
|
|
1971
|
+
"data_args": self.data_args,
|
|
1972
|
+
"data_samples": self.data_samples,
|
|
1973
|
+
"data_column_mapper": (
|
|
1974
|
+
self.data_column_mapper
|
|
1975
|
+
if isinstance(self.data_column_mapper, dict | str)
|
|
1976
|
+
else {}
|
|
1977
|
+
), # data_column_mapper - if not dict or str, then save as an empty dict
|
|
1978
|
+
"data_request_formatter": (
|
|
1979
|
+
self.data_request_formatter
|
|
1980
|
+
if isinstance(self.data_request_formatter, dict | str)
|
|
1981
|
+
else {}
|
|
1982
|
+
), # data_request_formatter - if not dict or str, then save as empty dict
|
|
1983
|
+
"data_collator": (
|
|
1984
|
+
self.data_collator if isinstance(self.data_collator, str) else None
|
|
1985
|
+
), # data_collator - if not str, then save as None
|
|
1986
|
+
"data_sampler": (
|
|
1987
|
+
self.data_sampler if isinstance(self.data_sampler, str) else None
|
|
1988
|
+
), # data_sampler - if not str, then save as None
|
|
1989
|
+
"data_num_workers": self.data_num_workers,
|
|
1990
|
+
"dataloader_kwargs": self.dataloader_kwargs,
|
|
1991
|
+
"random_seed": self.random_seed,
|
|
1992
|
+
"output_path": (
|
|
1993
|
+
str(self.output_path) if self.output_path is not None else None
|
|
1994
|
+
), # output_path - if not None, then ensure it's a str
|
|
1995
|
+
"output_formats": self.output_formats,
|
|
1996
|
+
# benchmark_cls - don't save at all (excluded)
|
|
1997
|
+
"sample_requests": self.sample_requests,
|
|
1998
|
+
"warmup": self.warmup,
|
|
1999
|
+
"cooldown": self.cooldown,
|
|
2000
|
+
"prefer_response_metrics": self.prefer_response_metrics,
|
|
2001
|
+
"max_seconds": self.max_seconds,
|
|
2002
|
+
"max_requests": self.max_requests,
|
|
2003
|
+
"max_errors": self.max_errors,
|
|
2004
|
+
"max_error_rate": self.max_error_rate,
|
|
2005
|
+
"max_global_error_rate": self.max_global_error_rate,
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
|
|
2009
|
+
class GenerativeBenchmarksReport(StandardBaseModel):
|
|
2010
|
+
"""Container for multiple benchmark results with load/save functionality."""
|
|
2011
|
+
|
|
2012
|
+
DEFAULT_FILE: ClassVar[str] = "benchmarks.json"
|
|
2013
|
+
|
|
2014
|
+
@staticmethod
|
|
2015
|
+
def load_file(
|
|
2016
|
+
path: str | Path, type_: Literal["json", "yaml"] | None = None
|
|
2017
|
+
) -> GenerativeBenchmarksReport:
|
|
2018
|
+
"""
|
|
2019
|
+
Load a report from a file.
|
|
2020
|
+
|
|
2021
|
+
:param path: The path to load the report from.
|
|
2022
|
+
:param type_: File type override, auto-detected from extension if None.
|
|
2023
|
+
:return: The loaded report.
|
|
2024
|
+
:raises ValueError: If file type is unsupported.
|
|
2025
|
+
"""
|
|
2026
|
+
path = Path(path) if not isinstance(path, Path) else path
|
|
2027
|
+
|
|
2028
|
+
if path.is_dir():
|
|
2029
|
+
path = path / GenerativeBenchmarksReport.DEFAULT_FILE
|
|
2030
|
+
|
|
2031
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
2032
|
+
path_suffix = path.suffix.lower()[1:]
|
|
2033
|
+
|
|
2034
|
+
with path.open("r") as file:
|
|
2035
|
+
if (type_ or path_suffix) == "json":
|
|
2036
|
+
model_dict = json.loads(file.read())
|
|
2037
|
+
elif (type_ or path_suffix) in ["yaml", "yml"]:
|
|
2038
|
+
model_dict = yaml.safe_load(file)
|
|
2039
|
+
else:
|
|
2040
|
+
raise ValueError(f"Unsupported file type: {type_} for {path}.")
|
|
2041
|
+
|
|
2042
|
+
return GenerativeBenchmarksReport.model_validate(model_dict)
|
|
2043
|
+
|
|
2044
|
+
args: BenchmarkGenerativeTextArgs = Field(
|
|
2045
|
+
description="The benchmark arguments used for all benchmarks in the report."
|
|
2046
|
+
)
|
|
2047
|
+
benchmarks: list[GenerativeBenchmark] = Field(
|
|
2048
|
+
description="The list of completed benchmarks contained within the report.",
|
|
2049
|
+
default_factory=list,
|
|
2050
|
+
)
|
|
2051
|
+
|
|
2052
|
+
def save_file(
|
|
2053
|
+
self, path: str | Path | None, type_: Literal["json", "yaml"] | None = None
|
|
2054
|
+
) -> Path:
|
|
2055
|
+
"""
|
|
2056
|
+
Save the report to a file.
|
|
2057
|
+
|
|
2058
|
+
:param path: The path to save the report to.
|
|
2059
|
+
:param type_: File type override, auto-detected from extension if None.
|
|
2060
|
+
:return: The path to the saved report.
|
|
2061
|
+
:raises ValueError: If file type is unsupported.
|
|
2062
|
+
"""
|
|
2063
|
+
if path is None:
|
|
2064
|
+
path = Path.cwd()
|
|
2065
|
+
elif not isinstance(path, Path):
|
|
2066
|
+
path = Path(path)
|
|
2067
|
+
|
|
2068
|
+
if path.is_dir():
|
|
2069
|
+
path = path / GenerativeBenchmarksReport.DEFAULT_FILE
|
|
2070
|
+
|
|
2071
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
2072
|
+
path_suffix = path.suffix.lower()[1:]
|
|
2073
|
+
model_dict = self.model_dump()
|
|
2074
|
+
|
|
2075
|
+
if (type_ or path_suffix) == "json":
|
|
2076
|
+
save_str = json.dumps(model_dict)
|
|
2077
|
+
elif (type_ or path_suffix) in ["yaml", "yml"]:
|
|
2078
|
+
save_str = yaml.dump(model_dict)
|
|
2079
|
+
else:
|
|
2080
|
+
raise ValueError(f"Unsupported file type: {type_} for {path}.")
|
|
2081
|
+
|
|
2082
|
+
with path.open("w") as file:
|
|
2083
|
+
file.write(save_str)
|
|
2084
|
+
|
|
2085
|
+
return path
|