guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +452 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +150 -317
  9. guidellm/benchmark/entrypoints.py +467 -128
  10. guidellm/benchmark/output.py +519 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2086 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +144 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +194 -0
  24. guidellm/data/deserializers/synthetic.py +348 -0
  25. guidellm/data/loaders.py +149 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +404 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +31 -0
  30. guidellm/data/processor.py +31 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +6 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/extras/__init__.py +4 -0
  35. guidellm/extras/audio.py +215 -0
  36. guidellm/extras/vision.py +242 -0
  37. guidellm/logger.py +2 -2
  38. guidellm/mock_server/__init__.py +8 -0
  39. guidellm/mock_server/config.py +84 -0
  40. guidellm/mock_server/handlers/__init__.py +17 -0
  41. guidellm/mock_server/handlers/chat_completions.py +280 -0
  42. guidellm/mock_server/handlers/completions.py +280 -0
  43. guidellm/mock_server/handlers/tokenizer.py +142 -0
  44. guidellm/mock_server/models.py +510 -0
  45. guidellm/mock_server/server.py +168 -0
  46. guidellm/mock_server/utils.py +302 -0
  47. guidellm/preprocess/dataset.py +23 -26
  48. guidellm/presentation/builder.py +2 -2
  49. guidellm/presentation/data_models.py +25 -21
  50. guidellm/presentation/injector.py +2 -3
  51. guidellm/scheduler/__init__.py +65 -26
  52. guidellm/scheduler/constraints.py +1035 -0
  53. guidellm/scheduler/environments.py +252 -0
  54. guidellm/scheduler/scheduler.py +140 -368
  55. guidellm/scheduler/schemas.py +272 -0
  56. guidellm/scheduler/strategies.py +519 -0
  57. guidellm/scheduler/worker.py +391 -420
  58. guidellm/scheduler/worker_group.py +707 -0
  59. guidellm/schemas/__init__.py +31 -0
  60. guidellm/schemas/info.py +159 -0
  61. guidellm/schemas/request.py +226 -0
  62. guidellm/schemas/response.py +119 -0
  63. guidellm/schemas/stats.py +228 -0
  64. guidellm/{config.py → settings.py} +32 -21
  65. guidellm/utils/__init__.py +95 -8
  66. guidellm/utils/auto_importer.py +98 -0
  67. guidellm/utils/cli.py +71 -2
  68. guidellm/utils/console.py +183 -0
  69. guidellm/utils/encoding.py +778 -0
  70. guidellm/utils/functions.py +134 -0
  71. guidellm/utils/hf_datasets.py +1 -2
  72. guidellm/utils/hf_transformers.py +4 -4
  73. guidellm/utils/imports.py +9 -0
  74. guidellm/utils/messaging.py +1118 -0
  75. guidellm/utils/mixins.py +115 -0
  76. guidellm/utils/pydantic_utils.py +411 -0
  77. guidellm/utils/random.py +3 -4
  78. guidellm/utils/registry.py +220 -0
  79. guidellm/utils/singleton.py +133 -0
  80. guidellm/{objects → utils}/statistics.py +341 -247
  81. guidellm/utils/synchronous.py +159 -0
  82. guidellm/utils/text.py +163 -50
  83. guidellm/utils/typing.py +41 -0
  84. guidellm/version.py +1 -1
  85. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
  86. guidellm-0.4.0a169.dist-info/RECORD +95 -0
  87. guidellm/backend/__init__.py +0 -23
  88. guidellm/backend/backend.py +0 -259
  89. guidellm/backend/openai.py +0 -705
  90. guidellm/backend/response.py +0 -136
  91. guidellm/benchmark/aggregator.py +0 -760
  92. guidellm/benchmark/benchmark.py +0 -837
  93. guidellm/benchmark/scenario.py +0 -104
  94. guidellm/data/prideandprejudice.txt.gz +0 -0
  95. guidellm/dataset/__init__.py +0 -22
  96. guidellm/dataset/creator.py +0 -213
  97. guidellm/dataset/entrypoints.py +0 -42
  98. guidellm/dataset/file.py +0 -92
  99. guidellm/dataset/hf_datasets.py +0 -62
  100. guidellm/dataset/in_memory.py +0 -132
  101. guidellm/dataset/synthetic.py +0 -287
  102. guidellm/objects/__init__.py +0 -18
  103. guidellm/objects/pydantic.py +0 -89
  104. guidellm/request/__init__.py +0 -18
  105. guidellm/request/loader.py +0 -284
  106. guidellm/request/request.py +0 -79
  107. guidellm/request/types.py +0 -10
  108. guidellm/scheduler/queues.py +0 -25
  109. guidellm/scheduler/result.py +0 -155
  110. guidellm/scheduler/strategy.py +0 -495
  111. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  112. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2086 @@
1
+ """
2
+ Benchmark data models and metrics for generative AI performance measurement.
3
+
4
+ Provides comprehensive data structures for capturing, storing, and analyzing
5
+ benchmark results from scheduler-driven generative AI workload executions.
6
+ Core abstractions include base benchmark interfaces, generative-specific
7
+ metrics with token/latency distributions, request-level statistics tracking,
8
+ and multi-benchmark reporting capabilities. These models enable detailed
9
+ performance analysis including throughput, latency, concurrency patterns, and
10
+ domain-specific metrics for text, image, video, and audio generation tasks.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import inspect
16
+ import json
17
+ import random
18
+ import time
19
+ import uuid
20
+ from abc import ABC, abstractmethod
21
+ from collections.abc import Callable, Iterable
22
+ from pathlib import Path
23
+ from typing import Any, ClassVar, Literal, TypeVar, cast
24
+
25
+ import yaml
26
+ from pydantic import ConfigDict, Field, computed_field, model_serializer
27
+ from torch.utils.data import Sampler
28
+ from transformers import PreTrainedTokenizerBase
29
+
30
+ from guidellm.backends import Backend, BackendType
31
+ from guidellm.benchmark.profile import Profile, ProfileType
32
+ from guidellm.benchmark.scenarios import get_builtin_scenarios
33
+ from guidellm.data import DatasetPreprocessor
34
+ from guidellm.scheduler import (
35
+ BackendInterface,
36
+ Environment,
37
+ SchedulerState,
38
+ SchedulingStrategy,
39
+ StrategyType,
40
+ )
41
+ from guidellm.schemas import (
42
+ GenerationRequest,
43
+ GenerationResponse,
44
+ GenerativeRequestStats,
45
+ RequestInfo,
46
+ UsageMetrics,
47
+ )
48
+ from guidellm.utils import (
49
+ InfoMixin,
50
+ StandardBaseDict,
51
+ StandardBaseModel,
52
+ StatusBreakdown,
53
+ StatusDistributionSummary,
54
+ )
55
+
56
+ __all__ = [
57
+ "Benchmark",
58
+ "BenchmarkGenerativeTextArgs",
59
+ "BenchmarkSchedulerStats",
60
+ "BenchmarkT",
61
+ "BenchmarkerArgs",
62
+ "BenchmarkerDict",
63
+ "EstimatedBenchmarkState",
64
+ "GenerativeAudioMetricsSummary",
65
+ "GenerativeBenchmark",
66
+ "GenerativeBenchmarksReport",
67
+ "GenerativeImageMetricsSummary",
68
+ "GenerativeMetrics",
69
+ "GenerativeMetricsSummary",
70
+ "GenerativeTextMetricsSummary",
71
+ "GenerativeVideoMetricsSummary",
72
+ "SchedulerDict",
73
+ ]
74
+
75
+
76
+ class EstimatedBenchmarkState(dict[str, Any]):
77
+ """
78
+ Accumulator for real-time benchmark metrics during scheduler execution.
79
+
80
+ Tracks incremental metrics, running averages, and time-based statistics as
81
+ requests are processed. Maintains grouped metrics for benchmark state,
82
+ benchmark-level metrics, and scheduler-level metrics with support for
83
+ average, rate, and time-averaged metric calculations.
84
+
85
+ :cvar benchmark_state_group: Metric group key for benchmark state tracking
86
+ :cvar benchmark_metrics_group: Metric group key for benchmark-level metrics
87
+ :cvar scheduler_state_group: Metric group key for scheduler-level metrics
88
+ """
89
+
90
+ benchmark_state_group: ClassVar[Literal["benchmark_state"]] = "benchmark_state"
91
+ benchmark_metrics_group: ClassVar[Literal["benchmark_metrics"]] = (
92
+ "benchmark_metrics"
93
+ )
94
+ scheduler_state_group: ClassVar[Literal["scheduler_state"]] = "scheduler_state"
95
+
96
+ def get_metric(
97
+ self,
98
+ group: str,
99
+ key: str,
100
+ default: int | float | None = None,
101
+ ) -> int | float | None:
102
+ """
103
+ Retrieve a grouped metric value by group and key.
104
+
105
+ :param group: Metric group identifier
106
+ :param key: Metric key within the group
107
+ :param default: Value returned if metric doesn't exist
108
+ :return: The metric value or default if not found
109
+ """
110
+ return self.get(f"{group}_{key}", default)
111
+
112
+ def set_metric(
113
+ self,
114
+ group: str,
115
+ key: str,
116
+ value: bool | int | float | None,
117
+ start_val: bool | int | float | None = None,
118
+ ) -> bool | int | float | None:
119
+ """
120
+ Set a grouped metric value, optionally adjusting by a starting value.
121
+
122
+ :param group: Metric group identifier
123
+ :param key: Metric key within the group
124
+ :param value: Metric value to set
125
+ :param start_val: Optional starting value to subtract from the metric value
126
+ :return: The adjusted metric value or None if value is None
127
+ """
128
+ if value is None:
129
+ return None
130
+
131
+ if start_val is not None:
132
+ value -= start_val
133
+ self[f"{group}_{key}"] = value
134
+
135
+ return value
136
+
137
+ def add_avg_metric(
138
+ self,
139
+ group: str,
140
+ key: str,
141
+ value: bool | int | float | None,
142
+ start_val: bool | int | float | None = 0.0,
143
+ count: int | None = 1,
144
+ ):
145
+ """
146
+ Add a value to a running average metric calculation.
147
+
148
+ :param group: Metric group identifier
149
+ :param key: Metric key within the group
150
+ :param value: Value to add to the average
151
+ :param start_val: Optional starting value to subtract before adding
152
+ :param count: Number of observations this value represents
153
+ """
154
+ if value is None or count is None:
155
+ return
156
+
157
+ if start_val is not None:
158
+ value -= start_val
159
+
160
+ total_key = f"{group}_{key}_total"
161
+ count_key = f"{group}_{key}_count"
162
+ self[total_key] = self.get(total_key, 0) + value
163
+ self[count_key] = self.get(count_key, 0) + count
164
+
165
+ average = self[total_key] / self[count_key] if self[count_key] > 0 else 0.0
166
+ self.set_metric(
167
+ group=group,
168
+ key=key,
169
+ value=average,
170
+ )
171
+
172
+ def add_avg_rate_metric(
173
+ self,
174
+ group: str,
175
+ key: str,
176
+ value: bool | int | float | None,
177
+ start_val: bool | int | float | None = 0.0,
178
+ start_time: float | None = None,
179
+ end_time: float | None = None,
180
+ numerator_type: Literal["avg", "total", "count"] = "total",
181
+ ):
182
+ """
183
+ Add a value to a rate-based average metric calculation.
184
+
185
+ :param group: Metric group identifier
186
+ :param key: Metric key within the group
187
+ :param value: Value to add to the average
188
+ :param start_val: Optional starting value to subtract before adding
189
+ :param start_time: Start time for rate calculation, defaults to current time
190
+ :param end_time: End time for rate calculation, defaults to current time
191
+ :param numerator_type: Type of numerator for rate calculation
192
+ """
193
+ if value is None:
194
+ return
195
+
196
+ self.add_avg_metric(
197
+ group=group,
198
+ key=key,
199
+ value=value,
200
+ start_val=start_val,
201
+ )
202
+ start_time_key = f"{group}_{key}_start_time"
203
+ if self.get(start_time_key) is None:
204
+ if start_time is None:
205
+ start_time = time.time()
206
+ self[start_time_key] = start_time
207
+ else:
208
+ self[start_time_key] = start_time or self[start_time_key]
209
+
210
+ end_time = end_time or time.time()
211
+ elapsed_time = end_time - self[start_time_key]
212
+
213
+ if elapsed_time > 0:
214
+ numerator_key = (
215
+ f"{group}_{key}_{numerator_type}"
216
+ if numerator_type != "avg"
217
+ else f"{group}_{key}"
218
+ )
219
+ rate = self[numerator_key] / elapsed_time
220
+ self.set_metric(
221
+ group=group,
222
+ key=f"{key}_per_second",
223
+ value=rate,
224
+ )
225
+
226
+ def add_time_averaged_metric(
227
+ self,
228
+ group: str,
229
+ key: str,
230
+ value: bool | int | float | None,
231
+ recorded_time: float | None = None,
232
+ ):
233
+ """
234
+ Add a value to a time-weighted average metric calculation.
235
+
236
+ :param group: Metric group identifier
237
+ :param key: Metric key within the group
238
+ :param value: Value to add to the time-weighted average
239
+ :param recorded_time: Time of the observation, defaults to current time
240
+ """
241
+ if value is None:
242
+ return
243
+
244
+ if recorded_time is None:
245
+ recorded_time = time.time()
246
+
247
+ time_avg_numerator_key = f"{group}_{key}_time_avg_numerator"
248
+ time_avg_denominator_key = f"{group}_{key}_time_avg_denominator"
249
+ last_recorded_time_key = f"{group}_{key}_last_recorded_time"
250
+ last_recorded_value_key = f"{group}_{key}_last_recorded_value"
251
+
252
+ if last_recorded_time_key not in self:
253
+ self[last_recorded_time_key] = recorded_time
254
+ self[last_recorded_value_key] = value
255
+ self[time_avg_numerator_key] = value
256
+ self[time_avg_denominator_key] = 0.0
257
+ else:
258
+ time_delta = recorded_time - self[last_recorded_time_key]
259
+ self[time_avg_numerator_key] += self[last_recorded_value_key] * time_delta
260
+ self[time_avg_denominator_key] += time_delta
261
+ self[last_recorded_time_key] = recorded_time
262
+ self[last_recorded_value_key] = value
263
+
264
+ if self[time_avg_denominator_key] > 0:
265
+ average = self[time_avg_numerator_key] / self[time_avg_denominator_key]
266
+ else:
267
+ average = value
268
+
269
+ self.set_metric(
270
+ group=group,
271
+ key=key,
272
+ value=average,
273
+ )
274
+
275
+
276
+ class BenchmarkerArgs(StandardBaseDict):
277
+ """
278
+ Configuration parameters for benchmark execution and request sampling.
279
+
280
+ Defines run identification, request sampling strategy, warmup/cooldown phases,
281
+ and metric preferences for benchmark executions. Provides methods to determine
282
+ whether a request falls within warmup or cooldown periods based on time,
283
+ request count, or percentage-based thresholds.
284
+ """
285
+
286
+ run_id: str = Field(
287
+ default_factory=lambda: str(uuid.uuid4()),
288
+ description="Unique identifier for the benchmark run",
289
+ )
290
+ run_index: int = Field(default=0, description="Index of the benchmark run")
291
+ sample_requests: int | None = Field(
292
+ default=20,
293
+ description=(
294
+ "Number of requests to sample and keep in the final benchmark for metrics"
295
+ ),
296
+ )
297
+ warmup: int | float | None = Field(
298
+ default=None, description="Warmup time before benchmarking starts"
299
+ )
300
+ cooldown: int | float | None = Field(
301
+ default=None, description="Cooldown time after benchmarking ends"
302
+ )
303
+ prefer_response_metrics: bool = Field(
304
+ default=True,
305
+ description="Whether to prefer response metrics over request metrics",
306
+ )
307
+
308
+ def is_in_warmup(
309
+ self, request_info: RequestInfo, scheduler_state: SchedulerState
310
+ ) -> bool:
311
+ """
312
+ Check if a request is in the warmup phase.
313
+
314
+ :param request_info: Information about the current request
315
+ :param scheduler_state: Current state of the scheduler
316
+ :return: True if the request is in warmup phase, False otherwise
317
+ """
318
+ if self.warmup is not None and 0 < self.warmup < 1:
319
+ # Percentage-based warmup
320
+ return (
321
+ scheduler_state.remaining_fraction is not None
322
+ and scheduler_state.remaining_fraction > (1 - self.warmup)
323
+ )
324
+
325
+ if self.warmup is not None and self.warmup > 1:
326
+ # Count/time-based warmup
327
+ if scheduler_state.processed_requests < self.warmup:
328
+ return True
329
+
330
+ current_time = request_info.timings.targeted_start
331
+ return (
332
+ current_time is not None
333
+ and (current_time - scheduler_state.start_time) < self.warmup
334
+ )
335
+
336
+ return False
337
+
338
+ def is_in_cooldown(
339
+ self, request_info: RequestInfo, scheduler_state: SchedulerState
340
+ ) -> bool:
341
+ """
342
+ Check if a request is in the cooldown phase.
343
+
344
+ :param request_info: Information about the current request
345
+ :param scheduler_state: Current state of the scheduler
346
+ :return: True if the request is in cooldown phase, False otherwise
347
+ """
348
+ if self.cooldown is not None and 0 < self.cooldown < 1:
349
+ # Percentage-based cooldown
350
+ return (
351
+ scheduler_state.remaining_fraction is not None
352
+ and scheduler_state.remaining_fraction < self.cooldown
353
+ )
354
+
355
+ if self.cooldown is not None and self.cooldown > 1:
356
+ # Count/time-based cooldown
357
+ if (
358
+ scheduler_state.remaining_requests is not None
359
+ and scheduler_state.remaining_requests <= self.cooldown
360
+ ):
361
+ return True
362
+
363
+ current_time = (
364
+ request_info.timings.resolve_end or request_info.timings.targeted_start
365
+ )
366
+ return (
367
+ current_time is not None
368
+ and scheduler_state.remaining_duration is not None
369
+ and scheduler_state.remaining_duration < self.cooldown
370
+ )
371
+
372
+ return False
373
+
374
+
375
+ class Benchmark(ABC):
376
+ """
377
+ Abstract base interface for benchmark result implementations.
378
+
379
+ Defines the contract for benchmark classes to provide run metrics sampling,
380
+ request metrics sampling, real-time estimate updates, and final compilation
381
+ of benchmark results from scheduler execution data.
382
+ """
383
+
384
+ @abstractmethod
385
+ def get_run_metrics_sample(
386
+ self,
387
+ ) -> dict[Literal["start_time", "end_time", "duration"], float]:
388
+ """
389
+ Get a sample of run-level timing metrics.
390
+
391
+ :return: Dictionary containing start_time, end_time, and duration metrics
392
+ """
393
+ ...
394
+
395
+ @abstractmethod
396
+ def get_request_metrics_sample(
397
+ self,
398
+ ) -> dict[
399
+ Literal[
400
+ "request_count",
401
+ "request_latency",
402
+ "request_throughput",
403
+ "request_concurrency",
404
+ ],
405
+ float,
406
+ ]:
407
+ """
408
+ Get a sample of request-level performance metrics.
409
+
410
+ :return: Dictionary containing request count, latency, throughput, and
411
+ concurrency metrics
412
+ """
413
+ ...
414
+
415
+ @classmethod
416
+ @abstractmethod
417
+ def update_estimate(
418
+ cls,
419
+ args: BenchmarkerArgs,
420
+ state: EstimatedBenchmarkState,
421
+ response: Any,
422
+ request: Any,
423
+ request_info: RequestInfo,
424
+ scheduler_state: SchedulerState,
425
+ ):
426
+ """
427
+ Update real-time benchmark estimates with new request data.
428
+
429
+ :param args: Benchmark configuration arguments
430
+ :param state: Current estimated benchmark state to update
431
+ :param response: Response received from the backend
432
+ :param request: Original request sent to the backend
433
+ :param request_info: Metadata about the request execution
434
+ :param scheduler_state: Current state of the scheduler
435
+ """
436
+ ...
437
+
438
+ @classmethod
439
+ @abstractmethod
440
+ def compile(
441
+ cls,
442
+ args: BenchmarkerArgs,
443
+ estimated_state: EstimatedBenchmarkState,
444
+ scheduler_state: SchedulerState,
445
+ profile: Profile,
446
+ requests: Iterable,
447
+ backend: BackendInterface,
448
+ environment: Environment,
449
+ strategy: SchedulingStrategy,
450
+ constraints: dict[str, dict[str, Any]],
451
+ ) -> Any:
452
+ """
453
+ Compile final benchmark results from accumulated state.
454
+
455
+ :param args: Benchmark configuration arguments
456
+ :param estimated_state: Accumulated benchmark state from execution
457
+ :param scheduler_state: Final state of the scheduler
458
+ :param profile: Benchmark profile configuration
459
+ :param requests: Collection of requests executed
460
+ :param backend: Backend interface used for execution
461
+ :param environment: Execution environment configuration
462
+ :param strategy: Scheduling strategy used
463
+ :param constraints: Execution constraints applied
464
+ :return: Compiled benchmark results instance
465
+ """
466
+ ...
467
+
468
+
469
+ BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark)
470
+
471
+
472
+ class BenchmarkSchedulerStats(StandardBaseDict):
473
+ """Scheduler timing and performance statistics."""
474
+
475
+ group_name: ClassVar[Literal["scheduler_stats"]] = "scheduler_stats"
476
+
477
+ start_time: float = Field(
478
+ description="Unix timestamp when the benchmark run started"
479
+ )
480
+ end_time: float = Field(description="Unix timestamp when the benchmark run ended")
481
+ requests_made: StatusBreakdown[int, int, int, int] = Field(
482
+ description="Request counts by status: successful, incomplete, errored, total"
483
+ )
484
+ queued_time_avg: float = Field(
485
+ description="Avg time requests spent in the queue (seconds)"
486
+ )
487
+ worker_resolve_start_delay_avg: float = Field(
488
+ description="Avg delay before worker begins resolving req after dequeue (sec)"
489
+ )
490
+ worker_resolve_time_avg: float = Field(
491
+ description="Avg time for worker to resolve requests (seconds)"
492
+ )
493
+ worker_resolve_end_delay_avg: float = Field(
494
+ description="Avg delay after request end till worker resolves (seconds)"
495
+ )
496
+ finalized_delay_avg: float = Field(
497
+ description="Avg delay after resolve til finalized with in scheduler (sec)"
498
+ )
499
+ worker_targeted_start_delay_avg: float = Field(
500
+ description="Avg delay from targeted start to actual worker start (seconds)"
501
+ )
502
+ request_start_delay_avg: float = Field(
503
+ description="Avg delay after resolve til request start (seconds)"
504
+ )
505
+ request_time_avg: float = Field(description="Avg request processing time (seconds)")
506
+ request_targeted_start_delay_avg: float = Field(
507
+ description="Avg delay from targeted start to actual request start"
508
+ )
509
+
510
+ @classmethod
511
+ def update_estimate(cls, state: EstimatedBenchmarkState, request_info: RequestInfo):
512
+ """
513
+ Update estimated scheduler statistics with request timing information.
514
+
515
+ :param state: Current estimated benchmark state to update
516
+ :param request_info: Metadata about the request execution with timing data
517
+ """
518
+ state.set_metric(group=cls.group_name, key="updated", value=True)
519
+ state.add_avg_metric(
520
+ group=cls.group_name,
521
+ key="queued_time",
522
+ value=request_info.timings.dequeued,
523
+ start_val=request_info.timings.queued,
524
+ )
525
+ state.add_avg_metric(
526
+ group=cls.group_name,
527
+ key="worker_resolve_start_delay",
528
+ value=request_info.timings.resolve_start,
529
+ start_val=request_info.timings.scheduled_at,
530
+ )
531
+ state.add_avg_metric(
532
+ group=cls.group_name,
533
+ key="worker_resolve_time",
534
+ value=request_info.timings.resolve_end,
535
+ start_val=request_info.timings.resolve_start,
536
+ )
537
+ state.add_avg_metric(
538
+ group=cls.group_name,
539
+ key="worker_resolve_end_delay",
540
+ value=request_info.timings.request_end,
541
+ start_val=request_info.timings.resolve_end,
542
+ )
543
+ state.add_avg_metric(
544
+ group=cls.group_name,
545
+ key="finalized_delay",
546
+ value=request_info.timings.finalized,
547
+ start_val=request_info.timings.resolve_end,
548
+ )
549
+ state.add_avg_metric(
550
+ group=cls.group_name,
551
+ key="worker_targeted_start_delay",
552
+ value=request_info.timings.resolve_start,
553
+ start_val=request_info.timings.targeted_start,
554
+ )
555
+ state.add_avg_metric(
556
+ group=cls.group_name,
557
+ key="request_start_delay",
558
+ value=request_info.timings.request_start,
559
+ start_val=request_info.timings.resolve_start,
560
+ )
561
+ state.add_avg_metric(
562
+ group=cls.group_name,
563
+ key="request_time",
564
+ value=request_info.timings.request_end,
565
+ start_val=request_info.timings.request_start,
566
+ )
567
+ state.add_avg_metric(
568
+ group=cls.group_name,
569
+ key="request_targeted_start_delay",
570
+ value=request_info.timings.request_start,
571
+ start_val=request_info.timings.targeted_start,
572
+ )
573
+
574
+ @classmethod
575
+ def compile(
576
+ cls, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState
577
+ ) -> BenchmarkSchedulerStats:
578
+ """
579
+ Compile final scheduler statistics from accumulated state.
580
+
581
+ :param estimated_state: Accumulated benchmark state with scheduler metrics
582
+ :param scheduler_state: Final state of the scheduler
583
+ :return: Compiled scheduler statistics instance
584
+ """
585
+ return BenchmarkSchedulerStats(
586
+ start_time=scheduler_state.start_time,
587
+ end_time=scheduler_state.end_time or scheduler_state.start_time,
588
+ requests_made=StatusBreakdown[int, int, int, int](
589
+ successful=scheduler_state.successful_requests,
590
+ incomplete=scheduler_state.cancelled_requests,
591
+ errored=scheduler_state.errored_requests,
592
+ total=(
593
+ scheduler_state.successful_requests
594
+ + scheduler_state.cancelled_requests
595
+ + scheduler_state.errored_requests
596
+ ),
597
+ ),
598
+ queued_time_avg=cast(
599
+ "float",
600
+ estimated_state.get_metric(
601
+ group=cls.group_name, key="queued_time", default=-1.0
602
+ ),
603
+ ),
604
+ worker_resolve_start_delay_avg=cast(
605
+ "float",
606
+ estimated_state.get_metric(
607
+ group=cls.group_name, key="worker_resolve_start_delay", default=-1.0
608
+ ),
609
+ ),
610
+ worker_resolve_time_avg=cast(
611
+ "float",
612
+ estimated_state.get_metric(
613
+ group=cls.group_name, key="worker_resolve_time", default=-1.0
614
+ ),
615
+ ),
616
+ worker_resolve_end_delay_avg=cast(
617
+ "float",
618
+ estimated_state.get_metric(
619
+ group=cls.group_name, key="worker_resolve_end_delay", default=-1.0
620
+ ),
621
+ ),
622
+ finalized_delay_avg=cast(
623
+ "float",
624
+ estimated_state.get_metric(
625
+ group=cls.group_name, key="finalized_delay", default=-1.0
626
+ ),
627
+ ),
628
+ worker_targeted_start_delay_avg=cast(
629
+ "float",
630
+ estimated_state.get_metric(
631
+ group=cls.group_name,
632
+ key="worker_targeted_start_delay",
633
+ default=-1.0,
634
+ ),
635
+ ),
636
+ request_start_delay_avg=cast(
637
+ "float",
638
+ estimated_state.get_metric(
639
+ group=cls.group_name, key="request_start_delay", default=-1.0
640
+ ),
641
+ ),
642
+ request_time_avg=cast(
643
+ "float",
644
+ estimated_state.get_metric(
645
+ group=cls.group_name, key="request_time", default=-1.0
646
+ ),
647
+ ),
648
+ request_targeted_start_delay_avg=cast(
649
+ "float",
650
+ estimated_state.get_metric(
651
+ group=cls.group_name,
652
+ key="request_targeted_start_delay",
653
+ default=-1.0,
654
+ ),
655
+ ),
656
+ )
657
+
658
+
659
+ class GenerativeMetricsSummary(StandardBaseDict):
660
+ """
661
+ Statistical summaries for input, output, and total metrics.
662
+
663
+ Provides distribution summaries across successful, incomplete, and errored
664
+ requests for absolute values, per-second rates, and concurrency levels.
665
+ """
666
+
667
+ input: StatusDistributionSummary = Field(
668
+ description="Distribution of input metric values"
669
+ )
670
+ input_per_second: StatusDistributionSummary = Field(
671
+ description="Distribution of input metric rates per second"
672
+ )
673
+ input_concurrency: StatusDistributionSummary = Field(
674
+ description="Distribution of concurrent input metric values"
675
+ )
676
+
677
+ output: StatusDistributionSummary = Field(
678
+ description="Distribution of output metric values"
679
+ )
680
+ output_per_second: StatusDistributionSummary = Field(
681
+ description="Distribution of output metric rates per second"
682
+ )
683
+ output_concurrency: StatusDistributionSummary = Field(
684
+ description="Distribution of concurrent output metric values"
685
+ )
686
+
687
+ total: StatusDistributionSummary = Field(
688
+ description="Distribution of total metric values (input + output)"
689
+ )
690
+ total_per_second: StatusDistributionSummary = Field(
691
+ description="Distribution of total metric rates per second"
692
+ )
693
+ total_concurrency: StatusDistributionSummary = Field(
694
+ description="Distribution of concurrent total metric values"
695
+ )
696
+
697
+ @classmethod
698
+ def compile(
699
+ cls,
700
+ request_types: list[Literal["successful", "incomplete", "error"]],
701
+ request_times: list[tuple[float, float]],
702
+ input_values: list[int | float],
703
+ output_values: list[int | float],
704
+ ) -> GenerativeMetricsSummary:
705
+ """
706
+ Compile generative metrics summary from request data.
707
+
708
+ :param request_types: Status types for each request
709
+ :param request_times: Start and end times for each request
710
+ :param input_values: Input metric values for each request
711
+ :param output_values: Output metric values for each request
712
+ :return: Compiled generative metrics summary
713
+ """
714
+ total_values = [
715
+ input_val + output_val
716
+ for input_val, output_val in zip(input_values, output_values, strict=False)
717
+ ]
718
+
719
+ return GenerativeMetricsSummary(
720
+ input=StatusDistributionSummary.from_values(
721
+ value_types=request_types,
722
+ values=input_values,
723
+ ),
724
+ input_per_second=StatusDistributionSummary.from_request_times(
725
+ request_types=request_types,
726
+ requests=request_times,
727
+ distribution_type="rate",
728
+ weights=input_values,
729
+ ),
730
+ input_concurrency=StatusDistributionSummary.from_request_times(
731
+ request_types=request_types,
732
+ requests=request_times,
733
+ distribution_type="concurrency",
734
+ weights=input_values,
735
+ ),
736
+ output=StatusDistributionSummary.from_values(
737
+ value_types=request_types,
738
+ values=output_values,
739
+ ),
740
+ output_per_second=StatusDistributionSummary.from_request_times(
741
+ request_types=request_types,
742
+ requests=request_times,
743
+ distribution_type="rate",
744
+ weights=output_values,
745
+ ),
746
+ output_concurrency=StatusDistributionSummary.from_request_times(
747
+ request_types=request_types,
748
+ requests=request_times,
749
+ distribution_type="concurrency",
750
+ weights=output_values,
751
+ ),
752
+ total=StatusDistributionSummary.from_values(
753
+ value_types=request_types,
754
+ values=total_values,
755
+ ),
756
+ total_per_second=StatusDistributionSummary.from_request_times(
757
+ request_types=request_types,
758
+ requests=request_times,
759
+ distribution_type="rate",
760
+ weights=total_values,
761
+ ),
762
+ total_concurrency=StatusDistributionSummary.from_request_times(
763
+ request_types=request_types,
764
+ requests=request_times,
765
+ distribution_type="concurrency",
766
+ weights=total_values,
767
+ ),
768
+ )
769
+
770
+
771
+ class GenerativeTextMetricsSummary(StandardBaseDict):
772
+ """
773
+ Text-specific metric summaries for generative benchmarks.
774
+
775
+ Tracks token, word, and character-level metrics across input, output, and
776
+ total usage for text generation workloads.
777
+ """
778
+
779
+ tokens: GenerativeMetricsSummary = Field(
780
+ description="Token count metrics and distributions"
781
+ )
782
+ words: GenerativeMetricsSummary = Field(
783
+ description="Word count metrics and distributions"
784
+ )
785
+ characters: GenerativeMetricsSummary = Field(
786
+ description="Character count metrics and distributions"
787
+ )
788
+
789
+ @classmethod
790
+ def compile(
791
+ cls,
792
+ request_types: list[Literal["successful", "incomplete", "error"]],
793
+ request_times: list[tuple[float, float]],
794
+ input_metrics: list[UsageMetrics],
795
+ output_metrics: list[UsageMetrics],
796
+ ) -> GenerativeTextMetricsSummary:
797
+ """
798
+ Compile text metrics summary from request usage data.
799
+
800
+ :param request_types: Status types for each request
801
+ :param request_times: Start and end times for each request
802
+ :param input_metrics: Input usage metrics for each request
803
+ :param output_metrics: Output usage metrics for each request
804
+ :return: Compiled text metrics summary
805
+ """
806
+ return GenerativeTextMetricsSummary(
807
+ tokens=GenerativeMetricsSummary.compile(
808
+ request_types=request_types,
809
+ request_times=request_times,
810
+ input_values=[metrics.text_tokens or 0 for metrics in input_metrics],
811
+ output_values=[metrics.text_tokens or 0 for metrics in output_metrics],
812
+ ),
813
+ words=GenerativeMetricsSummary.compile(
814
+ request_types=request_types,
815
+ request_times=request_times,
816
+ input_values=[metrics.text_words or 0 for metrics in input_metrics],
817
+ output_values=[metrics.text_words or 0 for metrics in output_metrics],
818
+ ),
819
+ characters=GenerativeMetricsSummary.compile(
820
+ request_types=request_types,
821
+ request_times=request_times,
822
+ input_values=[
823
+ metrics.text_characters or 0 for metrics in input_metrics
824
+ ],
825
+ output_values=[
826
+ metrics.text_characters or 0 for metrics in output_metrics
827
+ ],
828
+ ),
829
+ )
830
+
831
+
832
+ class GenerativeImageMetricsSummary(StandardBaseDict):
833
+ """
834
+ Image-specific metric summaries for generative benchmarks.
835
+
836
+ Tracks token, image count, pixel, and byte-level metrics across input, output,
837
+ and total usage for image generation workloads.
838
+ """
839
+
840
+ tokens: GenerativeMetricsSummary = Field(
841
+ description="Image token count metrics and distributions"
842
+ )
843
+ images: GenerativeMetricsSummary = Field(
844
+ description="Image count metrics and distributions"
845
+ )
846
+ pixels: GenerativeMetricsSummary = Field(
847
+ description="Pixel count metrics and distributions"
848
+ )
849
+ bytes: GenerativeMetricsSummary = Field(
850
+ description="Byte size metrics and distributions"
851
+ )
852
+
853
+ @classmethod
854
+ def compile(
855
+ cls,
856
+ request_types: list[Literal["successful", "incomplete", "error"]],
857
+ request_times: list[tuple[float, float]],
858
+ input_metrics: list[UsageMetrics],
859
+ output_metrics: list[UsageMetrics],
860
+ ) -> GenerativeImageMetricsSummary:
861
+ """
862
+ Compile image metrics summary from request usage data.
863
+
864
+ :param request_types: Status types for each request
865
+ :param request_times: Start and end times for each request
866
+ :param input_metrics: Input usage metrics for each request
867
+ :param output_metrics: Output usage metrics for each request
868
+ :return: Compiled image metrics summary
869
+ """
870
+ return GenerativeImageMetricsSummary(
871
+ tokens=GenerativeMetricsSummary.compile(
872
+ request_types=request_types,
873
+ request_times=request_times,
874
+ input_values=[metrics.image_tokens or 0 for metrics in input_metrics],
875
+ output_values=[metrics.image_tokens or 0 for metrics in output_metrics],
876
+ ),
877
+ images=GenerativeMetricsSummary.compile(
878
+ request_types=request_types,
879
+ request_times=request_times,
880
+ input_values=[metrics.image_count or 0 for metrics in input_metrics],
881
+ output_values=[metrics.image_count or 0 for metrics in output_metrics],
882
+ ),
883
+ pixels=GenerativeMetricsSummary.compile(
884
+ request_types=request_types,
885
+ request_times=request_times,
886
+ input_values=[metrics.image_pixels or 0 for metrics in input_metrics],
887
+ output_values=[metrics.image_pixels or 0 for metrics in output_metrics],
888
+ ),
889
+ bytes=GenerativeMetricsSummary.compile(
890
+ request_types=request_types,
891
+ request_times=request_times,
892
+ input_values=[metrics.image_bytes or 0 for metrics in input_metrics],
893
+ output_values=[metrics.image_bytes or 0 for metrics in output_metrics],
894
+ ),
895
+ )
896
+
897
+
898
+ class GenerativeVideoMetricsSummary(StandardBaseDict):
899
+ """
900
+ Video-specific metric summaries for generative benchmarks.
901
+
902
+ Tracks token, frame count, duration, and byte-level metrics across input,
903
+ output, and total usage for video generation workloads.
904
+ """
905
+
906
+ tokens: GenerativeMetricsSummary = Field(
907
+ description="Video token count metrics and distributions"
908
+ )
909
+ frames: GenerativeMetricsSummary = Field(
910
+ description="Frame count metrics and distributions"
911
+ )
912
+ seconds: GenerativeMetricsSummary = Field(
913
+ description="Duration metrics in seconds and distributions"
914
+ )
915
+ bytes: GenerativeMetricsSummary = Field(
916
+ description="Byte size metrics and distributions"
917
+ )
918
+
919
+ @classmethod
920
+ def compile(
921
+ cls,
922
+ request_types: list[Literal["successful", "incomplete", "error"]],
923
+ request_times: list[tuple[float, float]],
924
+ input_metrics: list[UsageMetrics],
925
+ output_metrics: list[UsageMetrics],
926
+ ) -> GenerativeVideoMetricsSummary:
927
+ """
928
+ Compile video metrics summary from request usage data.
929
+
930
+ :param request_types: Status types for each request
931
+ :param request_times: Start and end times for each request
932
+ :param input_metrics: Input usage metrics for each request
933
+ :param output_metrics: Output usage metrics for each request
934
+ :return: Compiled video metrics summary
935
+ """
936
+ return GenerativeVideoMetricsSummary(
937
+ tokens=GenerativeMetricsSummary.compile(
938
+ request_types=request_types,
939
+ request_times=request_times,
940
+ input_values=[metrics.video_tokens or 0 for metrics in input_metrics],
941
+ output_values=[metrics.video_tokens or 0 for metrics in output_metrics],
942
+ ),
943
+ frames=GenerativeMetricsSummary.compile(
944
+ request_types=request_types,
945
+ request_times=request_times,
946
+ input_values=[metrics.video_frames or 0 for metrics in input_metrics],
947
+ output_values=[metrics.video_frames or 0 for metrics in output_metrics],
948
+ ),
949
+ seconds=GenerativeMetricsSummary.compile(
950
+ request_types=request_types,
951
+ request_times=request_times,
952
+ input_values=[metrics.video_seconds or 0 for metrics in input_metrics],
953
+ output_values=[
954
+ metrics.video_seconds or 0 for metrics in output_metrics
955
+ ],
956
+ ),
957
+ bytes=GenerativeMetricsSummary.compile(
958
+ request_types=request_types,
959
+ request_times=request_times,
960
+ input_values=[metrics.video_bytes or 0 for metrics in input_metrics],
961
+ output_values=[metrics.video_bytes or 0 for metrics in output_metrics],
962
+ ),
963
+ )
964
+
965
+
966
+ class GenerativeAudioMetricsSummary(StandardBaseDict):
967
+ """
968
+ Audio-specific metric summaries for generative benchmarks.
969
+
970
+ Tracks token, sample count, duration, and byte-level metrics across input,
971
+ output, and total usage for audio generation workloads.
972
+ """
973
+
974
+ tokens: GenerativeMetricsSummary = Field(
975
+ description="Audio token count metrics and distributions"
976
+ )
977
+ samples: GenerativeMetricsSummary = Field(
978
+ description="Sample count metrics and distributions"
979
+ )
980
+ seconds: GenerativeMetricsSummary = Field(
981
+ description="Duration metrics in seconds and distributions"
982
+ )
983
+ bytes: GenerativeMetricsSummary = Field(
984
+ description="Byte size metrics and distributions"
985
+ )
986
+
987
+ @classmethod
988
+ def compile(
989
+ cls,
990
+ request_types: list[Literal["successful", "incomplete", "error"]],
991
+ request_times: list[tuple[float, float]],
992
+ input_metrics: list[UsageMetrics],
993
+ output_metrics: list[UsageMetrics],
994
+ ) -> GenerativeAudioMetricsSummary:
995
+ """
996
+ Compile audio metrics summary from request usage data.
997
+
998
+ :param request_types: Status types for each request
999
+ :param request_times: Start and end times for each request
1000
+ :param input_metrics: Input usage metrics for each request
1001
+ :param output_metrics: Output usage metrics for each request
1002
+ :return: Compiled audio metrics summary
1003
+ """
1004
+ return GenerativeAudioMetricsSummary(
1005
+ tokens=GenerativeMetricsSummary.compile(
1006
+ request_types=request_types,
1007
+ request_times=request_times,
1008
+ input_values=[metrics.audio_tokens or 0 for metrics in input_metrics],
1009
+ output_values=[metrics.audio_tokens or 0 for metrics in output_metrics],
1010
+ ),
1011
+ samples=GenerativeMetricsSummary.compile(
1012
+ request_types=request_types,
1013
+ request_times=request_times,
1014
+ input_values=[metrics.audio_samples or 0 for metrics in input_metrics],
1015
+ output_values=[
1016
+ metrics.audio_samples or 0 for metrics in output_metrics
1017
+ ],
1018
+ ),
1019
+ seconds=GenerativeMetricsSummary.compile(
1020
+ request_types=request_types,
1021
+ request_times=request_times,
1022
+ input_values=[metrics.audio_seconds or 0 for metrics in input_metrics],
1023
+ output_values=[
1024
+ metrics.audio_seconds or 0 for metrics in output_metrics
1025
+ ],
1026
+ ),
1027
+ bytes=GenerativeMetricsSummary.compile(
1028
+ request_types=request_types,
1029
+ request_times=request_times,
1030
+ input_values=[metrics.audio_bytes or 0 for metrics in input_metrics],
1031
+ output_values=[metrics.audio_bytes or 0 for metrics in output_metrics],
1032
+ ),
1033
+ )
1034
+
1035
+
1036
+ class GenerativeMetrics(StandardBaseDict):
1037
+ """Comprehensive metrics for generative AI benchmarks."""
1038
+
1039
+ # Request stats
1040
+ requests_per_second: StatusDistributionSummary = Field(
1041
+ description="Distribution of requests per second across benchmark execution"
1042
+ )
1043
+ request_concurrency: StatusDistributionSummary = Field(
1044
+ description="Distribution of concurrent request counts during execution"
1045
+ )
1046
+ request_latency: StatusDistributionSummary = Field(
1047
+ description="Distribution of request latencies for completed requests"
1048
+ )
1049
+ request_streaming_iterations_count: StatusDistributionSummary = Field(
1050
+ description="Distribution of stream iterations for completed requests"
1051
+ )
1052
+
1053
+ # General token stats
1054
+ prompt_token_count: StatusDistributionSummary = Field(
1055
+ description="Distribution of prompt token counts by request status"
1056
+ )
1057
+ output_token_count: StatusDistributionSummary = Field(
1058
+ description="Distribution of output token counts by request status"
1059
+ )
1060
+ total_token_count: StatusDistributionSummary = Field(
1061
+ description="Distribution of total token counts by request status"
1062
+ )
1063
+ time_to_first_token_ms: StatusDistributionSummary = Field(
1064
+ description="Distribution of first token latencies in milliseconds"
1065
+ )
1066
+ time_per_output_token_ms: StatusDistributionSummary = Field(
1067
+ description="Distribution of average time per output token in milliseconds"
1068
+ )
1069
+ inter_token_latency_ms: StatusDistributionSummary = Field(
1070
+ description="Distribution of inter-token latencies in milliseconds"
1071
+ )
1072
+ output_tokens_wo_first_per_iteration: StatusDistributionSummary = Field(
1073
+ description=(
1074
+ "Distribution of output tokens (without first) generated per "
1075
+ "streaming iteration"
1076
+ )
1077
+ )
1078
+ output_tokens_per_second: StatusDistributionSummary = Field(
1079
+ description="Distribution of output token generation rates"
1080
+ )
1081
+ output_tokens_per_iteration: StatusDistributionSummary = Field(
1082
+ description="Distribution of output tokens generated per streaming iteration"
1083
+ )
1084
+ tokens_per_second: StatusDistributionSummary = Field(
1085
+ description="Distribution of total token throughput including prompt and output"
1086
+ )
1087
+
1088
+ # Domain specific stats
1089
+ text: GenerativeTextMetricsSummary = Field(
1090
+ description="Text-specific metrics for tokens, words, and characters"
1091
+ )
1092
+ image: GenerativeImageMetricsSummary = Field(
1093
+ description="Image-specific metrics for tokens, images, pixels, and bytes"
1094
+ )
1095
+ video: GenerativeVideoMetricsSummary = Field(
1096
+ description="Video-specific metrics for tokens, frames, duration, and bytes"
1097
+ )
1098
+ audio: GenerativeAudioMetricsSummary = Field(
1099
+ description="Audio-specific metrics for tokens, samples, duration, and bytes"
1100
+ )
1101
+
1102
+ @classmethod
1103
+ def update_estimate(
1104
+ cls,
1105
+ state: EstimatedBenchmarkState,
1106
+ response: GenerationResponse | None,
1107
+ request: GenerationRequest,
1108
+ request_info: RequestInfo,
1109
+ scheduler_state: SchedulerState,
1110
+ ):
1111
+ """
1112
+ Update real-time generative metrics estimates with new request data.
1113
+
1114
+ :param state: Current estimated benchmark state to update
1115
+ :param response: Response received from the backend
1116
+ :param request: Original request sent to the backend
1117
+ :param request_info: Metadata about the request execution
1118
+ :param scheduler_state: Current state of the scheduler
1119
+ """
1120
+ benchmark_start_time = scheduler_state.start_time
1121
+ request_start_time = (
1122
+ request_info.timings.request_start or request_info.timings.resolve_start
1123
+ )
1124
+ request_end_time = (
1125
+ request_info.timings.request_end or request_info.timings.resolve_end
1126
+ )
1127
+ event_occurence_time = (
1128
+ request_info.timings.queued
1129
+ if request_info.status == "queued"
1130
+ else (
1131
+ request_info.timings.dequeued
1132
+ if request_info.status == "pending"
1133
+ else request_start_time
1134
+ if request_info.status == "in_progress"
1135
+ else request_end_time
1136
+ )
1137
+ )
1138
+ benchmark_duration = (
1139
+ event_occurence_time - benchmark_start_time
1140
+ if event_occurence_time
1141
+ else None
1142
+ )
1143
+ request_duration = (
1144
+ (request_end_time - request_start_time)
1145
+ if request_end_time and request_start_time else None
1146
+ )
1147
+
1148
+ # Always track concurrency
1149
+ if event_occurence_time is not None:
1150
+ state.add_time_averaged_metric(
1151
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1152
+ key="concurrency_requests",
1153
+ value=scheduler_state.processing_requests,
1154
+ recorded_time=event_occurence_time,
1155
+ )
1156
+
1157
+ if request_info.status not in {"completed", "errored", "cancelled"}:
1158
+ return
1159
+
1160
+ state.set_metric(
1161
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1162
+ key="updated",
1163
+ value=True,
1164
+ )
1165
+
1166
+ for prefix in (request_info.status, "total"):
1167
+ requests_count = (
1168
+ scheduler_state.successful_requests
1169
+ if prefix == "completed"
1170
+ else scheduler_state.errored_requests
1171
+ if prefix == "errored"
1172
+ else scheduler_state.cancelled_requests
1173
+ if prefix == "cancelled"
1174
+ else scheduler_state.processed_requests
1175
+ )
1176
+ input_tokens = (
1177
+ (response.input_metrics.total_tokens if response else None)
1178
+ or request.input_metrics.total_tokens
1179
+ or 0
1180
+ )
1181
+ output_tokens = (
1182
+ (response.output_metrics.total_tokens if response else None)
1183
+ or request.output_metrics.total_tokens
1184
+ or 0
1185
+ )
1186
+
1187
+ # Request distribution stats
1188
+ state.set_metric(
1189
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1190
+ key=f"{prefix}_requests",
1191
+ value=requests_count,
1192
+ )
1193
+ state.set_metric(
1194
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1195
+ key=f"{prefix}_requests_per_second",
1196
+ value=(
1197
+ requests_count / benchmark_duration if benchmark_duration else None
1198
+ ),
1199
+ )
1200
+ state.add_avg_metric(
1201
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1202
+ key=f"{prefix}_request_latency",
1203
+ value=request_duration,
1204
+ )
1205
+ state.add_avg_metric(
1206
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1207
+ key=f"{prefix}_request_streaming_iterations",
1208
+ value=request_info.timings.iterations or 0,
1209
+ )
1210
+
1211
+ # Token iteration stats
1212
+ state.add_avg_metric(
1213
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1214
+ key="output_tokens_iterations",
1215
+ value=output_tokens,
1216
+ count=request_info.timings.iterations or 1,
1217
+ )
1218
+ state.add_avg_metric(
1219
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1220
+ key="output_tokens_wo_first_iterations",
1221
+ value=output_tokens - 1 if output_tokens > 1 else 0,
1222
+ count=request_info.timings.iterations or 1,
1223
+ )
1224
+
1225
+ # Token metrics stats
1226
+ state.add_avg_metric(
1227
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1228
+ key=f"{prefix}_time_to_first_token",
1229
+ value=request_info.timings.first_iteration,
1230
+ start_val=request_start_time,
1231
+ )
1232
+ state.add_avg_metric(
1233
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1234
+ key=f"{prefix}_inter_token_latency",
1235
+ value=request_info.timings.last_iteration,
1236
+ start_val=request_info.timings.first_iteration,
1237
+ count=(output_tokens or 1) - 1,
1238
+ )
1239
+ state.add_avg_metric(
1240
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1241
+ key=f"{prefix}_time_per_output_token",
1242
+ value=request_duration,
1243
+ count=output_tokens or 0,
1244
+ )
1245
+
1246
+ # Input/output throughput stats
1247
+ if event_occurence_time is not None:
1248
+ state.add_avg_rate_metric(
1249
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1250
+ key="input_tokens",
1251
+ value=input_tokens,
1252
+ start_time=benchmark_start_time,
1253
+ end_time=event_occurence_time,
1254
+ )
1255
+ state.add_avg_rate_metric(
1256
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1257
+ key="output_tokens",
1258
+ value=output_tokens,
1259
+ start_time=benchmark_start_time,
1260
+ end_time=event_occurence_time,
1261
+ )
1262
+ state.add_avg_rate_metric(
1263
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1264
+ key="total_tokens",
1265
+ value=input_tokens + output_tokens,
1266
+ start_time=benchmark_start_time,
1267
+ end_time=event_occurence_time,
1268
+ )
1269
+ state.add_avg_rate_metric(
1270
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1271
+ key="input_text_tokens",
1272
+ value=(
1273
+ (response.input_metrics.text_tokens if response else None)
1274
+ or request.input_metrics.text_tokens
1275
+ or 0
1276
+ ),
1277
+ start_time=benchmark_start_time,
1278
+ end_time=event_occurence_time,
1279
+ )
1280
+ state.add_avg_rate_metric(
1281
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1282
+ key="input_images",
1283
+ value=(
1284
+ (response.input_metrics.image_count if response else None)
1285
+ or request.input_metrics.image_count
1286
+ or 0
1287
+ ),
1288
+ start_time=benchmark_start_time,
1289
+ end_time=event_occurence_time,
1290
+ )
1291
+ state.add_avg_rate_metric(
1292
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1293
+ key="input_video_frames",
1294
+ value=(
1295
+ (response.input_metrics.video_frames if response else None)
1296
+ or request.input_metrics.video_frames
1297
+ or 0
1298
+ ),
1299
+ start_time=benchmark_start_time,
1300
+ end_time=event_occurence_time,
1301
+ )
1302
+ state.add_avg_rate_metric(
1303
+ group=EstimatedBenchmarkState.benchmark_metrics_group,
1304
+ key="input_audio_seconds",
1305
+ value=request.input_metrics.audio_seconds or 0,
1306
+ start_time=benchmark_start_time,
1307
+ end_time=event_occurence_time,
1308
+ )
1309
+
1310
+ @classmethod
1311
+ def compile(
1312
+ cls,
1313
+ completed: list[GenerativeRequestStats],
1314
+ errored: list[GenerativeRequestStats],
1315
+ incomplete: list[GenerativeRequestStats],
1316
+ ) -> GenerativeMetrics:
1317
+ """
1318
+ Compile final generative metrics from request statistics.
1319
+
1320
+ :param completed: Successfully completed request statistics
1321
+ :param errored: Failed request statistics
1322
+ :param incomplete: Incomplete/cancelled request statistics
1323
+ :return: Compiled generative metrics with full distributions
1324
+ """
1325
+ requests = completed + errored + incomplete
1326
+ request_types = cast(
1327
+ "list[Literal['successful', 'error', 'incomplete']]",
1328
+ ["successful"] * len(completed)
1329
+ + ["error"] * len(errored)
1330
+ + ["incomplete"] * len(incomplete),
1331
+ )
1332
+ request_times = [
1333
+ (
1334
+ req.info.timings.request_start or req.info.timings.resolve_start or 0,
1335
+ req.info.timings.request_end or req.info.timings.resolve_end or 0,
1336
+ )
1337
+ for req in requests
1338
+ ]
1339
+ input_metrics = [req.input_metrics for req in requests]
1340
+ output_metrics = [req.output_metrics for req in requests]
1341
+
1342
+ return GenerativeMetrics(
1343
+ # Request stats
1344
+ requests_per_second=StatusDistributionSummary.from_request_times(
1345
+ request_types=request_types,
1346
+ requests=request_times,
1347
+ distribution_type="rate",
1348
+ ),
1349
+ request_concurrency=StatusDistributionSummary.from_request_times(
1350
+ request_types=request_types,
1351
+ requests=request_times,
1352
+ distribution_type="concurrency",
1353
+ ),
1354
+ request_latency=StatusDistributionSummary.from_values(
1355
+ value_types=request_types,
1356
+ values=[req.request_latency or 0.0 for req in requests],
1357
+ ),
1358
+ request_streaming_iterations_count=StatusDistributionSummary.from_values(
1359
+ value_types=request_types,
1360
+ values=[float(req.info.timings.iterations or 0) for req in requests],
1361
+ ),
1362
+ # General token stats
1363
+ prompt_token_count=StatusDistributionSummary.from_values(
1364
+ value_types=request_types,
1365
+ values=[float(req.prompt_tokens or 0) for req in requests],
1366
+ ),
1367
+ output_token_count=StatusDistributionSummary.from_values(
1368
+ value_types=request_types,
1369
+ values=[float(req.output_tokens or 0) for req in requests],
1370
+ ),
1371
+ total_token_count=StatusDistributionSummary.from_values(
1372
+ value_types=request_types,
1373
+ values=[float(req.total_tokens or 0) for req in requests],
1374
+ ),
1375
+ time_to_first_token_ms=StatusDistributionSummary.from_values(
1376
+ value_types=request_types,
1377
+ values=[req.time_to_first_token_ms or 0.0 for req in requests],
1378
+ ),
1379
+ time_per_output_token_ms=StatusDistributionSummary.from_values(
1380
+ value_types=request_types,
1381
+ values=[req.time_per_output_token_ms or 0.0 for req in requests],
1382
+ ),
1383
+ inter_token_latency_ms=StatusDistributionSummary.from_values(
1384
+ value_types=request_types,
1385
+ values=[req.inter_token_latency_ms or 0.0 for req in requests],
1386
+ ),
1387
+ output_tokens_wo_first_per_iteration=StatusDistributionSummary.from_values(
1388
+ value_types=request_types,
1389
+ values=[
1390
+ max(0.0, (req.output_metrics.total_tokens or 1.0) - 1.0)
1391
+ for req in requests
1392
+ ],
1393
+ weights=[req.info.timings.iterations or 1 for req in requests],
1394
+ ),
1395
+ output_tokens_per_second=StatusDistributionSummary.from_values(
1396
+ value_types=request_types,
1397
+ values=[req.output_tokens_per_second or 0.0 for req in requests],
1398
+ ),
1399
+ output_tokens_per_iteration=StatusDistributionSummary.from_values(
1400
+ value_types=request_types,
1401
+ values=[req.output_tokens_per_iteration or 0.0 for req in requests],
1402
+ weights=[req.info.timings.iterations or 1 for req in requests],
1403
+ ),
1404
+ tokens_per_second=StatusDistributionSummary.from_values(
1405
+ value_types=request_types,
1406
+ values=[req.tokens_per_second or 0.0 for req in requests],
1407
+ ),
1408
+ # Domain-specific stats
1409
+ text=GenerativeTextMetricsSummary.compile(
1410
+ request_types=request_types,
1411
+ request_times=request_times,
1412
+ input_metrics=input_metrics,
1413
+ output_metrics=output_metrics,
1414
+ ),
1415
+ image=GenerativeImageMetricsSummary.compile(
1416
+ request_types=request_types,
1417
+ request_times=request_times,
1418
+ input_metrics=input_metrics,
1419
+ output_metrics=output_metrics,
1420
+ ),
1421
+ video=GenerativeVideoMetricsSummary.compile(
1422
+ request_types=request_types,
1423
+ request_times=request_times,
1424
+ input_metrics=input_metrics,
1425
+ output_metrics=output_metrics,
1426
+ ),
1427
+ audio=GenerativeAudioMetricsSummary.compile(
1428
+ request_types=request_types,
1429
+ request_times=request_times,
1430
+ input_metrics=input_metrics,
1431
+ output_metrics=output_metrics,
1432
+ ),
1433
+ )
1434
+
1435
+
1436
+ class SchedulerDict(StandardBaseDict):
1437
+ """Scheduler configuration and execution state dictionary."""
1438
+
1439
+ strategy: SchedulingStrategy = Field(
1440
+ description="Scheduling strategy used for request distribution"
1441
+ )
1442
+ constraints: dict[str, dict[str, Any]] = Field(
1443
+ description="Execution constraints applied during benchmarking"
1444
+ )
1445
+ state: SchedulerState = Field(
1446
+ description="Final state of the scheduler after execution"
1447
+ )
1448
+
1449
+
1450
+ class BenchmarkerDict(StandardBaseDict):
1451
+ """Benchmarker configuration and component settings dictionary."""
1452
+
1453
+ profile: Profile = Field(description="Benchmark profile configuration")
1454
+ requests: dict[str, Any] = Field(
1455
+ description="Request configuration and dataset information"
1456
+ )
1457
+ backend: dict[str, Any] = Field(
1458
+ description="Backend configuration and connection details"
1459
+ )
1460
+ environment: dict[str, Any] = Field(
1461
+ description="Execution environment configuration"
1462
+ )
1463
+
1464
+
1465
+ class GenerativeBenchmark(Benchmark, StandardBaseDict):
1466
+ """Complete generative AI benchmark results with specialized metrics."""
1467
+
1468
+ group_name: ClassVar[Literal["generative_benchmark"]] = "generative_benchmark"
1469
+
1470
+ type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment]
1471
+ id_: str = Field(
1472
+ default_factory=lambda: str(uuid.uuid4()),
1473
+ description="Unique identifier for this benchmark execution",
1474
+ )
1475
+ run_id: str = Field(
1476
+ description="Identifier for the benchmarker run containing this benchmark"
1477
+ )
1478
+ run_index: int = Field(
1479
+ description="Sequential index of this benchmark within the benchmarker run"
1480
+ )
1481
+ scheduler: SchedulerDict = Field(
1482
+ description="Scheduler configuration and execution state"
1483
+ )
1484
+ benchmarker: BenchmarkerDict = Field(
1485
+ description="Benchmarker configuration and component settings"
1486
+ )
1487
+ run_stats: BenchmarkSchedulerStats = Field(
1488
+ description="Scheduler timing and performance statistics"
1489
+ )
1490
+ start_time: float = Field(
1491
+ default=-1.0, description="Unix timestamp when the first request was initiated"
1492
+ )
1493
+ end_time: float = Field(
1494
+ default=-1.0, description="Unix timestamp when the last request completed"
1495
+ )
1496
+
1497
+ def get_run_metrics_sample(
1498
+ self,
1499
+ ) -> dict[Literal["start_time", "end_time", "duration"], float]:
1500
+ return {
1501
+ "start_time": self.start_time,
1502
+ "end_time": self.end_time,
1503
+ "duration": self.duration,
1504
+ }
1505
+
1506
+ def get_request_metrics_sample(
1507
+ self,
1508
+ ) -> dict[
1509
+ Literal[
1510
+ "request_count",
1511
+ "request_latency",
1512
+ "request_throughput",
1513
+ "request_concurrency",
1514
+ ],
1515
+ float,
1516
+ ]:
1517
+ return {
1518
+ "request_count": self.request_totals.successful,
1519
+ "request_latency": self.metrics.request_latency.successful.mean,
1520
+ "request_throughput": self.metrics.requests_per_second.successful.mean,
1521
+ "request_concurrency": self.metrics.request_concurrency.successful.mean,
1522
+ }
1523
+
1524
+ @computed_field # type: ignore[misc]
1525
+ @property
1526
+ def duration(self) -> float:
1527
+ """
1528
+ Benchmark execution duration in seconds.
1529
+
1530
+ :return: Time elapsed from first request start to last request completion.
1531
+ """
1532
+ return self.end_time - self.start_time
1533
+
1534
+ metrics: GenerativeMetrics = Field(
1535
+ description="Performance metrics and statistical distributions"
1536
+ )
1537
+ request_totals: StatusBreakdown[int, int, int, int] = Field(
1538
+ description="Request counts by status: successful, incomplete, errored, total"
1539
+ )
1540
+ requests: StatusBreakdown[
1541
+ list[GenerativeRequestStats],
1542
+ list[GenerativeRequestStats],
1543
+ list[GenerativeRequestStats],
1544
+ None,
1545
+ ] = Field(
1546
+ description="Request details grouped by status: successful, incomplete, errored"
1547
+ )
1548
+
1549
+ @classmethod
1550
+ def update_estimate(
1551
+ cls,
1552
+ args: BenchmarkerArgs,
1553
+ state: EstimatedBenchmarkState,
1554
+ response: GenerationResponse | None,
1555
+ request: GenerationRequest,
1556
+ request_info: RequestInfo,
1557
+ scheduler_state: SchedulerState,
1558
+ ):
1559
+ """
1560
+ Update generative benchmark estimates with new request data.
1561
+
1562
+ Handles warmup/cooldown filtering, request sampling via reservoir sampling,
1563
+ and delegates metric updates to child metric classes.
1564
+
1565
+ :param args: Benchmark configuration arguments
1566
+ :param state: Current estimated benchmark state to update
1567
+ :param response: Response received from the backend
1568
+ :param request: Original request sent to the backend
1569
+ :param request_info: Metadata about the request execution
1570
+ :param scheduler_state: Current state of the scheduler
1571
+ """
1572
+ if (
1573
+ request_info.status == "cancelled"
1574
+ and request_info.timings.resolve_start is None
1575
+ ):
1576
+ # Cancelled requests that never started should be ignored
1577
+ return
1578
+
1579
+ # Update child metric groups
1580
+ BenchmarkSchedulerStats.update_estimate(state, request_info)
1581
+ GenerativeMetrics.update_estimate(
1582
+ state, response, request, request_info, scheduler_state
1583
+ )
1584
+
1585
+ # Store requests and sampling info, update counts
1586
+ if "requests_completed" not in state:
1587
+ state["requests_completed"] = []
1588
+ state["samples_completed"] = []
1589
+ state["requests_errored"] = []
1590
+ state["samples_errored"] = []
1591
+ state["requests_incomplete"] = []
1592
+ state["samples_incomplete"] = []
1593
+ in_warmup = state.set_metric(
1594
+ group=EstimatedBenchmarkState.benchmark_state_group,
1595
+ key="in_warmup",
1596
+ value=args.is_in_warmup(request_info, scheduler_state),
1597
+ )
1598
+ in_cooldown = state.set_metric(
1599
+ group=EstimatedBenchmarkState.benchmark_state_group,
1600
+ key="in_cooldown",
1601
+ value=args.is_in_cooldown(request_info, scheduler_state),
1602
+ )
1603
+ state[f"{EstimatedBenchmarkState.benchmark_state_group}_status"] = (
1604
+ "in_cooldown"
1605
+ if in_cooldown
1606
+ else "in_warmup"
1607
+ if in_warmup
1608
+ else "in_progress"
1609
+ )
1610
+
1611
+ if (
1612
+ request_info.status not in {"completed", "errored", "cancelled"}
1613
+ or in_warmup
1614
+ or in_cooldown
1615
+ ):
1616
+ # Must be fully resolved to be added
1617
+ return
1618
+
1619
+ state.set_metric(
1620
+ group=EstimatedBenchmarkState.benchmark_state_group,
1621
+ key="updated",
1622
+ value=True,
1623
+ )
1624
+
1625
+ if response is None:
1626
+ response = GenerationResponse(
1627
+ request_id=request.request_id, request_args=str(request.arguments)
1628
+ )
1629
+
1630
+ stats = response.compile_stats(
1631
+ request, request_info, args.prefer_response_metrics
1632
+ )
1633
+
1634
+ # Determine status and get corresponding lists
1635
+ if request_info.status == "completed":
1636
+ requests_list = state["requests_completed"]
1637
+ samples_list = state["samples_completed"]
1638
+ elif request_info.status == "errored":
1639
+ requests_list = state["requests_errored"]
1640
+ samples_list = state["samples_errored"]
1641
+ else: # cancelled (incomplete)
1642
+ requests_list = state["requests_incomplete"]
1643
+ samples_list = state["samples_incomplete"]
1644
+
1645
+ # Add to requests list
1646
+ requests_list.append(stats)
1647
+ current_index = len(requests_list) - 1
1648
+
1649
+ # Handle request sampling logic
1650
+ if args.sample_requests is None:
1651
+ # No sampling, add index to samples list
1652
+ samples_list.append(current_index)
1653
+ elif args.sample_requests > 0 and len(samples_list) < args.sample_requests:
1654
+ # Space in samples list, add index
1655
+ samples_list.append(current_index)
1656
+ elif (
1657
+ args.sample_requests > 0
1658
+ and (replace_index := random.randrange(len(requests_list)))
1659
+ < args.sample_requests
1660
+ ):
1661
+ # No space, adding based on reservoir sampling
1662
+ samples_list[replace_index] = current_index
1663
+ # Sampling set to 0, don't keep any requests
1664
+
1665
+ @classmethod
1666
+ def compile(
1667
+ cls,
1668
+ args: BenchmarkerArgs,
1669
+ estimated_state: EstimatedBenchmarkState,
1670
+ scheduler_state: SchedulerState,
1671
+ profile: Profile,
1672
+ requests: Iterable,
1673
+ backend: BackendInterface,
1674
+ environment: Environment,
1675
+ strategy: SchedulingStrategy,
1676
+ constraints: dict[str, dict[str, Any]],
1677
+ data: list[Any],
1678
+ ) -> GenerativeBenchmark:
1679
+ """
1680
+ Compile final generative benchmark from accumulated state.
1681
+
1682
+ :param args: Benchmark configuration arguments
1683
+ :param estimated_state: Accumulated benchmark state from execution
1684
+ :param scheduler_state: Final state of the scheduler
1685
+ :param profile: Benchmark profile configuration
1686
+ :param requests: Collection of requests executed
1687
+ :param backend: Backend interface used for execution
1688
+ :param environment: Execution environment configuration
1689
+ :param strategy: Scheduling strategy used
1690
+ :param constraints: Execution constraints applied
1691
+ :return: Compiled generative benchmark instance
1692
+ """
1693
+ return GenerativeBenchmark(
1694
+ run_id=args.run_id,
1695
+ run_index=args.run_index,
1696
+ scheduler=SchedulerDict(
1697
+ strategy=strategy,
1698
+ constraints={
1699
+ key: InfoMixin.extract_from_obj(val)
1700
+ for key, val in constraints.items()
1701
+ },
1702
+ state=scheduler_state,
1703
+ ),
1704
+ benchmarker=BenchmarkerDict(
1705
+ profile=profile,
1706
+ requests={"data": data},
1707
+ backend=backend.info,
1708
+ environment=environment.info,
1709
+ ),
1710
+ run_stats=BenchmarkSchedulerStats.compile(estimated_state, scheduler_state),
1711
+ start_time=scheduler_state.start_time or -1.0,
1712
+ end_time=scheduler_state.end_time or -1.0,
1713
+ metrics=GenerativeMetrics.compile(
1714
+ completed=estimated_state.get("requests_completed", []),
1715
+ errored=estimated_state.get("requests_errored", []),
1716
+ incomplete=estimated_state.get("requests_incomplete", []),
1717
+ ),
1718
+ request_totals=StatusBreakdown[int, int, int, int](
1719
+ successful=len(estimated_state.get("requests_completed", [])),
1720
+ incomplete=len(estimated_state.get("requests_incomplete", [])),
1721
+ errored=len(estimated_state.get("requests_errored", [])),
1722
+ total=(
1723
+ len(estimated_state.get("requests_completed", []))
1724
+ + len(estimated_state.get("requests_incomplete", []))
1725
+ + len(estimated_state.get("requests_errored", []))
1726
+ ),
1727
+ ),
1728
+ requests=StatusBreakdown[
1729
+ list[GenerativeRequestStats],
1730
+ list[GenerativeRequestStats],
1731
+ list[GenerativeRequestStats],
1732
+ None,
1733
+ ](
1734
+ successful=estimated_state.get("requests_completed", []),
1735
+ incomplete=estimated_state.get("requests_incomplete", []),
1736
+ errored=estimated_state.get("requests_errored", []),
1737
+ total=None,
1738
+ ),
1739
+ )
1740
+
1741
+
1742
+ class BenchmarkGenerativeTextArgs(StandardBaseModel):
1743
+ """
1744
+ Configuration arguments for generative text benchmark execution.
1745
+
1746
+ Defines all parameters for benchmark setup including target endpoint, data
1747
+ sources, backend configuration, processing pipeline, output formatting, and
1748
+ execution constraints. Supports loading from scenario files and merging with
1749
+ runtime overrides.
1750
+ """
1751
+
1752
+ @classmethod
1753
+ def create(
1754
+ cls, scenario: Path | str | None, **kwargs: dict[str, Any]
1755
+ ) -> BenchmarkGenerativeTextArgs:
1756
+ """
1757
+ Create benchmark args from scenario file and/or keyword arguments.
1758
+
1759
+ :param scenario: Path to scenario file or name of built-in scenario
1760
+ :param kwargs: Additional keyword arguments to override scenario values
1761
+ :return: Configured benchmark args instance
1762
+ :raises ValueError: If scenario is not found or file format is unsupported
1763
+ """
1764
+ constructor_kwargs = {}
1765
+
1766
+ if scenario is not None:
1767
+ if isinstance(scenario, str) and scenario in (
1768
+ builtin_scenarios := get_builtin_scenarios()
1769
+ ):
1770
+ scenario_path = builtin_scenarios[scenario]
1771
+ elif Path(scenario).exists() and Path(scenario).is_file():
1772
+ scenario_path = Path(scenario)
1773
+ else:
1774
+ raise ValueError(f"Scenario '{scenario}' not found.")
1775
+
1776
+ with scenario_path.open() as file:
1777
+ if scenario_path.suffix == ".json":
1778
+ scenario_data = json.load(file)
1779
+ elif scenario_path.suffix in {".yaml", ".yml"}:
1780
+ scenario_data = yaml.safe_load(file)
1781
+ else:
1782
+ raise ValueError(
1783
+ f"Unsupported scenario file format: {scenario_path.suffix}"
1784
+ )
1785
+ if "args" in scenario_data:
1786
+ # loading from a report file
1787
+ scenario_data = scenario_data["args"]
1788
+ constructor_kwargs.update(scenario_data)
1789
+
1790
+ for key, value in kwargs.items():
1791
+ if value != cls.get_default(key):
1792
+ constructor_kwargs[key] = value
1793
+
1794
+ return cls.model_validate(constructor_kwargs)
1795
+
1796
+ @classmethod
1797
+ def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
1798
+ """
1799
+ Get default value for a model field.
1800
+
1801
+ :param field: Name of the field to retrieve default for
1802
+ :return: Default value for the specified field
1803
+ :raises ValueError: If field is not found in model
1804
+ """
1805
+ if field not in BenchmarkGenerativeTextArgs.model_fields:
1806
+ raise ValueError(
1807
+ f"Field '{field}' not found in BenchmarkGenerativeTextArgs"
1808
+ )
1809
+
1810
+ field_info = BenchmarkGenerativeTextArgs.model_fields[field]
1811
+ factory = field_info.default_factory
1812
+
1813
+ if factory is None:
1814
+ return field_info.default
1815
+
1816
+ if len(inspect.signature(factory).parameters) == 0:
1817
+ return factory() # type: ignore[call-arg] # Confirmed correct at runtime by code above
1818
+ else:
1819
+ return factory({}) # type: ignore[call-arg] # Confirmed correct at runtime by code above
1820
+
1821
+
1822
+
1823
+ model_config = ConfigDict(
1824
+ extra="ignore",
1825
+ use_enum_values=True,
1826
+ from_attributes=True,
1827
+ arbitrary_types_allowed=True,
1828
+ )
1829
+
1830
+ # Required
1831
+ target: str = Field(description="Target endpoint URL for benchmark execution")
1832
+ data: list[Any] = Field(
1833
+ description="List of dataset sources or data files",
1834
+ default_factory=list,
1835
+ min_length=1,
1836
+ )
1837
+ # Benchmark configuration
1838
+ profile: StrategyType | ProfileType | Profile = Field(
1839
+ default="sweep", description="Benchmark profile or scheduling strategy type"
1840
+ )
1841
+ rate: float | list[float] | None = Field(
1842
+ default=None, description="Request rate(s) for rate-based scheduling"
1843
+ )
1844
+ # Backend configuration
1845
+ backend: BackendType | Backend = Field(
1846
+ default="openai_http", description="Backend type or instance for execution"
1847
+ )
1848
+ backend_kwargs: dict[str, Any] | None = Field(
1849
+ default=None, description="Additional backend configuration arguments"
1850
+ )
1851
+ model: str | None = Field(default=None, description="Model identifier for backend")
1852
+ # Data configuration
1853
+ processor: str | Path | PreTrainedTokenizerBase | None = Field(
1854
+ default=None, description="Tokenizer path, name, or instance for processing"
1855
+ )
1856
+ processor_args: dict[str, Any] | None = Field(
1857
+ default=None, description="Additional tokenizer configuration arguments"
1858
+ )
1859
+ data_args: list[dict[str, Any]] | None = Field(
1860
+ default_factory=list, description="Per-dataset configuration arguments"
1861
+ )
1862
+ data_samples: int = Field(
1863
+ default=-1, description="Number of samples to use from datasets (-1 for all)"
1864
+ )
1865
+ data_column_mapper: (
1866
+ DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"]
1867
+ ) = Field(
1868
+ default="generative_column_mapper",
1869
+ description="Column mapping preprocessor for dataset fields",
1870
+ )
1871
+ data_request_formatter: DatasetPreprocessor | dict[str, str] | str = Field(
1872
+ default="chat_completions",
1873
+ description="Request formatting preprocessor or template name",
1874
+ )
1875
+ data_collator: Callable | Literal["generative"] | None = Field(
1876
+ default="generative", description="Data collator for batch processing"
1877
+ )
1878
+ data_sampler: Sampler[int] | Literal["shuffle"] | None = Field(
1879
+ default=None, description="Data sampler for request ordering"
1880
+ )
1881
+ data_num_workers: int | None = Field(
1882
+ default=None, description="Number of workers for data loading"
1883
+ )
1884
+ dataloader_kwargs: dict[str, Any] | None = Field(
1885
+ default=None, description="Additional dataloader configuration arguments"
1886
+ )
1887
+ random_seed: int = Field(default=42, description="Random seed for reproducibility")
1888
+ # Output configuration
1889
+ output_path: str | Path | None = Field(
1890
+ default_factory=Path.cwd, description="Directory path for output files"
1891
+ )
1892
+ output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field(
1893
+ default_factory=lambda: ["console", "json"],
1894
+ description="Output format names or configuration mappings",
1895
+ )
1896
+ # Benchmarker configuration
1897
+ benchmark_cls: type[GenerativeBenchmark] = Field(
1898
+ default=GenerativeBenchmark,
1899
+ description="Benchmark class to use for result compilation",
1900
+ )
1901
+ sample_requests: int | None = Field(
1902
+ default=10,
1903
+ description="Number of requests to sample for detailed metrics (None for all)",
1904
+ )
1905
+ warmup: float | None = Field(
1906
+ default=None,
1907
+ description="Warmup period in seconds, requests, or fraction (0-1)",
1908
+ )
1909
+ cooldown: float | None = Field(
1910
+ default=None,
1911
+ description="Cooldown period in seconds, requests, or fraction (0-1)",
1912
+ )
1913
+ prefer_response_metrics: bool = Field(
1914
+ default=True,
1915
+ description="Whether to prefer backend response metrics over request metrics",
1916
+ )
1917
+ # Constraints configuration
1918
+ max_seconds: int | float | None = Field(
1919
+ default=None, description="Maximum benchmark execution time in seconds"
1920
+ )
1921
+ max_requests: int | None = Field(
1922
+ default=None, description="Maximum number of requests to execute"
1923
+ )
1924
+ max_errors: int | None = Field(
1925
+ default=None, description="Maximum number of errors before stopping"
1926
+ )
1927
+ max_error_rate: float | None = Field(
1928
+ default=None, description="Maximum error rate (0-1) before stopping"
1929
+ )
1930
+ max_global_error_rate: float | None = Field(
1931
+ default=None, description="Maximum global error rate (0-1) before stopping"
1932
+ )
1933
+
1934
+ @model_serializer
1935
+ def serialize_model(self):
1936
+ """
1937
+ Custom serialization logic for benchmark args.
1938
+
1939
+ Converts complex types to serializable formats including Profile to type
1940
+ string, Backend to type string, and Path objects to strings.
1941
+
1942
+ :return: Dictionary representation suitable for JSON/YAML serialization
1943
+ """
1944
+ return {
1945
+ # target - serialize as is
1946
+ "target": self.target,
1947
+ "data": [
1948
+ item if isinstance(item, str | type(None)) else str(item)
1949
+ for item in self.data
1950
+ ], # data - for each item in the list, if not a str or None, save str(item)
1951
+ "profile": (
1952
+ self.profile.type_
1953
+ if isinstance(self.profile, Profile)
1954
+ else self.profile
1955
+ ), # profile - if instance of Profile, then save as profile.type_
1956
+ "rate": self.rate,
1957
+ "backend": (
1958
+ self.backend.type_
1959
+ if isinstance(self.backend, Backend)
1960
+ else self.backend
1961
+ ), # backend - if instance of Backend, then save as backend.type_
1962
+ "backend_kwargs": self.backend_kwargs,
1963
+ "model": self.model,
1964
+ "processor": (
1965
+ self.processor
1966
+ if isinstance(self.processor, str)
1967
+ else str(self.processor)
1968
+ if self.processor is not None
1969
+ else None
1970
+ ), # processor - if not str, then save as str(processor)
1971
+ "processor_args": self.processor_args,
1972
+ "data_args": self.data_args,
1973
+ "data_samples": self.data_samples,
1974
+ "data_column_mapper": (
1975
+ self.data_column_mapper
1976
+ if isinstance(self.data_column_mapper, dict | str)
1977
+ else {}
1978
+ ), # data_column_mapper - if not dict or str, then save as an empty dict
1979
+ "data_request_formatter": (
1980
+ self.data_request_formatter
1981
+ if isinstance(self.data_request_formatter, dict | str)
1982
+ else {}
1983
+ ), # data_request_formatter - if not dict or str, then save as empty dict
1984
+ "data_collator": (
1985
+ self.data_collator if isinstance(self.data_collator, str) else None
1986
+ ), # data_collator - if not str, then save as None
1987
+ "data_sampler": (
1988
+ self.data_sampler if isinstance(self.data_sampler, str) else None
1989
+ ), # data_sampler - if not str, then save as None
1990
+ "data_num_workers": self.data_num_workers,
1991
+ "dataloader_kwargs": self.dataloader_kwargs,
1992
+ "random_seed": self.random_seed,
1993
+ "output_path": (
1994
+ str(self.output_path) if self.output_path is not None else None
1995
+ ), # output_path - if not None, then ensure it's a str
1996
+ "output_formats": self.output_formats,
1997
+ # benchmark_cls - don't save at all (excluded)
1998
+ "sample_requests": self.sample_requests,
1999
+ "warmup": self.warmup,
2000
+ "cooldown": self.cooldown,
2001
+ "prefer_response_metrics": self.prefer_response_metrics,
2002
+ "max_seconds": self.max_seconds,
2003
+ "max_requests": self.max_requests,
2004
+ "max_errors": self.max_errors,
2005
+ "max_error_rate": self.max_error_rate,
2006
+ "max_global_error_rate": self.max_global_error_rate,
2007
+ }
2008
+
2009
+
2010
+ class GenerativeBenchmarksReport(StandardBaseModel):
2011
+ """Container for multiple benchmark results with load/save functionality."""
2012
+
2013
+ DEFAULT_FILE: ClassVar[str] = "benchmarks.json"
2014
+
2015
+ @staticmethod
2016
+ def load_file(
2017
+ path: str | Path, type_: Literal["json", "yaml"] | None = None
2018
+ ) -> GenerativeBenchmarksReport:
2019
+ """
2020
+ Load a report from a file.
2021
+
2022
+ :param path: The path to load the report from.
2023
+ :param type_: File type override, auto-detected from extension if None.
2024
+ :return: The loaded report.
2025
+ :raises ValueError: If file type is unsupported.
2026
+ """
2027
+ path = Path(path) if not isinstance(path, Path) else path
2028
+
2029
+ if path.is_dir():
2030
+ path = path / GenerativeBenchmarksReport.DEFAULT_FILE
2031
+
2032
+ path.parent.mkdir(parents=True, exist_ok=True)
2033
+ path_suffix = path.suffix.lower()[1:]
2034
+
2035
+ with path.open("r") as file:
2036
+ if (type_ or path_suffix) == "json":
2037
+ model_dict = json.loads(file.read())
2038
+ elif (type_ or path_suffix) in ["yaml", "yml"]:
2039
+ model_dict = yaml.safe_load(file)
2040
+ else:
2041
+ raise ValueError(f"Unsupported file type: {type_} for {path}.")
2042
+
2043
+ return GenerativeBenchmarksReport.model_validate(model_dict)
2044
+
2045
+ args: BenchmarkGenerativeTextArgs = Field(
2046
+ description="The benchmark arguments used for all benchmarks in the report."
2047
+ )
2048
+ benchmarks: list[GenerativeBenchmark] = Field(
2049
+ description="The list of completed benchmarks contained within the report.",
2050
+ default_factory=list,
2051
+ )
2052
+
2053
+ def save_file(
2054
+ self, path: str | Path | None, type_: Literal["json", "yaml"] | None = None
2055
+ ) -> Path:
2056
+ """
2057
+ Save the report to a file.
2058
+
2059
+ :param path: The path to save the report to.
2060
+ :param type_: File type override, auto-detected from extension if None.
2061
+ :return: The path to the saved report.
2062
+ :raises ValueError: If file type is unsupported.
2063
+ """
2064
+ if path is None:
2065
+ path = Path.cwd()
2066
+ elif not isinstance(path, Path):
2067
+ path = Path(path)
2068
+
2069
+ if path.is_dir():
2070
+ path = path / GenerativeBenchmarksReport.DEFAULT_FILE
2071
+
2072
+ path.parent.mkdir(parents=True, exist_ok=True)
2073
+ path_suffix = path.suffix.lower()[1:]
2074
+ model_dict = self.model_dump()
2075
+
2076
+ if (type_ or path_suffix) == "json":
2077
+ save_str = json.dumps(model_dict)
2078
+ elif (type_ or path_suffix) in ["yaml", "yml"]:
2079
+ save_str = yaml.dump(model_dict)
2080
+ else:
2081
+ raise ValueError(f"Unsupported file type: {type_} for {path}.")
2082
+
2083
+ with path.open("w") as file:
2084
+ file.write(save_str)
2085
+
2086
+ return path