guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,40 @@
1
+ """
2
+ Builtin benchmark scenario definitions and discovery utilities.
3
+
4
+ This module provides access to predefined benchmark scenarios stored as JSON files
5
+ within the scenarios directory. It enables discovery and retrieval of builtin
6
+ scenarios by name or filename, supporting both stem names (without extension) and
7
+ full filenames for flexible scenario loading.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from functools import cache
13
+ from pathlib import Path
14
+ from typing import Annotated
15
+
16
+ __all__ = ["SCENARIO_DIR", "get_builtin_scenarios"]
17
+
18
+ SCENARIO_DIR: Annotated[
19
+ Path,
20
+ "Directory path containing builtin scenario JSON files",
21
+ ] = Path(__file__).parent
22
+
23
+
24
+ @cache
25
+ def get_builtin_scenarios() -> dict[str, Path]:
26
+ """
27
+ Retrieve all builtin scenario definitions from the scenarios directory.
28
+
29
+ Scans the scenarios directory for JSON files and returns a mapping of scenario
30
+ names to their file paths. Each scenario is indexed by both its stem name
31
+ (filename without extension) and full filename for convenient lookup.
32
+
33
+ :return: Dictionary mapping scenario names and filenames to their Path objects
34
+ """
35
+ builtin = {}
36
+ for path in SCENARIO_DIR.glob("*.json"):
37
+ builtin[path.stem] = path
38
+ builtin[path.name] = path
39
+
40
+ return builtin
@@ -0,0 +1,6 @@
1
+ {
2
+ "profile": "sweep",
3
+ "data": [
4
+ "prompt_tokens=512,prompt_tokens_stdev=128,prompt_tokens_min=1,prompt_tokens_max=1024,output_tokens=256,output_tokens_stdev=64,output_tokens_min=1,output_tokens_max=1024"
5
+ ]
6
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "profile": "sweep",
3
+ "data": [
4
+ "prompt_tokens=4096,prompt_tokens_stdev=512,prompt_tokens_min=2048,prompt_tokens_max=6144,output_tokens=512,output_tokens_stdev=128,output_tokens_min=1,output_tokens_max=1024"
5
+ ]
6
+ }
@@ -0,0 +1,66 @@
1
+ """
2
+ Benchmark schemas for performance measurement and result analysis.
3
+
4
+ This module consolidates the complete benchmark schema ecosystem, providing both
5
+ base abstractions for benchmark execution and domain-specific implementations
6
+ for generative AI tasks. It exports core configuration objects, accumulator
7
+ interfaces for real-time metric collection, benchmark result containers with
8
+ statistical summaries, and reporting utilities. The schemas support flexible
9
+ scheduling strategies, comprehensive metric tracking including latency and
10
+ throughput distributions, and multi-modal generative benchmarks for text, image,
11
+ video, and audio generation tasks.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from .base import (
17
+ Benchmark,
18
+ BenchmarkAccumulator,
19
+ BenchmarkAccumulatorT,
20
+ BenchmarkConfig,
21
+ BenchmarkT,
22
+ )
23
+ from .generative import (
24
+ BenchmarkGenerativeTextArgs,
25
+ GenerativeAudioMetricsSummary,
26
+ GenerativeBenchmark,
27
+ GenerativeBenchmarkAccumulator,
28
+ GenerativeBenchmarkMetadata,
29
+ GenerativeBenchmarksReport,
30
+ GenerativeBenchmarkTimings,
31
+ GenerativeImageMetricsSummary,
32
+ GenerativeMetrics,
33
+ GenerativeMetricsAccumulator,
34
+ GenerativeMetricsSummary,
35
+ GenerativeRequestsAccumulator,
36
+ GenerativeTextMetricsSummary,
37
+ GenerativeVideoMetricsSummary,
38
+ RunningMetricStats,
39
+ SchedulerMetrics,
40
+ SchedulerMetricsAccumulator,
41
+ )
42
+
43
+ __all__ = [
44
+ "Benchmark",
45
+ "BenchmarkAccumulator",
46
+ "BenchmarkAccumulatorT",
47
+ "BenchmarkConfig",
48
+ "BenchmarkGenerativeTextArgs",
49
+ "BenchmarkT",
50
+ "GenerativeAudioMetricsSummary",
51
+ "GenerativeBenchmark",
52
+ "GenerativeBenchmarkAccumulator",
53
+ "GenerativeBenchmarkMetadata",
54
+ "GenerativeBenchmarkTimings",
55
+ "GenerativeBenchmarksReport",
56
+ "GenerativeImageMetricsSummary",
57
+ "GenerativeMetrics",
58
+ "GenerativeMetricsAccumulator",
59
+ "GenerativeMetricsSummary",
60
+ "GenerativeRequestsAccumulator",
61
+ "GenerativeTextMetricsSummary",
62
+ "GenerativeVideoMetricsSummary",
63
+ "RunningMetricStats",
64
+ "SchedulerMetrics",
65
+ "SchedulerMetricsAccumulator",
66
+ ]
@@ -0,0 +1,402 @@
1
+ """
2
+ Base schemas for benchmark execution, metric accumulation, and result compilation.
3
+
4
+ Defines abstract interfaces and configuration models for coordinating benchmark
5
+ execution with schedulers. The module centers around three key abstractions:
6
+ BenchmarkConfig encapsulates execution parameters and constraints; BenchmarkAccumulator
7
+ tracks incremental metrics during scheduler runs; and Benchmark compiles final results
8
+ with comprehensive latency, throughput, and concurrency distributions. Supports
9
+ configurable warmup/cooldown phases, transient period handling, and flexible metric
10
+ sampling strategies.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import uuid
16
+ from abc import ABC, abstractmethod
17
+ from typing import Any, Generic, Literal, TypeVar
18
+
19
+ from pydantic import Field, NonNegativeFloat, NonNegativeInt
20
+
21
+ from guidellm.benchmark.profiles import Profile
22
+ from guidellm.scheduler import (
23
+ MultiTurnRequestT,
24
+ RequestT,
25
+ ResponseT,
26
+ SchedulerState,
27
+ SchedulingStrategy,
28
+ )
29
+ from guidellm.schemas import (
30
+ RequestInfo,
31
+ StandardBaseDict,
32
+ StandardBaseModel,
33
+ StatusDistributionSummary,
34
+ )
35
+
36
+ __all__ = [
37
+ "Benchmark",
38
+ "BenchmarkAccumulator",
39
+ "BenchmarkAccumulatorT",
40
+ "BenchmarkConfig",
41
+ "BenchmarkT",
42
+ ]
43
+
44
+ BenchmarkAccumulatorT = TypeVar(
45
+ "BenchmarkAccumulatorT", bound="BenchmarkAccumulator[Any, Any]"
46
+ )
47
+ "Generic type variable for benchmark accumulator implementations"
48
+
49
+ BenchmarkT = TypeVar("BenchmarkT", bound="Benchmark")
50
+ "Generic type variable for benchmark result implementations"
51
+
52
+
53
+ class TransientPhaseConfig(StandardBaseModel):
54
+ """
55
+ Configure warmup and cooldown phases for benchmark execution.
56
+
57
+ Supports flexible phase definition through percentage or absolute value
58
+ specifications with multiple interpretation modes. Phases can be bounded
59
+ by duration, request count, or both, enabling precise control over transient
60
+ periods that should be excluded from final benchmark metrics.
61
+ """
62
+
63
+ @classmethod
64
+ def create_from_value(
65
+ cls, value: int | float | dict | TransientPhaseConfig | None
66
+ ) -> TransientPhaseConfig:
67
+ """
68
+ Create configuration from flexible input formats.
69
+
70
+ :param value: Configuration as int/float (percent if <1.0, absolute
71
+ otherwise), dict (validated to model), TransientPhaseConfig instance,
72
+ or None for defaults
73
+ :return: Configured TransientPhaseConfig instance
74
+ :raises ValueError: If value type is unsupported
75
+ """
76
+ if value is None:
77
+ return TransientPhaseConfig()
78
+
79
+ if isinstance(value, TransientPhaseConfig):
80
+ return value
81
+
82
+ if isinstance(value, dict):
83
+ return TransientPhaseConfig.model_validate(value)
84
+
85
+ if isinstance(value, int | float):
86
+ kwargs = {
87
+ "percent": value if value < 1.0 else None,
88
+ "value": value if value >= 1.0 else None,
89
+ }
90
+ return TransientPhaseConfig.model_validate(kwargs)
91
+
92
+ raise ValueError(f"Unsupported type for TransientPhaseConfig: {type(value)}")
93
+
94
+ percent: NonNegativeFloat | None = Field(
95
+ default=None,
96
+ description=(
97
+ "Phase size as percentage (0.0-1.0) of total duration/requests; "
98
+ "interpretation depends on mode. Takes precedence over value when target "
99
+ "mode is available, otherwise falls back to value"
100
+ ),
101
+ lt=1.0,
102
+ )
103
+ value: NonNegativeInt | NonNegativeFloat | None = Field(
104
+ default=None,
105
+ description=(
106
+ "Phase size as absolute duration (seconds) or request count; "
107
+ "interpretation depends on mode. Used when percent is unset or "
108
+ "target mode unavailable"
109
+ ),
110
+ )
111
+ mode: Literal[
112
+ "duration", "requests", "prefer_duration", "prefer_requests", "both"
113
+ ] = Field(
114
+ default="prefer_duration",
115
+ description=(
116
+ "Interpretation mode: 'duration' for time-based phases, 'requests' for "
117
+ "count-based phases, 'prefer_duration'/'prefer_requests' for fallback "
118
+ "behavior, 'both' requires satisfying both conditions"
119
+ ),
120
+ )
121
+
122
+ def compute_limits(
123
+ self,
124
+ max_requests: int | float | None,
125
+ max_seconds: float | None,
126
+ enforce_preference: bool = True,
127
+ ) -> tuple[float | None, int | None]:
128
+ """
129
+ Calculate phase boundaries from benchmark constraints.
130
+
131
+ :param max_requests: Total request budget for benchmark execution
132
+ :param max_seconds: Total duration budget for benchmark execution
133
+ :param enforce_preference: Whether to enforce preferred mode when both
134
+ duration and request constraints are available
135
+ :return: Tuple of (phase duration in seconds, phase request count)
136
+ """
137
+ duration: float | None = None
138
+ requests: int | None = None
139
+
140
+ if self.mode != "requests" and max_seconds is not None:
141
+ if self.percent is not None:
142
+ duration = self.percent * max_seconds
143
+ elif self.value is not None:
144
+ duration = float(self.value)
145
+
146
+ if self.mode != "duration" and max_requests is not None:
147
+ if self.percent is not None:
148
+ requests = int(self.percent * max_requests)
149
+ elif self.value is not None:
150
+ requests = int(self.value)
151
+
152
+ if enforce_preference:
153
+ if self.mode == "prefer_duration" and duration is not None:
154
+ requests = None
155
+ elif self.mode == "prefer_requests" and requests is not None:
156
+ duration = None
157
+
158
+ return duration, requests
159
+
160
+ def compute_transition_time(
161
+ self, info: RequestInfo, state: SchedulerState, period: Literal["start", "end"]
162
+ ) -> tuple[bool, float | None]:
163
+ """
164
+ Determine transition timestamp for entering or exiting phase.
165
+
166
+ :param info: RequestInfo for current request to calculate against
167
+ :param state: SchedulerState with current progress metrics and scheduler info
168
+ :param period: Phase period, either "start" for warmup or "end" for cooldown
169
+ :return: Tuple of (phase active flag, transition timestamp if applicable)
170
+ """
171
+ phase_duration, phase_requests = self.compute_limits(
172
+ max_requests=state.progress.total_requests,
173
+ max_seconds=state.progress.total_duration,
174
+ )
175
+ duration_transition_time: float | None = None
176
+ request_transition_time: float | None = None
177
+
178
+ # Calculate transition times for the phase based on phase limits and period
179
+ # Potential phases: start (warmup) -> active -> end (cooldown)
180
+ # Warmup transition times: (start, start + duration)
181
+ # Active transition times: (start + duration, end - duration)
182
+ # Cooldown transition times: (end - duration, end)
183
+ if period == "start":
184
+ if phase_duration is not None:
185
+ # Duration was set and caculating for "warmup" / start phase
186
+ # Phase is active for [start, start + duration]
187
+ duration_transition_time = state.start_time + phase_duration
188
+ if phase_requests is not None:
189
+ # Requests was set and calculating for "warmup" / start phase
190
+ # Phase is active for requests [0, phase_requests]
191
+ # Grab start time of the next request as transition time
192
+ # (all requests up to and including phase_requests are in warmup)
193
+ request_transition_time = (
194
+ info.started_at
195
+ if info.started_at is not None
196
+ and state.processed_requests == phase_requests + 1
197
+ else -1.0
198
+ )
199
+ elif period == "end":
200
+ if phase_duration is not None:
201
+ # Duration was set and calculating for "cooldown" / end phase
202
+ # Phase is active for [end - duration, end]
203
+ duration_transition_time = (
204
+ state.start_time + state.progress.total_duration - phase_duration
205
+ if state.progress.total_duration is not None
206
+ else -1.0
207
+ )
208
+ if phase_requests is not None:
209
+ # Requests was set and calculating for "cooldown" / end phase
210
+ # Phase is active for requests [total - phase_requests, total]
211
+ # Grab completion time of the request right before cooldown starts
212
+ # (all requests from that point onward are in cooldown)
213
+ request_transition_time = (
214
+ info.completed_at
215
+ if info.completed_at is not None
216
+ and state.progress.remaining_requests is not None
217
+ and state.progress.remaining_requests == phase_requests + 1
218
+ else -1.0
219
+ )
220
+
221
+ transition_active: bool = False
222
+ transition_time: float | None = None
223
+
224
+ if request_transition_time == -1.0 or duration_transition_time == -1.0:
225
+ # Transition defined but not yet reached or passed
226
+ transition_active = True
227
+ request_transition_time = None
228
+ elif (
229
+ request_transition_time is not None and duration_transition_time is not None
230
+ ):
231
+ # Both limits defined; need to satisfy both (min for end, max for start)
232
+ transition_active = True
233
+ transition_time = (
234
+ min(request_transition_time, duration_transition_time)
235
+ if period == "end"
236
+ else max(request_transition_time, duration_transition_time)
237
+ )
238
+ elif (
239
+ request_transition_time is not None or duration_transition_time is not None
240
+ ):
241
+ # One limit defined; satisfy that one
242
+ transition_active = True
243
+ transition_time = request_transition_time or duration_transition_time
244
+
245
+ return transition_active, transition_time
246
+
247
+
248
+ class BenchmarkConfig(StandardBaseDict):
249
+ """
250
+ Encapsulate execution parameters and constraints for benchmark runs.
251
+
252
+ Defines comprehensive configuration including scheduler strategy, constraint
253
+ sets, transient phase handling, metric sampling preferences, and execution
254
+ metadata. Coordinates profile, request, backend, and environment configurations
255
+ to enable reproducible benchmark execution with precise control over metric
256
+ collection.
257
+ """
258
+
259
+ id_: str = Field(
260
+ default_factory=lambda: str(uuid.uuid4()),
261
+ description="Unique identifier for this benchmark execution",
262
+ )
263
+ run_id: str = Field(
264
+ description="Identifier grouping related benchmark runs in a series",
265
+ )
266
+ run_index: int = Field(
267
+ description="Zero-based index of this run within the benchmark series",
268
+ )
269
+ strategy: SchedulingStrategy = Field(
270
+ description="Scheduler strategy controlling request execution patterns",
271
+ )
272
+ constraints: dict[str, dict[str, Any]] = Field(
273
+ description="Constraint definitions applied to scheduler strategy execution",
274
+ )
275
+ sample_requests: int | None = Field(
276
+ default=20,
277
+ description="Request count for statistical sampling in final metrics",
278
+ )
279
+ warmup: TransientPhaseConfig = Field(
280
+ default_factory=TransientPhaseConfig,
281
+ description="Warmup phase configuration excluding initial transient period",
282
+ )
283
+ cooldown: TransientPhaseConfig = Field(
284
+ default_factory=TransientPhaseConfig,
285
+ description="Cooldown phase configuration excluding final transient period",
286
+ )
287
+ prefer_response_metrics: bool = Field(
288
+ default=True,
289
+ description="Prioritize response-based metrics over request-based metrics",
290
+ )
291
+ profile: Profile = Field(
292
+ description="Profile instance coordinating multi-strategy execution",
293
+ )
294
+ requests: dict[str, Any] = Field(
295
+ description="Request generation configuration and dataset metadata",
296
+ )
297
+ backend: dict[str, Any] = Field(
298
+ description="Backend connection parameters and service configuration",
299
+ )
300
+ environment: dict[str, Any] = Field(
301
+ description="Execution environment details and system metadata",
302
+ )
303
+
304
+
305
+ class BenchmarkAccumulator(StandardBaseDict, ABC, Generic[RequestT, ResponseT]):
306
+ """
307
+ Track and accumulate benchmark metrics during scheduler execution.
308
+
309
+ Maintains incremental metric estimates as requests are processed, enabling
310
+ real-time progress monitoring and efficient metric compilation. Subclasses
311
+ implement specific metric calculation strategies based on request/response
312
+ characteristics and scheduler state evolution.
313
+ """
314
+
315
+ config: BenchmarkConfig = Field(
316
+ description="Benchmark execution configuration and constraints",
317
+ )
318
+
319
+ @abstractmethod
320
+ def update_estimate(
321
+ self,
322
+ response: ResponseT | None,
323
+ request: RequestT | MultiTurnRequestT[RequestT],
324
+ info: RequestInfo,
325
+ scheduler_state: SchedulerState,
326
+ ):
327
+ """
328
+ Incrementally update metrics with completed request data.
329
+
330
+ :param response: Backend response data if request succeeded
331
+ :param request: Request instance submitted to backend
332
+ :param info: Request timing, status, and execution metadata
333
+ :param scheduler_state: Current scheduler state with queue and concurrency info
334
+ """
335
+ ...
336
+
337
+
338
+ class Benchmark(StandardBaseDict, ABC, Generic[BenchmarkAccumulatorT]):
339
+ """
340
+ Compile and expose final benchmark execution metrics.
341
+
342
+ Defines the interface for benchmark result implementations capturing
343
+ comprehensive performance metrics including latency distributions, throughput
344
+ measurements, and concurrency patterns. Subclasses implement compilation
345
+ logic to transform accumulated metrics and scheduler state into structured
346
+ results with statistical summaries.
347
+ """
348
+
349
+ @property
350
+ @abstractmethod
351
+ def start_time(self) -> float:
352
+ """
353
+ :return: Benchmark start timestamp in seconds since epoch
354
+ """
355
+
356
+ @property
357
+ @abstractmethod
358
+ def end_time(self) -> float:
359
+ """
360
+ :return: Benchmark completion timestamp in seconds since epoch
361
+ """
362
+
363
+ @property
364
+ @abstractmethod
365
+ def duration(self) -> float:
366
+ """
367
+ :return: Benchmark execution duration in seconds
368
+ """
369
+
370
+ @property
371
+ @abstractmethod
372
+ def request_latency(self) -> StatusDistributionSummary:
373
+ """
374
+ :return: Statistical distribution of request latencies
375
+ """
376
+
377
+ @property
378
+ @abstractmethod
379
+ def request_throughput(self) -> StatusDistributionSummary:
380
+ """
381
+ :return: Statistical distribution of throughput measurements
382
+ """
383
+
384
+ @property
385
+ @abstractmethod
386
+ def request_concurrency(self) -> StatusDistributionSummary:
387
+ """
388
+ :return: Statistical distribution of concurrent request counts
389
+ """
390
+
391
+ @classmethod
392
+ @abstractmethod
393
+ def compile(
394
+ cls, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState
395
+ ) -> Any:
396
+ """
397
+ Transform accumulated metrics into final benchmark results.
398
+
399
+ :param accumulator: Accumulator instance with collected metrics and state
400
+ :param scheduler_state: Scheduler's final state after execution completion
401
+ :return: Compiled benchmark instance with complete statistical results
402
+ """
@@ -0,0 +1,55 @@
1
+ """
2
+ Generative AI benchmark schemas for performance measurement and analysis.
3
+
4
+ This module provides the complete schema ecosystem for executing, tracking, and
5
+ analyzing generative AI benchmarks. It encompasses configuration entrypoints for
6
+ benchmark setup, real-time metric accumulators for execution monitoring,
7
+ comprehensive result containers with statistical summaries, and multi-benchmark
8
+ reporting capabilities. The schemas support domain-specific metrics for text,
9
+ image, video, and audio generation tasks, enabling detailed performance analysis
10
+ including throughput, latency distributions, concurrency patterns, and scheduler
11
+ behavior tracking across successful, incomplete, and errored requests.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from .accumulator import (
17
+ GenerativeBenchmarkAccumulator,
18
+ GenerativeBenchmarkTimings,
19
+ GenerativeMetricsAccumulator,
20
+ GenerativeRequestsAccumulator,
21
+ RunningMetricStats,
22
+ SchedulerMetricsAccumulator,
23
+ )
24
+ from .benchmark import GenerativeBenchmark
25
+ from .entrypoints import BenchmarkGenerativeTextArgs
26
+ from .metrics import (
27
+ GenerativeAudioMetricsSummary,
28
+ GenerativeImageMetricsSummary,
29
+ GenerativeMetrics,
30
+ GenerativeMetricsSummary,
31
+ GenerativeTextMetricsSummary,
32
+ GenerativeVideoMetricsSummary,
33
+ SchedulerMetrics,
34
+ )
35
+ from .report import GenerativeBenchmarkMetadata, GenerativeBenchmarksReport
36
+
37
+ __all__ = [
38
+ "BenchmarkGenerativeTextArgs",
39
+ "GenerativeAudioMetricsSummary",
40
+ "GenerativeBenchmark",
41
+ "GenerativeBenchmarkAccumulator",
42
+ "GenerativeBenchmarkMetadata",
43
+ "GenerativeBenchmarkTimings",
44
+ "GenerativeBenchmarksReport",
45
+ "GenerativeImageMetricsSummary",
46
+ "GenerativeMetrics",
47
+ "GenerativeMetricsAccumulator",
48
+ "GenerativeMetricsSummary",
49
+ "GenerativeRequestsAccumulator",
50
+ "GenerativeTextMetricsSummary",
51
+ "GenerativeVideoMetricsSummary",
52
+ "RunningMetricStats",
53
+ "SchedulerMetrics",
54
+ "SchedulerMetricsAccumulator",
55
+ ]