guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Builtin benchmark scenario definitions and discovery utilities.
|
|
3
|
+
|
|
4
|
+
This module provides access to predefined benchmark scenarios stored as JSON files
|
|
5
|
+
within the scenarios directory. It enables discovery and retrieval of builtin
|
|
6
|
+
scenarios by name or filename, supporting both stem names (without extension) and
|
|
7
|
+
full filenames for flexible scenario loading.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from functools import cache
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Annotated
|
|
15
|
+
|
|
16
|
+
__all__ = ["SCENARIO_DIR", "get_builtin_scenarios"]
|
|
17
|
+
|
|
18
|
+
SCENARIO_DIR: Annotated[
|
|
19
|
+
Path,
|
|
20
|
+
"Directory path containing builtin scenario JSON files",
|
|
21
|
+
] = Path(__file__).parent
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@cache
|
|
25
|
+
def get_builtin_scenarios() -> dict[str, Path]:
|
|
26
|
+
"""
|
|
27
|
+
Retrieve all builtin scenario definitions from the scenarios directory.
|
|
28
|
+
|
|
29
|
+
Scans the scenarios directory for JSON files and returns a mapping of scenario
|
|
30
|
+
names to their file paths. Each scenario is indexed by both its stem name
|
|
31
|
+
(filename without extension) and full filename for convenient lookup.
|
|
32
|
+
|
|
33
|
+
:return: Dictionary mapping scenario names and filenames to their Path objects
|
|
34
|
+
"""
|
|
35
|
+
builtin = {}
|
|
36
|
+
for path in SCENARIO_DIR.glob("*.json"):
|
|
37
|
+
builtin[path.stem] = path
|
|
38
|
+
builtin[path.name] = path
|
|
39
|
+
|
|
40
|
+
return builtin
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark schemas for performance measurement and result analysis.
|
|
3
|
+
|
|
4
|
+
This module consolidates the complete benchmark schema ecosystem, providing both
|
|
5
|
+
base abstractions for benchmark execution and domain-specific implementations
|
|
6
|
+
for generative AI tasks. It exports core configuration objects, accumulator
|
|
7
|
+
interfaces for real-time metric collection, benchmark result containers with
|
|
8
|
+
statistical summaries, and reporting utilities. The schemas support flexible
|
|
9
|
+
scheduling strategies, comprehensive metric tracking including latency and
|
|
10
|
+
throughput distributions, and multi-modal generative benchmarks for text, image,
|
|
11
|
+
video, and audio generation tasks.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from .base import (
|
|
17
|
+
Benchmark,
|
|
18
|
+
BenchmarkAccumulator,
|
|
19
|
+
BenchmarkAccumulatorT,
|
|
20
|
+
BenchmarkConfig,
|
|
21
|
+
BenchmarkT,
|
|
22
|
+
)
|
|
23
|
+
from .generative import (
|
|
24
|
+
BenchmarkGenerativeTextArgs,
|
|
25
|
+
GenerativeAudioMetricsSummary,
|
|
26
|
+
GenerativeBenchmark,
|
|
27
|
+
GenerativeBenchmarkAccumulator,
|
|
28
|
+
GenerativeBenchmarkMetadata,
|
|
29
|
+
GenerativeBenchmarksReport,
|
|
30
|
+
GenerativeBenchmarkTimings,
|
|
31
|
+
GenerativeImageMetricsSummary,
|
|
32
|
+
GenerativeMetrics,
|
|
33
|
+
GenerativeMetricsAccumulator,
|
|
34
|
+
GenerativeMetricsSummary,
|
|
35
|
+
GenerativeRequestsAccumulator,
|
|
36
|
+
GenerativeTextMetricsSummary,
|
|
37
|
+
GenerativeVideoMetricsSummary,
|
|
38
|
+
RunningMetricStats,
|
|
39
|
+
SchedulerMetrics,
|
|
40
|
+
SchedulerMetricsAccumulator,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
"Benchmark",
|
|
45
|
+
"BenchmarkAccumulator",
|
|
46
|
+
"BenchmarkAccumulatorT",
|
|
47
|
+
"BenchmarkConfig",
|
|
48
|
+
"BenchmarkGenerativeTextArgs",
|
|
49
|
+
"BenchmarkT",
|
|
50
|
+
"GenerativeAudioMetricsSummary",
|
|
51
|
+
"GenerativeBenchmark",
|
|
52
|
+
"GenerativeBenchmarkAccumulator",
|
|
53
|
+
"GenerativeBenchmarkMetadata",
|
|
54
|
+
"GenerativeBenchmarkTimings",
|
|
55
|
+
"GenerativeBenchmarksReport",
|
|
56
|
+
"GenerativeImageMetricsSummary",
|
|
57
|
+
"GenerativeMetrics",
|
|
58
|
+
"GenerativeMetricsAccumulator",
|
|
59
|
+
"GenerativeMetricsSummary",
|
|
60
|
+
"GenerativeRequestsAccumulator",
|
|
61
|
+
"GenerativeTextMetricsSummary",
|
|
62
|
+
"GenerativeVideoMetricsSummary",
|
|
63
|
+
"RunningMetricStats",
|
|
64
|
+
"SchedulerMetrics",
|
|
65
|
+
"SchedulerMetricsAccumulator",
|
|
66
|
+
]
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base schemas for benchmark execution, metric accumulation, and result compilation.
|
|
3
|
+
|
|
4
|
+
Defines abstract interfaces and configuration models for coordinating benchmark
|
|
5
|
+
execution with schedulers. The module centers around three key abstractions:
|
|
6
|
+
BenchmarkConfig encapsulates execution parameters and constraints; BenchmarkAccumulator
|
|
7
|
+
tracks incremental metrics during scheduler runs; and Benchmark compiles final results
|
|
8
|
+
with comprehensive latency, throughput, and concurrency distributions. Supports
|
|
9
|
+
configurable warmup/cooldown phases, transient period handling, and flexible metric
|
|
10
|
+
sampling strategies.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import uuid
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from typing import Any, Generic, Literal, TypeVar
|
|
18
|
+
|
|
19
|
+
from pydantic import Field, NonNegativeFloat, NonNegativeInt
|
|
20
|
+
|
|
21
|
+
from guidellm.benchmark.profiles import Profile
|
|
22
|
+
from guidellm.scheduler import (
|
|
23
|
+
MultiTurnRequestT,
|
|
24
|
+
RequestT,
|
|
25
|
+
ResponseT,
|
|
26
|
+
SchedulerState,
|
|
27
|
+
SchedulingStrategy,
|
|
28
|
+
)
|
|
29
|
+
from guidellm.schemas import (
|
|
30
|
+
RequestInfo,
|
|
31
|
+
StandardBaseDict,
|
|
32
|
+
StandardBaseModel,
|
|
33
|
+
StatusDistributionSummary,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"Benchmark",
|
|
38
|
+
"BenchmarkAccumulator",
|
|
39
|
+
"BenchmarkAccumulatorT",
|
|
40
|
+
"BenchmarkConfig",
|
|
41
|
+
"BenchmarkT",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
BenchmarkAccumulatorT = TypeVar(
|
|
45
|
+
"BenchmarkAccumulatorT", bound="BenchmarkAccumulator[Any, Any]"
|
|
46
|
+
)
|
|
47
|
+
"Generic type variable for benchmark accumulator implementations"
|
|
48
|
+
|
|
49
|
+
BenchmarkT = TypeVar("BenchmarkT", bound="Benchmark")
|
|
50
|
+
"Generic type variable for benchmark result implementations"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TransientPhaseConfig(StandardBaseModel):
|
|
54
|
+
"""
|
|
55
|
+
Configure warmup and cooldown phases for benchmark execution.
|
|
56
|
+
|
|
57
|
+
Supports flexible phase definition through percentage or absolute value
|
|
58
|
+
specifications with multiple interpretation modes. Phases can be bounded
|
|
59
|
+
by duration, request count, or both, enabling precise control over transient
|
|
60
|
+
periods that should be excluded from final benchmark metrics.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def create_from_value(
|
|
65
|
+
cls, value: int | float | dict | TransientPhaseConfig | None
|
|
66
|
+
) -> TransientPhaseConfig:
|
|
67
|
+
"""
|
|
68
|
+
Create configuration from flexible input formats.
|
|
69
|
+
|
|
70
|
+
:param value: Configuration as int/float (percent if <1.0, absolute
|
|
71
|
+
otherwise), dict (validated to model), TransientPhaseConfig instance,
|
|
72
|
+
or None for defaults
|
|
73
|
+
:return: Configured TransientPhaseConfig instance
|
|
74
|
+
:raises ValueError: If value type is unsupported
|
|
75
|
+
"""
|
|
76
|
+
if value is None:
|
|
77
|
+
return TransientPhaseConfig()
|
|
78
|
+
|
|
79
|
+
if isinstance(value, TransientPhaseConfig):
|
|
80
|
+
return value
|
|
81
|
+
|
|
82
|
+
if isinstance(value, dict):
|
|
83
|
+
return TransientPhaseConfig.model_validate(value)
|
|
84
|
+
|
|
85
|
+
if isinstance(value, int | float):
|
|
86
|
+
kwargs = {
|
|
87
|
+
"percent": value if value < 1.0 else None,
|
|
88
|
+
"value": value if value >= 1.0 else None,
|
|
89
|
+
}
|
|
90
|
+
return TransientPhaseConfig.model_validate(kwargs)
|
|
91
|
+
|
|
92
|
+
raise ValueError(f"Unsupported type for TransientPhaseConfig: {type(value)}")
|
|
93
|
+
|
|
94
|
+
percent: NonNegativeFloat | None = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
description=(
|
|
97
|
+
"Phase size as percentage (0.0-1.0) of total duration/requests; "
|
|
98
|
+
"interpretation depends on mode. Takes precedence over value when target "
|
|
99
|
+
"mode is available, otherwise falls back to value"
|
|
100
|
+
),
|
|
101
|
+
lt=1.0,
|
|
102
|
+
)
|
|
103
|
+
value: NonNegativeInt | NonNegativeFloat | None = Field(
|
|
104
|
+
default=None,
|
|
105
|
+
description=(
|
|
106
|
+
"Phase size as absolute duration (seconds) or request count; "
|
|
107
|
+
"interpretation depends on mode. Used when percent is unset or "
|
|
108
|
+
"target mode unavailable"
|
|
109
|
+
),
|
|
110
|
+
)
|
|
111
|
+
mode: Literal[
|
|
112
|
+
"duration", "requests", "prefer_duration", "prefer_requests", "both"
|
|
113
|
+
] = Field(
|
|
114
|
+
default="prefer_duration",
|
|
115
|
+
description=(
|
|
116
|
+
"Interpretation mode: 'duration' for time-based phases, 'requests' for "
|
|
117
|
+
"count-based phases, 'prefer_duration'/'prefer_requests' for fallback "
|
|
118
|
+
"behavior, 'both' requires satisfying both conditions"
|
|
119
|
+
),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def compute_limits(
|
|
123
|
+
self,
|
|
124
|
+
max_requests: int | float | None,
|
|
125
|
+
max_seconds: float | None,
|
|
126
|
+
enforce_preference: bool = True,
|
|
127
|
+
) -> tuple[float | None, int | None]:
|
|
128
|
+
"""
|
|
129
|
+
Calculate phase boundaries from benchmark constraints.
|
|
130
|
+
|
|
131
|
+
:param max_requests: Total request budget for benchmark execution
|
|
132
|
+
:param max_seconds: Total duration budget for benchmark execution
|
|
133
|
+
:param enforce_preference: Whether to enforce preferred mode when both
|
|
134
|
+
duration and request constraints are available
|
|
135
|
+
:return: Tuple of (phase duration in seconds, phase request count)
|
|
136
|
+
"""
|
|
137
|
+
duration: float | None = None
|
|
138
|
+
requests: int | None = None
|
|
139
|
+
|
|
140
|
+
if self.mode != "requests" and max_seconds is not None:
|
|
141
|
+
if self.percent is not None:
|
|
142
|
+
duration = self.percent * max_seconds
|
|
143
|
+
elif self.value is not None:
|
|
144
|
+
duration = float(self.value)
|
|
145
|
+
|
|
146
|
+
if self.mode != "duration" and max_requests is not None:
|
|
147
|
+
if self.percent is not None:
|
|
148
|
+
requests = int(self.percent * max_requests)
|
|
149
|
+
elif self.value is not None:
|
|
150
|
+
requests = int(self.value)
|
|
151
|
+
|
|
152
|
+
if enforce_preference:
|
|
153
|
+
if self.mode == "prefer_duration" and duration is not None:
|
|
154
|
+
requests = None
|
|
155
|
+
elif self.mode == "prefer_requests" and requests is not None:
|
|
156
|
+
duration = None
|
|
157
|
+
|
|
158
|
+
return duration, requests
|
|
159
|
+
|
|
160
|
+
def compute_transition_time(
|
|
161
|
+
self, info: RequestInfo, state: SchedulerState, period: Literal["start", "end"]
|
|
162
|
+
) -> tuple[bool, float | None]:
|
|
163
|
+
"""
|
|
164
|
+
Determine transition timestamp for entering or exiting phase.
|
|
165
|
+
|
|
166
|
+
:param info: RequestInfo for current request to calculate against
|
|
167
|
+
:param state: SchedulerState with current progress metrics and scheduler info
|
|
168
|
+
:param period: Phase period, either "start" for warmup or "end" for cooldown
|
|
169
|
+
:return: Tuple of (phase active flag, transition timestamp if applicable)
|
|
170
|
+
"""
|
|
171
|
+
phase_duration, phase_requests = self.compute_limits(
|
|
172
|
+
max_requests=state.progress.total_requests,
|
|
173
|
+
max_seconds=state.progress.total_duration,
|
|
174
|
+
)
|
|
175
|
+
duration_transition_time: float | None = None
|
|
176
|
+
request_transition_time: float | None = None
|
|
177
|
+
|
|
178
|
+
# Calculate transition times for the phase based on phase limits and period
|
|
179
|
+
# Potential phases: start (warmup) -> active -> end (cooldown)
|
|
180
|
+
# Warmup transition times: (start, start + duration)
|
|
181
|
+
# Active transition times: (start + duration, end - duration)
|
|
182
|
+
# Cooldown transition times: (end - duration, end)
|
|
183
|
+
if period == "start":
|
|
184
|
+
if phase_duration is not None:
|
|
185
|
+
# Duration was set and caculating for "warmup" / start phase
|
|
186
|
+
# Phase is active for [start, start + duration]
|
|
187
|
+
duration_transition_time = state.start_time + phase_duration
|
|
188
|
+
if phase_requests is not None:
|
|
189
|
+
# Requests was set and calculating for "warmup" / start phase
|
|
190
|
+
# Phase is active for requests [0, phase_requests]
|
|
191
|
+
# Grab start time of the next request as transition time
|
|
192
|
+
# (all requests up to and including phase_requests are in warmup)
|
|
193
|
+
request_transition_time = (
|
|
194
|
+
info.started_at
|
|
195
|
+
if info.started_at is not None
|
|
196
|
+
and state.processed_requests == phase_requests + 1
|
|
197
|
+
else -1.0
|
|
198
|
+
)
|
|
199
|
+
elif period == "end":
|
|
200
|
+
if phase_duration is not None:
|
|
201
|
+
# Duration was set and calculating for "cooldown" / end phase
|
|
202
|
+
# Phase is active for [end - duration, end]
|
|
203
|
+
duration_transition_time = (
|
|
204
|
+
state.start_time + state.progress.total_duration - phase_duration
|
|
205
|
+
if state.progress.total_duration is not None
|
|
206
|
+
else -1.0
|
|
207
|
+
)
|
|
208
|
+
if phase_requests is not None:
|
|
209
|
+
# Requests was set and calculating for "cooldown" / end phase
|
|
210
|
+
# Phase is active for requests [total - phase_requests, total]
|
|
211
|
+
# Grab completion time of the request right before cooldown starts
|
|
212
|
+
# (all requests from that point onward are in cooldown)
|
|
213
|
+
request_transition_time = (
|
|
214
|
+
info.completed_at
|
|
215
|
+
if info.completed_at is not None
|
|
216
|
+
and state.progress.remaining_requests is not None
|
|
217
|
+
and state.progress.remaining_requests == phase_requests + 1
|
|
218
|
+
else -1.0
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
transition_active: bool = False
|
|
222
|
+
transition_time: float | None = None
|
|
223
|
+
|
|
224
|
+
if request_transition_time == -1.0 or duration_transition_time == -1.0:
|
|
225
|
+
# Transition defined but not yet reached or passed
|
|
226
|
+
transition_active = True
|
|
227
|
+
request_transition_time = None
|
|
228
|
+
elif (
|
|
229
|
+
request_transition_time is not None and duration_transition_time is not None
|
|
230
|
+
):
|
|
231
|
+
# Both limits defined; need to satisfy both (min for end, max for start)
|
|
232
|
+
transition_active = True
|
|
233
|
+
transition_time = (
|
|
234
|
+
min(request_transition_time, duration_transition_time)
|
|
235
|
+
if period == "end"
|
|
236
|
+
else max(request_transition_time, duration_transition_time)
|
|
237
|
+
)
|
|
238
|
+
elif (
|
|
239
|
+
request_transition_time is not None or duration_transition_time is not None
|
|
240
|
+
):
|
|
241
|
+
# One limit defined; satisfy that one
|
|
242
|
+
transition_active = True
|
|
243
|
+
transition_time = request_transition_time or duration_transition_time
|
|
244
|
+
|
|
245
|
+
return transition_active, transition_time
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class BenchmarkConfig(StandardBaseDict):
|
|
249
|
+
"""
|
|
250
|
+
Encapsulate execution parameters and constraints for benchmark runs.
|
|
251
|
+
|
|
252
|
+
Defines comprehensive configuration including scheduler strategy, constraint
|
|
253
|
+
sets, transient phase handling, metric sampling preferences, and execution
|
|
254
|
+
metadata. Coordinates profile, request, backend, and environment configurations
|
|
255
|
+
to enable reproducible benchmark execution with precise control over metric
|
|
256
|
+
collection.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
id_: str = Field(
|
|
260
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
261
|
+
description="Unique identifier for this benchmark execution",
|
|
262
|
+
)
|
|
263
|
+
run_id: str = Field(
|
|
264
|
+
description="Identifier grouping related benchmark runs in a series",
|
|
265
|
+
)
|
|
266
|
+
run_index: int = Field(
|
|
267
|
+
description="Zero-based index of this run within the benchmark series",
|
|
268
|
+
)
|
|
269
|
+
strategy: SchedulingStrategy = Field(
|
|
270
|
+
description="Scheduler strategy controlling request execution patterns",
|
|
271
|
+
)
|
|
272
|
+
constraints: dict[str, dict[str, Any]] = Field(
|
|
273
|
+
description="Constraint definitions applied to scheduler strategy execution",
|
|
274
|
+
)
|
|
275
|
+
sample_requests: int | None = Field(
|
|
276
|
+
default=20,
|
|
277
|
+
description="Request count for statistical sampling in final metrics",
|
|
278
|
+
)
|
|
279
|
+
warmup: TransientPhaseConfig = Field(
|
|
280
|
+
default_factory=TransientPhaseConfig,
|
|
281
|
+
description="Warmup phase configuration excluding initial transient period",
|
|
282
|
+
)
|
|
283
|
+
cooldown: TransientPhaseConfig = Field(
|
|
284
|
+
default_factory=TransientPhaseConfig,
|
|
285
|
+
description="Cooldown phase configuration excluding final transient period",
|
|
286
|
+
)
|
|
287
|
+
prefer_response_metrics: bool = Field(
|
|
288
|
+
default=True,
|
|
289
|
+
description="Prioritize response-based metrics over request-based metrics",
|
|
290
|
+
)
|
|
291
|
+
profile: Profile = Field(
|
|
292
|
+
description="Profile instance coordinating multi-strategy execution",
|
|
293
|
+
)
|
|
294
|
+
requests: dict[str, Any] = Field(
|
|
295
|
+
description="Request generation configuration and dataset metadata",
|
|
296
|
+
)
|
|
297
|
+
backend: dict[str, Any] = Field(
|
|
298
|
+
description="Backend connection parameters and service configuration",
|
|
299
|
+
)
|
|
300
|
+
environment: dict[str, Any] = Field(
|
|
301
|
+
description="Execution environment details and system metadata",
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class BenchmarkAccumulator(StandardBaseDict, ABC, Generic[RequestT, ResponseT]):
|
|
306
|
+
"""
|
|
307
|
+
Track and accumulate benchmark metrics during scheduler execution.
|
|
308
|
+
|
|
309
|
+
Maintains incremental metric estimates as requests are processed, enabling
|
|
310
|
+
real-time progress monitoring and efficient metric compilation. Subclasses
|
|
311
|
+
implement specific metric calculation strategies based on request/response
|
|
312
|
+
characteristics and scheduler state evolution.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
config: BenchmarkConfig = Field(
|
|
316
|
+
description="Benchmark execution configuration and constraints",
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
@abstractmethod
|
|
320
|
+
def update_estimate(
|
|
321
|
+
self,
|
|
322
|
+
response: ResponseT | None,
|
|
323
|
+
request: RequestT | MultiTurnRequestT[RequestT],
|
|
324
|
+
info: RequestInfo,
|
|
325
|
+
scheduler_state: SchedulerState,
|
|
326
|
+
):
|
|
327
|
+
"""
|
|
328
|
+
Incrementally update metrics with completed request data.
|
|
329
|
+
|
|
330
|
+
:param response: Backend response data if request succeeded
|
|
331
|
+
:param request: Request instance submitted to backend
|
|
332
|
+
:param info: Request timing, status, and execution metadata
|
|
333
|
+
:param scheduler_state: Current scheduler state with queue and concurrency info
|
|
334
|
+
"""
|
|
335
|
+
...
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class Benchmark(StandardBaseDict, ABC, Generic[BenchmarkAccumulatorT]):
|
|
339
|
+
"""
|
|
340
|
+
Compile and expose final benchmark execution metrics.
|
|
341
|
+
|
|
342
|
+
Defines the interface for benchmark result implementations capturing
|
|
343
|
+
comprehensive performance metrics including latency distributions, throughput
|
|
344
|
+
measurements, and concurrency patterns. Subclasses implement compilation
|
|
345
|
+
logic to transform accumulated metrics and scheduler state into structured
|
|
346
|
+
results with statistical summaries.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
@property
|
|
350
|
+
@abstractmethod
|
|
351
|
+
def start_time(self) -> float:
|
|
352
|
+
"""
|
|
353
|
+
:return: Benchmark start timestamp in seconds since epoch
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
@abstractmethod
|
|
358
|
+
def end_time(self) -> float:
|
|
359
|
+
"""
|
|
360
|
+
:return: Benchmark completion timestamp in seconds since epoch
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
@property
|
|
364
|
+
@abstractmethod
|
|
365
|
+
def duration(self) -> float:
|
|
366
|
+
"""
|
|
367
|
+
:return: Benchmark execution duration in seconds
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
@abstractmethod
|
|
372
|
+
def request_latency(self) -> StatusDistributionSummary:
|
|
373
|
+
"""
|
|
374
|
+
:return: Statistical distribution of request latencies
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
@property
|
|
378
|
+
@abstractmethod
|
|
379
|
+
def request_throughput(self) -> StatusDistributionSummary:
|
|
380
|
+
"""
|
|
381
|
+
:return: Statistical distribution of throughput measurements
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
@abstractmethod
|
|
386
|
+
def request_concurrency(self) -> StatusDistributionSummary:
|
|
387
|
+
"""
|
|
388
|
+
:return: Statistical distribution of concurrent request counts
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
@classmethod
|
|
392
|
+
@abstractmethod
|
|
393
|
+
def compile(
|
|
394
|
+
cls, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState
|
|
395
|
+
) -> Any:
|
|
396
|
+
"""
|
|
397
|
+
Transform accumulated metrics into final benchmark results.
|
|
398
|
+
|
|
399
|
+
:param accumulator: Accumulator instance with collected metrics and state
|
|
400
|
+
:param scheduler_state: Scheduler's final state after execution completion
|
|
401
|
+
:return: Compiled benchmark instance with complete statistical results
|
|
402
|
+
"""
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generative AI benchmark schemas for performance measurement and analysis.
|
|
3
|
+
|
|
4
|
+
This module provides the complete schema ecosystem for executing, tracking, and
|
|
5
|
+
analyzing generative AI benchmarks. It encompasses configuration entrypoints for
|
|
6
|
+
benchmark setup, real-time metric accumulators for execution monitoring,
|
|
7
|
+
comprehensive result containers with statistical summaries, and multi-benchmark
|
|
8
|
+
reporting capabilities. The schemas support domain-specific metrics for text,
|
|
9
|
+
image, video, and audio generation tasks, enabling detailed performance analysis
|
|
10
|
+
including throughput, latency distributions, concurrency patterns, and scheduler
|
|
11
|
+
behavior tracking across successful, incomplete, and errored requests.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from .accumulator import (
|
|
17
|
+
GenerativeBenchmarkAccumulator,
|
|
18
|
+
GenerativeBenchmarkTimings,
|
|
19
|
+
GenerativeMetricsAccumulator,
|
|
20
|
+
GenerativeRequestsAccumulator,
|
|
21
|
+
RunningMetricStats,
|
|
22
|
+
SchedulerMetricsAccumulator,
|
|
23
|
+
)
|
|
24
|
+
from .benchmark import GenerativeBenchmark
|
|
25
|
+
from .entrypoints import BenchmarkGenerativeTextArgs
|
|
26
|
+
from .metrics import (
|
|
27
|
+
GenerativeAudioMetricsSummary,
|
|
28
|
+
GenerativeImageMetricsSummary,
|
|
29
|
+
GenerativeMetrics,
|
|
30
|
+
GenerativeMetricsSummary,
|
|
31
|
+
GenerativeTextMetricsSummary,
|
|
32
|
+
GenerativeVideoMetricsSummary,
|
|
33
|
+
SchedulerMetrics,
|
|
34
|
+
)
|
|
35
|
+
from .report import GenerativeBenchmarkMetadata, GenerativeBenchmarksReport
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"BenchmarkGenerativeTextArgs",
|
|
39
|
+
"GenerativeAudioMetricsSummary",
|
|
40
|
+
"GenerativeBenchmark",
|
|
41
|
+
"GenerativeBenchmarkAccumulator",
|
|
42
|
+
"GenerativeBenchmarkMetadata",
|
|
43
|
+
"GenerativeBenchmarkTimings",
|
|
44
|
+
"GenerativeBenchmarksReport",
|
|
45
|
+
"GenerativeImageMetricsSummary",
|
|
46
|
+
"GenerativeMetrics",
|
|
47
|
+
"GenerativeMetricsAccumulator",
|
|
48
|
+
"GenerativeMetricsSummary",
|
|
49
|
+
"GenerativeRequestsAccumulator",
|
|
50
|
+
"GenerativeTextMetricsSummary",
|
|
51
|
+
"GenerativeVideoMetricsSummary",
|
|
52
|
+
"RunningMetricStats",
|
|
53
|
+
"SchedulerMetrics",
|
|
54
|
+
"SchedulerMetricsAccumulator",
|
|
55
|
+
]
|