guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML output formatter for benchmark results.
|
|
3
|
+
|
|
4
|
+
Transforms benchmark data into interactive web-based reports by building UI data
|
|
5
|
+
structures, converting keys to camelCase for JavaScript compatibility, and injecting
|
|
6
|
+
formatted data into HTML templates. The formatter processes GenerativeBenchmark
|
|
7
|
+
instances and their associated metrics, creating histogram buckets for distributions,
|
|
8
|
+
formatting percentile statistics for tabular display, and embedding all data as
|
|
9
|
+
JavaScript objects within an HTML template for client-side rendering and visualization.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import random
|
|
16
|
+
import re
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
from math import ceil
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, ClassVar
|
|
22
|
+
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from pydantic import BaseModel, Field, computed_field
|
|
25
|
+
|
|
26
|
+
from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
|
|
27
|
+
from guidellm.benchmark.schemas import (
|
|
28
|
+
BenchmarkGenerativeTextArgs,
|
|
29
|
+
GenerativeBenchmark,
|
|
30
|
+
GenerativeBenchmarksReport,
|
|
31
|
+
)
|
|
32
|
+
from guidellm.schemas import DistributionSummary, Percentiles
|
|
33
|
+
from guidellm.settings import settings
|
|
34
|
+
from guidellm.utils import camelize_str, recursive_key_update
|
|
35
|
+
from guidellm.utils.text import load_text
|
|
36
|
+
|
|
37
|
+
__all__ = ["GenerativeBenchmarkerHTML"]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@GenerativeBenchmarkerOutput.register("html")
|
|
41
|
+
class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput):
|
|
42
|
+
"""
|
|
43
|
+
HTML output formatter for benchmark results.
|
|
44
|
+
|
|
45
|
+
Generates interactive HTML reports from benchmark data by transforming results
|
|
46
|
+
into camelCase JSON structures and injecting them into HTML templates. The
|
|
47
|
+
formatter processes benchmark metrics, creates histogram distributions, and
|
|
48
|
+
embeds all data into a pre-built HTML template for browser-based visualization.
|
|
49
|
+
Reports are saved to the specified output path or current working directory.
|
|
50
|
+
|
|
51
|
+
:cvar DEFAULT_FILE: Default filename for HTML output when a directory is provided
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
DEFAULT_FILE: ClassVar[str] = "benchmarks.html"
|
|
55
|
+
|
|
56
|
+
output_path: Path = Field(
|
|
57
|
+
default_factory=lambda: Path.cwd(),
|
|
58
|
+
description=(
|
|
59
|
+
"Directory or file path for saving the HTML report, "
|
|
60
|
+
"defaults to current working directory"
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def validated_kwargs(
|
|
66
|
+
cls, output_path: str | Path | None, **_kwargs
|
|
67
|
+
) -> dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Validate and normalize output path argument.
|
|
70
|
+
|
|
71
|
+
:param output_path: Output file or directory path for the HTML report
|
|
72
|
+
:return: Dictionary containing validated output_path if provided
|
|
73
|
+
"""
|
|
74
|
+
validated: dict[str, Any] = {}
|
|
75
|
+
if output_path is not None:
|
|
76
|
+
validated["output_path"] = (
|
|
77
|
+
Path(output_path) if not isinstance(output_path, Path) else output_path
|
|
78
|
+
)
|
|
79
|
+
return validated
|
|
80
|
+
|
|
81
|
+
async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
|
|
82
|
+
"""
|
|
83
|
+
Generate and save the HTML benchmark report.
|
|
84
|
+
|
|
85
|
+
Transforms benchmark data into camelCase JSON format, injects it into the
|
|
86
|
+
HTML template, and writes the resulting report to the output path. Creates
|
|
87
|
+
parent directories if they don't exist.
|
|
88
|
+
|
|
89
|
+
:param report: Completed benchmark report containing all results
|
|
90
|
+
:return: Path to the saved HTML report file
|
|
91
|
+
"""
|
|
92
|
+
output_path = self.output_path
|
|
93
|
+
if output_path.is_dir():
|
|
94
|
+
output_path = output_path / self.DEFAULT_FILE
|
|
95
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
data = _build_ui_data(report.benchmarks, report.args)
|
|
98
|
+
camel_data = recursive_key_update(deepcopy(data), camelize_str)
|
|
99
|
+
|
|
100
|
+
ui_api_data = {
|
|
101
|
+
f"window.{key} = {{}};": f"window.{key} = {json.dumps(value, indent=2)};\n"
|
|
102
|
+
for key, value in camel_data.items()
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
_create_html_report(ui_api_data, output_path)
|
|
106
|
+
|
|
107
|
+
return output_path
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class _Bucket(BaseModel):
|
|
111
|
+
"""
|
|
112
|
+
Histogram bucket for data distribution visualization.
|
|
113
|
+
|
|
114
|
+
Represents a single bucket in a histogram with its starting value and count
|
|
115
|
+
of data points falling within the bucket range. Used to create distribution
|
|
116
|
+
histograms for metrics like token counts and request timings.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
value: float | int = Field(description="Starting value of the bucket range")
|
|
120
|
+
count: int = Field(description="Number of data points falling within this bucket")
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def from_data(
|
|
124
|
+
data: list[float] | list[int],
|
|
125
|
+
bucket_width: float | None = None,
|
|
126
|
+
n_buckets: int | None = None,
|
|
127
|
+
) -> tuple[list[_Bucket], float]:
|
|
128
|
+
"""
|
|
129
|
+
Create histogram buckets from numeric data values.
|
|
130
|
+
|
|
131
|
+
Divides the data range into equal-width buckets and counts values within
|
|
132
|
+
each bucket. Either bucket_width or n_buckets can be specified; if neither
|
|
133
|
+
is provided, defaults to 10 buckets.
|
|
134
|
+
|
|
135
|
+
:param data: Numeric values to bucket
|
|
136
|
+
:param bucket_width: Width of each bucket, computed if None
|
|
137
|
+
:param n_buckets: Number of buckets, defaults to 10 if width not specified
|
|
138
|
+
:return: Tuple of bucket list and computed bucket width
|
|
139
|
+
"""
|
|
140
|
+
if not data:
|
|
141
|
+
return [], 1.0
|
|
142
|
+
|
|
143
|
+
min_v = min(data)
|
|
144
|
+
max_v = max(data)
|
|
145
|
+
range_v = (1 + max_v) - min_v
|
|
146
|
+
|
|
147
|
+
if bucket_width is None:
|
|
148
|
+
if n_buckets is None:
|
|
149
|
+
n_buckets = 10
|
|
150
|
+
bucket_width = range_v / n_buckets
|
|
151
|
+
else:
|
|
152
|
+
n_buckets = ceil(range_v / bucket_width)
|
|
153
|
+
|
|
154
|
+
bucket_counts: defaultdict[float | int, int] = defaultdict(int)
|
|
155
|
+
for val in data:
|
|
156
|
+
idx = int((val - min_v) // bucket_width)
|
|
157
|
+
if idx >= n_buckets:
|
|
158
|
+
idx = n_buckets - 1
|
|
159
|
+
bucket_start = min_v + idx * bucket_width
|
|
160
|
+
bucket_counts[bucket_start] += 1
|
|
161
|
+
|
|
162
|
+
buckets = [
|
|
163
|
+
_Bucket(value=start, count=count)
|
|
164
|
+
for start, count in sorted(bucket_counts.items())
|
|
165
|
+
]
|
|
166
|
+
return buckets, bucket_width
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class _TabularDistributionSummary(DistributionSummary):
|
|
170
|
+
"""
|
|
171
|
+
Distribution summary with tabular percentile representation.
|
|
172
|
+
|
|
173
|
+
Extends DistributionSummary to provide percentile data formatted for table
|
|
174
|
+
display in the HTML report. Filters to show only key percentiles (p50, p90,
|
|
175
|
+
p95, p99) for concise presentation.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
@computed_field
|
|
179
|
+
def percentile_rows(self) -> list[dict[str, str | float]]:
|
|
180
|
+
"""
|
|
181
|
+
Format percentiles as table rows for UI display.
|
|
182
|
+
|
|
183
|
+
:return: List of dictionaries with percentile names and values
|
|
184
|
+
"""
|
|
185
|
+
rows = [
|
|
186
|
+
{"percentile": name, "value": value}
|
|
187
|
+
for name, value in self.percentiles.model_dump().items()
|
|
188
|
+
]
|
|
189
|
+
return list(
|
|
190
|
+
filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def model_dump(self, **kwargs) -> dict:
|
|
194
|
+
"""
|
|
195
|
+
Override model_dump to filter duplicate consecutive percentile values.
|
|
196
|
+
|
|
197
|
+
This prevents visualization errors when distributions have limited data
|
|
198
|
+
points causing multiple percentiles to collapse to the same value.
|
|
199
|
+
|
|
200
|
+
:param kwargs: Arguments to pass to parent model_dump
|
|
201
|
+
:return: Dictionary with filtered percentiles
|
|
202
|
+
"""
|
|
203
|
+
data = super().model_dump(**kwargs)
|
|
204
|
+
|
|
205
|
+
if "percentiles" in data and data["percentiles"]:
|
|
206
|
+
filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"])
|
|
207
|
+
data["percentiles"] = filtered_percentiles
|
|
208
|
+
|
|
209
|
+
return data
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def from_distribution_summary(
|
|
213
|
+
cls, distribution: DistributionSummary
|
|
214
|
+
) -> _TabularDistributionSummary:
|
|
215
|
+
"""
|
|
216
|
+
Convert standard DistributionSummary to tabular format.
|
|
217
|
+
|
|
218
|
+
:param distribution: Source distribution summary to convert
|
|
219
|
+
:return: Tabular distribution summary with formatted percentile rows
|
|
220
|
+
"""
|
|
221
|
+
return cls(**distribution.model_dump())
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path:
|
|
225
|
+
"""
|
|
226
|
+
Create HTML report by injecting JavaScript data into template.
|
|
227
|
+
|
|
228
|
+
Loads the HTML template, injects JavaScript data into the head section, and
|
|
229
|
+
writes the final report to the specified output path.
|
|
230
|
+
|
|
231
|
+
:param js_data: Dictionary mapping placeholder strings to JavaScript code
|
|
232
|
+
:param output_path: Path where HTML report will be saved
|
|
233
|
+
:return: Path to the saved report file
|
|
234
|
+
"""
|
|
235
|
+
html_content = load_text(settings.report_generation.source)
|
|
236
|
+
report_content = _inject_data(js_data, html_content)
|
|
237
|
+
|
|
238
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
239
|
+
output_path.write_text(report_content)
|
|
240
|
+
return output_path
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]:
|
|
244
|
+
"""
|
|
245
|
+
Filter out consecutive duplicate percentile values.
|
|
246
|
+
|
|
247
|
+
When distributions have very few data points, multiple percentiles can have
|
|
248
|
+
the same value, which causes visualization libraries to fail. This function
|
|
249
|
+
keeps only the largest percentile for consecutive duplicate values, which is
|
|
250
|
+
more mathematically accurate as higher percentiles have greater statistical
|
|
251
|
+
significance.
|
|
252
|
+
|
|
253
|
+
:param percentiles: Dictionary of percentile names to values
|
|
254
|
+
:return: Filtered percentiles dictionary with no consecutive duplicates
|
|
255
|
+
"""
|
|
256
|
+
if not percentiles:
|
|
257
|
+
return percentiles
|
|
258
|
+
|
|
259
|
+
percentile_order = list(Percentiles.model_fields.keys())
|
|
260
|
+
|
|
261
|
+
# Iterate in reverse to keep the largest percentile for each value
|
|
262
|
+
filtered = {}
|
|
263
|
+
previous_value = None
|
|
264
|
+
|
|
265
|
+
for key in reversed(percentile_order):
|
|
266
|
+
if key in percentiles:
|
|
267
|
+
current_value = percentiles[key]
|
|
268
|
+
if previous_value is None or current_value != previous_value:
|
|
269
|
+
filtered[key] = current_value
|
|
270
|
+
previous_value = current_value
|
|
271
|
+
|
|
272
|
+
# Restore original order
|
|
273
|
+
return {key: filtered[key] for key in percentile_order if key in filtered}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _inject_data(js_data: dict[str, str], html: str) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Inject JavaScript data into HTML head section.
|
|
279
|
+
|
|
280
|
+
Replaces placeholder strings in the HTML head section with actual JavaScript
|
|
281
|
+
code containing benchmark data. Returns original HTML if no head section found.
|
|
282
|
+
|
|
283
|
+
:param js_data: Dictionary mapping placeholder strings to JavaScript code
|
|
284
|
+
:param html: HTML template content
|
|
285
|
+
:return: HTML with injected JavaScript data
|
|
286
|
+
"""
|
|
287
|
+
head_match = re.search(r"<head[^>]*>(.*?)</head>", html, re.DOTALL | re.IGNORECASE)
|
|
288
|
+
if not head_match:
|
|
289
|
+
logger.warning("<head> section missing, returning original HTML.")
|
|
290
|
+
return html
|
|
291
|
+
|
|
292
|
+
head_content = head_match.group(1)
|
|
293
|
+
|
|
294
|
+
for placeholder, script in js_data.items():
|
|
295
|
+
head_content = head_content.replace(placeholder, script)
|
|
296
|
+
|
|
297
|
+
new_head = f"<head>{head_content}</head>"
|
|
298
|
+
return html[: head_match.start()] + new_head + html[head_match.end() :]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _build_ui_data(
|
|
302
|
+
benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
|
|
303
|
+
) -> dict[str, Any]:
|
|
304
|
+
"""
|
|
305
|
+
Build complete UI data structure from benchmarks.
|
|
306
|
+
|
|
307
|
+
Aggregates benchmark results into a structured format for the HTML UI,
|
|
308
|
+
including run metadata, workload details, and per-benchmark metrics.
|
|
309
|
+
|
|
310
|
+
:param benchmarks: List of completed benchmark results
|
|
311
|
+
:param args: Benchmark configuration arguments
|
|
312
|
+
:return: Dictionary with run_info, workload_details, and benchmarks sections
|
|
313
|
+
"""
|
|
314
|
+
return {
|
|
315
|
+
"run_info": _build_run_info(benchmarks, args),
|
|
316
|
+
"workload_details": _build_workload_details(benchmarks, args),
|
|
317
|
+
"benchmarks": _build_benchmarks(benchmarks),
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _build_run_info(
|
|
322
|
+
benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
|
|
323
|
+
) -> dict[str, Any]:
|
|
324
|
+
"""
|
|
325
|
+
Build run metadata from benchmarks.
|
|
326
|
+
|
|
327
|
+
Extracts model name, timestamp, and dataset information from the benchmark
|
|
328
|
+
configuration and results.
|
|
329
|
+
|
|
330
|
+
:param benchmarks: List of completed benchmark results
|
|
331
|
+
:param args: Benchmark configuration arguments
|
|
332
|
+
:return: Dictionary with model, task, timestamp, and dataset information
|
|
333
|
+
"""
|
|
334
|
+
model = args.model or "N/A"
|
|
335
|
+
timestamp = max(bm.start_time for bm in benchmarks if bm.start_time is not None)
|
|
336
|
+
return {
|
|
337
|
+
"model": {"name": model, "size": 0},
|
|
338
|
+
"task": "N/A",
|
|
339
|
+
"timestamp": timestamp,
|
|
340
|
+
"dataset": {"name": "N/A"},
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _build_workload_details(
|
|
345
|
+
benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
|
|
346
|
+
) -> dict[str, Any]:
|
|
347
|
+
"""
|
|
348
|
+
Build workload details from benchmarks.
|
|
349
|
+
|
|
350
|
+
Aggregates prompt and generation samples, token distribution statistics,
|
|
351
|
+
request timing histograms, and server configuration. Samples up to 5 random
|
|
352
|
+
prompts and outputs for display.
|
|
353
|
+
|
|
354
|
+
:param benchmarks: List of completed benchmark results
|
|
355
|
+
:param args: Benchmark configuration arguments
|
|
356
|
+
:return: Dictionary with prompts, generations, request timing, and server info
|
|
357
|
+
"""
|
|
358
|
+
target = args.target
|
|
359
|
+
rate_type = benchmarks[0].config.strategy.type_
|
|
360
|
+
successful_requests = [req for bm in benchmarks for req in bm.requests.successful]
|
|
361
|
+
|
|
362
|
+
sample_indices = random.sample(
|
|
363
|
+
range(len(successful_requests)), min(5, len(successful_requests))
|
|
364
|
+
)
|
|
365
|
+
sample_prompts = [
|
|
366
|
+
req.request_args.replace("\n", " ").replace('"', "'")
|
|
367
|
+
if (req := successful_requests[i]).request_args
|
|
368
|
+
else ""
|
|
369
|
+
for i in sample_indices
|
|
370
|
+
]
|
|
371
|
+
sample_outputs = [
|
|
372
|
+
req.output.replace("\n", " ").replace('"', "'")
|
|
373
|
+
if (req := successful_requests[i]).output
|
|
374
|
+
else ""
|
|
375
|
+
for i in sample_indices
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
prompt_tokens = [
|
|
379
|
+
float(req.prompt_tokens) if req.prompt_tokens is not None else -1
|
|
380
|
+
for bm in benchmarks
|
|
381
|
+
for req in bm.requests.successful
|
|
382
|
+
]
|
|
383
|
+
output_tokens = [
|
|
384
|
+
float(req.output_tokens) if req.output_tokens is not None else -1
|
|
385
|
+
for bm in benchmarks
|
|
386
|
+
for req in bm.requests.successful
|
|
387
|
+
]
|
|
388
|
+
|
|
389
|
+
prompt_token_buckets, _prompt_bucket_width = _Bucket.from_data(prompt_tokens, 1)
|
|
390
|
+
output_token_buckets, _output_bucket_width = _Bucket.from_data(output_tokens, 1)
|
|
391
|
+
|
|
392
|
+
prompt_token_stats = DistributionSummary.from_values(prompt_tokens)
|
|
393
|
+
output_token_stats = DistributionSummary.from_values(output_tokens)
|
|
394
|
+
|
|
395
|
+
min_start_time = benchmarks[0].start_time
|
|
396
|
+
all_req_times = [
|
|
397
|
+
req.info.timings.request_start - min_start_time
|
|
398
|
+
for bm in benchmarks
|
|
399
|
+
for req in bm.requests.successful
|
|
400
|
+
if req.info.timings.request_start is not None
|
|
401
|
+
]
|
|
402
|
+
|
|
403
|
+
number_of_buckets = len(benchmarks)
|
|
404
|
+
request_buckets, bucket_width = _Bucket.from_data(
|
|
405
|
+
all_req_times, None, number_of_buckets
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
"prompts": {
|
|
410
|
+
"samples": sample_prompts,
|
|
411
|
+
"token_distributions": {
|
|
412
|
+
"statistics": prompt_token_stats.model_dump()
|
|
413
|
+
if prompt_token_stats
|
|
414
|
+
else None,
|
|
415
|
+
"buckets": [b.model_dump() for b in prompt_token_buckets],
|
|
416
|
+
"bucket_width": 1,
|
|
417
|
+
},
|
|
418
|
+
},
|
|
419
|
+
"generations": {
|
|
420
|
+
"samples": sample_outputs,
|
|
421
|
+
"token_distributions": {
|
|
422
|
+
"statistics": output_token_stats.model_dump()
|
|
423
|
+
if output_token_stats
|
|
424
|
+
else None,
|
|
425
|
+
"buckets": [b.model_dump() for b in output_token_buckets],
|
|
426
|
+
"bucket_width": 1,
|
|
427
|
+
},
|
|
428
|
+
},
|
|
429
|
+
"requests_over_time": {
|
|
430
|
+
"requests_over_time": {
|
|
431
|
+
"buckets": [b.model_dump() for b in request_buckets],
|
|
432
|
+
"bucket_width": bucket_width,
|
|
433
|
+
},
|
|
434
|
+
"num_benchmarks": number_of_buckets,
|
|
435
|
+
},
|
|
436
|
+
"rate_type": rate_type,
|
|
437
|
+
"server": {"target": target},
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _build_benchmarks(benchmarks: list[GenerativeBenchmark]) -> list[dict[str, Any]]:
|
|
442
|
+
"""
|
|
443
|
+
Build benchmark metrics data for UI display.
|
|
444
|
+
|
|
445
|
+
Extracts key performance metrics from each benchmark including requests per
|
|
446
|
+
second, inter-token latency, time to first token, throughput, and request
|
|
447
|
+
latency. Formats distribution summaries for tabular display.
|
|
448
|
+
|
|
449
|
+
:param benchmarks: List of completed benchmark results
|
|
450
|
+
:return: List of dictionaries with formatted benchmark metrics
|
|
451
|
+
"""
|
|
452
|
+
result = []
|
|
453
|
+
for bm in benchmarks:
|
|
454
|
+
result.append(
|
|
455
|
+
{
|
|
456
|
+
"requests_per_second": bm.metrics.requests_per_second.successful.mean,
|
|
457
|
+
"itl": _TabularDistributionSummary.from_distribution_summary(
|
|
458
|
+
bm.metrics.inter_token_latency_ms.successful
|
|
459
|
+
).model_dump(),
|
|
460
|
+
"ttft": _TabularDistributionSummary.from_distribution_summary(
|
|
461
|
+
bm.metrics.time_to_first_token_ms.successful
|
|
462
|
+
).model_dump(),
|
|
463
|
+
"throughput": _TabularDistributionSummary.from_distribution_summary(
|
|
464
|
+
bm.metrics.output_tokens_per_second.successful
|
|
465
|
+
).model_dump(),
|
|
466
|
+
"time_per_request": (
|
|
467
|
+
_TabularDistributionSummary.from_distribution_summary(
|
|
468
|
+
bm.metrics.request_latency.successful
|
|
469
|
+
).model_dump()
|
|
470
|
+
),
|
|
471
|
+
}
|
|
472
|
+
)
|
|
473
|
+
return result
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base output interface for generative benchmarking results.
|
|
3
|
+
|
|
4
|
+
This module defines the abstract base class for all benchmark output formatters in
|
|
5
|
+
the guidellm system. Output formatters transform benchmark reports into various file
|
|
6
|
+
formats (JSON, CSV, HTML, etc.) enabling flexible result persistence and analysis.
|
|
7
|
+
The module leverages a registry pattern for dynamic format resolution and supports
|
|
8
|
+
both direct instantiation and configuration-based initialization.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from collections.abc import Mapping, Sequence
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel, ConfigDict
|
|
19
|
+
|
|
20
|
+
from guidellm.benchmark.schemas import GenerativeBenchmarksReport
|
|
21
|
+
from guidellm.utils import RegistryMixin
|
|
22
|
+
|
|
23
|
+
__all__ = ["GenerativeBenchmarkerOutput"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenerativeBenchmarkerOutput(
|
|
27
|
+
BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Abstract base for benchmark output formatters with registry support.
|
|
31
|
+
|
|
32
|
+
Defines the interface for transforming benchmark reports into various output
|
|
33
|
+
formats. Subclasses implement specific formatters (JSON, CSV, HTML) that can be
|
|
34
|
+
registered and resolved dynamically. Supports flexible initialization from string
|
|
35
|
+
identifiers, file paths, or configuration dictionaries enabling declarative
|
|
36
|
+
output configuration in benchmark runs.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
::
|
|
40
|
+
# Register and resolve output formats
|
|
41
|
+
outputs = GenerativeBenchmarkerOutput.resolve(
|
|
42
|
+
output_formats=["json", "csv"],
|
|
43
|
+
output_path="./results"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Finalize outputs with benchmark report
|
|
47
|
+
for output in outputs.values():
|
|
48
|
+
await output.finalize(report)
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model_config = ConfigDict(
|
|
52
|
+
extra="ignore",
|
|
53
|
+
arbitrary_types_allowed=True,
|
|
54
|
+
validate_assignment=True,
|
|
55
|
+
from_attributes=True,
|
|
56
|
+
use_enum_values=True,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]:
|
|
62
|
+
"""
|
|
63
|
+
Validate and normalize initialization arguments for output formatter.
|
|
64
|
+
|
|
65
|
+
Processes positional and keyword arguments into a validated parameter
|
|
66
|
+
dictionary suitable for formatter instantiation. Subclasses implement
|
|
67
|
+
format-specific validation logic handling their unique parameter patterns.
|
|
68
|
+
|
|
69
|
+
:param args: Positional arguments for formatter configuration
|
|
70
|
+
:param kwargs: Keyword arguments for formatter configuration
|
|
71
|
+
:return: Validated dictionary of parameters for formatter creation
|
|
72
|
+
:raises NotImplementedError: Must be implemented by subclasses
|
|
73
|
+
"""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def resolve(
|
|
78
|
+
cls,
|
|
79
|
+
outputs: (
|
|
80
|
+
Sequence[str | GenerativeBenchmarkerOutput]
|
|
81
|
+
| Mapping[str, str | dict[str, Any] | GenerativeBenchmarkerOutput]
|
|
82
|
+
| None
|
|
83
|
+
),
|
|
84
|
+
output_dir: str | Path | None,
|
|
85
|
+
) -> dict[str, GenerativeBenchmarkerOutput]:
|
|
86
|
+
"""
|
|
87
|
+
Resolve output format specifications into formatter instances.
|
|
88
|
+
|
|
89
|
+
Supports multiple input patterns: format identifiers (["json", "csv"]),
|
|
90
|
+
file paths (["results.json"]), format configurations ({"json": {"indent": 2}}),
|
|
91
|
+
or pre-instantiated formatters. Registered format types are resolved from the
|
|
92
|
+
registry and instantiated with validated parameters.
|
|
93
|
+
|
|
94
|
+
:param output_formats: Format specifications as sequence of identifiers/paths,
|
|
95
|
+
mapping of format configurations, or None for no outputs
|
|
96
|
+
:param output_path: Default output directory path for all formatters
|
|
97
|
+
:return: Dictionary mapping format keys to instantiated formatter instances
|
|
98
|
+
:raises TypeError: If format specification type is invalid
|
|
99
|
+
:raises ValueError: If format resolution or validation fails
|
|
100
|
+
"""
|
|
101
|
+
if not outputs:
|
|
102
|
+
return {}
|
|
103
|
+
|
|
104
|
+
keys: Sequence[str]
|
|
105
|
+
values: Sequence[dict[str, Any] | GenerativeBenchmarkerOutput]
|
|
106
|
+
if isinstance(outputs, Mapping):
|
|
107
|
+
keys = list(outputs.keys())
|
|
108
|
+
values = list(outputs.values()) # type: ignore[arg-type]
|
|
109
|
+
else:
|
|
110
|
+
keys = []
|
|
111
|
+
values = []
|
|
112
|
+
|
|
113
|
+
for out in outputs:
|
|
114
|
+
if isinstance(out, str) and "." in out:
|
|
115
|
+
# File name, extract extension as type
|
|
116
|
+
ext = Path(out).suffix[1:].lower()
|
|
117
|
+
keys.append(ext)
|
|
118
|
+
values.append({"output_path": Path(output_dir or Path.cwd()) / out})
|
|
119
|
+
elif isinstance(out, str):
|
|
120
|
+
# Assume registered type
|
|
121
|
+
keys.append(out)
|
|
122
|
+
values.append({})
|
|
123
|
+
elif isinstance(out, GenerativeBenchmarkerOutput):
|
|
124
|
+
# Use class name as key
|
|
125
|
+
keys.append(out.__class__.__name__)
|
|
126
|
+
values.append(out)
|
|
127
|
+
else:
|
|
128
|
+
raise TypeError(
|
|
129
|
+
"output_formats must be a sequence of strings or "
|
|
130
|
+
"GenerativeBenchmarkerOutput instances, or a mapping."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
resolved: dict[str, GenerativeBenchmarkerOutput] = {}
|
|
134
|
+
for key, val in zip(keys, values, strict=True):
|
|
135
|
+
if isinstance(val, GenerativeBenchmarkerOutput):
|
|
136
|
+
# Already resolved
|
|
137
|
+
resolved[key] = val
|
|
138
|
+
else:
|
|
139
|
+
# Resolve from registry
|
|
140
|
+
output_class = cls.get_registered_object(key)
|
|
141
|
+
if output_class is None:
|
|
142
|
+
available_formats = (
|
|
143
|
+
list(cls.registry.keys()) if cls.registry else []
|
|
144
|
+
)
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Output format '{key}' is not registered. "
|
|
147
|
+
f"Available formats: {available_formats}"
|
|
148
|
+
)
|
|
149
|
+
kwargs = output_class.validated_kwargs(
|
|
150
|
+
**{"output_path": output_dir, **val} # type: ignore[dict-item]
|
|
151
|
+
)
|
|
152
|
+
resolved[key] = output_class(**kwargs)
|
|
153
|
+
|
|
154
|
+
return resolved
|
|
155
|
+
|
|
156
|
+
@abstractmethod
|
|
157
|
+
async def finalize(self, report: GenerativeBenchmarksReport) -> Any:
|
|
158
|
+
"""
|
|
159
|
+
Process and persist benchmark report in the formatter's output format.
|
|
160
|
+
|
|
161
|
+
Transforms the provided benchmark report into the target format and writes
|
|
162
|
+
results to the configured output destination. Implementation details vary by
|
|
163
|
+
formatter type (file writing, API calls, etc.).
|
|
164
|
+
|
|
165
|
+
:param report: Benchmark report containing results to format and output
|
|
166
|
+
:return: Format-specific output result (file path, response object, etc.)
|
|
167
|
+
:raises NotImplementedError: Must be implemented by subclasses
|
|
168
|
+
"""
|
|
169
|
+
...
|