guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,473 @@
1
+ """
2
+ HTML output formatter for benchmark results.
3
+
4
+ Transforms benchmark data into interactive web-based reports by building UI data
5
+ structures, converting keys to camelCase for JavaScript compatibility, and injecting
6
+ formatted data into HTML templates. The formatter processes GenerativeBenchmark
7
+ instances and their associated metrics, creating histogram buckets for distributions,
8
+ formatting percentile statistics for tabular display, and embedding all data as
9
+ JavaScript objects within an HTML template for client-side rendering and visualization.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import random
16
+ import re
17
+ from collections import defaultdict
18
+ from copy import deepcopy
19
+ from math import ceil
20
+ from pathlib import Path
21
+ from typing import Any, ClassVar
22
+
23
+ from loguru import logger
24
+ from pydantic import BaseModel, Field, computed_field
25
+
26
+ from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
27
+ from guidellm.benchmark.schemas import (
28
+ BenchmarkGenerativeTextArgs,
29
+ GenerativeBenchmark,
30
+ GenerativeBenchmarksReport,
31
+ )
32
+ from guidellm.schemas import DistributionSummary, Percentiles
33
+ from guidellm.settings import settings
34
+ from guidellm.utils import camelize_str, recursive_key_update
35
+ from guidellm.utils.text import load_text
36
+
37
+ __all__ = ["GenerativeBenchmarkerHTML"]
38
+
39
+
40
+ @GenerativeBenchmarkerOutput.register("html")
41
+ class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput):
42
+ """
43
+ HTML output formatter for benchmark results.
44
+
45
+ Generates interactive HTML reports from benchmark data by transforming results
46
+ into camelCase JSON structures and injecting them into HTML templates. The
47
+ formatter processes benchmark metrics, creates histogram distributions, and
48
+ embeds all data into a pre-built HTML template for browser-based visualization.
49
+ Reports are saved to the specified output path or current working directory.
50
+
51
+ :cvar DEFAULT_FILE: Default filename for HTML output when a directory is provided
52
+ """
53
+
54
+ DEFAULT_FILE: ClassVar[str] = "benchmarks.html"
55
+
56
+ output_path: Path = Field(
57
+ default_factory=lambda: Path.cwd(),
58
+ description=(
59
+ "Directory or file path for saving the HTML report, "
60
+ "defaults to current working directory"
61
+ ),
62
+ )
63
+
64
+ @classmethod
65
+ def validated_kwargs(
66
+ cls, output_path: str | Path | None, **_kwargs
67
+ ) -> dict[str, Any]:
68
+ """
69
+ Validate and normalize output path argument.
70
+
71
+ :param output_path: Output file or directory path for the HTML report
72
+ :return: Dictionary containing validated output_path if provided
73
+ """
74
+ validated: dict[str, Any] = {}
75
+ if output_path is not None:
76
+ validated["output_path"] = (
77
+ Path(output_path) if not isinstance(output_path, Path) else output_path
78
+ )
79
+ return validated
80
+
81
+ async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
82
+ """
83
+ Generate and save the HTML benchmark report.
84
+
85
+ Transforms benchmark data into camelCase JSON format, injects it into the
86
+ HTML template, and writes the resulting report to the output path. Creates
87
+ parent directories if they don't exist.
88
+
89
+ :param report: Completed benchmark report containing all results
90
+ :return: Path to the saved HTML report file
91
+ """
92
+ output_path = self.output_path
93
+ if output_path.is_dir():
94
+ output_path = output_path / self.DEFAULT_FILE
95
+ output_path.parent.mkdir(parents=True, exist_ok=True)
96
+
97
+ data = _build_ui_data(report.benchmarks, report.args)
98
+ camel_data = recursive_key_update(deepcopy(data), camelize_str)
99
+
100
+ ui_api_data = {
101
+ f"window.{key} = {{}};": f"window.{key} = {json.dumps(value, indent=2)};\n"
102
+ for key, value in camel_data.items()
103
+ }
104
+
105
+ _create_html_report(ui_api_data, output_path)
106
+
107
+ return output_path
108
+
109
+
110
+ class _Bucket(BaseModel):
111
+ """
112
+ Histogram bucket for data distribution visualization.
113
+
114
+ Represents a single bucket in a histogram with its starting value and count
115
+ of data points falling within the bucket range. Used to create distribution
116
+ histograms for metrics like token counts and request timings.
117
+ """
118
+
119
+ value: float | int = Field(description="Starting value of the bucket range")
120
+ count: int = Field(description="Number of data points falling within this bucket")
121
+
122
+ @staticmethod
123
+ def from_data(
124
+ data: list[float] | list[int],
125
+ bucket_width: float | None = None,
126
+ n_buckets: int | None = None,
127
+ ) -> tuple[list[_Bucket], float]:
128
+ """
129
+ Create histogram buckets from numeric data values.
130
+
131
+ Divides the data range into equal-width buckets and counts values within
132
+ each bucket. Either bucket_width or n_buckets can be specified; if neither
133
+ is provided, defaults to 10 buckets.
134
+
135
+ :param data: Numeric values to bucket
136
+ :param bucket_width: Width of each bucket, computed if None
137
+ :param n_buckets: Number of buckets, defaults to 10 if width not specified
138
+ :return: Tuple of bucket list and computed bucket width
139
+ """
140
+ if not data:
141
+ return [], 1.0
142
+
143
+ min_v = min(data)
144
+ max_v = max(data)
145
+ range_v = (1 + max_v) - min_v
146
+
147
+ if bucket_width is None:
148
+ if n_buckets is None:
149
+ n_buckets = 10
150
+ bucket_width = range_v / n_buckets
151
+ else:
152
+ n_buckets = ceil(range_v / bucket_width)
153
+
154
+ bucket_counts: defaultdict[float | int, int] = defaultdict(int)
155
+ for val in data:
156
+ idx = int((val - min_v) // bucket_width)
157
+ if idx >= n_buckets:
158
+ idx = n_buckets - 1
159
+ bucket_start = min_v + idx * bucket_width
160
+ bucket_counts[bucket_start] += 1
161
+
162
+ buckets = [
163
+ _Bucket(value=start, count=count)
164
+ for start, count in sorted(bucket_counts.items())
165
+ ]
166
+ return buckets, bucket_width
167
+
168
+
169
+ class _TabularDistributionSummary(DistributionSummary):
170
+ """
171
+ Distribution summary with tabular percentile representation.
172
+
173
+ Extends DistributionSummary to provide percentile data formatted for table
174
+ display in the HTML report. Filters to show only key percentiles (p50, p90,
175
+ p95, p99) for concise presentation.
176
+ """
177
+
178
+ @computed_field
179
+ def percentile_rows(self) -> list[dict[str, str | float]]:
180
+ """
181
+ Format percentiles as table rows for UI display.
182
+
183
+ :return: List of dictionaries with percentile names and values
184
+ """
185
+ rows = [
186
+ {"percentile": name, "value": value}
187
+ for name, value in self.percentiles.model_dump().items()
188
+ ]
189
+ return list(
190
+ filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
191
+ )
192
+
193
+ def model_dump(self, **kwargs) -> dict:
194
+ """
195
+ Override model_dump to filter duplicate consecutive percentile values.
196
+
197
+ This prevents visualization errors when distributions have limited data
198
+ points causing multiple percentiles to collapse to the same value.
199
+
200
+ :param kwargs: Arguments to pass to parent model_dump
201
+ :return: Dictionary with filtered percentiles
202
+ """
203
+ data = super().model_dump(**kwargs)
204
+
205
+ if "percentiles" in data and data["percentiles"]:
206
+ filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"])
207
+ data["percentiles"] = filtered_percentiles
208
+
209
+ return data
210
+
211
+ @classmethod
212
+ def from_distribution_summary(
213
+ cls, distribution: DistributionSummary
214
+ ) -> _TabularDistributionSummary:
215
+ """
216
+ Convert standard DistributionSummary to tabular format.
217
+
218
+ :param distribution: Source distribution summary to convert
219
+ :return: Tabular distribution summary with formatted percentile rows
220
+ """
221
+ return cls(**distribution.model_dump())
222
+
223
+
224
+ def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path:
225
+ """
226
+ Create HTML report by injecting JavaScript data into template.
227
+
228
+ Loads the HTML template, injects JavaScript data into the head section, and
229
+ writes the final report to the specified output path.
230
+
231
+ :param js_data: Dictionary mapping placeholder strings to JavaScript code
232
+ :param output_path: Path where HTML report will be saved
233
+ :return: Path to the saved report file
234
+ """
235
+ html_content = load_text(settings.report_generation.source)
236
+ report_content = _inject_data(js_data, html_content)
237
+
238
+ output_path.parent.mkdir(parents=True, exist_ok=True)
239
+ output_path.write_text(report_content)
240
+ return output_path
241
+
242
+
243
+ def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]:
244
+ """
245
+ Filter out consecutive duplicate percentile values.
246
+
247
+ When distributions have very few data points, multiple percentiles can have
248
+ the same value, which causes visualization libraries to fail. This function
249
+ keeps only the largest percentile for consecutive duplicate values, which is
250
+ more mathematically accurate as higher percentiles have greater statistical
251
+ significance.
252
+
253
+ :param percentiles: Dictionary of percentile names to values
254
+ :return: Filtered percentiles dictionary with no consecutive duplicates
255
+ """
256
+ if not percentiles:
257
+ return percentiles
258
+
259
+ percentile_order = list(Percentiles.model_fields.keys())
260
+
261
+ # Iterate in reverse to keep the largest percentile for each value
262
+ filtered = {}
263
+ previous_value = None
264
+
265
+ for key in reversed(percentile_order):
266
+ if key in percentiles:
267
+ current_value = percentiles[key]
268
+ if previous_value is None or current_value != previous_value:
269
+ filtered[key] = current_value
270
+ previous_value = current_value
271
+
272
+ # Restore original order
273
+ return {key: filtered[key] for key in percentile_order if key in filtered}
274
+
275
+
276
+ def _inject_data(js_data: dict[str, str], html: str) -> str:
277
+ """
278
+ Inject JavaScript data into HTML head section.
279
+
280
+ Replaces placeholder strings in the HTML head section with actual JavaScript
281
+ code containing benchmark data. Returns original HTML if no head section found.
282
+
283
+ :param js_data: Dictionary mapping placeholder strings to JavaScript code
284
+ :param html: HTML template content
285
+ :return: HTML with injected JavaScript data
286
+ """
287
+ head_match = re.search(r"<head[^>]*>(.*?)</head>", html, re.DOTALL | re.IGNORECASE)
288
+ if not head_match:
289
+ logger.warning("<head> section missing, returning original HTML.")
290
+ return html
291
+
292
+ head_content = head_match.group(1)
293
+
294
+ for placeholder, script in js_data.items():
295
+ head_content = head_content.replace(placeholder, script)
296
+
297
+ new_head = f"<head>{head_content}</head>"
298
+ return html[: head_match.start()] + new_head + html[head_match.end() :]
299
+
300
+
301
+ def _build_ui_data(
302
+ benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
303
+ ) -> dict[str, Any]:
304
+ """
305
+ Build complete UI data structure from benchmarks.
306
+
307
+ Aggregates benchmark results into a structured format for the HTML UI,
308
+ including run metadata, workload details, and per-benchmark metrics.
309
+
310
+ :param benchmarks: List of completed benchmark results
311
+ :param args: Benchmark configuration arguments
312
+ :return: Dictionary with run_info, workload_details, and benchmarks sections
313
+ """
314
+ return {
315
+ "run_info": _build_run_info(benchmarks, args),
316
+ "workload_details": _build_workload_details(benchmarks, args),
317
+ "benchmarks": _build_benchmarks(benchmarks),
318
+ }
319
+
320
+
321
+ def _build_run_info(
322
+ benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
323
+ ) -> dict[str, Any]:
324
+ """
325
+ Build run metadata from benchmarks.
326
+
327
+ Extracts model name, timestamp, and dataset information from the benchmark
328
+ configuration and results.
329
+
330
+ :param benchmarks: List of completed benchmark results
331
+ :param args: Benchmark configuration arguments
332
+ :return: Dictionary with model, task, timestamp, and dataset information
333
+ """
334
+ model = args.model or "N/A"
335
+ timestamp = max(bm.start_time for bm in benchmarks if bm.start_time is not None)
336
+ return {
337
+ "model": {"name": model, "size": 0},
338
+ "task": "N/A",
339
+ "timestamp": timestamp,
340
+ "dataset": {"name": "N/A"},
341
+ }
342
+
343
+
344
+ def _build_workload_details(
345
+ benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
346
+ ) -> dict[str, Any]:
347
+ """
348
+ Build workload details from benchmarks.
349
+
350
+ Aggregates prompt and generation samples, token distribution statistics,
351
+ request timing histograms, and server configuration. Samples up to 5 random
352
+ prompts and outputs for display.
353
+
354
+ :param benchmarks: List of completed benchmark results
355
+ :param args: Benchmark configuration arguments
356
+ :return: Dictionary with prompts, generations, request timing, and server info
357
+ """
358
+ target = args.target
359
+ rate_type = benchmarks[0].config.strategy.type_
360
+ successful_requests = [req for bm in benchmarks for req in bm.requests.successful]
361
+
362
+ sample_indices = random.sample(
363
+ range(len(successful_requests)), min(5, len(successful_requests))
364
+ )
365
+ sample_prompts = [
366
+ req.request_args.replace("\n", " ").replace('"', "'")
367
+ if (req := successful_requests[i]).request_args
368
+ else ""
369
+ for i in sample_indices
370
+ ]
371
+ sample_outputs = [
372
+ req.output.replace("\n", " ").replace('"', "'")
373
+ if (req := successful_requests[i]).output
374
+ else ""
375
+ for i in sample_indices
376
+ ]
377
+
378
+ prompt_tokens = [
379
+ float(req.prompt_tokens) if req.prompt_tokens is not None else -1
380
+ for bm in benchmarks
381
+ for req in bm.requests.successful
382
+ ]
383
+ output_tokens = [
384
+ float(req.output_tokens) if req.output_tokens is not None else -1
385
+ for bm in benchmarks
386
+ for req in bm.requests.successful
387
+ ]
388
+
389
+ prompt_token_buckets, _prompt_bucket_width = _Bucket.from_data(prompt_tokens, 1)
390
+ output_token_buckets, _output_bucket_width = _Bucket.from_data(output_tokens, 1)
391
+
392
+ prompt_token_stats = DistributionSummary.from_values(prompt_tokens)
393
+ output_token_stats = DistributionSummary.from_values(output_tokens)
394
+
395
+ min_start_time = benchmarks[0].start_time
396
+ all_req_times = [
397
+ req.info.timings.request_start - min_start_time
398
+ for bm in benchmarks
399
+ for req in bm.requests.successful
400
+ if req.info.timings.request_start is not None
401
+ ]
402
+
403
+ number_of_buckets = len(benchmarks)
404
+ request_buckets, bucket_width = _Bucket.from_data(
405
+ all_req_times, None, number_of_buckets
406
+ )
407
+
408
+ return {
409
+ "prompts": {
410
+ "samples": sample_prompts,
411
+ "token_distributions": {
412
+ "statistics": prompt_token_stats.model_dump()
413
+ if prompt_token_stats
414
+ else None,
415
+ "buckets": [b.model_dump() for b in prompt_token_buckets],
416
+ "bucket_width": 1,
417
+ },
418
+ },
419
+ "generations": {
420
+ "samples": sample_outputs,
421
+ "token_distributions": {
422
+ "statistics": output_token_stats.model_dump()
423
+ if output_token_stats
424
+ else None,
425
+ "buckets": [b.model_dump() for b in output_token_buckets],
426
+ "bucket_width": 1,
427
+ },
428
+ },
429
+ "requests_over_time": {
430
+ "requests_over_time": {
431
+ "buckets": [b.model_dump() for b in request_buckets],
432
+ "bucket_width": bucket_width,
433
+ },
434
+ "num_benchmarks": number_of_buckets,
435
+ },
436
+ "rate_type": rate_type,
437
+ "server": {"target": target},
438
+ }
439
+
440
+
441
+ def _build_benchmarks(benchmarks: list[GenerativeBenchmark]) -> list[dict[str, Any]]:
442
+ """
443
+ Build benchmark metrics data for UI display.
444
+
445
+ Extracts key performance metrics from each benchmark including requests per
446
+ second, inter-token latency, time to first token, throughput, and request
447
+ latency. Formats distribution summaries for tabular display.
448
+
449
+ :param benchmarks: List of completed benchmark results
450
+ :return: List of dictionaries with formatted benchmark metrics
451
+ """
452
+ result = []
453
+ for bm in benchmarks:
454
+ result.append(
455
+ {
456
+ "requests_per_second": bm.metrics.requests_per_second.successful.mean,
457
+ "itl": _TabularDistributionSummary.from_distribution_summary(
458
+ bm.metrics.inter_token_latency_ms.successful
459
+ ).model_dump(),
460
+ "ttft": _TabularDistributionSummary.from_distribution_summary(
461
+ bm.metrics.time_to_first_token_ms.successful
462
+ ).model_dump(),
463
+ "throughput": _TabularDistributionSummary.from_distribution_summary(
464
+ bm.metrics.output_tokens_per_second.successful
465
+ ).model_dump(),
466
+ "time_per_request": (
467
+ _TabularDistributionSummary.from_distribution_summary(
468
+ bm.metrics.request_latency.successful
469
+ ).model_dump()
470
+ ),
471
+ }
472
+ )
473
+ return result
@@ -0,0 +1,169 @@
1
+ """
2
+ Base output interface for generative benchmarking results.
3
+
4
+ This module defines the abstract base class for all benchmark output formatters in
5
+ the guidellm system. Output formatters transform benchmark reports into various file
6
+ formats (JSON, CSV, HTML, etc.) enabling flexible result persistence and analysis.
7
+ The module leverages a registry pattern for dynamic format resolution and supports
8
+ both direct instantiation and configuration-based initialization.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from abc import ABC, abstractmethod
14
+ from collections.abc import Mapping, Sequence
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from pydantic import BaseModel, ConfigDict
19
+
20
+ from guidellm.benchmark.schemas import GenerativeBenchmarksReport
21
+ from guidellm.utils import RegistryMixin
22
+
23
+ __all__ = ["GenerativeBenchmarkerOutput"]
24
+
25
+
26
+ class GenerativeBenchmarkerOutput(
27
+ BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC
28
+ ):
29
+ """
30
+ Abstract base for benchmark output formatters with registry support.
31
+
32
+ Defines the interface for transforming benchmark reports into various output
33
+ formats. Subclasses implement specific formatters (JSON, CSV, HTML) that can be
34
+ registered and resolved dynamically. Supports flexible initialization from string
35
+ identifiers, file paths, or configuration dictionaries enabling declarative
36
+ output configuration in benchmark runs.
37
+
38
+ Example:
39
+ ::
40
+ # Register and resolve output formats
41
+ outputs = GenerativeBenchmarkerOutput.resolve(
42
+ output_formats=["json", "csv"],
43
+ output_path="./results"
44
+ )
45
+
46
+ # Finalize outputs with benchmark report
47
+ for output in outputs.values():
48
+ await output.finalize(report)
49
+ """
50
+
51
+ model_config = ConfigDict(
52
+ extra="ignore",
53
+ arbitrary_types_allowed=True,
54
+ validate_assignment=True,
55
+ from_attributes=True,
56
+ use_enum_values=True,
57
+ )
58
+
59
+ @classmethod
60
+ @abstractmethod
61
+ def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]:
62
+ """
63
+ Validate and normalize initialization arguments for output formatter.
64
+
65
+ Processes positional and keyword arguments into a validated parameter
66
+ dictionary suitable for formatter instantiation. Subclasses implement
67
+ format-specific validation logic handling their unique parameter patterns.
68
+
69
+ :param args: Positional arguments for formatter configuration
70
+ :param kwargs: Keyword arguments for formatter configuration
71
+ :return: Validated dictionary of parameters for formatter creation
72
+ :raises NotImplementedError: Must be implemented by subclasses
73
+ """
74
+ ...
75
+
76
+ @classmethod
77
+ def resolve(
78
+ cls,
79
+ outputs: (
80
+ Sequence[str | GenerativeBenchmarkerOutput]
81
+ | Mapping[str, str | dict[str, Any] | GenerativeBenchmarkerOutput]
82
+ | None
83
+ ),
84
+ output_dir: str | Path | None,
85
+ ) -> dict[str, GenerativeBenchmarkerOutput]:
86
+ """
87
+ Resolve output format specifications into formatter instances.
88
+
89
+ Supports multiple input patterns: format identifiers (["json", "csv"]),
90
+ file paths (["results.json"]), format configurations ({"json": {"indent": 2}}),
91
+ or pre-instantiated formatters. Registered format types are resolved from the
92
+ registry and instantiated with validated parameters.
93
+
94
+ :param output_formats: Format specifications as sequence of identifiers/paths,
95
+ mapping of format configurations, or None for no outputs
96
+ :param output_path: Default output directory path for all formatters
97
+ :return: Dictionary mapping format keys to instantiated formatter instances
98
+ :raises TypeError: If format specification type is invalid
99
+ :raises ValueError: If format resolution or validation fails
100
+ """
101
+ if not outputs:
102
+ return {}
103
+
104
+ keys: Sequence[str]
105
+ values: Sequence[dict[str, Any] | GenerativeBenchmarkerOutput]
106
+ if isinstance(outputs, Mapping):
107
+ keys = list(outputs.keys())
108
+ values = list(outputs.values()) # type: ignore[arg-type]
109
+ else:
110
+ keys = []
111
+ values = []
112
+
113
+ for out in outputs:
114
+ if isinstance(out, str) and "." in out:
115
+ # File name, extract extension as type
116
+ ext = Path(out).suffix[1:].lower()
117
+ keys.append(ext)
118
+ values.append({"output_path": Path(output_dir or Path.cwd()) / out})
119
+ elif isinstance(out, str):
120
+ # Assume registered type
121
+ keys.append(out)
122
+ values.append({})
123
+ elif isinstance(out, GenerativeBenchmarkerOutput):
124
+ # Use class name as key
125
+ keys.append(out.__class__.__name__)
126
+ values.append(out)
127
+ else:
128
+ raise TypeError(
129
+ "output_formats must be a sequence of strings or "
130
+ "GenerativeBenchmarkerOutput instances, or a mapping."
131
+ )
132
+
133
+ resolved: dict[str, GenerativeBenchmarkerOutput] = {}
134
+ for key, val in zip(keys, values, strict=True):
135
+ if isinstance(val, GenerativeBenchmarkerOutput):
136
+ # Already resolved
137
+ resolved[key] = val
138
+ else:
139
+ # Resolve from registry
140
+ output_class = cls.get_registered_object(key)
141
+ if output_class is None:
142
+ available_formats = (
143
+ list(cls.registry.keys()) if cls.registry else []
144
+ )
145
+ raise ValueError(
146
+ f"Output format '{key}' is not registered. "
147
+ f"Available formats: {available_formats}"
148
+ )
149
+ kwargs = output_class.validated_kwargs(
150
+ **{"output_path": output_dir, **val} # type: ignore[dict-item]
151
+ )
152
+ resolved[key] = output_class(**kwargs)
153
+
154
+ return resolved
155
+
156
+ @abstractmethod
157
+ async def finalize(self, report: GenerativeBenchmarksReport) -> Any:
158
+ """
159
+ Process and persist benchmark report in the formatter's output format.
160
+
161
+ Transforms the provided benchmark report into the target format and writes
162
+ results to the configured output destination. Implementation details vary by
163
+ formatter type (file writing, API calls, etc.).
164
+
165
+ :param report: Benchmark report containing results to format and output
166
+ :return: Format-specific output result (file path, response object, etc.)
167
+ :raises NotImplementedError: Must be implemented by subclasses
168
+ """
169
+ ...