guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +452 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +150 -317
  9. guidellm/benchmark/entrypoints.py +467 -128
  10. guidellm/benchmark/output.py +519 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2086 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +144 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +194 -0
  24. guidellm/data/deserializers/synthetic.py +348 -0
  25. guidellm/data/loaders.py +149 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +404 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +31 -0
  30. guidellm/data/processor.py +31 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +6 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/extras/__init__.py +4 -0
  35. guidellm/extras/audio.py +215 -0
  36. guidellm/extras/vision.py +242 -0
  37. guidellm/logger.py +2 -2
  38. guidellm/mock_server/__init__.py +8 -0
  39. guidellm/mock_server/config.py +84 -0
  40. guidellm/mock_server/handlers/__init__.py +17 -0
  41. guidellm/mock_server/handlers/chat_completions.py +280 -0
  42. guidellm/mock_server/handlers/completions.py +280 -0
  43. guidellm/mock_server/handlers/tokenizer.py +142 -0
  44. guidellm/mock_server/models.py +510 -0
  45. guidellm/mock_server/server.py +168 -0
  46. guidellm/mock_server/utils.py +302 -0
  47. guidellm/preprocess/dataset.py +23 -26
  48. guidellm/presentation/builder.py +2 -2
  49. guidellm/presentation/data_models.py +25 -21
  50. guidellm/presentation/injector.py +2 -3
  51. guidellm/scheduler/__init__.py +65 -26
  52. guidellm/scheduler/constraints.py +1035 -0
  53. guidellm/scheduler/environments.py +252 -0
  54. guidellm/scheduler/scheduler.py +140 -368
  55. guidellm/scheduler/schemas.py +272 -0
  56. guidellm/scheduler/strategies.py +519 -0
  57. guidellm/scheduler/worker.py +391 -420
  58. guidellm/scheduler/worker_group.py +707 -0
  59. guidellm/schemas/__init__.py +31 -0
  60. guidellm/schemas/info.py +159 -0
  61. guidellm/schemas/request.py +226 -0
  62. guidellm/schemas/response.py +119 -0
  63. guidellm/schemas/stats.py +228 -0
  64. guidellm/{config.py → settings.py} +32 -21
  65. guidellm/utils/__init__.py +95 -8
  66. guidellm/utils/auto_importer.py +98 -0
  67. guidellm/utils/cli.py +71 -2
  68. guidellm/utils/console.py +183 -0
  69. guidellm/utils/encoding.py +778 -0
  70. guidellm/utils/functions.py +134 -0
  71. guidellm/utils/hf_datasets.py +1 -2
  72. guidellm/utils/hf_transformers.py +4 -4
  73. guidellm/utils/imports.py +9 -0
  74. guidellm/utils/messaging.py +1118 -0
  75. guidellm/utils/mixins.py +115 -0
  76. guidellm/utils/pydantic_utils.py +411 -0
  77. guidellm/utils/random.py +3 -4
  78. guidellm/utils/registry.py +220 -0
  79. guidellm/utils/singleton.py +133 -0
  80. guidellm/{objects → utils}/statistics.py +341 -247
  81. guidellm/utils/synchronous.py +159 -0
  82. guidellm/utils/text.py +163 -50
  83. guidellm/utils/typing.py +41 -0
  84. guidellm/version.py +1 -1
  85. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
  86. guidellm-0.4.0a169.dist-info/RECORD +95 -0
  87. guidellm/backend/__init__.py +0 -23
  88. guidellm/backend/backend.py +0 -259
  89. guidellm/backend/openai.py +0 -705
  90. guidellm/backend/response.py +0 -136
  91. guidellm/benchmark/aggregator.py +0 -760
  92. guidellm/benchmark/benchmark.py +0 -837
  93. guidellm/benchmark/scenario.py +0 -104
  94. guidellm/data/prideandprejudice.txt.gz +0 -0
  95. guidellm/dataset/__init__.py +0 -22
  96. guidellm/dataset/creator.py +0 -213
  97. guidellm/dataset/entrypoints.py +0 -42
  98. guidellm/dataset/file.py +0 -92
  99. guidellm/dataset/hf_datasets.py +0 -62
  100. guidellm/dataset/in_memory.py +0 -132
  101. guidellm/dataset/synthetic.py +0 -287
  102. guidellm/objects/__init__.py +0 -18
  103. guidellm/objects/pydantic.py +0 -89
  104. guidellm/request/__init__.py +0 -18
  105. guidellm/request/loader.py +0 -284
  106. guidellm/request/request.py +0 -79
  107. guidellm/request/types.py +0 -10
  108. guidellm/scheduler/queues.py +0 -25
  109. guidellm/scheduler/result.py +0 -155
  110. guidellm/scheduler/strategy.py +0 -495
  111. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  112. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
@@ -1,837 +0,0 @@
1
- import random
2
- import uuid
3
- from typing import Any, Literal, Optional, TypeVar, Union
4
-
5
- from pydantic import Field, computed_field
6
-
7
- from guidellm.benchmark.profile import (
8
- AsyncProfile,
9
- ConcurrentProfile,
10
- Profile,
11
- SweepProfile,
12
- SynchronousProfile,
13
- ThroughputProfile,
14
- )
15
- from guidellm.objects import (
16
- StandardBaseModel,
17
- StatusBreakdown,
18
- StatusDistributionSummary,
19
- )
20
- from guidellm.request import (
21
- GenerativeRequestLoaderDescription,
22
- RequestLoaderDescription,
23
- )
24
- from guidellm.scheduler import (
25
- AsyncConstantStrategy,
26
- AsyncPoissonStrategy,
27
- ConcurrentStrategy,
28
- GenerativeRequestsWorkerDescription,
29
- SchedulerRequestInfo,
30
- SchedulingStrategy,
31
- SynchronousStrategy,
32
- ThroughputStrategy,
33
- WorkerDescription,
34
- )
35
-
36
- __all__ = [
37
- "Benchmark",
38
- "BenchmarkArgs",
39
- "BenchmarkMetrics",
40
- "BenchmarkRunStats",
41
- "BenchmarkT",
42
- "GenerativeBenchmark",
43
- "GenerativeMetrics",
44
- "GenerativeTextErrorStats",
45
- "GenerativeTextResponseStats",
46
- "StatusBreakdown",
47
- ]
48
-
49
-
50
- class BenchmarkArgs(StandardBaseModel):
51
- """
52
- A serializable model representing the arguments used to specify a benchmark run
53
- and how data was collected for it.
54
- """
55
-
56
- profile: Union[
57
- AsyncProfile,
58
- SweepProfile,
59
- ConcurrentProfile,
60
- ThroughputProfile,
61
- SynchronousProfile,
62
- Profile,
63
- ] = Field(
64
- description=(
65
- "The profile used for the entire benchmark run that the strategy for "
66
- "this benchmark was pulled from."
67
- ),
68
- discriminator="type_",
69
- )
70
- strategy_index: int = Field(
71
- description=(
72
- "The index of the strategy in the profile that was used for this benchmark."
73
- )
74
- )
75
- strategy: Union[
76
- ConcurrentStrategy,
77
- SchedulingStrategy,
78
- ThroughputStrategy,
79
- SynchronousStrategy,
80
- AsyncPoissonStrategy,
81
- AsyncConstantStrategy,
82
- SchedulingStrategy,
83
- ] = Field(
84
- description="The scheduling strategy used to run this benchmark. ",
85
- discriminator="type_",
86
- )
87
- max_number: Optional[int] = Field(
88
- description="The maximum number of requests to run for this benchmark, if any."
89
- )
90
- max_duration: Optional[float] = Field(
91
- description="The maximum duration in seconds to run this benchmark, if any."
92
- )
93
- warmup_number: Optional[int] = Field(
94
- description=(
95
- "The number of requests to run for the warmup phase of this benchmark, "
96
- "if any. These are requests that were not included in the final results."
97
- )
98
- )
99
- warmup_duration: Optional[float] = Field(
100
- description=(
101
- "The duration in seconds to run for the warmup phase of this benchmark, "
102
- "if any. These are requests that were not included in the final results."
103
- )
104
- )
105
- cooldown_number: Optional[int] = Field(
106
- description=(
107
- "The number of requests to run for the cooldown phase of this benchmark, "
108
- "if any. These are requests that were not included in the final results."
109
- )
110
- )
111
- cooldown_duration: Optional[float] = Field(
112
- description=(
113
- "The duration in seconds to run for the cooldown phase of this benchmark, "
114
- "if any. These are requests that were not included in the final results."
115
- )
116
- )
117
-
118
-
119
- class BenchmarkRunStats(StandardBaseModel):
120
- """
121
- A serializable model representing the run process statistics for the
122
- entire benchmark run across all requests including warmup and cooldown.
123
- """
124
-
125
- start_time: float = Field(
126
- description="The start time of the benchmark run.",
127
- )
128
- end_time: float = Field(
129
- description="The end time of the benchmark run.",
130
- )
131
- requests_made: StatusBreakdown[int, int, int, int] = Field(
132
- description=(
133
- "The number of requests made for the benchmark run broken down by "
134
- "status including successful, incomplete, errored, and the sum of all three"
135
- )
136
- )
137
- queued_time_avg: float = Field(
138
- description=(
139
- "The average time spent in the queue for each request in the benchmark "
140
- "run until it was dequeued by a worker."
141
- )
142
- )
143
- scheduled_time_delay_avg: float = Field(
144
- description=(
145
- "The average time delay between when a request was dequeued and when it "
146
- "was scheduled to be processed by a worker in the benchmark run. "
147
- "This should be as close to 0 as possible, any additional time is "
148
- "overheads from the system or the worker."
149
- )
150
- )
151
- scheduled_time_sleep_avg: float = Field(
152
- description=(
153
- "The average time spent sleeping til the desired start time was reached "
154
- "after being scheduled by the worker in the benchmark run."
155
- )
156
- )
157
- worker_start_delay_avg: float = Field(
158
- description=(
159
- "The average time delay between when a request was scheduled and when "
160
- "the worker started processing it in the benchmark run. "
161
- "This should be as close to 0 as possible, any additional time is "
162
- "overheads from the system or the worker."
163
- )
164
- )
165
- worker_time_avg: float = Field(
166
- description=(
167
- "The average time taken by the worker to process each request in the "
168
- "benchmark run. This includes the time to generate the response and "
169
- "any additional processing time."
170
- )
171
- )
172
- worker_start_time_targeted_delay_avg: float = Field(
173
- description=(
174
- "The average time delay between when a request was targeted to start "
175
- "and when the worker actually started processing it in the benchmark "
176
- "run. For async strategies, this represents delays from the ideal "
177
- "system. For sync strategies, since those are doubled in queue, "
178
- "this should be as close to the time for a request to be processed "
179
- "as possible. Any additional time is overhead from the system or "
180
- "the worker."
181
- )
182
- )
183
- request_start_time_delay_avg: float = Field(
184
- description=(
185
- "The average time delay between the actual request being made "
186
- "and the time the worker started on the request for all requests "
187
- "that completed within the benchmark run. This time should be as close "
188
- "to 0 as possible, any additional time is overhead from the system or "
189
- "the worker."
190
- )
191
- )
192
- request_start_time_targeted_delay_avg: float = Field(
193
- description=(
194
- "The average time delay between when the targeted start time and "
195
- "the actual start time for each request in the benchmark run. "
196
- "For async strategies, this represents delays from the ideal "
197
- "system. For sync strategies, this should be as close to the "
198
- "time for a request to be processed as possible. Any additional "
199
- "time is overhead from the system or the worker."
200
- )
201
- )
202
- request_time_delay_avg: float = Field(
203
- description=(
204
- "The average time delay between the total request time and the "
205
- "worker time. This should be as close to 0 as possible, any additional "
206
- "time is overhead from the system or the worker. "
207
- )
208
- )
209
- request_time_avg: float = Field(
210
- description=(
211
- "The average time spent processing all requests in the benchmark run. "
212
- "This is the time from when the actual request was started to when "
213
- "it was completed."
214
- )
215
- )
216
-
217
-
218
- class BenchmarkMetrics(StandardBaseModel):
219
- """
220
- A serializable model representing the metrics for a benchmark run.
221
- """
222
-
223
- requests_per_second: StatusDistributionSummary = Field(
224
- description="The distribution of requests per second for the benchmark.",
225
- )
226
- request_concurrency: StatusDistributionSummary = Field(
227
- description="The distribution of requests concurrency for the benchmark.",
228
- )
229
-
230
-
231
- class Benchmark(StandardBaseModel):
232
- """
233
- The base serializable model representing a benchmark run and its results.
234
- Specific benchmarker implementations should extend this model to include
235
- additional information or metadata as needed.
236
-
237
- Note, requests_per_second and request_concurrency are kept at this level
238
- and are expected to be populated by the subclass implementation to ensure
239
- the logic for Profiles can include more complicated logic for determining
240
- what rates and concurrency values to use for subsequent strategies.
241
- """
242
-
243
- type_: Literal["benchmark"] = "benchmark"
244
- id_: str = Field(
245
- default_factory=lambda: str(uuid.uuid4()),
246
- description="The unique identifier for the benchmark.",
247
- )
248
- run_id: str = Field(
249
- description=(
250
- "The unique identifier for the encompasing benchmark run that this "
251
- "benchmark was a part of."
252
- )
253
- )
254
- args: BenchmarkArgs = Field(
255
- description=(
256
- "The arguments used to specify how to run the benchmark and collect data."
257
- )
258
- )
259
- run_stats: BenchmarkRunStats = Field(
260
- description=(
261
- "The process statistics for the entire benchmark run across all requests."
262
- )
263
- )
264
- worker: Union[WorkerDescription] = Field(
265
- description=(
266
- "The description and specifics for the worker used to resolve requests "
267
- "for this benchmark."
268
- ),
269
- )
270
- request_loader: Union[RequestLoaderDescription] = Field(
271
- description=(
272
- "The description and specifics for the request loader used to create "
273
- "requests for this benchmark."
274
- ),
275
- )
276
- extras: dict[str, Any] = Field(
277
- description=(
278
- "Any additional information or metadata that was passed for this benchmark."
279
- )
280
- )
281
- metrics: BenchmarkMetrics = Field(
282
- description=(
283
- "The metrics for the benchmark run represented as a distribution of "
284
- "various per-request statistics."
285
- ),
286
- )
287
-
288
-
289
- BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark)
290
-
291
-
292
- class GenerativeTextResponseStats(StandardBaseModel):
293
- """
294
- A serializable model representing the request values, response values, and
295
- statistics for a generative text response.
296
- """
297
-
298
- type_: Literal["generative_text_response"] = "generative_text_response"
299
- request_id: Optional[str] = Field(
300
- description="The unique identifier for the request.",
301
- )
302
- request_type: Literal["text_completions", "chat_completions"] = Field(
303
- description="The type of request made to the generative backend."
304
- )
305
- scheduler_info: SchedulerRequestInfo = Field(
306
- description=(
307
- "The info about the request from the scheduler about how it was run."
308
- ),
309
- )
310
- prompt: str = Field(
311
- description="The text prompt used for the generative request.",
312
- )
313
- output: str = Field(
314
- description="The generated text output from the generative request.",
315
- )
316
- prompt_tokens: int = Field(
317
- description="The number of tokens in the prompt text.",
318
- )
319
- output_tokens: int = Field(
320
- description="The number of tokens in the generated output text.",
321
- )
322
- start_time: float = Field(
323
- description="The time the request started.",
324
- )
325
- end_time: float = Field(
326
- description="The time the request ended.",
327
- )
328
- first_token_time: float = Field(
329
- description="The time the first token was received.",
330
- )
331
- last_token_time: float = Field(
332
- description="The time the last token was received.",
333
- )
334
-
335
- @computed_field # type: ignore[misc]
336
- @property
337
- def request_latency(self) -> float:
338
- """
339
- :return: The duration of the request in seconds from the start to the end.
340
- """
341
- return self.end_time - self.start_time
342
-
343
- @computed_field # type: ignore[misc]
344
- @property
345
- def time_to_first_token_ms(self) -> float:
346
- """
347
- :return: The time in milliseconds from the start of the request to the first
348
- token received.
349
- """
350
- return 1000 * (self.first_token_time - self.start_time)
351
-
352
- @computed_field # type: ignore[misc]
353
- @property
354
- def time_per_output_token_ms(self) -> float:
355
- """
356
- :return: The average time in milliseconds per output token generated.
357
- This includes the time to generate the first token and all other tokens.
358
- """
359
- if self.output_tokens == 0:
360
- return 0.0
361
-
362
- return (
363
- 1000 * (self.last_token_time - self.first_token_time) / self.output_tokens
364
- )
365
-
366
- @computed_field # type: ignore[misc]
367
- @property
368
- def inter_token_latency_ms(self) -> float:
369
- """
370
- :return: The average time in milliseconds between generating tokens in the
371
- output text. Note, does not include the time to generate the first token.
372
- """
373
- if self.output_tokens <= 1:
374
- return 0.0
375
-
376
- return (
377
- 1000
378
- * (self.last_token_time - self.first_token_time)
379
- / (self.output_tokens - 1)
380
- )
381
-
382
- @computed_field # type: ignore[misc]
383
- @property
384
- def tokens_per_second(self) -> float:
385
- """
386
- :return: The average number of tokens generated per second in the prompt and
387
- output text.
388
- """
389
- if (latency := self.request_latency) == 0.0:
390
- return 0.0
391
-
392
- return (self.prompt_tokens + self.output_tokens) / latency
393
-
394
- @computed_field # type: ignore[misc]
395
- @property
396
- def output_tokens_per_second(self) -> float:
397
- """
398
- :return: The average number of output tokens generated per second.
399
- """
400
- if (latency := self.request_latency) == 0.0:
401
- return 0.0
402
-
403
- return self.output_tokens / latency
404
-
405
-
406
- class GenerativeTextErrorStats(GenerativeTextResponseStats):
407
- """
408
- A serializable model representing the request values, response values, and
409
- statistics for a generative text response that errored.
410
- Extends and overrides the GenerativeTextResponseStats model to include the
411
- error message and optional properties given the error occurred.
412
- """
413
-
414
- type_: Literal["generative_text_error"] = "generative_text_error" # type: ignore[assignment]
415
- error: str = Field(
416
- description=(
417
- "The error message for the error that occurred while making the request."
418
- )
419
- )
420
- output: Optional[str] = Field( # type: ignore[assignment]
421
- default=None,
422
- description=(
423
- "The generated text output from the generative request, if any, "
424
- "before the error occurred."
425
- ),
426
- )
427
- first_token_time: Optional[float] = Field( # type: ignore[assignment]
428
- default=None,
429
- description=(
430
- "The time the first token was received, if any, before the error occurred."
431
- ),
432
- )
433
- last_token_time: Optional[float] = Field( # type: ignore[assignment]
434
- default=None,
435
- description=(
436
- "The time the last token was received, if any, before the error occurred."
437
- ),
438
- )
439
-
440
- @computed_field # type: ignore[misc]
441
- @property
442
- def time_to_first_token_ms(self) -> Optional[float]: # type: ignore[override]
443
- """
444
- :return: The time in milliseconds from the start of the request to the first
445
- token received. None if the first token was not received.
446
- """
447
- if self.first_token_time is None:
448
- return None
449
-
450
- return super().time_to_first_token_ms
451
-
452
- @computed_field # type: ignore[misc]
453
- @property
454
- def time_per_output_token_ms(self) -> Optional[float]: # type: ignore[override]
455
- """
456
- :return: The average time in milliseconds per output token generated.
457
- This includes the time to generate the first token and all other tokens.
458
- None if the output_tokens is None or 0.
459
- """
460
- if (
461
- self.output_tokens is None
462
- or self.output_tokens == 0
463
- or self.first_token_time is None
464
- or self.last_token_time is None
465
- ):
466
- return None
467
-
468
- return super().time_per_output_token_ms
469
-
470
- @computed_field # type: ignore[misc]
471
- @property
472
- def inter_token_latency_ms(self) -> Optional[float]: # type: ignore[override]
473
- """
474
- :return: The average time in milliseconds between generating tokens in the
475
- output text. Note, does not include the time to generate the first token.
476
- None if there were no output_tokens or the first token was not received.
477
- """
478
- if (
479
- self.output_tokens is None
480
- or self.first_token_time is None
481
- or self.last_token_time is None
482
- ):
483
- return None
484
-
485
- return super().inter_token_latency_ms
486
-
487
- @computed_field # type: ignore[misc]
488
- @property
489
- def output_tokens_per_second(self) -> Optional[float]: # type: ignore[override]
490
- """
491
- :return: The average number of tokens generated per second in the output text.
492
- Note, does not include the time to generate the first token. None if there
493
- were no output_tokens or the first token was not received.
494
- """
495
- if self.inter_token_latency_ms is None:
496
- return None
497
-
498
- return super().output_tokens_per_second
499
-
500
-
501
- class GenerativeMetrics(BenchmarkMetrics):
502
- """
503
- A serializable model representing the metrics for a generative benchmark run.
504
- """
505
-
506
- request_latency: StatusDistributionSummary = Field(
507
- description="The distribution of latencies for the completed requests.",
508
- )
509
- prompt_token_count: StatusDistributionSummary = Field(
510
- description=(
511
- "The distribution of token counts in the prompts for completed, "
512
- "errored, and all requests."
513
- )
514
- )
515
- output_token_count: StatusDistributionSummary = Field(
516
- description=(
517
- "The distribution of token counts in the outputs for completed, "
518
- "errored, and all requests."
519
- )
520
- )
521
- time_to_first_token_ms: StatusDistributionSummary = Field(
522
- description=(
523
- "The distribution of latencies to receiving the first token in "
524
- "milliseconds for completed, errored, and all requests."
525
- ),
526
- )
527
- time_per_output_token_ms: StatusDistributionSummary = Field(
528
- description=(
529
- "The distribution of latencies per output token in milliseconds for "
530
- "completed, errored, and all requests. "
531
- "This includes the time to generate the first token and all other tokens."
532
- ),
533
- )
534
- inter_token_latency_ms: StatusDistributionSummary = Field(
535
- description=(
536
- "The distribution of latencies between tokens in milliseconds for "
537
- "completed, errored, and all requests."
538
- ),
539
- )
540
- output_tokens_per_second: StatusDistributionSummary = Field(
541
- description=(
542
- "The distribution of output tokens per second for completed, "
543
- "errored, and all requests."
544
- ),
545
- )
546
- tokens_per_second: StatusDistributionSummary = Field(
547
- description=(
548
- "The distribution of tokens per second, including prompt and output tokens "
549
- "for completed, errored, and all requests."
550
- ),
551
- )
552
-
553
-
554
- class GenerativeBenchmark(Benchmark):
555
- """
556
- A serializable model representing a benchmark run and its results for generative
557
- requests and responses. Includes the completed and errored requests, the start
558
- and end times for the benchmark, and the statistics for the requests and responses.
559
- """
560
-
561
- type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment]
562
- start_time: float = Field(
563
- description="The start time of the first request for the benchmark.",
564
- )
565
- end_time: float = Field(
566
- description="The end time of the last request for the benchmark.",
567
- )
568
-
569
- @computed_field # type: ignore[misc]
570
- @property
571
- def duration(self) -> float:
572
- """
573
- :return: The duration of the benchmark in seconds from the start of the
574
- first request to the end of the last request.
575
- """
576
- return self.end_time - self.start_time
577
-
578
- worker: GenerativeRequestsWorkerDescription = Field(
579
- description=(
580
- "The description and specifics for the worker used to resolve requests "
581
- "for this benchmark."
582
- ),
583
- )
584
- request_loader: GenerativeRequestLoaderDescription = Field(
585
- description=(
586
- "The description and specifics for the request loader used to create "
587
- "requests for this benchmark."
588
- ),
589
- )
590
- metrics: GenerativeMetrics = Field(
591
- description=(
592
- "The metrics for the benchmark run represented as a distribution of "
593
- "various per-request statistics."
594
- ),
595
- )
596
- # Output is ordered so keep the requests at the end for better readability in files
597
- request_totals: StatusBreakdown[int, int, int, int] = Field(
598
- description=(
599
- "The number of requests made for the benchmark broken down by status "
600
- "including successful, incomplete, errored, and the sum of all three"
601
- )
602
- )
603
- request_samples: Optional[StatusBreakdown[int, int, int, None]] = Field(
604
- description=(
605
- "The number of requests that were randomly sampled for "
606
- "the benchmark. None if no sampling was applied."
607
- ),
608
- default=None,
609
- )
610
- requests: StatusBreakdown[
611
- list[GenerativeTextResponseStats],
612
- list[GenerativeTextErrorStats],
613
- list[GenerativeTextErrorStats],
614
- None,
615
- ] = Field(
616
- description=(
617
- "The breakdown of requests for the benchmark run including successful, "
618
- "incomplete, and errored requests."
619
- ),
620
- )
621
-
622
- def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
623
- """
624
- Set the sample size for the benchmark. This will randomly sample the
625
- requests for each status type to the given sample size or the maximum
626
- number of requests for that status type, whichever is smaller.
627
- This is applied to requests.successful, requests.errored, and
628
- requests.incomplete.
629
- If None, no sampling is applied and the state is kept.
630
-
631
- :param sample_size: The number of requests to sample for each status type.
632
- :return: The benchmark with the sampled requests.
633
- :raises ValueError: If the sample size is invalid.
634
- """
635
-
636
- if sample_size is not None:
637
- if sample_size < 0 or not isinstance(sample_size, int):
638
- raise ValueError(
639
- f"Sample size must be non-negative integer, given {sample_size}"
640
- )
641
-
642
- sample_size = min(sample_size, len(self.requests.successful))
643
- error_sample_size = min(sample_size, len(self.requests.errored))
644
- incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
645
-
646
- self.requests.successful = random.sample(
647
- self.requests.successful, sample_size
648
- )
649
- self.requests.errored = random.sample(
650
- self.requests.errored, error_sample_size
651
- )
652
- self.requests.incomplete = random.sample(
653
- self.requests.incomplete, incomplete_sample_size
654
- )
655
- self.request_samples = StatusBreakdown(
656
- successful=len(self.requests.successful),
657
- incomplete=len(self.requests.incomplete),
658
- errored=len(self.requests.errored),
659
- )
660
-
661
- return self
662
-
663
- @staticmethod
664
- def from_stats(
665
- run_id: str,
666
- successful: list[GenerativeTextResponseStats],
667
- incomplete: list[GenerativeTextErrorStats],
668
- errored: list[GenerativeTextErrorStats],
669
- args: BenchmarkArgs,
670
- run_stats: BenchmarkRunStats,
671
- worker: GenerativeRequestsWorkerDescription,
672
- requests_loader: GenerativeRequestLoaderDescription,
673
- extras: Optional[dict[str, Any]],
674
- ) -> "GenerativeBenchmark":
675
- """
676
- Create a GenerativeBenchmark instance from the given statistics and metadata.
677
- Given the completed and errored requests, the benchmark will fill in the
678
- remaining statistics for the various metrics required for a benchmark.
679
- This is the preferred method for creating a GenerativeBenchmark instance
680
- to ensure all statistics are properly calculated and populated.
681
-
682
- :param run_id: The unique identifier for the benchmark run.
683
- :param completed: The list of completed requests.
684
- :param errored: The list of errored requests.
685
- :param args: The arguments used to specify how to run the benchmark
686
- and collect data.
687
- :param run_stats: The process statistics for the entire benchmark run across
688
- all requests.
689
- :param worker: The description and specifics for the worker used to resolve
690
- requests.
691
- :param requests_loader: The description and specifics for the request loader
692
- used to create requests.
693
- :param extras: Any additional information or metadata that was passed for
694
- this benchmark.
695
- :return: A GenerativeBenchmark instance with the given statistics and metadata
696
- populated and calculated
697
- """
698
- total = successful + incomplete + errored
699
- total_types: list[Literal["successful", "incomplete", "error"]] = [
700
- *["successful"] * len(successful), # type: ignore[list-item]
701
- *["incomplete"] * len(incomplete), # type: ignore[list-item]
702
- *["error"] * len(errored), # type: ignore[list-item]
703
- ]
704
- start_time = min(req.start_time for req in total)
705
- end_time = max(req.end_time for req in total)
706
-
707
- total_with_prompt, total_types_with_prompt = (
708
- zip(*filtered)
709
- if (
710
- filtered := list(
711
- filter(lambda val: bool(val[0].prompt), zip(total, total_types))
712
- )
713
- )
714
- else ([], [])
715
- )
716
- total_with_output_first, total_types_with_output_first = (
717
- zip(*filtered)
718
- if (
719
- filtered := list(
720
- filter(
721
- lambda val: bool(val[0].output_tokens > 0),
722
- zip(total, total_types),
723
- )
724
- )
725
- )
726
- else ([], [])
727
- )
728
- total_with_output_multi, total_types_with_output_multi = (
729
- zip(*filtered)
730
- if (
731
- filtered := list(
732
- filter(
733
- lambda val: bool(val[0].output_tokens > 1),
734
- zip(total, total_types),
735
- )
736
- )
737
- )
738
- else ([], [])
739
- )
740
-
741
- return GenerativeBenchmark(
742
- run_id=run_id,
743
- args=args,
744
- run_stats=run_stats,
745
- extras=extras or {},
746
- start_time=start_time,
747
- end_time=end_time,
748
- worker=worker,
749
- request_loader=requests_loader,
750
- metrics=GenerativeMetrics(
751
- requests_per_second=StatusDistributionSummary.from_request_times(
752
- request_types=total_types,
753
- requests=[(req.start_time, req.end_time) for req in total],
754
- distribution_type="rate",
755
- ),
756
- request_concurrency=StatusDistributionSummary.from_request_times(
757
- request_types=total_types,
758
- requests=[(req.start_time, req.end_time) for req in total],
759
- distribution_type="concurrency",
760
- ),
761
- request_latency=StatusDistributionSummary.from_values(
762
- value_types=total_types,
763
- values=[req.request_latency for req in total],
764
- ),
765
- prompt_token_count=StatusDistributionSummary.from_values(
766
- value_types=list(total_types_with_prompt),
767
- values=[req.prompt_tokens for req in total_with_prompt],
768
- ),
769
- output_token_count=StatusDistributionSummary.from_values(
770
- value_types=list(total_types_with_output_first),
771
- values=[req.output_tokens for req in total_with_output_first],
772
- ),
773
- time_to_first_token_ms=StatusDistributionSummary.from_values(
774
- value_types=list(total_types_with_output_first),
775
- values=[
776
- req.time_to_first_token_ms or 0
777
- for req in total_with_output_first
778
- ],
779
- ),
780
- time_per_output_token_ms=StatusDistributionSummary.from_values(
781
- value_types=list(total_types_with_output_first),
782
- values=[
783
- req.time_per_output_token_ms or 0
784
- for req in total_with_output_first
785
- ],
786
- weights=[req.output_tokens for req in total_with_output_first],
787
- ),
788
- inter_token_latency_ms=StatusDistributionSummary.from_values(
789
- value_types=list(total_types_with_output_multi),
790
- values=[
791
- req.inter_token_latency_ms or 0
792
- for req in total_with_output_multi
793
- ],
794
- weights=[req.output_tokens - 1 for req in total_with_output_multi],
795
- ),
796
- output_tokens_per_second=StatusDistributionSummary.from_iterable_request_times(
797
- request_types=list(total_types_with_output_first),
798
- requests=[
799
- (req.start_time, req.end_time)
800
- for req in total_with_output_first
801
- ],
802
- first_iter_times=[
803
- req.first_token_time or req.start_time
804
- for req in total_with_output_first
805
- ],
806
- iter_counts=[req.output_tokens for req in total_with_output_first],
807
- ),
808
- tokens_per_second=StatusDistributionSummary.from_iterable_request_times(
809
- request_types=list(total_types_with_output_first),
810
- requests=[
811
- (req.start_time, req.end_time)
812
- for req in total_with_output_first
813
- ],
814
- first_iter_times=[
815
- req.first_token_time or req.start_time
816
- for req in total_with_output_first
817
- ],
818
- iter_counts=[req.output_tokens for req in total_with_output_first],
819
- first_iter_counts=[
820
- # prompt tokens + first token
821
- req.prompt_tokens + 1
822
- for req in total_with_output_first
823
- ],
824
- ),
825
- ),
826
- request_totals=StatusBreakdown(
827
- successful=len(successful),
828
- incomplete=len(incomplete),
829
- errored=len(errored),
830
- total=len(total),
831
- ),
832
- requests=StatusBreakdown(
833
- successful=successful,
834
- incomplete=incomplete,
835
- errored=errored,
836
- ),
837
- )