guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (116) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +451 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +148 -317
  9. guidellm/benchmark/entrypoints.py +466 -128
  10. guidellm/benchmark/output.py +517 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2085 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +109 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +192 -0
  24. guidellm/data/deserializers/synthetic.py +346 -0
  25. guidellm/data/loaders.py +145 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +412 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +29 -0
  30. guidellm/data/processor.py +30 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +10 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/data/utils/functions.py +18 -0
  35. guidellm/extras/__init__.py +4 -0
  36. guidellm/extras/audio.py +215 -0
  37. guidellm/extras/vision.py +242 -0
  38. guidellm/logger.py +2 -2
  39. guidellm/mock_server/__init__.py +8 -0
  40. guidellm/mock_server/config.py +84 -0
  41. guidellm/mock_server/handlers/__init__.py +17 -0
  42. guidellm/mock_server/handlers/chat_completions.py +280 -0
  43. guidellm/mock_server/handlers/completions.py +280 -0
  44. guidellm/mock_server/handlers/tokenizer.py +142 -0
  45. guidellm/mock_server/models.py +510 -0
  46. guidellm/mock_server/server.py +168 -0
  47. guidellm/mock_server/utils.py +302 -0
  48. guidellm/preprocess/dataset.py +23 -26
  49. guidellm/presentation/builder.py +2 -2
  50. guidellm/presentation/data_models.py +25 -21
  51. guidellm/presentation/injector.py +2 -3
  52. guidellm/scheduler/__init__.py +65 -26
  53. guidellm/scheduler/constraints.py +1035 -0
  54. guidellm/scheduler/environments.py +252 -0
  55. guidellm/scheduler/scheduler.py +140 -368
  56. guidellm/scheduler/schemas.py +272 -0
  57. guidellm/scheduler/strategies.py +519 -0
  58. guidellm/scheduler/worker.py +391 -420
  59. guidellm/scheduler/worker_group.py +707 -0
  60. guidellm/schemas/__init__.py +31 -0
  61. guidellm/schemas/info.py +159 -0
  62. guidellm/schemas/request.py +216 -0
  63. guidellm/schemas/response.py +119 -0
  64. guidellm/schemas/stats.py +228 -0
  65. guidellm/{config.py → settings.py} +32 -21
  66. guidellm/utils/__init__.py +95 -8
  67. guidellm/utils/auto_importer.py +98 -0
  68. guidellm/utils/cli.py +46 -2
  69. guidellm/utils/console.py +183 -0
  70. guidellm/utils/encoding.py +778 -0
  71. guidellm/utils/functions.py +134 -0
  72. guidellm/utils/hf_datasets.py +1 -2
  73. guidellm/utils/hf_transformers.py +4 -4
  74. guidellm/utils/imports.py +9 -0
  75. guidellm/utils/messaging.py +1118 -0
  76. guidellm/utils/mixins.py +115 -0
  77. guidellm/utils/pydantic_utils.py +411 -0
  78. guidellm/utils/random.py +3 -4
  79. guidellm/utils/registry.py +220 -0
  80. guidellm/utils/singleton.py +133 -0
  81. guidellm/{objects → utils}/statistics.py +341 -247
  82. guidellm/utils/synchronous.py +159 -0
  83. guidellm/utils/text.py +163 -50
  84. guidellm/utils/typing.py +41 -0
  85. guidellm/version.py +1 -1
  86. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
  87. guidellm-0.4.0a155.dist-info/RECORD +96 -0
  88. guidellm/backend/__init__.py +0 -23
  89. guidellm/backend/backend.py +0 -259
  90. guidellm/backend/openai.py +0 -705
  91. guidellm/backend/response.py +0 -136
  92. guidellm/benchmark/aggregator.py +0 -760
  93. guidellm/benchmark/benchmark.py +0 -837
  94. guidellm/benchmark/scenario.py +0 -104
  95. guidellm/data/prideandprejudice.txt.gz +0 -0
  96. guidellm/dataset/__init__.py +0 -22
  97. guidellm/dataset/creator.py +0 -213
  98. guidellm/dataset/entrypoints.py +0 -42
  99. guidellm/dataset/file.py +0 -92
  100. guidellm/dataset/hf_datasets.py +0 -62
  101. guidellm/dataset/in_memory.py +0 -132
  102. guidellm/dataset/synthetic.py +0 -287
  103. guidellm/objects/__init__.py +0 -18
  104. guidellm/objects/pydantic.py +0 -89
  105. guidellm/request/__init__.py +0 -18
  106. guidellm/request/loader.py +0 -284
  107. guidellm/request/request.py +0 -79
  108. guidellm/request/types.py +0 -10
  109. guidellm/scheduler/queues.py +0 -25
  110. guidellm/scheduler/result.py +0 -155
  111. guidellm/scheduler/strategy.py +0 -495
  112. guidellm-0.4.0a18.dist-info/RECORD +0 -62
  113. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
  114. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
  115. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
  116. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
@@ -1,760 +0,0 @@
1
- import time
2
- from abc import ABC, abstractmethod
3
- from pathlib import Path
4
- from typing import (
5
- Any,
6
- Generic,
7
- Literal,
8
- Optional,
9
- TypeVar,
10
- Union,
11
- )
12
-
13
- from pydantic import Field
14
-
15
- from guidellm.backend import ResponseSummary
16
- from guidellm.benchmark.benchmark import (
17
- BenchmarkArgs,
18
- BenchmarkRunStats,
19
- BenchmarkT,
20
- GenerativeBenchmark,
21
- GenerativeTextErrorStats,
22
- GenerativeTextResponseStats,
23
- )
24
- from guidellm.config import settings
25
- from guidellm.objects import (
26
- RunningStats,
27
- StandardBaseModel,
28
- StatusBreakdown,
29
- TimeRunningStats,
30
- )
31
- from guidellm.request import (
32
- GenerationRequest,
33
- GenerativeRequestLoaderDescription,
34
- RequestLoaderDescription,
35
- RequestT,
36
- ResponseT,
37
- )
38
- from guidellm.scheduler import (
39
- GenerativeRequestsWorkerDescription,
40
- SchedulerRequestResult,
41
- WorkerDescription,
42
- )
43
- from guidellm.utils import check_load_processor
44
-
45
- __all__ = [
46
- "AggregatorT",
47
- "BenchmarkAggregator",
48
- "GenerativeBenchmarkAggregator",
49
- ]
50
-
51
-
52
- class SchedulerRunningStats(StandardBaseModel):
53
- """
54
- The metrics for the scheduler stored as running statistics for easy calculations
55
- of rates, averages, totals, etc.
56
- """
57
-
58
- created_requests: RunningStats = Field(
59
- description=(
60
- "The running statistics for the number of requests created for this "
61
- "benchmark run. This includes all requests created, regardless of "
62
- "their status."
63
- ),
64
- default_factory=RunningStats,
65
- )
66
- queued_requests: RunningStats = Field(
67
- description=(
68
- "The running statistics for the number of requests pending in queue "
69
- "for this benchmark run. This includes requests that are waiting to "
70
- "be scheduled."
71
- ),
72
- default_factory=RunningStats,
73
- )
74
- scheduled_requests: RunningStats = Field(
75
- description=(
76
- "The running statistics for the number of requests scheduled (actively "
77
- "running but waiting for the desired start time) for this benchmark run."
78
- ),
79
- default_factory=RunningStats,
80
- )
81
- processing_requests: RunningStats = Field(
82
- description=(
83
- "The running statistics for the number of requests actively being "
84
- "processed by the worker for this benchmark run."
85
- ),
86
- default_factory=RunningStats,
87
- )
88
- completed_requests: RunningStats = Field(
89
- description=(
90
- "The running statistics for the number of requests completed for this "
91
- "benchmark run. This includes requests within the warmup and cooldown "
92
- "period, if any, along with the final results."
93
- ),
94
- default_factory=RunningStats,
95
- )
96
-
97
-
98
- class RequestsRunningStats(StandardBaseModel):
99
- """
100
- The metrics for requests that have succeeded, been canceled, or errored stored
101
- as running statistics for easy calculations of rates, averages, totals, etc.
102
- """
103
-
104
- totals: StatusBreakdown[RunningStats, RunningStats, RunningStats, RunningStats] = (
105
- Field(
106
- description=(
107
- "The running statistics for the total number of requests that "
108
- "completed within the benchmark run."
109
- ),
110
- default_factory=lambda: StatusBreakdown(
111
- successful=RunningStats(),
112
- errored=RunningStats(),
113
- incomplete=RunningStats(),
114
- total=RunningStats(),
115
- ),
116
- )
117
- )
118
- queued_time: TimeRunningStats = Field(
119
- description=(
120
- "The running statistics for the time spent in queue for all requests that "
121
- "completed within the benchmark run. This is the time from when the "
122
- "request was created to when it was dequeued by the worker."
123
- ),
124
- default_factory=TimeRunningStats,
125
- )
126
- scheduled_time_delay: TimeRunningStats = Field(
127
- description=(
128
- "The running statistics for the time spent from when a request was "
129
- "dequeued by the worker to when it was actually scheduled by the worker"
130
- "for all requests that completed within the benchmark run. "
131
- "This should be as close to 0 as possible, any additional time is "
132
- "overheads from the system or the worker."
133
- ),
134
- default_factory=TimeRunningStats,
135
- )
136
- scheduled_time_sleep: TimeRunningStats = Field(
137
- description=(
138
- "The running statistics for the time for each request spent sleeping til "
139
- "the desired start time was reached for all requests that completed within "
140
- "the benchmark run. This is the time from when the request was scheduled "
141
- "to when the desired start time was reached. "
142
- ),
143
- default_factory=TimeRunningStats,
144
- )
145
- worker_start_delay: TimeRunningStats = Field(
146
- description=(
147
- "The running statistics for the time delay between when the request was "
148
- "scheduled and when the worker actually started processing subtracting any "
149
- "sleep time for all requests that completed within the benchmark run. "
150
- "This should be as close to 0 as possible, any additional time is "
151
- "overheads from the system or the worker."
152
- ),
153
- default_factory=TimeRunningStats,
154
- )
155
- worker_time: TimeRunningStats = Field(
156
- description=(
157
- "The running statistics for the time spent processing all requests that "
158
- "completed within the benchmark run. This is the time from when the "
159
- "request was started to when it was completed."
160
- ),
161
- default_factory=TimeRunningStats,
162
- )
163
- worker_start_time_targeted_delay: TimeRunningStats = Field(
164
- description=(
165
- "The running statistics for the delay between the targeted start time and "
166
- "the actual start time for requests that completed within the benchmark "
167
- "run. This represents delays from the best case desired start time. "
168
- "For async strategies, this represents delays from the ideal system. "
169
- "For sync strategies, since those are doubled in queue, this should be "
170
- "as close to the time for a request to be processed as possible."
171
- ),
172
- default_factory=TimeRunningStats,
173
- )
174
- request_start_time_delay: TimeRunningStats = Field(
175
- description=(
176
- "The running statistics for the delay between the actual request being "
177
- "made and the time the worker started on the request for all requests "
178
- "that completed within the benchmark run. This time should be as close to "
179
- "0 as possible, any additional time is overhead from the system or "
180
- "the worker."
181
- ),
182
- default_factory=TimeRunningStats,
183
- )
184
- request_start_time_targeted_delay: TimeRunningStats = Field(
185
- description=(
186
- "The running statistics for the delay between the targeted start time and "
187
- "the actual start time for all requests that completed within the "
188
- "benchmark run. This represents delays from the best case desired start "
189
- "time. For async strategies, this represents delays from the ideal system. "
190
- "For sync strategies, since those are duplicated in queue, this should be "
191
- "as close to the time for a request to be processed."
192
- ),
193
- default_factory=TimeRunningStats,
194
- )
195
- request_time_delay: TimeRunningStats = Field(
196
- description=(
197
- "The running statistics for the delay in time between the total request "
198
- "time and the worker time. This should be as close to 0 as possible, any "
199
- "additional time is overhead from the system or the worker. "
200
- ),
201
- default_factory=TimeRunningStats,
202
- )
203
- request_time: TimeRunningStats = Field(
204
- description=(
205
- "The running statistics for the time spent processing all requests that "
206
- "completed within the benchmark run. This is the time from when the "
207
- "request was created to when it was completed."
208
- ),
209
- default_factory=TimeRunningStats,
210
- )
211
-
212
-
213
- class BenchmarkAggregator(
214
- ABC, StandardBaseModel, Generic[BenchmarkT, RequestT, ResponseT]
215
- ):
216
- """
217
- A pydantic base class representing the base class for aggregating benchmark results.
218
- The purpose is to receive and process results from a Benchmarker as it iterates
219
- through a Scheduler for an individual benchmark run.
220
- As results are added, lightweight statistics are updated and stored for immediate
221
- progress and informational updates to the caller.
222
- Once the benchmark run is complete, the `compile` method is called to finalize
223
- the benchmark and return a Benchmark object with all the results and statistics
224
- fully calculated.
225
- """
226
-
227
- type_: Literal["benchmark_aggregator"] = "benchmark_aggregator"
228
- run_id: str = Field(
229
- description=(
230
- "The unique identifier for the encompasing benchmark run that this "
231
- "benchmark was a part of."
232
- )
233
- )
234
- args: BenchmarkArgs = Field(
235
- description=(
236
- "The arguments used to create the benchmark run that this benchmark was "
237
- "a part of."
238
- )
239
- )
240
- worker_description: Union[
241
- GenerativeRequestsWorkerDescription, WorkerDescription
242
- ] = Field(
243
- description=(
244
- "The description and specifics for the worker used to resolve requests "
245
- "for this benchmark."
246
- ),
247
- discriminator="type_",
248
- )
249
- request_loader_description: Union[
250
- GenerativeRequestLoaderDescription, RequestLoaderDescription
251
- ] = Field(
252
- description=(
253
- "The description and specifics for the request loader used to create "
254
- "requests for this benchmark."
255
- ),
256
- discriminator="type_",
257
- )
258
- extras: dict[str, Any] = Field(
259
- description=(
260
- "Any additional information or metadata that was passed for this benchmark."
261
- )
262
- )
263
- in_warmup: bool = Field(
264
- description=(
265
- "A flag to indicate if the benchmark is currently in the warmup phase."
266
- ),
267
- default=False,
268
- exclude=True,
269
- )
270
- in_cooldown: bool = Field(
271
- description=(
272
- "A flag to indicate if the benchmark is currently in the cooldown phase."
273
- ),
274
- default=False,
275
- exclude=True,
276
- )
277
- scheduler_stats: SchedulerRunningStats = Field(
278
- description=(
279
- "The running statistics for the scheduler for this benchmark run. "
280
- "This includes all requests created, regardless of their status."
281
- ),
282
- default_factory=SchedulerRunningStats,
283
- )
284
- requests_stats: RequestsRunningStats = Field(
285
- description=(
286
- "The running statistics for the requests for this benchmark run. "
287
- "This includes all requests created, regardless of their status."
288
- ),
289
- default_factory=RequestsRunningStats,
290
- )
291
- results: StatusBreakdown[
292
- list[SchedulerRequestResult[RequestT, ResponseT]],
293
- list[SchedulerRequestResult[RequestT, ResponseT]],
294
- list[SchedulerRequestResult[RequestT, ResponseT]],
295
- None,
296
- ] = Field(
297
- description=(
298
- "The completed requests for this benchmark run broken down by status"
299
- "and excluding warmup and cooldown requests."
300
- ),
301
- default_factory=lambda: StatusBreakdown( # type: ignore[arg-type]
302
- successful=[],
303
- errored=[],
304
- incomplete=[],
305
- total=None,
306
- ),
307
- )
308
-
309
- def add_result(
310
- self,
311
- result: SchedulerRequestResult[RequestT, ResponseT],
312
- ) -> bool:
313
- """
314
- Add a result to the aggregator. This will update the internal statistics
315
- and add the result to the list of results if it is not within the warmup or
316
- cooldown period.
317
-
318
- :param result: The result to add to the aggregator.
319
- :return: True if the result was added, False if it was added because it
320
- did not fit within the warmup or cooldown period, was not requested,
321
- or is not finished
322
- """
323
- # Add scheduler statistics
324
- self.scheduler_stats.created_requests += max(
325
- 0, result.run_info.created_requests
326
- )
327
- self.scheduler_stats.queued_requests += max(0, result.run_info.queued_requests)
328
- self.scheduler_stats.scheduled_requests += max(
329
- 0, result.run_info.scheduled_requests
330
- )
331
- self.scheduler_stats.processing_requests += max(
332
- 0, result.run_info.processing_requests
333
- )
334
- self.scheduler_stats.completed_requests += max(
335
- 0, result.run_info.completed_requests
336
- )
337
-
338
- if result.type_ != "request_complete" or (
339
- result.request_info.canceled and not result.request_info.requested
340
- ):
341
- # If the result is not completed yet, don't add to the results
342
- # If the result was canceled and not started, ignore it
343
- return False
344
-
345
- # Add request statistics
346
- self.requests_stats.totals.total += 1
347
- if result.request_info.canceled:
348
- self.requests_stats.totals.incomplete += 1
349
- elif result.request_info.errored:
350
- self.requests_stats.totals.errored += 1
351
- elif result.request_info.completed:
352
- self.requests_stats.totals.successful += 1
353
- else:
354
- raise ValueError(
355
- "Unexpected state: request_info must be either "
356
- "completed, canceled, or errored. "
357
- f"Got {result.request_info}"
358
- )
359
-
360
- self.requests_stats.queued_time.update(
361
- result.request_info.dequeued_time - result.request_info.queued_time
362
- )
363
- self.requests_stats.scheduled_time_delay.update(
364
- result.request_info.scheduled_time - result.request_info.dequeued_time
365
- )
366
- sleep_time = max(
367
- 0.0,
368
- result.request_info.targeted_start_time
369
- - result.request_info.scheduled_time,
370
- )
371
- self.requests_stats.scheduled_time_sleep.update(sleep_time)
372
- time_to_worker_start = (
373
- result.request_info.worker_start - result.request_info.scheduled_time
374
- )
375
- self.requests_stats.worker_start_delay.update(time_to_worker_start - sleep_time)
376
- self.requests_stats.worker_time.update(
377
- result.request_info.worker_end - result.request_info.worker_start
378
- )
379
- self.requests_stats.worker_start_time_targeted_delay.update(
380
- result.request_info.worker_start - result.request_info.targeted_start_time
381
- )
382
- self.requests_stats.request_start_time_delay.update(
383
- result.request_info.worker_start - result.request_info.targeted_start_time
384
- )
385
- self.requests_stats.request_start_time_targeted_delay.update(
386
- result.request_info.worker_start - result.request_info.targeted_start_time
387
- )
388
- self.requests_stats.request_time_delay.update(
389
- (result.request_info.worker_end - result.request_info.worker_start)
390
- - (result.request_info.worker_end - result.request_info.worker_start)
391
- )
392
- self.requests_stats.request_time.update(
393
- result.request_info.worker_end - result.request_info.worker_start
394
- )
395
-
396
- # Add result to the list of results provided we are not in warmup or cooldown
397
- total_completed = self.requests_stats.totals.total.total
398
- global_start_time = self.requests_stats.totals.total.start_time
399
-
400
- in_warmup_number = (
401
- self.args.warmup_number and total_completed <= self.args.warmup_number
402
- )
403
- in_warmup_duration = (
404
- self.args.warmup_duration
405
- and result.request_info.worker_start
406
- <= (global_start_time + self.args.warmup_duration)
407
- )
408
-
409
- if in_warmup_number or in_warmup_duration:
410
- self.in_warmup = True
411
- return True
412
-
413
- self.in_warmup = False
414
- in_cooldown_number = (
415
- self.args.cooldown_number
416
- and self.args.max_number
417
- and total_completed > self.args.max_number - self.args.cooldown_number
418
- )
419
- in_cooldown_duration = (
420
- self.args.cooldown_duration
421
- and self.args.max_duration
422
- and result.request_info.worker_start
423
- > global_start_time + self.args.max_duration - self.args.cooldown_duration
424
- )
425
-
426
- if in_cooldown_number or in_cooldown_duration:
427
- self.in_cooldown = True
428
- return True
429
-
430
- self.in_cooldown = False
431
-
432
- if result.request_info.canceled:
433
- self.results.incomplete.append(result)
434
- elif result.request_info.errored:
435
- self.results.errored.append(result)
436
- elif result.request_info.completed:
437
- self.results.successful.append(result)
438
- else:
439
- raise ValueError(
440
- "Unexpected state: request_info must be either "
441
- "completed, canceled, or errored. "
442
- f"Got {result.request_info}"
443
- )
444
-
445
- return True
446
-
447
- @abstractmethod
448
- def compile(self) -> BenchmarkT:
449
- """
450
- Compile the benchmark results and statistics into a Benchmark object.
451
- This is required to be implemented by subclasses to finalize the benchmark
452
- and return the compiled object.
453
- """
454
- ...
455
-
456
-
457
- AggregatorT = TypeVar("AggregatorT", bound=BenchmarkAggregator)
458
-
459
-
460
- class GenerativeRequestsRunningStats(RequestsRunningStats):
461
- """
462
- The metrics for generative requests that have succeeded, been canceled, or errored
463
- stored as running statistics for easy calculations of rates, averages, totals, etc.
464
- """
465
-
466
- time_to_first_token: TimeRunningStats = Field(
467
- description=(
468
- "The running statistics for the time from the start of the request to the "
469
- "first token being generated for all requests that completed within the "
470
- "benchmark run."
471
- ),
472
- default_factory=TimeRunningStats,
473
- )
474
- inter_token_latency: TimeRunningStats = Field(
475
- description=(
476
- "The running statistics for the time between each token being generated "
477
- "for all requests that completed within the benchmark run."
478
- ),
479
- default_factory=TimeRunningStats,
480
- )
481
- prompt_tokens: RunningStats = Field(
482
- description=(
483
- "The running statistics for the token count for the prompt for all "
484
- "requests that completed, if available in the response."
485
- ),
486
- default_factory=RunningStats,
487
- )
488
- output_tokens: RunningStats = Field(
489
- description=(
490
- "The running statistics for the token count for the output for all "
491
- "requests that completed, if available in the response."
492
- ),
493
- default_factory=RunningStats,
494
- )
495
- total_tokens: RunningStats = Field(
496
- description=(
497
- "The running statistics for the total token count for all requests that "
498
- "completed, if available in the response."
499
- ),
500
- default_factory=RunningStats,
501
- )
502
-
503
-
504
- class GenerativeBenchmarkAggregator(
505
- BenchmarkAggregator[GenerativeBenchmark, GenerationRequest, ResponseSummary]
506
- ):
507
- type_: Literal["generative_benchmark_aggregator"] = (
508
- "generative_benchmark_aggregator" # type: ignore[assignment]
509
- )
510
- processor: Optional[Union[str, Path, Any]] = Field(
511
- description=(
512
- "The tokenizer to use for calculating token counts when none are "
513
- "avaiable that match the preferred source."
514
- )
515
- )
516
- processor_args: Optional[dict[str, Any]] = Field(
517
- description=(
518
- "Additional arguments to pass to the tokenizer if it requires "
519
- "any specific configuration for loading or processing."
520
- ),
521
- )
522
- worker_description: GenerativeRequestsWorkerDescription = Field(
523
- description=(
524
- "The description and specifics for the worker used to resolve requests "
525
- "for this benchmark."
526
- ),
527
- discriminator="type_",
528
- )
529
- request_loader_description: GenerativeRequestLoaderDescription = Field(
530
- description=(
531
- "The description and specifics for the request loader used to create "
532
- "requests for this benchmark."
533
- ),
534
- discriminator="type_",
535
- )
536
- requests_stats: GenerativeRequestsRunningStats = Field(
537
- description=(
538
- "The running statistics for the requests for this benchmark run. "
539
- "This includes all requests created, regardless of their status."
540
- ),
541
- default_factory=GenerativeRequestsRunningStats,
542
- )
543
-
544
- def add_result(
545
- self, result: SchedulerRequestResult[GenerationRequest, ResponseSummary]
546
- ) -> bool:
547
- """
548
- Add a result to the aggregator. This will update the internal statistics
549
- and add the result to the list of results if it is not within the warmup or
550
- cooldown period.
551
-
552
- :param result: The result to add to the aggregator.
553
- """
554
- if not super().add_result(result):
555
- return False
556
-
557
- if result.request is None:
558
- raise ValueError("Request is None, cannot add result.")
559
-
560
- if result.response is None:
561
- raise ValueError("Response is None, cannot add result.")
562
-
563
- self.requests_stats.request_start_time_delay.update(
564
- result.response.start_time - result.request_info.worker_start
565
- )
566
- self.requests_stats.request_start_time_targeted_delay.update(
567
- result.response.start_time - result.request_info.targeted_start_time
568
- )
569
- self.requests_stats.request_time_delay.update(
570
- (result.response.start_time - result.request_info.worker_start)
571
- + result.request_info.worker_end
572
- - result.response.end_time
573
- )
574
- self.requests_stats.request_time.update(
575
- result.response.end_time - result.response.start_time
576
- )
577
- if result.response.first_iter_time:
578
- self.requests_stats.time_to_first_token.update(
579
- result.response.first_iter_time - result.response.start_time
580
- )
581
- if result.response.last_iter_time and result.response.first_iter_time:
582
- self.requests_stats.inter_token_latency.update(
583
- result.response.last_iter_time - result.response.first_iter_time,
584
- count=(result.response.output_tokens or 1) - 1,
585
- )
586
- self.requests_stats.prompt_tokens += result.response.request_prompt_tokens or 0
587
- self.requests_stats.output_tokens += result.response.request_output_tokens or 0
588
- total_tokens = (result.response.request_prompt_tokens or 0) + (
589
- result.response.request_output_tokens or 0
590
- )
591
- self.requests_stats.total_tokens += total_tokens
592
-
593
- return True
594
-
595
- def compile(self) -> GenerativeBenchmark:
596
- """
597
- Compile the benchmark results and statistics into a GenerativeBenchmark object.
598
- This is required to be implemented by subclasses to finalize the benchmark
599
- and return the compiled object.
600
- """
601
- successful, incomplete, errored = self._compile_results()
602
-
603
- return GenerativeBenchmark.from_stats(
604
- run_id=self.run_id,
605
- successful=successful,
606
- incomplete=incomplete,
607
- errored=errored,
608
- args=self.args,
609
- run_stats=BenchmarkRunStats(
610
- start_time=self.requests_stats.totals.total.start_time,
611
- end_time=time.time(),
612
- requests_made=StatusBreakdown(
613
- successful=int(self.requests_stats.totals.successful.total),
614
- errored=int(self.requests_stats.totals.errored.total),
615
- incomplete=int(self.requests_stats.totals.incomplete.total),
616
- total=int(self.requests_stats.totals.total.total),
617
- ),
618
- queued_time_avg=self.requests_stats.queued_time.mean,
619
- scheduled_time_delay_avg=self.requests_stats.scheduled_time_delay.mean,
620
- scheduled_time_sleep_avg=self.requests_stats.scheduled_time_sleep.mean,
621
- worker_start_delay_avg=self.requests_stats.worker_start_delay.mean,
622
- worker_time_avg=self.requests_stats.worker_time.mean,
623
- worker_start_time_targeted_delay_avg=self.requests_stats.worker_start_time_targeted_delay.mean,
624
- request_start_time_delay_avg=self.requests_stats.request_start_time_delay.mean,
625
- request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
626
- request_time_delay_avg=self.requests_stats.request_time_delay.mean,
627
- request_time_avg=self.requests_stats.request_time.mean,
628
- ),
629
- worker=self.worker_description,
630
- requests_loader=self.request_loader_description,
631
- extras=self.extras,
632
- )
633
-
634
- def _compile_results(
635
- self,
636
- ) -> tuple[
637
- list[GenerativeTextResponseStats],
638
- list[GenerativeTextErrorStats],
639
- list[GenerativeTextErrorStats],
640
- ]:
641
- successful: list[GenerativeTextResponseStats] = [
642
- GenerativeTextResponseStats(
643
- request_id=result.request.request_id,
644
- request_type=result.request.request_type,
645
- scheduler_info=result.request_info,
646
- prompt=str(result.request.content),
647
- prompt_tokens=self._compile_tokens_count(
648
- value=str(result.request.content),
649
- requests_tokens=result.response.request_prompt_tokens,
650
- response_tokens=result.response.response_prompt_tokens,
651
- preferred_tokens_source=settings.preferred_prompt_tokens_source,
652
- errored=False,
653
- ),
654
- output=result.response.value,
655
- output_tokens=self._compile_tokens_count(
656
- value=result.response.value,
657
- requests_tokens=result.response.request_output_tokens,
658
- response_tokens=result.response.response_output_tokens,
659
- preferred_tokens_source=settings.preferred_output_tokens_source,
660
- errored=False,
661
- ),
662
- start_time=result.response.start_time,
663
- end_time=result.response.end_time,
664
- first_token_time=result.response.first_iter_time or -1.0,
665
- last_token_time=result.response.last_iter_time or -1.0,
666
- )
667
- for result in self.results.successful
668
- if result.request and result.response
669
- ]
670
- incomplete: list[GenerativeTextErrorStats] = [
671
- GenerativeTextErrorStats(
672
- error=result.response.error or "",
673
- request_id=result.request.request_id,
674
- request_type=result.request.request_type,
675
- scheduler_info=result.request_info,
676
- prompt=str(result.request.content),
677
- prompt_tokens=self._compile_tokens_count(
678
- value=str(result.request.content),
679
- requests_tokens=result.response.request_prompt_tokens,
680
- response_tokens=result.response.response_prompt_tokens,
681
- preferred_tokens_source=settings.preferred_prompt_tokens_source,
682
- errored=True,
683
- ),
684
- output=result.response.value,
685
- output_tokens=self._compile_tokens_count(
686
- value=result.response.value,
687
- requests_tokens=result.response.request_output_tokens,
688
- response_tokens=result.response.response_output_tokens,
689
- preferred_tokens_source=settings.preferred_output_tokens_source,
690
- errored=True,
691
- ),
692
- start_time=result.response.start_time,
693
- end_time=result.response.end_time,
694
- first_token_time=result.response.first_iter_time,
695
- last_token_time=result.response.last_iter_time,
696
- )
697
- for result in self.results.incomplete
698
- if result.request and result.response
699
- ]
700
- error: list[GenerativeTextErrorStats] = [
701
- GenerativeTextErrorStats(
702
- error=result.response.error or "",
703
- request_id=result.request.request_id,
704
- request_type=result.request.request_type,
705
- scheduler_info=result.request_info,
706
- prompt=str(result.request.content),
707
- prompt_tokens=self._compile_tokens_count(
708
- value=str(result.request.content),
709
- requests_tokens=result.response.request_prompt_tokens,
710
- response_tokens=result.response.response_prompt_tokens,
711
- preferred_tokens_source=settings.preferred_prompt_tokens_source,
712
- errored=True,
713
- ),
714
- output=result.response.value,
715
- output_tokens=self._compile_tokens_count(
716
- value=result.response.value,
717
- requests_tokens=result.response.request_output_tokens,
718
- response_tokens=result.response.response_output_tokens,
719
- preferred_tokens_source=settings.preferred_output_tokens_source,
720
- errored=True,
721
- ),
722
- start_time=result.response.start_time,
723
- end_time=result.response.end_time,
724
- first_token_time=result.response.first_iter_time,
725
- last_token_time=result.response.last_iter_time,
726
- )
727
- for result in self.results.errored
728
- if result.request and result.response
729
- ]
730
-
731
- return successful, incomplete, error
732
-
733
- def _compile_tokens_count(
734
- self,
735
- value: str,
736
- requests_tokens: Optional[int],
737
- response_tokens: Optional[int],
738
- preferred_tokens_source: Optional[Literal["request", "response", "local"]],
739
- errored: bool,
740
- ) -> int:
741
- if not errored and preferred_tokens_source == "response" and response_tokens:
742
- return response_tokens or 0
743
-
744
- if not errored and preferred_tokens_source == "request" and requests_tokens:
745
- return requests_tokens or 0
746
-
747
- if preferred_tokens_source in {"response", "request"} and (
748
- self.processor is None or errored or response_tokens or requests_tokens
749
- ):
750
- # we had a preferred tokens source that isn't local and we either
751
- # have the data to return something or we don't have the ability
752
- # to calculate locally
753
- return response_tokens or requests_tokens or 0
754
-
755
- self.processor = check_load_processor(
756
- self.processor,
757
- processor_args=self.processor_args,
758
- error_msg="Processor/Tokenizer is required for calculating token counts.",
759
- )
760
- return len(self.processor.tokenize(value))