guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a21.dist-info/RECORD +0 -62
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABCMeta
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from guidellm.data.preprocessors.preprocessor import (
|
|
7
|
+
DatasetPreprocessor,
|
|
8
|
+
PreprocessorRegistry,
|
|
9
|
+
)
|
|
10
|
+
from guidellm.data.schemas import GenerativeDatasetColumnType
|
|
11
|
+
from guidellm.data.utils import text_stats
|
|
12
|
+
from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"GenerativeAudioTranscriptionRequestFormatter",
|
|
16
|
+
"GenerativeAudioTranslationRequestFormatter",
|
|
17
|
+
"GenerativeChatCompletionsRequestFormatter",
|
|
18
|
+
"GenerativeTextCompletionsRequestFormatter",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
|
|
23
|
+
@staticmethod
|
|
24
|
+
def encode_audio(*args, **kwargs):
|
|
25
|
+
from guidellm.extras.audio import encode_audio
|
|
26
|
+
|
|
27
|
+
return encode_audio(*args, **kwargs)
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def encode_image(*args, **kwargs):
|
|
31
|
+
from guidellm.extras.vision import encode_image
|
|
32
|
+
|
|
33
|
+
return encode_image(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def encode_video(*args, **kwargs):
|
|
37
|
+
from guidellm.extras.vision import encode_video
|
|
38
|
+
|
|
39
|
+
return encode_video(*args, **kwargs)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@PreprocessorRegistry.register("text_completions")
|
|
43
|
+
class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
model: str,
|
|
47
|
+
extras: dict[str, Any] | GenerationRequestArguments | None = None,
|
|
48
|
+
stream: bool = True,
|
|
49
|
+
max_tokens: int | None = None,
|
|
50
|
+
max_completion_tokens: int | None = None,
|
|
51
|
+
):
|
|
52
|
+
self.model: str | None = model
|
|
53
|
+
self.extras = (
|
|
54
|
+
GenerationRequestArguments(**extras)
|
|
55
|
+
if extras and isinstance(extras, dict)
|
|
56
|
+
else extras
|
|
57
|
+
)
|
|
58
|
+
self.stream: bool = stream
|
|
59
|
+
self.max_tokens: int | None = max_tokens or max_completion_tokens
|
|
60
|
+
|
|
61
|
+
def __call__(
|
|
62
|
+
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
|
|
63
|
+
) -> GenerationRequest:
|
|
64
|
+
arguments: GenerationRequestArguments = GenerationRequestArguments(body={})
|
|
65
|
+
input_metrics = UsageMetrics()
|
|
66
|
+
output_metrics = UsageMetrics()
|
|
67
|
+
|
|
68
|
+
# Add model
|
|
69
|
+
if self.model is not None:
|
|
70
|
+
arguments.body["model"] = self.model
|
|
71
|
+
|
|
72
|
+
# Configure streaming
|
|
73
|
+
if self.stream:
|
|
74
|
+
arguments.stream = True
|
|
75
|
+
arguments.body["stream"] = True
|
|
76
|
+
|
|
77
|
+
# Handle output tokens
|
|
78
|
+
if output_tokens := sum(
|
|
79
|
+
count for count in columns.get("output_tokens_count_column", []) if count
|
|
80
|
+
):
|
|
81
|
+
output_metrics.text_tokens = output_tokens
|
|
82
|
+
arguments.body["max_tokens"] = output_tokens
|
|
83
|
+
arguments.body["stop"] = None
|
|
84
|
+
arguments.body["ignore_eos"] = True
|
|
85
|
+
elif self.max_tokens is not None:
|
|
86
|
+
arguments.body["max_tokens"] = self.max_tokens
|
|
87
|
+
|
|
88
|
+
# Handle prompt tokens
|
|
89
|
+
if prompt_tokens := sum(
|
|
90
|
+
count for count in columns.get("prompt_tokens_count_column", []) if count
|
|
91
|
+
):
|
|
92
|
+
input_metrics.text_tokens = prompt_tokens
|
|
93
|
+
|
|
94
|
+
# Apply extra arguments
|
|
95
|
+
if self.extras:
|
|
96
|
+
arguments.model_combine(self.extras)
|
|
97
|
+
|
|
98
|
+
# Build prompt
|
|
99
|
+
prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
|
|
100
|
+
text = "".join(txt for txt in columns.get("text_column", []) if txt)
|
|
101
|
+
if prefix or text:
|
|
102
|
+
arguments.body["prompt"] = prefix + text
|
|
103
|
+
stats = text_stats(arguments.body["prompt"])
|
|
104
|
+
input_metrics.text_characters = stats.get("num_chars")
|
|
105
|
+
input_metrics.text_words = stats.get("num_words")
|
|
106
|
+
|
|
107
|
+
return GenerationRequest(
|
|
108
|
+
request_type="text_completions",
|
|
109
|
+
arguments=arguments,
|
|
110
|
+
input_metrics=input_metrics,
|
|
111
|
+
output_metrics=output_metrics,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@PreprocessorRegistry.register("chat_completions")
|
|
116
|
+
class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
model: str,
|
|
120
|
+
extras: dict[str, Any] | GenerationRequestArguments | None = None,
|
|
121
|
+
stream: bool = True,
|
|
122
|
+
max_tokens: int | None = None,
|
|
123
|
+
max_completion_tokens: int | None = None,
|
|
124
|
+
encode_kwargs: dict[str, Any] | None = None,
|
|
125
|
+
):
|
|
126
|
+
self.model = model
|
|
127
|
+
self.extras = (
|
|
128
|
+
GenerationRequestArguments(**extras)
|
|
129
|
+
if extras and isinstance(extras, dict)
|
|
130
|
+
else extras
|
|
131
|
+
)
|
|
132
|
+
self.stream = stream
|
|
133
|
+
self.max_completion_tokens = max_tokens or max_completion_tokens
|
|
134
|
+
self.encode_image_kwargs = (
|
|
135
|
+
encode_kwargs.get("image", {}) if encode_kwargs else {}
|
|
136
|
+
)
|
|
137
|
+
self.encode_video_kwargs = (
|
|
138
|
+
encode_kwargs.get("video", {}) if encode_kwargs else {}
|
|
139
|
+
)
|
|
140
|
+
self.encode_audio_kwargs = (
|
|
141
|
+
encode_kwargs.get("audio", {}) if encode_kwargs else {}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def __call__( # noqa: C901, PLR0912, PLR0915
|
|
145
|
+
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
|
|
146
|
+
) -> GenerationRequest:
|
|
147
|
+
arguments = GenerationRequestArguments(body={})
|
|
148
|
+
input_metrics = UsageMetrics()
|
|
149
|
+
output_metrics = UsageMetrics()
|
|
150
|
+
|
|
151
|
+
# Add model
|
|
152
|
+
if self.model is not None:
|
|
153
|
+
arguments.body["model"] = self.model
|
|
154
|
+
|
|
155
|
+
# Configure streaming
|
|
156
|
+
if self.stream:
|
|
157
|
+
arguments.stream = True
|
|
158
|
+
arguments.body.update(
|
|
159
|
+
{"stream": True, "stream_options": {"include_usage": True}}
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Handle output tokens
|
|
163
|
+
if output_tokens := sum(
|
|
164
|
+
count for count in columns.get("output_tokens_count_column", []) if count
|
|
165
|
+
):
|
|
166
|
+
output_metrics.text_tokens = output_tokens
|
|
167
|
+
arguments.body.update(
|
|
168
|
+
{
|
|
169
|
+
"max_completion_tokens": output_tokens,
|
|
170
|
+
"stop": None,
|
|
171
|
+
"ignore_eos": True,
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
elif self.max_completion_tokens is not None:
|
|
175
|
+
arguments.body["max_completion_tokens"] = self.max_completion_tokens
|
|
176
|
+
|
|
177
|
+
# Handle prompt tokens
|
|
178
|
+
if prompt_tokens := sum(
|
|
179
|
+
count for count in columns.get("prompt_tokens_count_column", []) if count
|
|
180
|
+
):
|
|
181
|
+
input_metrics.text_tokens = prompt_tokens
|
|
182
|
+
|
|
183
|
+
# Apply extra arguments
|
|
184
|
+
if self.extras:
|
|
185
|
+
arguments.model_combine(self.extras)
|
|
186
|
+
|
|
187
|
+
# Build messages
|
|
188
|
+
arguments.body["messages"] = []
|
|
189
|
+
|
|
190
|
+
for prefix in columns.get("prefix_column", []):
|
|
191
|
+
if not prefix:
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
stats = text_stats(prefix)
|
|
195
|
+
if (num_chars := stats.get("num_chars")) is not None:
|
|
196
|
+
input_metrics.text_characters = (
|
|
197
|
+
input_metrics.text_characters or 0
|
|
198
|
+
) + num_chars
|
|
199
|
+
if (num_words := stats.get("num_words")) is not None:
|
|
200
|
+
input_metrics.text_words = (input_metrics.text_words or 0) + num_words
|
|
201
|
+
|
|
202
|
+
arguments.body["messages"].append({"role": "system", "content": prefix})
|
|
203
|
+
|
|
204
|
+
for text in columns.get("text_column", []):
|
|
205
|
+
if not text:
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
stats = text_stats(text)
|
|
209
|
+
if (num_chars := stats.get("num_chars")) is not None:
|
|
210
|
+
input_metrics.text_characters = (
|
|
211
|
+
input_metrics.text_characters or 0
|
|
212
|
+
) + num_chars
|
|
213
|
+
if (num_words := stats.get("num_words")) is not None:
|
|
214
|
+
input_metrics.text_words = (input_metrics.text_words or 0) + num_words
|
|
215
|
+
|
|
216
|
+
arguments.body["messages"].append(
|
|
217
|
+
{"role": "user", "content": [{"type": "text", "text": text}]}
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
for image in columns.get("image_column", []):
|
|
221
|
+
if not image:
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
image_dict = self.encode_image(image, **self.encode_image_kwargs)
|
|
225
|
+
if (image_pixels := image_dict.get("image_pixels")) is not None:
|
|
226
|
+
input_metrics.image_pixels = (
|
|
227
|
+
input_metrics.image_pixels or 0
|
|
228
|
+
) + image_pixels
|
|
229
|
+
if (image_bytes := image_dict.get("image_bytes")) is not None:
|
|
230
|
+
input_metrics.image_bytes = (
|
|
231
|
+
input_metrics.image_bytes or 0
|
|
232
|
+
) + image_bytes
|
|
233
|
+
|
|
234
|
+
arguments.body["messages"].append(
|
|
235
|
+
{
|
|
236
|
+
"role": "user",
|
|
237
|
+
"content": [
|
|
238
|
+
{"type": "image_url", "image_url": image_dict.get("image")}
|
|
239
|
+
],
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
for video in columns.get("video_column", []):
|
|
244
|
+
if not video:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
video_dict = self.encode_video(video, **self.encode_video_kwargs)
|
|
248
|
+
if (video_frames := video_dict.get("video_frames")) is not None:
|
|
249
|
+
input_metrics.video_frames = (
|
|
250
|
+
input_metrics.video_frames or 0
|
|
251
|
+
) + video_frames
|
|
252
|
+
if (video_seconds := video_dict.get("video_seconds")) is not None:
|
|
253
|
+
input_metrics.video_seconds = (
|
|
254
|
+
input_metrics.video_seconds or 0.0
|
|
255
|
+
) + video_seconds
|
|
256
|
+
if (video_bytes := video_dict.get("video_bytes")) is not None:
|
|
257
|
+
input_metrics.video_bytes = (
|
|
258
|
+
input_metrics.video_bytes or 0
|
|
259
|
+
) + video_bytes
|
|
260
|
+
|
|
261
|
+
arguments.body["messages"].append(
|
|
262
|
+
{
|
|
263
|
+
"role": "user",
|
|
264
|
+
"content": [
|
|
265
|
+
{"type": "video_url", "video_url": video_dict.get("video")}
|
|
266
|
+
],
|
|
267
|
+
}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
for audio in columns.get("audio_column", []):
|
|
271
|
+
if not audio:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
audio_dict = self.encode_audio(
|
|
275
|
+
audio, b64encode=True, **self.encode_audio_kwargs
|
|
276
|
+
)
|
|
277
|
+
if (audio_samples := audio_dict.get("audio_samples")) is not None:
|
|
278
|
+
input_metrics.audio_samples = (
|
|
279
|
+
input_metrics.audio_samples or 0
|
|
280
|
+
) + audio_samples
|
|
281
|
+
if (audio_seconds := audio_dict.get("audio_seconds")) is not None:
|
|
282
|
+
input_metrics.audio_seconds = (
|
|
283
|
+
input_metrics.audio_seconds or 0.0
|
|
284
|
+
) + audio_seconds
|
|
285
|
+
if (audio_bytes := audio_dict.get("audio_bytes")) is not None:
|
|
286
|
+
input_metrics.audio_bytes = (
|
|
287
|
+
input_metrics.audio_bytes or 0
|
|
288
|
+
) + audio_bytes
|
|
289
|
+
|
|
290
|
+
arguments.body["messages"].append(
|
|
291
|
+
{
|
|
292
|
+
"role": "user",
|
|
293
|
+
"content": [
|
|
294
|
+
{
|
|
295
|
+
"type": "input_audio",
|
|
296
|
+
"input_audio": {
|
|
297
|
+
"data": audio_dict.get("audio"),
|
|
298
|
+
"format": audio_dict.get("format"),
|
|
299
|
+
},
|
|
300
|
+
}
|
|
301
|
+
],
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
return GenerationRequest(
|
|
306
|
+
request_type="chat_completions",
|
|
307
|
+
arguments=arguments,
|
|
308
|
+
input_metrics=input_metrics,
|
|
309
|
+
output_metrics=output_metrics,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@PreprocessorRegistry.register("audio_transcriptions")
|
|
314
|
+
class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
|
|
315
|
+
def __init__(
|
|
316
|
+
self,
|
|
317
|
+
model: str,
|
|
318
|
+
extras: dict[str, Any] | GenerationRequestArguments | None = None,
|
|
319
|
+
stream: bool = True,
|
|
320
|
+
encode_kwargs: dict[str, Any] | None = None,
|
|
321
|
+
):
|
|
322
|
+
self.model = model
|
|
323
|
+
self.extras = (
|
|
324
|
+
GenerationRequestArguments(**extras)
|
|
325
|
+
if extras and isinstance(extras, dict)
|
|
326
|
+
else extras
|
|
327
|
+
)
|
|
328
|
+
self.stream = stream
|
|
329
|
+
self.encode_audio_kwargs = encode_kwargs or {}
|
|
330
|
+
|
|
331
|
+
def __call__( # noqa: C901
|
|
332
|
+
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
|
|
333
|
+
) -> GenerationRequest:
|
|
334
|
+
arguments = GenerationRequestArguments(body={}, files={})
|
|
335
|
+
input_metrics = UsageMetrics()
|
|
336
|
+
output_metrics = UsageMetrics()
|
|
337
|
+
|
|
338
|
+
# Add model
|
|
339
|
+
if self.model is not None:
|
|
340
|
+
arguments.body["model"] = self.model
|
|
341
|
+
|
|
342
|
+
# Configure streaming
|
|
343
|
+
if self.stream:
|
|
344
|
+
arguments.stream = True
|
|
345
|
+
arguments.body["stream"] = True
|
|
346
|
+
|
|
347
|
+
# Handle output tokens
|
|
348
|
+
if output_tokens := sum(
|
|
349
|
+
count for count in columns.get("output_tokens_count_column", []) if count
|
|
350
|
+
):
|
|
351
|
+
output_metrics.text_tokens = output_tokens
|
|
352
|
+
|
|
353
|
+
# Handle prompt tokens (for audio duration tracking)
|
|
354
|
+
if prompt_tokens := sum(
|
|
355
|
+
count for count in columns.get("prompt_tokens_count_column", []) if count
|
|
356
|
+
):
|
|
357
|
+
input_metrics.text_tokens = prompt_tokens
|
|
358
|
+
|
|
359
|
+
# Apply extra arguments
|
|
360
|
+
if self.extras:
|
|
361
|
+
arguments.model_combine(self.extras)
|
|
362
|
+
|
|
363
|
+
# Build audio input
|
|
364
|
+
audio_columns = columns.get("audio_column", [])
|
|
365
|
+
if len(audio_columns) != 1:
|
|
366
|
+
raise ValueError(
|
|
367
|
+
f"GenerativeAudioTranscriptionRequestFormatter expects exactly "
|
|
368
|
+
f"one audio column, but got {len(audio_columns)}."
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
audio_dict = self.encode_audio(
|
|
372
|
+
audio_columns[0], b64encode=False, **self.encode_audio_kwargs
|
|
373
|
+
)
|
|
374
|
+
input_metrics.audio_samples = audio_dict.get("audio_samples")
|
|
375
|
+
input_metrics.audio_seconds = audio_dict.get("audio_seconds")
|
|
376
|
+
input_metrics.audio_bytes = audio_dict.get("audio_bytes")
|
|
377
|
+
|
|
378
|
+
arguments.files = {
|
|
379
|
+
"file": (
|
|
380
|
+
audio_dict.get("file_name", "audio_input"),
|
|
381
|
+
audio_dict.get("audio"),
|
|
382
|
+
audio_dict.get("mimetype"),
|
|
383
|
+
)
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
# Build prompt
|
|
387
|
+
prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
|
|
388
|
+
text = "".join(txt for txt in columns.get("text_column", []) if txt)
|
|
389
|
+
if prefix or text:
|
|
390
|
+
arguments.body["prompt"] = prefix + text
|
|
391
|
+
stats = text_stats(arguments.body["prompt"])
|
|
392
|
+
input_metrics.text_characters = stats.get("num_chars")
|
|
393
|
+
input_metrics.text_words = stats.get("num_words")
|
|
394
|
+
|
|
395
|
+
return GenerationRequest(
|
|
396
|
+
request_type="audio_transcriptions",
|
|
397
|
+
arguments=arguments,
|
|
398
|
+
input_metrics=input_metrics,
|
|
399
|
+
output_metrics=output_metrics,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
@PreprocessorRegistry.register("audio_translations")
|
|
404
|
+
class GenerativeAudioTranslationRequestFormatter(
|
|
405
|
+
GenerativeAudioTranscriptionRequestFormatter
|
|
406
|
+
):
|
|
407
|
+
def __call__(
|
|
408
|
+
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
|
|
409
|
+
) -> GenerationRequest:
|
|
410
|
+
result = super().__call__(columns)
|
|
411
|
+
result.request_type = "audio_translations"
|
|
412
|
+
return result
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, ClassVar, cast
|
|
5
|
+
|
|
6
|
+
from datasets import Dataset, IterableDataset
|
|
7
|
+
|
|
8
|
+
from guidellm.data.preprocessors.preprocessor import (
|
|
9
|
+
DataDependentPreprocessor,
|
|
10
|
+
PreprocessorRegistry,
|
|
11
|
+
)
|
|
12
|
+
from guidellm.data.schemas import GenerativeDatasetColumnType
|
|
13
|
+
|
|
14
|
+
__all__ = ["GenerativeColumnMapper"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@PreprocessorRegistry.register("generative_column_mapper")
|
|
18
|
+
class GenerativeColumnMapper(DataDependentPreprocessor):
|
|
19
|
+
defaults: ClassVar[dict[str, list[str]]] = {
|
|
20
|
+
"prompt_tokens_count_column": ["prompt_tokens_count", "input_tokens_count"],
|
|
21
|
+
"output_tokens_count_column": [
|
|
22
|
+
"output_tokens_count",
|
|
23
|
+
"completion_tokens_count",
|
|
24
|
+
],
|
|
25
|
+
"prefix_column": [
|
|
26
|
+
"system_prompt",
|
|
27
|
+
"system",
|
|
28
|
+
"prefix",
|
|
29
|
+
],
|
|
30
|
+
"text_column": [
|
|
31
|
+
"prompt",
|
|
32
|
+
"instruction",
|
|
33
|
+
"question",
|
|
34
|
+
"input",
|
|
35
|
+
"context",
|
|
36
|
+
"content",
|
|
37
|
+
"conversation",
|
|
38
|
+
"turn",
|
|
39
|
+
"text",
|
|
40
|
+
],
|
|
41
|
+
"image_column": [
|
|
42
|
+
"image",
|
|
43
|
+
"picture",
|
|
44
|
+
"photo",
|
|
45
|
+
"img",
|
|
46
|
+
],
|
|
47
|
+
"video_column": [
|
|
48
|
+
"video",
|
|
49
|
+
"clip",
|
|
50
|
+
"movie",
|
|
51
|
+
"footage",
|
|
52
|
+
"mp4",
|
|
53
|
+
"mov",
|
|
54
|
+
"avi",
|
|
55
|
+
],
|
|
56
|
+
"audio_column": [
|
|
57
|
+
"audio",
|
|
58
|
+
"sound",
|
|
59
|
+
"voice",
|
|
60
|
+
"speech",
|
|
61
|
+
"wav",
|
|
62
|
+
"mp3",
|
|
63
|
+
],
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def datasets_default_mappings(
|
|
68
|
+
cls, datasets: list[Dataset | IterableDataset]
|
|
69
|
+
) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
|
|
70
|
+
mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
|
|
71
|
+
defaultdict(list)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
for index, dataset in enumerate(datasets):
|
|
75
|
+
dataset_columns = dataset.column_names or list(next(iter(dataset)).keys())
|
|
76
|
+
|
|
77
|
+
for column_type in cls.defaults:
|
|
78
|
+
if column_type in mappings:
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
type_names = [
|
|
82
|
+
variant
|
|
83
|
+
for name in cls.defaults.get(column_type, [])
|
|
84
|
+
for plural in [name, f"{name}s", f"{name}es"]
|
|
85
|
+
for variant in [
|
|
86
|
+
plural,
|
|
87
|
+
plural.lower(),
|
|
88
|
+
plural.upper(),
|
|
89
|
+
plural.capitalize(),
|
|
90
|
+
]
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
for name in type_names:
|
|
94
|
+
if name in dataset_columns:
|
|
95
|
+
key = cast("GenerativeDatasetColumnType", column_type)
|
|
96
|
+
mappings[key].append((index, name))
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
return mappings
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def datasets_mappings(
|
|
103
|
+
cls,
|
|
104
|
+
datasets: list[Dataset | IterableDataset],
|
|
105
|
+
input_mappings: dict[GenerativeDatasetColumnType, str | list[str]],
|
|
106
|
+
) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
|
|
107
|
+
mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
|
|
108
|
+
defaultdict(list)
|
|
109
|
+
)
|
|
110
|
+
datasets_named_indices = {
|
|
111
|
+
(
|
|
112
|
+
dataset.info.dataset_name
|
|
113
|
+
if dataset.info and dataset.info.dataset_name
|
|
114
|
+
else index
|
|
115
|
+
): index
|
|
116
|
+
for index, dataset in enumerate(datasets)
|
|
117
|
+
}
|
|
118
|
+
datasets_columns = {
|
|
119
|
+
index: dataset.column_names or list(next(iter(dataset)).keys())
|
|
120
|
+
for index, dataset in enumerate(datasets)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Parse out user mappings that were passed in and validate them
|
|
124
|
+
# Must be in the format of:
|
|
125
|
+
# {<column_type>: [<column_names>]}
|
|
126
|
+
# where <column_names> can be a single string or list of strings
|
|
127
|
+
# and each string can be any of:
|
|
128
|
+
# - a column name (assumes the first dataset was intended)
|
|
129
|
+
# - <int>.<column_name> where <int> is the dataset index
|
|
130
|
+
# - <str>.<column_name> where <str> is the dataset name
|
|
131
|
+
for column_type, names in input_mappings.items():
|
|
132
|
+
mappings[column_type] = []
|
|
133
|
+
for name in names if isinstance(names, list) else [names]:
|
|
134
|
+
if "." in name:
|
|
135
|
+
dataset, column_name = name.split(".", 1)
|
|
136
|
+
dataset_index = (
|
|
137
|
+
int(dataset)
|
|
138
|
+
if dataset.isdigit()
|
|
139
|
+
else datasets_named_indices.get(dataset)
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
dataset_index = 0
|
|
143
|
+
column_name = name
|
|
144
|
+
|
|
145
|
+
if dataset_index is None or dataset_index >= len(datasets):
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"Dataset '{name}' not found in datasets: "
|
|
148
|
+
f"{datasets_named_indices}."
|
|
149
|
+
)
|
|
150
|
+
if column_name not in datasets_columns[dataset_index]:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"Column '{column_name}' not found in dataset "
|
|
153
|
+
f"'{datasets[dataset_index]}' "
|
|
154
|
+
f"columns: {datasets_columns[dataset_index]}."
|
|
155
|
+
)
|
|
156
|
+
mappings[column_type].append((dataset_index, column_name))
|
|
157
|
+
|
|
158
|
+
return mappings
|
|
159
|
+
|
|
160
|
+
def __init__(
|
|
161
|
+
self,
|
|
162
|
+
column_mappings: dict[GenerativeDatasetColumnType, str | list[str]]
|
|
163
|
+
| None = None,
|
|
164
|
+
):
|
|
165
|
+
self.input_mappings = column_mappings
|
|
166
|
+
self.datasets_column_mappings: (
|
|
167
|
+
dict[GenerativeDatasetColumnType, list[tuple[int, str]]] | None
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def __call__(
|
|
171
|
+
self, row: dict[str, Any]
|
|
172
|
+
) -> dict[GenerativeDatasetColumnType, list[Any]]:
|
|
173
|
+
if self.datasets_column_mappings is None:
|
|
174
|
+
raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
|
|
175
|
+
|
|
176
|
+
items = cast("dict[int, dict[str, Any]]", row.pop("items"))
|
|
177
|
+
mapped: dict[GenerativeDatasetColumnType, list[Any]] = defaultdict(list)
|
|
178
|
+
|
|
179
|
+
for column_type, column_mappings in self.datasets_column_mappings.items():
|
|
180
|
+
for (
|
|
181
|
+
dataset_index,
|
|
182
|
+
dataset_column,
|
|
183
|
+
) in column_mappings:
|
|
184
|
+
mapped[column_type].append(items[dataset_index][dataset_column])
|
|
185
|
+
|
|
186
|
+
return dict(mapped)
|
|
187
|
+
|
|
188
|
+
def setup_data(
|
|
189
|
+
self,
|
|
190
|
+
datasets: list[Dataset | IterableDataset],
|
|
191
|
+
data_args: list[dict[str, Any]],
|
|
192
|
+
):
|
|
193
|
+
_ = data_args # Unused for this mapper
|
|
194
|
+
self.datasets_column_mappings = (
|
|
195
|
+
self.datasets_default_mappings(datasets)
|
|
196
|
+
if self.input_mappings is None
|
|
197
|
+
else self.datasets_mappings(datasets, self.input_mappings)
|
|
198
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Protocol, Union, runtime_checkable
|
|
4
|
+
|
|
5
|
+
from datasets import Dataset, IterableDataset
|
|
6
|
+
|
|
7
|
+
from guidellm.utils import RegistryMixin
|
|
8
|
+
|
|
9
|
+
__all__ = ["DataDependentPreprocessor", "DatasetPreprocessor", "PreprocessorRegistry"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@runtime_checkable
|
|
13
|
+
class DatasetPreprocessor(Protocol):
|
|
14
|
+
def __call__(self, item: dict[str, Any]) -> dict[str, Any]: ...
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@runtime_checkable
|
|
18
|
+
class DataDependentPreprocessor(DatasetPreprocessor, Protocol):
|
|
19
|
+
def setup_data(
|
|
20
|
+
self,
|
|
21
|
+
datasets: list[Dataset | IterableDataset],
|
|
22
|
+
data_args: list[dict[str, Any]],
|
|
23
|
+
): ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PreprocessorRegistry(
|
|
27
|
+
RegistryMixin[Union[DataDependentPreprocessor, type[DataDependentPreprocessor]]]
|
|
28
|
+
):
|
|
29
|
+
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from transformers import ( # type: ignore[import]
|
|
6
|
+
AutoTokenizer,
|
|
7
|
+
PreTrainedTokenizerBase,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = ["ProcessorFactory"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProcessorFactory:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
processor: str | PreTrainedTokenizerBase,
|
|
17
|
+
processor_args: dict[str, Any] | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
self.processor = processor
|
|
20
|
+
self.processor_args = processor_args or {}
|
|
21
|
+
|
|
22
|
+
def __call__(self) -> PreTrainedTokenizerBase:
|
|
23
|
+
if isinstance(self.processor, PreTrainedTokenizerBase):
|
|
24
|
+
return self.processor
|
|
25
|
+
else:
|
|
26
|
+
self.processor = AutoTokenizer.from_pretrained(
|
|
27
|
+
self.processor,
|
|
28
|
+
**(self.processor_args or {}),
|
|
29
|
+
)
|
|
30
|
+
return self.processor
|
guidellm/data/schemas.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
__all__ = ["GenerativeDatasetColumnType"]
|
|
4
|
+
|
|
5
|
+
GenerativeDatasetColumnType = Literal[
|
|
6
|
+
"prompt_tokens_count_column",
|
|
7
|
+
"output_tokens_count_column",
|
|
8
|
+
"prefix_column",
|
|
9
|
+
"text_column",
|
|
10
|
+
"image_column",
|
|
11
|
+
"video_column",
|
|
12
|
+
"audio_column",
|
|
13
|
+
]
|