guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +452 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +150 -317
  9. guidellm/benchmark/entrypoints.py +467 -128
  10. guidellm/benchmark/output.py +519 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2086 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +144 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +194 -0
  24. guidellm/data/deserializers/synthetic.py +348 -0
  25. guidellm/data/loaders.py +149 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +404 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +31 -0
  30. guidellm/data/processor.py +31 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +6 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/extras/__init__.py +4 -0
  35. guidellm/extras/audio.py +215 -0
  36. guidellm/extras/vision.py +242 -0
  37. guidellm/logger.py +2 -2
  38. guidellm/mock_server/__init__.py +8 -0
  39. guidellm/mock_server/config.py +84 -0
  40. guidellm/mock_server/handlers/__init__.py +17 -0
  41. guidellm/mock_server/handlers/chat_completions.py +280 -0
  42. guidellm/mock_server/handlers/completions.py +280 -0
  43. guidellm/mock_server/handlers/tokenizer.py +142 -0
  44. guidellm/mock_server/models.py +510 -0
  45. guidellm/mock_server/server.py +168 -0
  46. guidellm/mock_server/utils.py +302 -0
  47. guidellm/preprocess/dataset.py +23 -26
  48. guidellm/presentation/builder.py +2 -2
  49. guidellm/presentation/data_models.py +25 -21
  50. guidellm/presentation/injector.py +2 -3
  51. guidellm/scheduler/__init__.py +65 -26
  52. guidellm/scheduler/constraints.py +1035 -0
  53. guidellm/scheduler/environments.py +252 -0
  54. guidellm/scheduler/scheduler.py +140 -368
  55. guidellm/scheduler/schemas.py +272 -0
  56. guidellm/scheduler/strategies.py +519 -0
  57. guidellm/scheduler/worker.py +391 -420
  58. guidellm/scheduler/worker_group.py +707 -0
  59. guidellm/schemas/__init__.py +31 -0
  60. guidellm/schemas/info.py +159 -0
  61. guidellm/schemas/request.py +226 -0
  62. guidellm/schemas/response.py +119 -0
  63. guidellm/schemas/stats.py +228 -0
  64. guidellm/{config.py → settings.py} +32 -21
  65. guidellm/utils/__init__.py +95 -8
  66. guidellm/utils/auto_importer.py +98 -0
  67. guidellm/utils/cli.py +71 -2
  68. guidellm/utils/console.py +183 -0
  69. guidellm/utils/encoding.py +778 -0
  70. guidellm/utils/functions.py +134 -0
  71. guidellm/utils/hf_datasets.py +1 -2
  72. guidellm/utils/hf_transformers.py +4 -4
  73. guidellm/utils/imports.py +9 -0
  74. guidellm/utils/messaging.py +1118 -0
  75. guidellm/utils/mixins.py +115 -0
  76. guidellm/utils/pydantic_utils.py +411 -0
  77. guidellm/utils/random.py +3 -4
  78. guidellm/utils/registry.py +220 -0
  79. guidellm/utils/singleton.py +133 -0
  80. guidellm/{objects → utils}/statistics.py +341 -247
  81. guidellm/utils/synchronous.py +159 -0
  82. guidellm/utils/text.py +163 -50
  83. guidellm/utils/typing.py +41 -0
  84. guidellm/version.py +1 -1
  85. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
  86. guidellm-0.4.0a169.dist-info/RECORD +95 -0
  87. guidellm/backend/__init__.py +0 -23
  88. guidellm/backend/backend.py +0 -259
  89. guidellm/backend/openai.py +0 -705
  90. guidellm/backend/response.py +0 -136
  91. guidellm/benchmark/aggregator.py +0 -760
  92. guidellm/benchmark/benchmark.py +0 -837
  93. guidellm/benchmark/scenario.py +0 -104
  94. guidellm/data/prideandprejudice.txt.gz +0 -0
  95. guidellm/dataset/__init__.py +0 -22
  96. guidellm/dataset/creator.py +0 -213
  97. guidellm/dataset/entrypoints.py +0 -42
  98. guidellm/dataset/file.py +0 -92
  99. guidellm/dataset/hf_datasets.py +0 -62
  100. guidellm/dataset/in_memory.py +0 -132
  101. guidellm/dataset/synthetic.py +0 -287
  102. guidellm/objects/__init__.py +0 -18
  103. guidellm/objects/pydantic.py +0 -89
  104. guidellm/request/__init__.py +0 -18
  105. guidellm/request/loader.py +0 -284
  106. guidellm/request/request.py +0 -79
  107. guidellm/request/types.py +0 -10
  108. guidellm/scheduler/queues.py +0 -25
  109. guidellm/scheduler/result.py +0 -155
  110. guidellm/scheduler/strategy.py +0 -495
  111. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  112. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,25 @@
1
+ from .formatters import (
2
+ GenerativeAudioTranscriptionRequestFormatter,
3
+ GenerativeAudioTranslationRequestFormatter,
4
+ GenerativeChatCompletionsRequestFormatter,
5
+ GenerativeTextCompletionsRequestFormatter,
6
+ )
7
+ from .mappers import GenerativeColumnMapper
8
+ from .preprocessor import (
9
+ DataDependentPreprocessor,
10
+ DatasetPreprocessor,
11
+ PreprocessorRegistry,
12
+ )
13
+
14
+ __all__ = [
15
+ "ColumnMapper",
16
+ "ColumnMapperRegistry",
17
+ "DataDependentPreprocessor",
18
+ "DatasetPreprocessor",
19
+ "GenerativeAudioTranscriptionRequestFormatter",
20
+ "GenerativeAudioTranslationRequestFormatter",
21
+ "GenerativeChatCompletionsRequestFormatter",
22
+ "GenerativeColumnMapper",
23
+ "GenerativeTextCompletionsRequestFormatter",
24
+ "PreprocessorRegistry",
25
+ ]
@@ -0,0 +1,404 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABCMeta
4
+ from typing import Any
5
+
6
+ from guidellm.data.preprocessors.preprocessor import (
7
+ DatasetPreprocessor,
8
+ PreprocessorRegistry,
9
+ )
10
+ from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
11
+
12
+ __all__ = [
13
+ "GenerativeAudioTranscriptionRequestFormatter",
14
+ "GenerativeAudioTranslationRequestFormatter",
15
+ "GenerativeChatCompletionsRequestFormatter",
16
+ "GenerativeTextCompletionsRequestFormatter",
17
+ ]
18
+
19
+
20
+ class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
21
+ @staticmethod
22
+ def encode_audio(*args, **kwargs):
23
+ from guidellm.extras.audio import encode_audio
24
+
25
+ return encode_audio(*args, **kwargs)
26
+
27
+ @staticmethod
28
+ def encode_image(*args, **kwargs):
29
+ from guidellm.extras.vision import encode_image
30
+
31
+ return encode_image(*args, **kwargs)
32
+
33
+ @staticmethod
34
+ def encode_video(*args, **kwargs):
35
+ from guidellm.extras.vision import encode_video
36
+
37
+ return encode_video(*args, **kwargs)
38
+
39
+
40
+ @PreprocessorRegistry.register("text_completions")
41
+ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
42
+ def __init__(
43
+ self,
44
+ model: str,
45
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
46
+ stream: bool = True,
47
+ max_tokens: int | None = None,
48
+ max_completion_tokens: int | None = None,
49
+ ):
50
+ self.model: str | None = model
51
+ self.extras = (
52
+ GenerationRequestArguments(**extras)
53
+ if extras and isinstance(extras, dict)
54
+ else extras
55
+ )
56
+ self.stream: bool = stream
57
+ self.max_tokens: int | None = max_tokens or max_completion_tokens
58
+
59
+ def __call__(
60
+ self, columns: dict[str, list[Any]]
61
+ ) -> GenerationRequest:
62
+ """
63
+ :param columns: A dict of GenerativeDatasetColumnType to Any
64
+ """
65
+ arguments: GenerationRequestArguments = GenerationRequestArguments()
66
+ arguments.body = {} # The type checker works better setting this field here
67
+ input_metrics = UsageMetrics()
68
+ output_metrics = UsageMetrics()
69
+
70
+ # Add model
71
+ if self.model is not None:
72
+ arguments.body["model"] = self.model
73
+
74
+ # Configure streaming
75
+ if self.stream:
76
+ arguments.stream = True
77
+ arguments.body["stream"] = True
78
+
79
+ # Handle output tokens
80
+ if output_tokens := sum(
81
+ count for count in columns.get("output_tokens_count_column", []) if count
82
+ ):
83
+ output_metrics.text_tokens = output_tokens
84
+ arguments.body["max_tokens"] = output_tokens
85
+ arguments.body["stop"] = None
86
+ arguments.body["ignore_eos"] = True
87
+ elif self.max_tokens is not None:
88
+ arguments.body["max_tokens"] = self.max_tokens
89
+
90
+ # Handle prompt tokens
91
+ if prompt_tokens := sum(
92
+ count for count in columns.get("prompt_tokens_count_column", []) if count
93
+ ):
94
+ input_metrics.text_tokens = prompt_tokens
95
+
96
+ # Apply extra arguments
97
+ if self.extras:
98
+ arguments.model_combine(self.extras)
99
+
100
+ # Build prompt
101
+ prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
102
+ text = "".join(txt for txt in columns.get("text_column", []) if txt)
103
+ if prefix or text:
104
+ prompt = prefix + text
105
+ arguments.body["prompt"] = prompt
106
+ input_metrics.add_text_metrics(prompt)
107
+
108
+ return GenerationRequest(
109
+ request_type="text_completions",
110
+ arguments=arguments,
111
+ input_metrics=input_metrics,
112
+ output_metrics=output_metrics,
113
+ )
114
+
115
+
116
+ @PreprocessorRegistry.register("chat_completions")
117
+ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
118
+ def __init__(
119
+ self,
120
+ model: str,
121
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
122
+ stream: bool = True,
123
+ max_tokens: int | None = None,
124
+ max_completion_tokens: int | None = None,
125
+ encode_kwargs: dict[str, Any] | None = None,
126
+ ):
127
+ self.model = model
128
+ self.extras = (
129
+ GenerationRequestArguments(**extras)
130
+ if extras and isinstance(extras, dict)
131
+ else extras
132
+ )
133
+ self.stream = stream
134
+ self.max_completion_tokens = max_tokens or max_completion_tokens
135
+ self.encode_image_kwargs = (
136
+ encode_kwargs.get("image", {}) if encode_kwargs else {}
137
+ )
138
+ self.encode_video_kwargs = (
139
+ encode_kwargs.get("video", {}) if encode_kwargs else {}
140
+ )
141
+ self.encode_audio_kwargs = (
142
+ encode_kwargs.get("audio", {}) if encode_kwargs else {}
143
+ )
144
+
145
+ def __call__( # noqa: C901, PLR0912, PLR0915
146
+ self, columns: dict[str, list[Any]]
147
+ ) -> GenerationRequest:
148
+ """
149
+ :param columns: A dict of GenerativeDatasetColumnType to Any
150
+ """
151
+ arguments = GenerationRequestArguments()
152
+ arguments.body = {} # The type checker works best with body assigned here
153
+ input_metrics = UsageMetrics()
154
+ output_metrics = UsageMetrics()
155
+
156
+ # Add model
157
+ if self.model is not None:
158
+ arguments.body["model"] = self.model
159
+
160
+ # Configure streaming
161
+ if self.stream:
162
+ arguments.stream = True
163
+ arguments.body.update(
164
+ {"stream": True, "stream_options": {"include_usage": True}}
165
+ )
166
+
167
+ # Handle output tokens
168
+ if output_tokens := sum(
169
+ count for count in columns.get("output_tokens_count_column", []) if count
170
+ ):
171
+ output_metrics.text_tokens = output_tokens
172
+ arguments.body.update(
173
+ {
174
+ "max_completion_tokens": output_tokens,
175
+ "stop": None,
176
+ "ignore_eos": True,
177
+ }
178
+ )
179
+ elif self.max_completion_tokens is not None:
180
+ arguments.body["max_completion_tokens"] = self.max_completion_tokens
181
+
182
+ # Handle prompt tokens
183
+ if prompt_tokens := sum(
184
+ count for count in columns.get("prompt_tokens_count_column", []) if count
185
+ ):
186
+ input_metrics.text_tokens = prompt_tokens
187
+
188
+ # Apply extra arguments
189
+ if self.extras:
190
+ arguments.model_combine(self.extras)
191
+
192
+ # Build messages
193
+ arguments.body["messages"] = []
194
+
195
+ for prefix in columns.get("prefix_column", []):
196
+ if not prefix:
197
+ continue
198
+
199
+ input_metrics.add_text_metrics(prefix)
200
+ arguments.body["messages"].append({"role": "system", "content": prefix})
201
+
202
+ for text in columns.get("text_column", []):
203
+ if not text:
204
+ continue
205
+
206
+ input_metrics.add_text_metrics(text)
207
+
208
+ arguments.body["messages"].append(
209
+ {"role": "user", "content": [{"type": "text", "text": text}]}
210
+ )
211
+
212
+ for image in columns.get("image_column", []):
213
+ if not image:
214
+ continue
215
+
216
+ image_dict = self.encode_image(image, **self.encode_image_kwargs)
217
+ if (image_pixels := image_dict.get("image_pixels")) is not None:
218
+ input_metrics.image_pixels = (
219
+ input_metrics.image_pixels or 0
220
+ ) + image_pixels
221
+ if (image_bytes := image_dict.get("image_bytes")) is not None:
222
+ input_metrics.image_bytes = (
223
+ input_metrics.image_bytes or 0
224
+ ) + image_bytes
225
+
226
+ arguments.body["messages"].append(
227
+ {
228
+ "role": "user",
229
+ "content": [
230
+ {"type": "image_url", "image_url": image_dict.get("image")}
231
+ ],
232
+ }
233
+ )
234
+
235
+ for video in columns.get("video_column", []):
236
+ if not video:
237
+ continue
238
+
239
+ video_dict = self.encode_video(video, **self.encode_video_kwargs)
240
+ if (video_frames := video_dict.get("video_frames")) is not None:
241
+ input_metrics.video_frames = (
242
+ input_metrics.video_frames or 0
243
+ ) + video_frames
244
+ if (video_seconds := video_dict.get("video_seconds")) is not None:
245
+ input_metrics.video_seconds = (
246
+ input_metrics.video_seconds or 0.0
247
+ ) + video_seconds
248
+ if (video_bytes := video_dict.get("video_bytes")) is not None:
249
+ input_metrics.video_bytes = (
250
+ input_metrics.video_bytes or 0
251
+ ) + video_bytes
252
+
253
+ arguments.body["messages"].append(
254
+ {
255
+ "role": "user",
256
+ "content": [
257
+ {"type": "video_url", "video_url": video_dict.get("video")}
258
+ ],
259
+ }
260
+ )
261
+
262
+ for audio in columns.get("audio_column", []):
263
+ if not audio:
264
+ continue
265
+
266
+ audio_dict = self.encode_audio(
267
+ audio, b64encode=True, **self.encode_audio_kwargs
268
+ )
269
+ if (audio_samples := audio_dict.get("audio_samples")) is not None:
270
+ input_metrics.audio_samples = (
271
+ input_metrics.audio_samples or 0
272
+ ) + audio_samples
273
+ if (audio_seconds := audio_dict.get("audio_seconds")) is not None:
274
+ input_metrics.audio_seconds = (
275
+ input_metrics.audio_seconds or 0.0
276
+ ) + audio_seconds
277
+ if (audio_bytes := audio_dict.get("audio_bytes")) is not None:
278
+ input_metrics.audio_bytes = (
279
+ input_metrics.audio_bytes or 0
280
+ ) + audio_bytes
281
+
282
+ arguments.body["messages"].append(
283
+ {
284
+ "role": "user",
285
+ "content": [
286
+ {
287
+ "type": "input_audio",
288
+ "input_audio": {
289
+ "data": audio_dict.get("audio"),
290
+ "format": audio_dict.get("format"),
291
+ },
292
+ }
293
+ ],
294
+ }
295
+ )
296
+
297
+ return GenerationRequest(
298
+ request_type="chat_completions",
299
+ arguments=arguments,
300
+ input_metrics=input_metrics,
301
+ output_metrics=output_metrics,
302
+ )
303
+
304
+
305
+ @PreprocessorRegistry.register("audio_transcriptions")
306
+ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
307
+ def __init__(
308
+ self,
309
+ model: str,
310
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
311
+ stream: bool = True,
312
+ encode_kwargs: dict[str, Any] | None = None,
313
+ ):
314
+ self.model = model
315
+ self.extras = (
316
+ GenerationRequestArguments(**extras)
317
+ if extras and isinstance(extras, dict)
318
+ else extras
319
+ )
320
+ self.stream = stream
321
+ self.encode_audio_kwargs = encode_kwargs or {}
322
+
323
+ def __call__( # noqa: C901
324
+ self, columns: dict[str, list[Any]]
325
+ ) -> GenerationRequest:
326
+ arguments = GenerationRequestArguments(files={})
327
+ arguments.body = {} # The type checker works best with body assigned here
328
+ input_metrics = UsageMetrics()
329
+ output_metrics = UsageMetrics()
330
+
331
+ # Add model
332
+ if self.model is not None:
333
+ arguments.body["model"] = self.model
334
+
335
+ # Configure streaming
336
+ if self.stream:
337
+ arguments.stream = True
338
+ arguments.body["stream"] = True
339
+
340
+ # Handle output tokens
341
+ if output_tokens := sum(
342
+ count for count in columns.get("output_tokens_count_column", []) if count
343
+ ):
344
+ output_metrics.text_tokens = output_tokens
345
+
346
+ # Handle prompt tokens (for audio duration tracking)
347
+ if prompt_tokens := sum(
348
+ count for count in columns.get("prompt_tokens_count_column", []) if count
349
+ ):
350
+ input_metrics.text_tokens = prompt_tokens
351
+
352
+ # Apply extra arguments
353
+ if self.extras:
354
+ arguments.model_combine(self.extras)
355
+
356
+ # Build audio input
357
+ audio_columns = columns.get("audio_column", [])
358
+ if len(audio_columns) != 1:
359
+ raise ValueError(
360
+ f"GenerativeAudioTranscriptionRequestFormatter expects exactly "
361
+ f"one audio column, but got {len(audio_columns)}."
362
+ )
363
+
364
+ audio_dict = self.encode_audio(
365
+ audio_columns[0], b64encode=False, **self.encode_audio_kwargs
366
+ )
367
+ input_metrics.audio_samples = audio_dict.get("audio_samples")
368
+ input_metrics.audio_seconds = audio_dict.get("audio_seconds")
369
+ input_metrics.audio_bytes = audio_dict.get("audio_bytes")
370
+
371
+ arguments.files = {
372
+ "file": (
373
+ audio_dict.get("file_name", "audio_input"),
374
+ audio_dict.get("audio"),
375
+ audio_dict.get("mimetype"),
376
+ )
377
+ }
378
+
379
+ # Build prompt
380
+ prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
381
+ text = "".join(txt for txt in columns.get("text_column", []) if txt)
382
+ if prefix or text:
383
+ prompt = prefix + text
384
+ arguments.body["prompt"] = prompt
385
+ input_metrics.add_text_metrics(prompt)
386
+
387
+ return GenerationRequest(
388
+ request_type="audio_transcriptions",
389
+ arguments=arguments,
390
+ input_metrics=input_metrics,
391
+ output_metrics=output_metrics,
392
+ )
393
+
394
+
395
+ @PreprocessorRegistry.register("audio_translations")
396
+ class GenerativeAudioTranslationRequestFormatter(
397
+ GenerativeAudioTranscriptionRequestFormatter
398
+ ):
399
+ def __call__(
400
+ self, columns: dict[str, list[Any]]
401
+ ) -> GenerationRequest:
402
+ result = super().__call__(columns)
403
+ result.request_type = "audio_translations"
404
+ return result
@@ -0,0 +1,198 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from typing import Any, ClassVar, cast
5
+
6
+ from datasets import Dataset, IterableDataset
7
+
8
+ from guidellm.data.preprocessors.preprocessor import (
9
+ DataDependentPreprocessor,
10
+ PreprocessorRegistry,
11
+ )
12
+ from guidellm.data.schemas import GenerativeDatasetColumnType
13
+
14
+ __all__ = ["GenerativeColumnMapper"]
15
+
16
+
17
+ @PreprocessorRegistry.register("generative_column_mapper")
18
+ class GenerativeColumnMapper(DataDependentPreprocessor):
19
+ defaults: ClassVar[dict[str, list[str]]] = {
20
+ "prompt_tokens_count_column": ["prompt_tokens_count", "input_tokens_count"],
21
+ "output_tokens_count_column": [
22
+ "output_tokens_count",
23
+ "completion_tokens_count",
24
+ ],
25
+ "prefix_column": [
26
+ "system_prompt",
27
+ "system",
28
+ "prefix",
29
+ ],
30
+ "text_column": [
31
+ "prompt",
32
+ "instruction",
33
+ "question",
34
+ "input",
35
+ "context",
36
+ "content",
37
+ "conversation",
38
+ "turn",
39
+ "text",
40
+ ],
41
+ "image_column": [
42
+ "image",
43
+ "picture",
44
+ "photo",
45
+ "img",
46
+ ],
47
+ "video_column": [
48
+ "video",
49
+ "clip",
50
+ "movie",
51
+ "footage",
52
+ "mp4",
53
+ "mov",
54
+ "avi",
55
+ ],
56
+ "audio_column": [
57
+ "audio",
58
+ "sound",
59
+ "voice",
60
+ "speech",
61
+ "wav",
62
+ "mp3",
63
+ ],
64
+ }
65
+
66
+ @classmethod
67
+ def datasets_default_mappings(
68
+ cls, datasets: list[Dataset | IterableDataset]
69
+ ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
70
+ mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
71
+ defaultdict(list)
72
+ )
73
+
74
+ for index, dataset in enumerate(datasets):
75
+ dataset_columns = dataset.column_names or list(next(iter(dataset)).keys())
76
+
77
+ for column_type in cls.defaults:
78
+ if column_type in mappings:
79
+ continue
80
+
81
+ type_names = [
82
+ variant
83
+ for name in cls.defaults.get(column_type, [])
84
+ for plural in [name, f"{name}s", f"{name}es"]
85
+ for variant in [
86
+ plural,
87
+ plural.lower(),
88
+ plural.upper(),
89
+ plural.capitalize(),
90
+ ]
91
+ ]
92
+
93
+ for name in type_names:
94
+ if name in dataset_columns:
95
+ key = cast("GenerativeDatasetColumnType", column_type)
96
+ mappings[key].append((index, name))
97
+ break
98
+
99
+ return mappings
100
+
101
+ @classmethod
102
+ def datasets_mappings(
103
+ cls,
104
+ datasets: list[Dataset | IterableDataset],
105
+ input_mappings: dict[GenerativeDatasetColumnType, str | list[str]],
106
+ ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
107
+ mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
108
+ defaultdict(list)
109
+ )
110
+ datasets_named_indices = {
111
+ (
112
+ dataset.info.dataset_name
113
+ if dataset.info and dataset.info.dataset_name
114
+ else index
115
+ ): index
116
+ for index, dataset in enumerate(datasets)
117
+ }
118
+ datasets_columns = {
119
+ index: dataset.column_names or list(next(iter(dataset)).keys())
120
+ for index, dataset in enumerate(datasets)
121
+ }
122
+
123
+ # Parse out user mappings that were passed in and validate them
124
+ # Must be in the format of:
125
+ # {<column_type>: [<column_names>]}
126
+ # where <column_names> can be a single string or list of strings
127
+ # and each string can be any of:
128
+ # - a column name (assumes the first dataset was intended)
129
+ # - <int>.<column_name> where <int> is the dataset index
130
+ # - <str>.<column_name> where <str> is the dataset name
131
+ for column_type, names in input_mappings.items():
132
+ mappings[column_type] = []
133
+ for name in names if isinstance(names, list) else [names]:
134
+ if "." in name:
135
+ dataset, column_name = name.split(".", 1)
136
+ dataset_index = (
137
+ int(dataset)
138
+ if dataset.isdigit()
139
+ else datasets_named_indices.get(dataset)
140
+ )
141
+ else:
142
+ dataset_index = 0
143
+ column_name = name
144
+
145
+ if dataset_index is None or dataset_index >= len(datasets):
146
+ raise ValueError(
147
+ f"Dataset '{name}' not found in datasets: "
148
+ f"{datasets_named_indices}."
149
+ )
150
+ if column_name not in datasets_columns[dataset_index]:
151
+ raise ValueError(
152
+ f"Column '{column_name}' not found in dataset "
153
+ f"'{datasets[dataset_index]}' "
154
+ f"columns: {datasets_columns[dataset_index]}."
155
+ )
156
+ mappings[column_type].append((dataset_index, column_name))
157
+
158
+ return mappings
159
+
160
+ def __init__(
161
+ self,
162
+ column_mappings: dict[GenerativeDatasetColumnType, str | list[str]]
163
+ | None = None,
164
+ ):
165
+ self.input_mappings = column_mappings
166
+ self.datasets_column_mappings: (
167
+ dict[GenerativeDatasetColumnType, list[tuple[int, str]]] | None
168
+ )
169
+
170
+ def __call__(
171
+ self, row: dict[str, Any]
172
+ ) -> dict[str, list[Any]]:
173
+ if self.datasets_column_mappings is None:
174
+ raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
175
+
176
+ items = cast("dict[int, dict[str, Any]]", row.pop("items"))
177
+ mapped: dict[str, Any] = defaultdict(list)
178
+
179
+ for column_type, column_mappings in self.datasets_column_mappings.items():
180
+ for (
181
+ dataset_index,
182
+ dataset_column,
183
+ ) in column_mappings:
184
+ mapped[column_type].append(items[dataset_index][dataset_column])
185
+
186
+ return dict(mapped)
187
+
188
+ def setup_data(
189
+ self,
190
+ datasets: list[Dataset | IterableDataset],
191
+ data_args: list[dict[str, Any]],
192
+ ):
193
+ _ = data_args # Unused for this mapper
194
+ self.datasets_column_mappings = (
195
+ self.datasets_default_mappings(datasets)
196
+ if self.input_mappings is None
197
+ else self.datasets_mappings(datasets, self.input_mappings)
198
+ )
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Protocol, runtime_checkable
4
+
5
+ from datasets import Dataset, IterableDataset
6
+
7
+ from guidellm.schemas import GenerationRequest
8
+ from guidellm.utils import RegistryMixin
9
+
10
+ __all__ = ["DataDependentPreprocessor", "DatasetPreprocessor", "PreprocessorRegistry"]
11
+
12
+
13
+ @runtime_checkable
14
+ class DatasetPreprocessor(Protocol):
15
+ def __call__(self, item: dict[str, Any]) -> (
16
+ GenerationRequest | dict[str, Any]): ...
17
+
18
+
19
+ @runtime_checkable
20
+ class DataDependentPreprocessor(DatasetPreprocessor, Protocol):
21
+ def setup_data(
22
+ self,
23
+ datasets: list[Dataset | IterableDataset],
24
+ data_args: list[dict[str, Any]],
25
+ ): ...
26
+
27
+
28
+ class PreprocessorRegistry(
29
+ RegistryMixin[DataDependentPreprocessor | type[DataDependentPreprocessor]]
30
+ ):
31
+ pass
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from transformers import ( # type: ignore[import]
6
+ AutoTokenizer,
7
+ PreTrainedTokenizerBase,
8
+ )
9
+
10
+ __all__ = ["ProcessorFactory"]
11
+
12
+
13
+ class ProcessorFactory:
14
+ def __init__(
15
+ self,
16
+ processor: str | PreTrainedTokenizerBase,
17
+ processor_args: dict[str, Any] | None = None,
18
+ ) -> None:
19
+ self.processor = processor
20
+ self.processor_args = processor_args or {}
21
+
22
+ def __call__(self) -> PreTrainedTokenizerBase:
23
+ if isinstance(self.processor, PreTrainedTokenizerBase):
24
+ return self.processor
25
+ else:
26
+ from_pretrained = AutoTokenizer.from_pretrained(
27
+ self.processor,
28
+ **(self.processor_args or {}),
29
+ )
30
+ self.processor = from_pretrained
31
+ return from_pretrained
@@ -0,0 +1,13 @@
1
+ from typing import Literal
2
+
3
+ __all__ = ["GenerativeDatasetColumnType"]
4
+
5
+ GenerativeDatasetColumnType = Literal[
6
+ "prompt_tokens_count_column",
7
+ "output_tokens_count_column",
8
+ "prefix_column",
9
+ "text_column",
10
+ "image_column",
11
+ "video_column",
12
+ "audio_column",
13
+ ]