guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (116) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +451 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +148 -317
  9. guidellm/benchmark/entrypoints.py +466 -128
  10. guidellm/benchmark/output.py +517 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2085 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +109 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +192 -0
  24. guidellm/data/deserializers/synthetic.py +346 -0
  25. guidellm/data/loaders.py +145 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +412 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +29 -0
  30. guidellm/data/processor.py +30 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +10 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/data/utils/functions.py +18 -0
  35. guidellm/extras/__init__.py +4 -0
  36. guidellm/extras/audio.py +215 -0
  37. guidellm/extras/vision.py +242 -0
  38. guidellm/logger.py +2 -2
  39. guidellm/mock_server/__init__.py +8 -0
  40. guidellm/mock_server/config.py +84 -0
  41. guidellm/mock_server/handlers/__init__.py +17 -0
  42. guidellm/mock_server/handlers/chat_completions.py +280 -0
  43. guidellm/mock_server/handlers/completions.py +280 -0
  44. guidellm/mock_server/handlers/tokenizer.py +142 -0
  45. guidellm/mock_server/models.py +510 -0
  46. guidellm/mock_server/server.py +168 -0
  47. guidellm/mock_server/utils.py +302 -0
  48. guidellm/preprocess/dataset.py +23 -26
  49. guidellm/presentation/builder.py +2 -2
  50. guidellm/presentation/data_models.py +25 -21
  51. guidellm/presentation/injector.py +2 -3
  52. guidellm/scheduler/__init__.py +65 -26
  53. guidellm/scheduler/constraints.py +1035 -0
  54. guidellm/scheduler/environments.py +252 -0
  55. guidellm/scheduler/scheduler.py +140 -368
  56. guidellm/scheduler/schemas.py +272 -0
  57. guidellm/scheduler/strategies.py +519 -0
  58. guidellm/scheduler/worker.py +391 -420
  59. guidellm/scheduler/worker_group.py +707 -0
  60. guidellm/schemas/__init__.py +31 -0
  61. guidellm/schemas/info.py +159 -0
  62. guidellm/schemas/request.py +216 -0
  63. guidellm/schemas/response.py +119 -0
  64. guidellm/schemas/stats.py +228 -0
  65. guidellm/{config.py → settings.py} +32 -21
  66. guidellm/utils/__init__.py +95 -8
  67. guidellm/utils/auto_importer.py +98 -0
  68. guidellm/utils/cli.py +46 -2
  69. guidellm/utils/console.py +183 -0
  70. guidellm/utils/encoding.py +778 -0
  71. guidellm/utils/functions.py +134 -0
  72. guidellm/utils/hf_datasets.py +1 -2
  73. guidellm/utils/hf_transformers.py +4 -4
  74. guidellm/utils/imports.py +9 -0
  75. guidellm/utils/messaging.py +1118 -0
  76. guidellm/utils/mixins.py +115 -0
  77. guidellm/utils/pydantic_utils.py +411 -0
  78. guidellm/utils/random.py +3 -4
  79. guidellm/utils/registry.py +220 -0
  80. guidellm/utils/singleton.py +133 -0
  81. guidellm/{objects → utils}/statistics.py +341 -247
  82. guidellm/utils/synchronous.py +159 -0
  83. guidellm/utils/text.py +163 -50
  84. guidellm/utils/typing.py +41 -0
  85. guidellm/version.py +1 -1
  86. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
  87. guidellm-0.4.0a155.dist-info/RECORD +96 -0
  88. guidellm/backend/__init__.py +0 -23
  89. guidellm/backend/backend.py +0 -259
  90. guidellm/backend/openai.py +0 -705
  91. guidellm/backend/response.py +0 -136
  92. guidellm/benchmark/aggregator.py +0 -760
  93. guidellm/benchmark/benchmark.py +0 -837
  94. guidellm/benchmark/scenario.py +0 -104
  95. guidellm/data/prideandprejudice.txt.gz +0 -0
  96. guidellm/dataset/__init__.py +0 -22
  97. guidellm/dataset/creator.py +0 -213
  98. guidellm/dataset/entrypoints.py +0 -42
  99. guidellm/dataset/file.py +0 -92
  100. guidellm/dataset/hf_datasets.py +0 -62
  101. guidellm/dataset/in_memory.py +0 -132
  102. guidellm/dataset/synthetic.py +0 -287
  103. guidellm/objects/__init__.py +0 -18
  104. guidellm/objects/pydantic.py +0 -89
  105. guidellm/request/__init__.py +0 -18
  106. guidellm/request/loader.py +0 -284
  107. guidellm/request/request.py +0 -79
  108. guidellm/request/types.py +0 -10
  109. guidellm/scheduler/queues.py +0 -25
  110. guidellm/scheduler/result.py +0 -155
  111. guidellm/scheduler/strategy.py +0 -495
  112. guidellm-0.4.0a18.dist-info/RECORD +0 -62
  113. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
  114. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
  115. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
  116. {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,412 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABCMeta
4
+ from typing import Any
5
+
6
+ from guidellm.data.preprocessors.preprocessor import (
7
+ DatasetPreprocessor,
8
+ PreprocessorRegistry,
9
+ )
10
+ from guidellm.data.schemas import GenerativeDatasetColumnType
11
+ from guidellm.data.utils import text_stats
12
+ from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
13
+
14
+ __all__ = [
15
+ "GenerativeAudioTranscriptionRequestFormatter",
16
+ "GenerativeAudioTranslationRequestFormatter",
17
+ "GenerativeChatCompletionsRequestFormatter",
18
+ "GenerativeTextCompletionsRequestFormatter",
19
+ ]
20
+
21
+
22
+ class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
23
+ @staticmethod
24
+ def encode_audio(*args, **kwargs):
25
+ from guidellm.extras.audio import encode_audio
26
+
27
+ return encode_audio(*args, **kwargs)
28
+
29
+ @staticmethod
30
+ def encode_image(*args, **kwargs):
31
+ from guidellm.extras.vision import encode_image
32
+
33
+ return encode_image(*args, **kwargs)
34
+
35
+ @staticmethod
36
+ def encode_video(*args, **kwargs):
37
+ from guidellm.extras.vision import encode_video
38
+
39
+ return encode_video(*args, **kwargs)
40
+
41
+
42
+ @PreprocessorRegistry.register("text_completions")
43
+ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
44
+ def __init__(
45
+ self,
46
+ model: str,
47
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
48
+ stream: bool = True,
49
+ max_tokens: int | None = None,
50
+ max_completion_tokens: int | None = None,
51
+ ):
52
+ self.model: str | None = model
53
+ self.extras = (
54
+ GenerationRequestArguments(**extras)
55
+ if extras and isinstance(extras, dict)
56
+ else extras
57
+ )
58
+ self.stream: bool = stream
59
+ self.max_tokens: int | None = max_tokens or max_completion_tokens
60
+
61
+ def __call__(
62
+ self, columns: dict[GenerativeDatasetColumnType, list[Any]]
63
+ ) -> GenerationRequest:
64
+ arguments: GenerationRequestArguments = GenerationRequestArguments(body={})
65
+ input_metrics = UsageMetrics()
66
+ output_metrics = UsageMetrics()
67
+
68
+ # Add model
69
+ if self.model is not None:
70
+ arguments.body["model"] = self.model
71
+
72
+ # Configure streaming
73
+ if self.stream:
74
+ arguments.stream = True
75
+ arguments.body["stream"] = True
76
+
77
+ # Handle output tokens
78
+ if output_tokens := sum(
79
+ count for count in columns.get("output_tokens_count_column", []) if count
80
+ ):
81
+ output_metrics.text_tokens = output_tokens
82
+ arguments.body["max_tokens"] = output_tokens
83
+ arguments.body["stop"] = None
84
+ arguments.body["ignore_eos"] = True
85
+ elif self.max_tokens is not None:
86
+ arguments.body["max_tokens"] = self.max_tokens
87
+
88
+ # Handle prompt tokens
89
+ if prompt_tokens := sum(
90
+ count for count in columns.get("prompt_tokens_count_column", []) if count
91
+ ):
92
+ input_metrics.text_tokens = prompt_tokens
93
+
94
+ # Apply extra arguments
95
+ if self.extras:
96
+ arguments.model_combine(self.extras)
97
+
98
+ # Build prompt
99
+ prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
100
+ text = "".join(txt for txt in columns.get("text_column", []) if txt)
101
+ if prefix or text:
102
+ arguments.body["prompt"] = prefix + text
103
+ stats = text_stats(arguments.body["prompt"])
104
+ input_metrics.text_characters = stats.get("num_chars")
105
+ input_metrics.text_words = stats.get("num_words")
106
+
107
+ return GenerationRequest(
108
+ request_type="text_completions",
109
+ arguments=arguments,
110
+ input_metrics=input_metrics,
111
+ output_metrics=output_metrics,
112
+ )
113
+
114
+
115
+ @PreprocessorRegistry.register("chat_completions")
116
+ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
117
+ def __init__(
118
+ self,
119
+ model: str,
120
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
121
+ stream: bool = True,
122
+ max_tokens: int | None = None,
123
+ max_completion_tokens: int | None = None,
124
+ encode_kwargs: dict[str, Any] | None = None,
125
+ ):
126
+ self.model = model
127
+ self.extras = (
128
+ GenerationRequestArguments(**extras)
129
+ if extras and isinstance(extras, dict)
130
+ else extras
131
+ )
132
+ self.stream = stream
133
+ self.max_completion_tokens = max_tokens or max_completion_tokens
134
+ self.encode_image_kwargs = (
135
+ encode_kwargs.get("image", {}) if encode_kwargs else {}
136
+ )
137
+ self.encode_video_kwargs = (
138
+ encode_kwargs.get("video", {}) if encode_kwargs else {}
139
+ )
140
+ self.encode_audio_kwargs = (
141
+ encode_kwargs.get("audio", {}) if encode_kwargs else {}
142
+ )
143
+
144
+ def __call__( # noqa: C901, PLR0912, PLR0915
145
+ self, columns: dict[GenerativeDatasetColumnType, list[Any]]
146
+ ) -> GenerationRequest:
147
+ arguments = GenerationRequestArguments(body={})
148
+ input_metrics = UsageMetrics()
149
+ output_metrics = UsageMetrics()
150
+
151
+ # Add model
152
+ if self.model is not None:
153
+ arguments.body["model"] = self.model
154
+
155
+ # Configure streaming
156
+ if self.stream:
157
+ arguments.stream = True
158
+ arguments.body.update(
159
+ {"stream": True, "stream_options": {"include_usage": True}}
160
+ )
161
+
162
+ # Handle output tokens
163
+ if output_tokens := sum(
164
+ count for count in columns.get("output_tokens_count_column", []) if count
165
+ ):
166
+ output_metrics.text_tokens = output_tokens
167
+ arguments.body.update(
168
+ {
169
+ "max_completion_tokens": output_tokens,
170
+ "stop": None,
171
+ "ignore_eos": True,
172
+ }
173
+ )
174
+ elif self.max_completion_tokens is not None:
175
+ arguments.body["max_completion_tokens"] = self.max_completion_tokens
176
+
177
+ # Handle prompt tokens
178
+ if prompt_tokens := sum(
179
+ count for count in columns.get("prompt_tokens_count_column", []) if count
180
+ ):
181
+ input_metrics.text_tokens = prompt_tokens
182
+
183
+ # Apply extra arguments
184
+ if self.extras:
185
+ arguments.model_combine(self.extras)
186
+
187
+ # Build messages
188
+ arguments.body["messages"] = []
189
+
190
+ for prefix in columns.get("prefix_column", []):
191
+ if not prefix:
192
+ continue
193
+
194
+ stats = text_stats(prefix)
195
+ if (num_chars := stats.get("num_chars")) is not None:
196
+ input_metrics.text_characters = (
197
+ input_metrics.text_characters or 0
198
+ ) + num_chars
199
+ if (num_words := stats.get("num_words")) is not None:
200
+ input_metrics.text_words = (input_metrics.text_words or 0) + num_words
201
+
202
+ arguments.body["messages"].append({"role": "system", "content": prefix})
203
+
204
+ for text in columns.get("text_column", []):
205
+ if not text:
206
+ continue
207
+
208
+ stats = text_stats(text)
209
+ if (num_chars := stats.get("num_chars")) is not None:
210
+ input_metrics.text_characters = (
211
+ input_metrics.text_characters or 0
212
+ ) + num_chars
213
+ if (num_words := stats.get("num_words")) is not None:
214
+ input_metrics.text_words = (input_metrics.text_words or 0) + num_words
215
+
216
+ arguments.body["messages"].append(
217
+ {"role": "user", "content": [{"type": "text", "text": text}]}
218
+ )
219
+
220
+ for image in columns.get("image_column", []):
221
+ if not image:
222
+ continue
223
+
224
+ image_dict = self.encode_image(image, **self.encode_image_kwargs)
225
+ if (image_pixels := image_dict.get("image_pixels")) is not None:
226
+ input_metrics.image_pixels = (
227
+ input_metrics.image_pixels or 0
228
+ ) + image_pixels
229
+ if (image_bytes := image_dict.get("image_bytes")) is not None:
230
+ input_metrics.image_bytes = (
231
+ input_metrics.image_bytes or 0
232
+ ) + image_bytes
233
+
234
+ arguments.body["messages"].append(
235
+ {
236
+ "role": "user",
237
+ "content": [
238
+ {"type": "image_url", "image_url": image_dict.get("image")}
239
+ ],
240
+ }
241
+ )
242
+
243
+ for video in columns.get("video_column", []):
244
+ if not video:
245
+ continue
246
+
247
+ video_dict = self.encode_video(video, **self.encode_video_kwargs)
248
+ if (video_frames := video_dict.get("video_frames")) is not None:
249
+ input_metrics.video_frames = (
250
+ input_metrics.video_frames or 0
251
+ ) + video_frames
252
+ if (video_seconds := video_dict.get("video_seconds")) is not None:
253
+ input_metrics.video_seconds = (
254
+ input_metrics.video_seconds or 0.0
255
+ ) + video_seconds
256
+ if (video_bytes := video_dict.get("video_bytes")) is not None:
257
+ input_metrics.video_bytes = (
258
+ input_metrics.video_bytes or 0
259
+ ) + video_bytes
260
+
261
+ arguments.body["messages"].append(
262
+ {
263
+ "role": "user",
264
+ "content": [
265
+ {"type": "video_url", "video_url": video_dict.get("video")}
266
+ ],
267
+ }
268
+ )
269
+
270
+ for audio in columns.get("audio_column", []):
271
+ if not audio:
272
+ continue
273
+
274
+ audio_dict = self.encode_audio(
275
+ audio, b64encode=True, **self.encode_audio_kwargs
276
+ )
277
+ if (audio_samples := audio_dict.get("audio_samples")) is not None:
278
+ input_metrics.audio_samples = (
279
+ input_metrics.audio_samples or 0
280
+ ) + audio_samples
281
+ if (audio_seconds := audio_dict.get("audio_seconds")) is not None:
282
+ input_metrics.audio_seconds = (
283
+ input_metrics.audio_seconds or 0.0
284
+ ) + audio_seconds
285
+ if (audio_bytes := audio_dict.get("audio_bytes")) is not None:
286
+ input_metrics.audio_bytes = (
287
+ input_metrics.audio_bytes or 0
288
+ ) + audio_bytes
289
+
290
+ arguments.body["messages"].append(
291
+ {
292
+ "role": "user",
293
+ "content": [
294
+ {
295
+ "type": "input_audio",
296
+ "input_audio": {
297
+ "data": audio_dict.get("audio"),
298
+ "format": audio_dict.get("format"),
299
+ },
300
+ }
301
+ ],
302
+ }
303
+ )
304
+
305
+ return GenerationRequest(
306
+ request_type="chat_completions",
307
+ arguments=arguments,
308
+ input_metrics=input_metrics,
309
+ output_metrics=output_metrics,
310
+ )
311
+
312
+
313
+ @PreprocessorRegistry.register("audio_transcriptions")
314
+ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
315
+ def __init__(
316
+ self,
317
+ model: str,
318
+ extras: dict[str, Any] | GenerationRequestArguments | None = None,
319
+ stream: bool = True,
320
+ encode_kwargs: dict[str, Any] | None = None,
321
+ ):
322
+ self.model = model
323
+ self.extras = (
324
+ GenerationRequestArguments(**extras)
325
+ if extras and isinstance(extras, dict)
326
+ else extras
327
+ )
328
+ self.stream = stream
329
+ self.encode_audio_kwargs = encode_kwargs or {}
330
+
331
+ def __call__( # noqa: C901
332
+ self, columns: dict[GenerativeDatasetColumnType, list[Any]]
333
+ ) -> GenerationRequest:
334
+ arguments = GenerationRequestArguments(body={}, files={})
335
+ input_metrics = UsageMetrics()
336
+ output_metrics = UsageMetrics()
337
+
338
+ # Add model
339
+ if self.model is not None:
340
+ arguments.body["model"] = self.model
341
+
342
+ # Configure streaming
343
+ if self.stream:
344
+ arguments.stream = True
345
+ arguments.body["stream"] = True
346
+
347
+ # Handle output tokens
348
+ if output_tokens := sum(
349
+ count for count in columns.get("output_tokens_count_column", []) if count
350
+ ):
351
+ output_metrics.text_tokens = output_tokens
352
+
353
+ # Handle prompt tokens (for audio duration tracking)
354
+ if prompt_tokens := sum(
355
+ count for count in columns.get("prompt_tokens_count_column", []) if count
356
+ ):
357
+ input_metrics.text_tokens = prompt_tokens
358
+
359
+ # Apply extra arguments
360
+ if self.extras:
361
+ arguments.model_combine(self.extras)
362
+
363
+ # Build audio input
364
+ audio_columns = columns.get("audio_column", [])
365
+ if len(audio_columns) != 1:
366
+ raise ValueError(
367
+ f"GenerativeAudioTranscriptionRequestFormatter expects exactly "
368
+ f"one audio column, but got {len(audio_columns)}."
369
+ )
370
+
371
+ audio_dict = self.encode_audio(
372
+ audio_columns[0], b64encode=False, **self.encode_audio_kwargs
373
+ )
374
+ input_metrics.audio_samples = audio_dict.get("audio_samples")
375
+ input_metrics.audio_seconds = audio_dict.get("audio_seconds")
376
+ input_metrics.audio_bytes = audio_dict.get("audio_bytes")
377
+
378
+ arguments.files = {
379
+ "file": (
380
+ audio_dict.get("file_name", "audio_input"),
381
+ audio_dict.get("audio"),
382
+ audio_dict.get("mimetype"),
383
+ )
384
+ }
385
+
386
+ # Build prompt
387
+ prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
388
+ text = "".join(txt for txt in columns.get("text_column", []) if txt)
389
+ if prefix or text:
390
+ arguments.body["prompt"] = prefix + text
391
+ stats = text_stats(arguments.body["prompt"])
392
+ input_metrics.text_characters = stats.get("num_chars")
393
+ input_metrics.text_words = stats.get("num_words")
394
+
395
+ return GenerationRequest(
396
+ request_type="audio_transcriptions",
397
+ arguments=arguments,
398
+ input_metrics=input_metrics,
399
+ output_metrics=output_metrics,
400
+ )
401
+
402
+
403
+ @PreprocessorRegistry.register("audio_translations")
404
+ class GenerativeAudioTranslationRequestFormatter(
405
+ GenerativeAudioTranscriptionRequestFormatter
406
+ ):
407
+ def __call__(
408
+ self, columns: dict[GenerativeDatasetColumnType, list[Any]]
409
+ ) -> GenerationRequest:
410
+ result = super().__call__(columns)
411
+ result.request_type = "audio_translations"
412
+ return result
@@ -0,0 +1,198 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from typing import Any, ClassVar, cast
5
+
6
+ from datasets import Dataset, IterableDataset
7
+
8
+ from guidellm.data.preprocessors.preprocessor import (
9
+ DataDependentPreprocessor,
10
+ PreprocessorRegistry,
11
+ )
12
+ from guidellm.data.schemas import GenerativeDatasetColumnType
13
+
14
+ __all__ = ["GenerativeColumnMapper"]
15
+
16
+
17
+ @PreprocessorRegistry.register("generative_column_mapper")
18
+ class GenerativeColumnMapper(DataDependentPreprocessor):
19
+ defaults: ClassVar[dict[str, list[str]]] = {
20
+ "prompt_tokens_count_column": ["prompt_tokens_count", "input_tokens_count"],
21
+ "output_tokens_count_column": [
22
+ "output_tokens_count",
23
+ "completion_tokens_count",
24
+ ],
25
+ "prefix_column": [
26
+ "system_prompt",
27
+ "system",
28
+ "prefix",
29
+ ],
30
+ "text_column": [
31
+ "prompt",
32
+ "instruction",
33
+ "question",
34
+ "input",
35
+ "context",
36
+ "content",
37
+ "conversation",
38
+ "turn",
39
+ "text",
40
+ ],
41
+ "image_column": [
42
+ "image",
43
+ "picture",
44
+ "photo",
45
+ "img",
46
+ ],
47
+ "video_column": [
48
+ "video",
49
+ "clip",
50
+ "movie",
51
+ "footage",
52
+ "mp4",
53
+ "mov",
54
+ "avi",
55
+ ],
56
+ "audio_column": [
57
+ "audio",
58
+ "sound",
59
+ "voice",
60
+ "speech",
61
+ "wav",
62
+ "mp3",
63
+ ],
64
+ }
65
+
66
+ @classmethod
67
+ def datasets_default_mappings(
68
+ cls, datasets: list[Dataset | IterableDataset]
69
+ ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
70
+ mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
71
+ defaultdict(list)
72
+ )
73
+
74
+ for index, dataset in enumerate(datasets):
75
+ dataset_columns = dataset.column_names or list(next(iter(dataset)).keys())
76
+
77
+ for column_type in cls.defaults:
78
+ if column_type in mappings:
79
+ continue
80
+
81
+ type_names = [
82
+ variant
83
+ for name in cls.defaults.get(column_type, [])
84
+ for plural in [name, f"{name}s", f"{name}es"]
85
+ for variant in [
86
+ plural,
87
+ plural.lower(),
88
+ plural.upper(),
89
+ plural.capitalize(),
90
+ ]
91
+ ]
92
+
93
+ for name in type_names:
94
+ if name in dataset_columns:
95
+ key = cast("GenerativeDatasetColumnType", column_type)
96
+ mappings[key].append((index, name))
97
+ break
98
+
99
+ return mappings
100
+
101
+ @classmethod
102
+ def datasets_mappings(
103
+ cls,
104
+ datasets: list[Dataset | IterableDataset],
105
+ input_mappings: dict[GenerativeDatasetColumnType, str | list[str]],
106
+ ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
107
+ mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
108
+ defaultdict(list)
109
+ )
110
+ datasets_named_indices = {
111
+ (
112
+ dataset.info.dataset_name
113
+ if dataset.info and dataset.info.dataset_name
114
+ else index
115
+ ): index
116
+ for index, dataset in enumerate(datasets)
117
+ }
118
+ datasets_columns = {
119
+ index: dataset.column_names or list(next(iter(dataset)).keys())
120
+ for index, dataset in enumerate(datasets)
121
+ }
122
+
123
+ # Parse out user mappings that were passed in and validate them
124
+ # Must be in the format of:
125
+ # {<column_type>: [<column_names>]}
126
+ # where <column_names> can be a single string or list of strings
127
+ # and each string can be any of:
128
+ # - a column name (assumes the first dataset was intended)
129
+ # - <int>.<column_name> where <int> is the dataset index
130
+ # - <str>.<column_name> where <str> is the dataset name
131
+ for column_type, names in input_mappings.items():
132
+ mappings[column_type] = []
133
+ for name in names if isinstance(names, list) else [names]:
134
+ if "." in name:
135
+ dataset, column_name = name.split(".", 1)
136
+ dataset_index = (
137
+ int(dataset)
138
+ if dataset.isdigit()
139
+ else datasets_named_indices.get(dataset)
140
+ )
141
+ else:
142
+ dataset_index = 0
143
+ column_name = name
144
+
145
+ if dataset_index is None or dataset_index >= len(datasets):
146
+ raise ValueError(
147
+ f"Dataset '{name}' not found in datasets: "
148
+ f"{datasets_named_indices}."
149
+ )
150
+ if column_name not in datasets_columns[dataset_index]:
151
+ raise ValueError(
152
+ f"Column '{column_name}' not found in dataset "
153
+ f"'{datasets[dataset_index]}' "
154
+ f"columns: {datasets_columns[dataset_index]}."
155
+ )
156
+ mappings[column_type].append((dataset_index, column_name))
157
+
158
+ return mappings
159
+
160
+ def __init__(
161
+ self,
162
+ column_mappings: dict[GenerativeDatasetColumnType, str | list[str]]
163
+ | None = None,
164
+ ):
165
+ self.input_mappings = column_mappings
166
+ self.datasets_column_mappings: (
167
+ dict[GenerativeDatasetColumnType, list[tuple[int, str]]] | None
168
+ )
169
+
170
+ def __call__(
171
+ self, row: dict[str, Any]
172
+ ) -> dict[GenerativeDatasetColumnType, list[Any]]:
173
+ if self.datasets_column_mappings is None:
174
+ raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
175
+
176
+ items = cast("dict[int, dict[str, Any]]", row.pop("items"))
177
+ mapped: dict[GenerativeDatasetColumnType, list[Any]] = defaultdict(list)
178
+
179
+ for column_type, column_mappings in self.datasets_column_mappings.items():
180
+ for (
181
+ dataset_index,
182
+ dataset_column,
183
+ ) in column_mappings:
184
+ mapped[column_type].append(items[dataset_index][dataset_column])
185
+
186
+ return dict(mapped)
187
+
188
+ def setup_data(
189
+ self,
190
+ datasets: list[Dataset | IterableDataset],
191
+ data_args: list[dict[str, Any]],
192
+ ):
193
+ _ = data_args # Unused for this mapper
194
+ self.datasets_column_mappings = (
195
+ self.datasets_default_mappings(datasets)
196
+ if self.input_mappings is None
197
+ else self.datasets_mappings(datasets, self.input_mappings)
198
+ )
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Protocol, Union, runtime_checkable
4
+
5
+ from datasets import Dataset, IterableDataset
6
+
7
+ from guidellm.utils import RegistryMixin
8
+
9
+ __all__ = ["DataDependentPreprocessor", "DatasetPreprocessor", "PreprocessorRegistry"]
10
+
11
+
12
+ @runtime_checkable
13
+ class DatasetPreprocessor(Protocol):
14
+ def __call__(self, item: dict[str, Any]) -> dict[str, Any]: ...
15
+
16
+
17
+ @runtime_checkable
18
+ class DataDependentPreprocessor(DatasetPreprocessor, Protocol):
19
+ def setup_data(
20
+ self,
21
+ datasets: list[Dataset | IterableDataset],
22
+ data_args: list[dict[str, Any]],
23
+ ): ...
24
+
25
+
26
+ class PreprocessorRegistry(
27
+ RegistryMixin[Union[DataDependentPreprocessor, type[DataDependentPreprocessor]]]
28
+ ):
29
+ pass
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from transformers import ( # type: ignore[import]
6
+ AutoTokenizer,
7
+ PreTrainedTokenizerBase,
8
+ )
9
+
10
+ __all__ = ["ProcessorFactory"]
11
+
12
+
13
+ class ProcessorFactory:
14
+ def __init__(
15
+ self,
16
+ processor: str | PreTrainedTokenizerBase,
17
+ processor_args: dict[str, Any] | None = None,
18
+ ) -> None:
19
+ self.processor = processor
20
+ self.processor_args = processor_args or {}
21
+
22
+ def __call__(self) -> PreTrainedTokenizerBase:
23
+ if isinstance(self.processor, PreTrainedTokenizerBase):
24
+ return self.processor
25
+ else:
26
+ self.processor = AutoTokenizer.from_pretrained(
27
+ self.processor,
28
+ **(self.processor_args or {}),
29
+ )
30
+ return self.processor
@@ -0,0 +1,13 @@
1
+ from typing import Literal
2
+
3
+ __all__ = ["GenerativeDatasetColumnType"]
4
+
5
+ GenerativeDatasetColumnType = Literal[
6
+ "prompt_tokens_count_column",
7
+ "output_tokens_count_column",
8
+ "prefix_column",
9
+ "text_column",
10
+ "image_column",
11
+ "video_column",
12
+ "audio_column",
13
+ ]
@@ -0,0 +1,10 @@
1
+ from .dataset import DEFAULT_SPLITS, resolve_dataset_split
2
+ from .functions import (
3
+ text_stats,
4
+ )
5
+
6
+ __all__ = [
7
+ "DEFAULT_SPLITS",
8
+ "resolve_dataset_split",
9
+ "text_stats",
10
+ ]