guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ from .deserializer import (
2
+ DataNotSupportedError,
3
+ DatasetDeserializer,
4
+ DatasetDeserializerFactory,
5
+ )
6
+ from .file import (
7
+ ArrowFileDatasetDeserializer,
8
+ CSVFileDatasetDeserializer,
9
+ DBFileDatasetDeserializer,
10
+ HDF5FileDatasetDeserializer,
11
+ JSONFileDatasetDeserializer,
12
+ ParquetFileDatasetDeserializer,
13
+ TarFileDatasetDeserializer,
14
+ TextFileDatasetDeserializer,
15
+ )
16
+ from .huggingface import HuggingFaceDatasetDeserializer
17
+ from .memory import (
18
+ InMemoryCsvDatasetDeserializer,
19
+ InMemoryDictDatasetDeserializer,
20
+ InMemoryDictListDatasetDeserializer,
21
+ InMemoryItemListDatasetDeserializer,
22
+ InMemoryJsonStrDatasetDeserializer,
23
+ )
24
+ from .synthetic import (
25
+ SyntheticTextDataset,
26
+ SyntheticTextDatasetDeserializer,
27
+ )
28
+
29
+ __all__ = [
30
+ "ArrowFileDatasetDeserializer",
31
+ "CSVFileDatasetDeserializer",
32
+ "DBFileDatasetDeserializer",
33
+ "DataNotSupportedError",
34
+ "DatasetDeserializer",
35
+ "DatasetDeserializerFactory",
36
+ "HDF5FileDatasetDeserializer",
37
+ "HuggingFaceDatasetDeserializer",
38
+ "InMemoryCsvDatasetDeserializer",
39
+ "InMemoryDictDatasetDeserializer",
40
+ "InMemoryDictListDatasetDeserializer",
41
+ "InMemoryItemListDatasetDeserializer",
42
+ "InMemoryJsonStrDatasetDeserializer",
43
+ "JSONFileDatasetDeserializer",
44
+ "ParquetFileDatasetDeserializer",
45
+ "SyntheticTextDataset",
46
+ "SyntheticTextDatasetDeserializer",
47
+ "TarFileDatasetDeserializer",
48
+ "TextFileDatasetDeserializer",
49
+ ]
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from typing import Any, Protocol, Union, runtime_checkable
5
+
6
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
7
+ from transformers import PreTrainedTokenizerBase
8
+
9
+ from guidellm.data.schemas import DataNotSupportedError
10
+ from guidellm.data.utils import resolve_dataset_split
11
+ from guidellm.utils import RegistryMixin
12
+
13
+ __all__ = [
14
+ "DatasetDeserializer",
15
+ "DatasetDeserializerFactory",
16
+ ]
17
+
18
+
19
+ @runtime_checkable
20
+ class DatasetDeserializer(Protocol):
21
+ def __call__(
22
+ self,
23
+ data: Any,
24
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
25
+ random_seed: int,
26
+ **data_kwargs: dict[str, Any],
27
+ ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
28
+
29
+
30
+ class DatasetDeserializerFactory(
31
+ RegistryMixin[Union["type[DatasetDeserializer]", DatasetDeserializer]],
32
+ ):
33
+ @classmethod
34
+ def deserialize(
35
+ cls,
36
+ data: Any,
37
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
38
+ random_seed: int = 42,
39
+ type_: str | None = None,
40
+ resolve_split: bool = True,
41
+ select_columns: list[str] | None = None,
42
+ remove_columns: list[str] | None = None,
43
+ **data_kwargs: dict[str, Any],
44
+ ) -> Dataset | IterableDataset:
45
+ dataset: Dataset
46
+
47
+ if type_ is None:
48
+ dataset = cls._deserialize_with_registered_deserializers(
49
+ data, processor_factory, random_seed, **data_kwargs
50
+ )
51
+
52
+ else:
53
+ dataset = cls._deserialize_with_specified_deserializer(
54
+ data, type_, processor_factory, random_seed, **data_kwargs
55
+ )
56
+
57
+ if resolve_split:
58
+ dataset = resolve_dataset_split(dataset)
59
+
60
+ if select_columns is not None or remove_columns is not None:
61
+ column_names = dataset.column_names or list(next(iter(dataset)).keys())
62
+ if select_columns is not None:
63
+ remove_columns = [
64
+ col for col in column_names if col not in select_columns
65
+ ]
66
+
67
+ dataset = dataset.remove_columns(remove_columns)
68
+
69
+ return dataset
70
+
71
+ @classmethod
72
+ def _deserialize_with_registered_deserializers(
73
+ cls,
74
+ data: Any,
75
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
76
+ random_seed: int = 42,
77
+ **data_kwargs: dict[str, Any],
78
+ ) -> Dataset:
79
+ if cls.registry is None:
80
+ raise RuntimeError("registry is None; cannot deserialize dataset")
81
+ dataset: Dataset | None = None
82
+
83
+ errors: dict[str, Exception] = {}
84
+ # Note: There is no priority order for the deserializers, so all deserializers
85
+ # must be mutually exclusive to ensure deterministic behavior.
86
+ for _name, deserializer in cls.registry.items():
87
+ deserializer_fn: DatasetDeserializer = (
88
+ deserializer() if isinstance(deserializer, type) else deserializer
89
+ )
90
+
91
+ try:
92
+ dataset = deserializer_fn(
93
+ data=data,
94
+ processor_factory=processor_factory,
95
+ random_seed=random_seed,
96
+ **data_kwargs,
97
+ )
98
+ except Exception as e: # noqa: BLE001 # The exceptions are saved.
99
+ errors[_name] = e
100
+
101
+ if dataset is not None:
102
+ return dataset # Success
103
+
104
+ if len(errors) > 0:
105
+ err_msgs = ""
106
+
107
+ def sort_key(item):
108
+ return (isinstance(item[1], DataNotSupportedError), item[0])
109
+
110
+ for key, err in sorted(errors.items(), key=sort_key):
111
+ err_msgs += f"\n - Deserializer '{key}': ({type(err).__name__}) {err}"
112
+ raise ValueError(
113
+ "Data deserialization failed, likely because the input doesn't "
114
+ f"match any of the input formats. See the {len(errors)} error(s) that "
115
+ f"occurred while attempting to deserialize the data {data}:{err_msgs}"
116
+ )
117
+ return dataset
118
+
119
+ @classmethod
120
+ def _deserialize_with_specified_deserializer(
121
+ cls,
122
+ data: Any,
123
+ type_: str,
124
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
125
+ random_seed: int = 42,
126
+ **data_kwargs: dict[str, Any],
127
+ ) -> Dataset:
128
+ deserializer_from_type = cls.get_registered_object(type_)
129
+ if deserializer_from_type is None:
130
+ raise ValueError(f"Deserializer type '{type_}' is not registered.")
131
+ if isinstance(deserializer_from_type, type):
132
+ deserializer_fn = deserializer_from_type()
133
+ else:
134
+ deserializer_fn = deserializer_from_type
135
+
136
+ return deserializer_fn(
137
+ data=data,
138
+ processor_factory=processor_factory,
139
+ random_seed=random_seed,
140
+ **data_kwargs,
141
+ )
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+ from datasets import Dataset, load_dataset
9
+ from transformers import PreTrainedTokenizerBase
10
+
11
+ from guidellm.data.deserializers.deserializer import (
12
+ DataNotSupportedError,
13
+ DatasetDeserializer,
14
+ DatasetDeserializerFactory,
15
+ )
16
+
17
+ __all__ = [
18
+ "ArrowFileDatasetDeserializer",
19
+ "CSVFileDatasetDeserializer",
20
+ "DBFileDatasetDeserializer",
21
+ "HDF5FileDatasetDeserializer",
22
+ "JSONFileDatasetDeserializer",
23
+ "ParquetFileDatasetDeserializer",
24
+ "TarFileDatasetDeserializer",
25
+ "TextFileDatasetDeserializer",
26
+ ]
27
+
28
+
29
+ @DatasetDeserializerFactory.register("text_file")
30
+ class TextFileDatasetDeserializer(DatasetDeserializer):
31
+ def __call__(
32
+ self,
33
+ data: Any,
34
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
35
+ random_seed: int,
36
+ **data_kwargs: dict[str, Any],
37
+ ) -> Dataset:
38
+ _ = (processor_factory, random_seed) # Ignore unused args format errors
39
+
40
+ if (
41
+ not isinstance(data, str | Path)
42
+ or not (path := Path(data)).exists()
43
+ or not path.is_file()
44
+ or path.suffix.lower() not in {".txt", ".text"}
45
+ ):
46
+ raise DataNotSupportedError(
47
+ "Unsupported data for TextFileDatasetDeserializer, "
48
+ f"expected str or Path to a local .txt or .text file, got {data}"
49
+ )
50
+
51
+ with path.open() as file:
52
+ lines = file.readlines()
53
+
54
+ return Dataset.from_dict({"text": lines}, **data_kwargs)
55
+
56
+
57
+ @DatasetDeserializerFactory.register("csv_file")
58
+ class CSVFileDatasetDeserializer(DatasetDeserializer):
59
+ def __call__(
60
+ self,
61
+ data: Any,
62
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
63
+ random_seed: int,
64
+ **data_kwargs: dict[str, Any],
65
+ ) -> Dataset:
66
+ _ = (processor_factory, random_seed)
67
+
68
+ if (
69
+ not isinstance(data, str | Path)
70
+ or not (path := Path(data)).exists()
71
+ or not path.is_file()
72
+ or path.suffix.lower() != ".csv"
73
+ ):
74
+ raise DataNotSupportedError(
75
+ "Unsupported data for CSVFileDatasetDeserializer, "
76
+ f"expected str or Path to a valid local .csv file, got {data}"
77
+ )
78
+
79
+ return load_dataset("csv", data_files=str(path), **data_kwargs)
80
+
81
+
82
+ @DatasetDeserializerFactory.register("json_file")
83
+ class JSONFileDatasetDeserializer(DatasetDeserializer):
84
+ def __call__(
85
+ self,
86
+ data: Any,
87
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
88
+ random_seed: int,
89
+ **data_kwargs: dict[str, Any],
90
+ ) -> Dataset:
91
+ _ = (processor_factory, random_seed)
92
+ if (
93
+ not isinstance(data, str | Path)
94
+ or not (path := Path(data)).exists()
95
+ or not path.is_file()
96
+ or path.suffix.lower() not in {".json", ".jsonl"}
97
+ ):
98
+ raise DataNotSupportedError(
99
+ f"Unsupported data for JSONFileDatasetDeserializer, "
100
+ f"expected str or Path to a local .json or .jsonl file, got {data}"
101
+ )
102
+
103
+ return load_dataset("json", data_files=str(path), **data_kwargs)
104
+
105
+
106
+ @DatasetDeserializerFactory.register("parquet_file")
107
+ class ParquetFileDatasetDeserializer(DatasetDeserializer):
108
+ def __call__(
109
+ self,
110
+ data: Any,
111
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
112
+ random_seed: int,
113
+ **data_kwargs: dict[str, Any],
114
+ ) -> Dataset:
115
+ _ = (processor_factory, random_seed)
116
+ if (
117
+ not isinstance(data, str | Path)
118
+ or not (path := Path(data)).exists()
119
+ or not path.is_file()
120
+ or path.suffix.lower() != ".parquet"
121
+ ):
122
+ raise DataNotSupportedError(
123
+ f"Unsupported data for ParquetFileDatasetDeserializer, "
124
+ f"expected str or Path to a local .parquet file, got {data}"
125
+ )
126
+
127
+ return load_dataset("parquet", data_files=str(path), **data_kwargs)
128
+
129
+
130
+ @DatasetDeserializerFactory.register("arrow_file")
131
+ class ArrowFileDatasetDeserializer(DatasetDeserializer):
132
+ def __call__(
133
+ self,
134
+ data: Any,
135
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
136
+ random_seed: int,
137
+ **data_kwargs: dict[str, Any],
138
+ ) -> Dataset:
139
+ _ = (processor_factory, random_seed)
140
+ if (
141
+ not isinstance(data, str | Path)
142
+ or not (path := Path(data)).exists()
143
+ or not path.is_file()
144
+ or path.suffix.lower() != ".arrow"
145
+ ):
146
+ raise DataNotSupportedError(
147
+ f"Unsupported data for ArrowFileDatasetDeserializer, "
148
+ f"expected str or Path to a local .arrow file, got {data}"
149
+ )
150
+
151
+ return load_dataset("arrow", data_files=str(path), **data_kwargs)
152
+
153
+
154
+ @DatasetDeserializerFactory.register("hdf5_file")
155
+ class HDF5FileDatasetDeserializer(DatasetDeserializer):
156
+ def __call__(
157
+ self,
158
+ data: Any,
159
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
160
+ random_seed: int,
161
+ **data_kwargs: dict[str, Any],
162
+ ) -> Dataset:
163
+ _ = (processor_factory, random_seed)
164
+ if (
165
+ not isinstance(data, str | Path)
166
+ or not (path := Path(data)).exists()
167
+ or not path.is_file()
168
+ or path.suffix.lower() not in {".hdf5", ".h5"}
169
+ ):
170
+ raise DataNotSupportedError(
171
+ f"Unsupported data for HDF5FileDatasetDeserializer, "
172
+ f"expected str or Path to a local .hdf5 or .h5 file, got {data}"
173
+ )
174
+
175
+ return Dataset.from_pandas(pd.read_hdf(str(path)), **data_kwargs)
176
+
177
+
178
+ @DatasetDeserializerFactory.register("db_file")
179
+ class DBFileDatasetDeserializer(DatasetDeserializer):
180
+ def __call__(
181
+ self,
182
+ data: Any,
183
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
184
+ random_seed: int,
185
+ **data_kwargs: dict[str, Any],
186
+ ) -> dict[str, list]:
187
+ _ = (processor_factory, random_seed)
188
+ if (
189
+ not isinstance(data, str | Path)
190
+ or not (path := Path(data)).exists()
191
+ or not path.is_file()
192
+ or path.suffix.lower() != ".db"
193
+ ):
194
+ raise DataNotSupportedError(
195
+ f"Unsupported data for DBFileDatasetDeserializer, "
196
+ f"expected str or Path to a local .db file, got {data}"
197
+ )
198
+
199
+ return Dataset.from_sql(con=str(path), **data_kwargs)
200
+
201
+
202
+ @DatasetDeserializerFactory.register("tar_file")
203
+ class TarFileDatasetDeserializer(DatasetDeserializer):
204
+ def __call__(
205
+ self,
206
+ data: Any,
207
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
208
+ random_seed: int,
209
+ **data_kwargs: dict[str, Any],
210
+ ) -> dict[str, list]:
211
+ _ = (processor_factory, random_seed)
212
+ if (
213
+ not isinstance(data, str | Path)
214
+ or not (path := Path(data)).exists()
215
+ or not path.is_file()
216
+ or path.suffix.lower() != ".tar"
217
+ ):
218
+ raise DataNotSupportedError(
219
+ f"Unsupported data for TarFileDatasetDeserializer, "
220
+ f"expected str or Path to a local .tar file, got {data}"
221
+ )
222
+
223
+ return load_dataset("webdataset", data_files=str(path), **data_kwargs)
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from datasets import (
8
+ Dataset,
9
+ DatasetDict,
10
+ IterableDataset,
11
+ IterableDatasetDict,
12
+ load_dataset,
13
+ load_from_disk,
14
+ )
15
+ from datasets.exceptions import (
16
+ DataFilesNotFoundError,
17
+ DatasetNotFoundError,
18
+ FileNotFoundDatasetsError,
19
+ )
20
+ from transformers import PreTrainedTokenizerBase
21
+
22
+ from guidellm.data.deserializers.deserializer import (
23
+ DataNotSupportedError,
24
+ DatasetDeserializer,
25
+ DatasetDeserializerFactory,
26
+ )
27
+
28
+ __all__ = ["HuggingFaceDatasetDeserializer"]
29
+
30
+
31
+ @DatasetDeserializerFactory.register("huggingface")
32
+ class HuggingFaceDatasetDeserializer(DatasetDeserializer):
33
+ def __call__(
34
+ self,
35
+ data: Any,
36
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
37
+ random_seed: int,
38
+ **data_kwargs: dict[str, Any],
39
+ ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
40
+ _ = (processor_factory, random_seed)
41
+
42
+ if isinstance(
43
+ data, Dataset | IterableDataset | DatasetDict | IterableDatasetDict
44
+ ):
45
+ return data
46
+
47
+ load_error = None
48
+
49
+ if (
50
+ isinstance(data, str | Path)
51
+ and (path := Path(data)).exists()
52
+ and ((path.is_file() and path.suffix == ".py") or path.is_dir())
53
+ ):
54
+ # Handle python script or nested python script in a directory
55
+ try:
56
+ return load_dataset(str(data), **data_kwargs)
57
+ except (
58
+ FileNotFoundDatasetsError,
59
+ DatasetNotFoundError,
60
+ DataFilesNotFoundError,
61
+ ) as err:
62
+ load_error = err
63
+ except Exception: # noqa: BLE001
64
+ # Try loading as a local dataset directory next
65
+ try:
66
+ return load_from_disk(str(data), **data_kwargs)
67
+ except (
68
+ FileNotFoundDatasetsError,
69
+ DatasetNotFoundError,
70
+ DataFilesNotFoundError,
71
+ ) as err2:
72
+ load_error = err2
73
+
74
+ try:
75
+ # Handle dataset identifier from the Hugging Face Hub
76
+ return load_dataset(str(data), **data_kwargs)
77
+ except (
78
+ FileNotFoundDatasetsError,
79
+ DatasetNotFoundError,
80
+ DataFilesNotFoundError,
81
+ ) as err:
82
+ load_error = err
83
+
84
+ not_supported = DataNotSupportedError(
85
+ "Unsupported data for HuggingFaceDatasetDeserializer, "
86
+ "expected Dataset, IterableDataset, DatasetDict, IterableDatasetDict, "
87
+ "str or Path to a local dataset directory or a local .py dataset script, "
88
+ f"got {data} and HF load error: {load_error}"
89
+ )
90
+
91
+ if load_error is not None:
92
+ raise not_supported from load_error
93
+ else:
94
+ raise not_supported
@@ -0,0 +1,194 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import csv
5
+ import json
6
+ from collections.abc import Callable
7
+ from io import StringIO
8
+ from typing import Any, cast
9
+
10
+ from datasets import Dataset
11
+ from transformers import PreTrainedTokenizerBase
12
+
13
+ from guidellm.data.deserializers.deserializer import (
14
+ DataNotSupportedError,
15
+ DatasetDeserializer,
16
+ DatasetDeserializerFactory,
17
+ )
18
+
19
+ __all__ = [
20
+ "InMemoryCsvDatasetDeserializer",
21
+ "InMemoryDictDatasetDeserializer",
22
+ "InMemoryDictListDatasetDeserializer",
23
+ "InMemoryItemListDatasetDeserializer",
24
+ "InMemoryJsonStrDatasetDeserializer",
25
+ ]
26
+
27
+
28
+ @DatasetDeserializerFactory.register("in_memory_dict")
29
+ class InMemoryDictDatasetDeserializer(DatasetDeserializer):
30
+ def __call__(
31
+ self,
32
+ data: Any,
33
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
34
+ random_seed: int,
35
+ **data_kwargs: dict[str, Any],
36
+ ) -> Dataset:
37
+ _ = (processor_factory, random_seed) # Ignore unused args format errors
38
+
39
+ if (
40
+ not data
41
+ or not isinstance(data, dict)
42
+ or not all(
43
+ isinstance(key, str) and isinstance(val, list)
44
+ for key, val in data.items()
45
+ )
46
+ ):
47
+ raise DataNotSupportedError(
48
+ f"Unsupported data for InMemoryDictDatasetDeserializer, "
49
+ f"expected dict[str, list], got {data}"
50
+ )
51
+
52
+ rows = len(list(data.values())[0])
53
+ if not all(len(val) == rows for val in data.values()):
54
+ raise DataNotSupportedError(
55
+ "All lists in the data dictionary must have the same length, "
56
+ f"expected {rows} for all keys {list(data.keys())}"
57
+ )
58
+
59
+ return Dataset.from_dict(data, **data_kwargs)
60
+
61
+
62
+ @DatasetDeserializerFactory.register("in_memory_dict_list")
63
+ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
64
+ def __call__(
65
+ self,
66
+ data: Any,
67
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
68
+ random_seed: int,
69
+ **data_kwargs: dict[str, Any],
70
+ ) -> Dataset:
71
+ _ = (processor_factory, random_seed) # Ignore unused args format errors
72
+
73
+ if (
74
+ not data
75
+ or not isinstance(data, list)
76
+ or not all(isinstance(item, dict) for item in data)
77
+ or not all(isinstance(key, str) for item in data for key in item)
78
+ ):
79
+ raise DataNotSupportedError(
80
+ f"Unsupported data for InMemoryDictListDatasetDeserializer, "
81
+ f"expected list of dicts, got {data}"
82
+ )
83
+
84
+ typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
85
+ first_keys = set(typed_data[0].keys())
86
+ for index, item in enumerate(typed_data):
87
+ if set(item.keys()) != first_keys:
88
+ raise DataNotSupportedError(
89
+ f"All dictionaries must have the same keys. "
90
+ f"Expected keys: {first_keys}, "
91
+ f"got keys at index {index}: {set(item.keys())}"
92
+ )
93
+
94
+ # Convert list of dicts to dict of lists
95
+ result_dict: dict = {key: [] for key in first_keys}
96
+ for item in typed_data:
97
+ for key, value in item.items():
98
+ result_dict[key].append(value)
99
+
100
+ return Dataset.from_dict(result_dict, **data_kwargs)
101
+
102
+
103
+ @DatasetDeserializerFactory.register("in_memory_item_list")
104
+ class InMemoryItemListDatasetDeserializer(DatasetDeserializer):
105
+ def __call__(
106
+ self,
107
+ data: Any,
108
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
109
+ random_seed: int,
110
+ **data_kwargs: dict[str, Any],
111
+ ) -> Dataset:
112
+ _ = (processor_factory, random_seed) # Ignore unused args format errors
113
+
114
+ primitive_types = (str, int, float, bool, type(None))
115
+ if (
116
+ not data
117
+ or not isinstance(data, list)
118
+ or not all(isinstance(item, primitive_types) for item in data)
119
+ ):
120
+ raise DataNotSupportedError(
121
+ f"Unsupported data for InMemoryItemListDatasetDeserializer, "
122
+ f"expected list of primitive items, got {data}"
123
+ )
124
+
125
+ column_name = data_kwargs.pop("column_name", "data")
126
+
127
+ return Dataset.from_dict({column_name: data}, **data_kwargs)
128
+
129
+
130
+ @DatasetDeserializerFactory.register("in_memory_json_str")
131
+ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
132
+ def __call__(
133
+ self,
134
+ data: Any,
135
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
136
+ random_seed: int,
137
+ **data_kwargs: dict[str, Any],
138
+ ) -> Dataset:
139
+ if (
140
+ isinstance(data, str)
141
+ and (json_str := data.strip())
142
+ and (
143
+ (json_str.startswith("{") and json_str.endswith("}"))
144
+ or (json_str.startswith("[") and json_str.endswith("]"))
145
+ )
146
+ ):
147
+ with contextlib.suppress(Exception):
148
+ parsed_data = json.loads(data)
149
+
150
+ deserializers = [
151
+ InMemoryDictDatasetDeserializer(),
152
+ InMemoryDictListDatasetDeserializer(),
153
+ InMemoryItemListDatasetDeserializer(),
154
+ ]
155
+
156
+ for deserializer in deserializers:
157
+ with contextlib.suppress(DataNotSupportedError):
158
+ return deserializer(
159
+ parsed_data, processor_factory, random_seed, **data_kwargs
160
+ )
161
+
162
+ raise DataNotSupportedError(
163
+ f"Unsupported data for InMemoryJsonStrDatasetDeserializer, "
164
+ f"expected JSON string with a list or dict of items, got {data}"
165
+ )
166
+
167
+
168
+ @DatasetDeserializerFactory.register("in_memory_csv_str")
169
+ class InMemoryCsvDatasetDeserializer(DatasetDeserializer):
170
+ def __call__(
171
+ self,
172
+ data: Any,
173
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
174
+ random_seed: int,
175
+ **data_kwargs: dict[str, Any],
176
+ ) -> Dataset:
177
+ if (
178
+ isinstance(data, str)
179
+ and (csv_str := data.strip())
180
+ and len(csv_str.split("\n")) > 0
181
+ ):
182
+ with contextlib.suppress(Exception):
183
+ csv_buffer = StringIO(data)
184
+ reader = csv.DictReader(csv_buffer)
185
+ rows = list(reader)
186
+
187
+ return InMemoryDictListDatasetDeserializer()(
188
+ rows, processor_factory, random_seed, **data_kwargs
189
+ )
190
+
191
+ raise DataNotSupportedError(
192
+ f"Unsupported data for InMemoryCsvDatasetDeserializer, "
193
+ f"expected CSV string, got {type(data)}"
194
+ )