guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -1,213 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from pathlib import Path
3
- from typing import Any, Literal, Optional, Union
4
-
5
- from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
6
- from transformers import PreTrainedTokenizerBase # type: ignore[import]
7
-
8
- __all__ = ["ColumnInputTypes", "DatasetCreator"]
9
-
10
- ColumnInputTypes = Literal[
11
- "prompt_column",
12
- "text_column",
13
- "prompt_tokens_count_column",
14
- "output_tokens_count_column",
15
- ]
16
-
17
-
18
- class DatasetCreator(ABC):
19
- DEFAULT_SPLITS_TRAIN = [
20
- "train",
21
- "training",
22
- "train_set",
23
- "training_set",
24
- "train_dataset",
25
- "training_dataset",
26
- "train_data",
27
- "training_data",
28
- "pretrain",
29
- "pretrain_set",
30
- "pretrain_dataset",
31
- "pretrain_data",
32
- "pretraining",
33
- ]
34
- DEFAULT_SPLITS_CALIB = [
35
- "calibration",
36
- "calib",
37
- "cal",
38
- "calibration_set",
39
- "calib_set",
40
- "cal_set",
41
- "calibration_dataset",
42
- "calib_dataset",
43
- "cal_set",
44
- "calibration_data",
45
- "calib_data",
46
- "cal_data",
47
- ]
48
- DEFAULT_SPLITS_VAL = [
49
- "validation",
50
- "val",
51
- "valid",
52
- "validation_set",
53
- "val_set",
54
- "validation_dataset",
55
- "val_dataset",
56
- "validation_data",
57
- "val_data",
58
- "dev",
59
- "dev_set",
60
- "dev_dataset",
61
- "dev_data",
62
- ]
63
- DEFAULT_SPLITS_TEST = [
64
- "test",
65
- "testing",
66
- "test_set",
67
- "testing_set",
68
- "test_dataset",
69
- "testing_dataset",
70
- "test_data",
71
- "testing_data",
72
- "eval",
73
- "eval_set",
74
- "eval_dataset",
75
- "eval_data",
76
- ]
77
- DEFAULT_SPLITS_DATASET: dict[str, str] = {}
78
-
79
- @classmethod
80
- def create(
81
- cls,
82
- data: Any,
83
- data_args: Optional[dict[str, Any]],
84
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
85
- processor_args: Optional[dict[str, Any]],
86
- random_seed: int = 42,
87
- split_pref_order: Optional[list[str]] = None,
88
- ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
89
- if not cls.is_supported(data, data_args):
90
- raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
91
-
92
- split = cls.extract_args_split(data_args)
93
- column_mappings = cls.extract_args_column_mappings(data_args)
94
- dataset = cls.handle_create(
95
- data, data_args, processor, processor_args, random_seed
96
- )
97
-
98
- if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
99
- dataset = cls.extract_dataset_split(dataset, split, split_pref_order)
100
-
101
- if not isinstance(dataset, (Dataset, IterableDataset)):
102
- raise ValueError(
103
- f"Unsupported data type: {type(dataset)} given for {dataset}."
104
- )
105
-
106
- return dataset, column_mappings
107
-
108
- @classmethod
109
- def extract_args_split(cls, data_args: Optional[dict[str, Any]]) -> str:
110
- split = "auto"
111
-
112
- if data_args and "split" in data_args:
113
- split = data_args["split"]
114
- del data_args["split"]
115
-
116
- return split
117
-
118
- @classmethod
119
- def extract_args_column_mappings(
120
- cls,
121
- data_args: Optional[dict[str, Any]],
122
- ) -> dict[ColumnInputTypes, str]:
123
- columns: dict[ColumnInputTypes, str] = {}
124
-
125
- if data_args:
126
- if "prompt_column" in data_args:
127
- columns["prompt_column"] = data_args["prompt_column"]
128
- del data_args["prompt_column"]
129
-
130
- if "prompt_tokens_count_column" in data_args:
131
- columns["prompt_tokens_count_column"] = data_args[
132
- "prompt_tokens_count_column"
133
- ]
134
- del data_args["prompt_tokens_count_column"]
135
-
136
- if "output_tokens_count_column" in data_args:
137
- columns["output_tokens_count_column"] = data_args[
138
- "output_tokens_count_column"
139
- ]
140
- del data_args["output_tokens_count_column"]
141
-
142
- return columns
143
-
144
- @classmethod
145
- def extract_dataset_name(
146
- cls, dataset: Union[Dataset, IterableDataset, DatasetDict, IterableDatasetDict]
147
- ) -> Optional[str]:
148
- if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
149
- dataset = dataset[list(dataset.keys())[0]]
150
-
151
- if isinstance(dataset, (Dataset, IterableDataset)):
152
- if not hasattr(dataset, "info") or not hasattr(
153
- dataset.info, "dataset_name"
154
- ):
155
- return None
156
-
157
- return dataset.info.dataset_name
158
-
159
- raise ValueError(f"Unsupported data type: {type(dataset)} given for {dataset}.")
160
-
161
- @classmethod
162
- def extract_dataset_split(
163
- cls,
164
- dataset: Union[DatasetDict, IterableDatasetDict],
165
- specified_split: Union[Literal["auto"], str] = "auto",
166
- split_pref_order: Optional[Union[Literal["auto"], list[str]]] = "auto",
167
- ) -> Union[Dataset, IterableDataset]:
168
- if not isinstance(dataset, (DatasetDict, IterableDatasetDict)):
169
- raise ValueError(
170
- f"Unsupported data type: {type(dataset)} given for {dataset}."
171
- )
172
-
173
- if specified_split != "auto":
174
- if specified_split not in dataset:
175
- raise ValueError(
176
- f"Split {specified_split} not found in dataset {dataset}."
177
- )
178
-
179
- return dataset[specified_split]
180
-
181
- dataset_name = cls.extract_dataset_name(dataset)
182
-
183
- if dataset_name and dataset_name in cls.DEFAULT_SPLITS_DATASET:
184
- return dataset[cls.DEFAULT_SPLITS_DATASET[dataset_name]]
185
-
186
- if split_pref_order == "auto":
187
- split_pref_order = [
188
- *cls.DEFAULT_SPLITS_TEST,
189
- *cls.DEFAULT_SPLITS_VAL,
190
- *cls.DEFAULT_SPLITS_CALIB,
191
- *cls.DEFAULT_SPLITS_TRAIN,
192
- ]
193
-
194
- for test_split in split_pref_order or []:
195
- if test_split in dataset:
196
- return dataset[test_split]
197
-
198
- return dataset[list(dataset.keys())[0]]
199
-
200
- @classmethod
201
- @abstractmethod
202
- def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: ...
203
-
204
- @classmethod
205
- @abstractmethod
206
- def handle_create(
207
- cls,
208
- data: Any,
209
- data_args: Optional[dict[str, Any]],
210
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
211
- processor_args: Optional[dict[str, Any]],
212
- random_seed: int,
213
- ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: ...
@@ -1,42 +0,0 @@
1
- from pathlib import Path
2
- from typing import Any, Optional, Union
3
-
4
- from datasets import Dataset, IterableDataset
5
- from transformers import PreTrainedTokenizerBase # type: ignore[import]
6
-
7
- from guidellm.dataset.creator import ColumnInputTypes
8
- from guidellm.dataset.file import FileDatasetCreator
9
- from guidellm.dataset.hf_datasets import HFDatasetsCreator
10
- from guidellm.dataset.in_memory import InMemoryDatasetCreator
11
- from guidellm.dataset.synthetic import SyntheticDatasetCreator
12
-
13
- __all__ = ["load_dataset"]
14
-
15
-
16
- def load_dataset(
17
- data: Any,
18
- data_args: Optional[dict[str, Any]],
19
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
20
- processor_args: Optional[dict[str, Any]],
21
- random_seed: int = 42,
22
- split_pref_order: Optional[list[str]] = None,
23
- ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
24
- creators = [
25
- InMemoryDatasetCreator,
26
- SyntheticDatasetCreator,
27
- FileDatasetCreator,
28
- HFDatasetsCreator,
29
- ]
30
-
31
- for creator in creators:
32
- if creator.is_supported(data, data_args):
33
- return creator.create(
34
- data,
35
- data_args,
36
- processor,
37
- processor_args,
38
- random_seed,
39
- split_pref_order,
40
- )
41
-
42
- raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
guidellm/dataset/file.py DELETED
@@ -1,92 +0,0 @@
1
- from pathlib import Path
2
- from typing import Any, Optional, Union
3
-
4
- import pandas as pd # type: ignore[import]
5
- from datasets import (
6
- Dataset,
7
- DatasetDict,
8
- IterableDataset,
9
- IterableDatasetDict,
10
- load_dataset,
11
- )
12
- from transformers import PreTrainedTokenizerBase # type: ignore[import]
13
-
14
- from guidellm.dataset.creator import DatasetCreator
15
-
16
- __all__ = ["FileDatasetCreator"]
17
-
18
-
19
- class FileDatasetCreator(DatasetCreator):
20
- SUPPORTED_TYPES = {
21
- ".txt",
22
- ".text",
23
- ".csv",
24
- ".json",
25
- ".jsonl",
26
- ".parquet",
27
- ".arrow",
28
- ".hdf5",
29
- ".tar",
30
- }
31
-
32
- @classmethod
33
- def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
34
- if isinstance(data, (str, Path)) and (path := Path(data)).exists():
35
- # local folder or py file, assume supported
36
- return path.suffix.lower() in cls.SUPPORTED_TYPES
37
-
38
- return False
39
-
40
- @classmethod
41
- def handle_create(
42
- cls,
43
- data: Any,
44
- data_args: Optional[dict[str, Any]],
45
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
46
- processor_args: Optional[dict[str, Any]], # noqa: ARG003
47
- random_seed: int, # noqa: ARG003
48
- ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
49
- if not isinstance(data, (str, Path)):
50
- raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
51
-
52
- path = Path(data)
53
- if not path.exists():
54
- raise FileNotFoundError(f"File not found: {path}")
55
-
56
- if not path.is_file():
57
- raise ValueError(f"Unsupported data type: {path} given for {path}. ")
58
-
59
- if path.suffix.lower() not in cls.SUPPORTED_TYPES:
60
- raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
61
-
62
- return cls.load_dataset(path, data_args)
63
-
64
- @classmethod
65
- def load_dataset(
66
- cls, path: Path, data_args: Optional[dict[str, Any]]
67
- ) -> Union[Dataset, IterableDataset]:
68
- if path.suffix.lower() in {".txt", ".text"}:
69
- with path.open("r") as file:
70
- items = file.readlines()
71
-
72
- dataset = Dataset.from_dict({"text": items}, **(data_args or {}))
73
- elif path.suffix.lower() == ".csv":
74
- dataset = load_dataset("csv", data_files=str(path), **(data_args or {}))
75
- elif path.suffix.lower() in {".json", ".jsonl"}:
76
- dataset = load_dataset("json", data_files=str(path), **(data_args or {}))
77
- elif path.suffix.lower() == ".parquet":
78
- dataset = load_dataset("parquet", data_files=str(path), **(data_args or {}))
79
- elif path.suffix.lower() == ".arrow":
80
- dataset = load_dataset("arrow", data_files=str(path), **(data_args or {}))
81
- elif path.suffix.lower() == ".hdf5":
82
- dataset = Dataset.from_pandas(pd.read_hdf(str(path)), **(data_args or {}))
83
- elif path.suffix.lower() == ".db":
84
- dataset = Dataset.from_sql(con=str(path), **(data_args or {}))
85
- elif path.suffix.lower() == ".tar":
86
- dataset = load_dataset(
87
- "webdataset", data_files=str(path), **(data_args or {})
88
- )
89
- else:
90
- raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
91
-
92
- return dataset
@@ -1,62 +0,0 @@
1
- from pathlib import Path
2
- from typing import Any, Optional, Union
3
-
4
- from datasets import (
5
- Dataset,
6
- DatasetDict,
7
- IterableDataset,
8
- IterableDatasetDict,
9
- get_dataset_config_info,
10
- load_dataset,
11
- )
12
- from transformers import PreTrainedTokenizerBase # type: ignore[import]
13
-
14
- from guidellm.dataset.creator import DatasetCreator
15
-
16
- __all__ = ["HFDatasetsCreator"]
17
-
18
-
19
- class HFDatasetsCreator(DatasetCreator):
20
- @classmethod
21
- def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
22
- if isinstance(
23
- data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
24
- ):
25
- # base type is supported
26
- return True
27
-
28
- if isinstance(data, (str, Path)) and (path := Path(data)).exists():
29
- # local folder or py file, assume supported
30
- return path.is_dir() or path.suffix == ".py"
31
-
32
- if isinstance(data, (str, Path)):
33
- try:
34
- # try to load dataset
35
- return get_dataset_config_info(data) is not None
36
- except Exception: # noqa: BLE001, S110
37
- pass
38
-
39
- return False
40
-
41
- @classmethod
42
- def handle_create(
43
- cls,
44
- data: Any,
45
- data_args: Optional[dict[str, Any]],
46
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
47
- processor_args: Optional[dict[str, Any]], # noqa: ARG003
48
- random_seed: int, # noqa: ARG003
49
- ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
50
- if isinstance(data, (str, Path)):
51
- data = load_dataset(data, **(data_args or {}))
52
- elif data_args:
53
- raise ValueError(
54
- f"data_args should not be provided when data is a {type(data)}"
55
- )
56
-
57
- if isinstance(
58
- data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
59
- ):
60
- return data
61
-
62
- raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
@@ -1,132 +0,0 @@
1
- from collections.abc import Iterable
2
- from pathlib import Path
3
- from typing import Any, Optional, Union
4
-
5
- from datasets import (
6
- Dataset,
7
- DatasetDict,
8
- IterableDataset,
9
- IterableDatasetDict,
10
- )
11
- from transformers import PreTrainedTokenizerBase # type: ignore[import]
12
-
13
- from guidellm.dataset.creator import DatasetCreator
14
-
15
- __all__ = ["InMemoryDatasetCreator"]
16
-
17
-
18
- class InMemoryDatasetCreator(DatasetCreator):
19
- @classmethod
20
- def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
21
- return isinstance(data, Iterable) and not isinstance(data, str)
22
-
23
- @classmethod
24
- def handle_create(
25
- cls,
26
- data: Any,
27
- data_args: Optional[dict[str, Any]],
28
- processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
29
- processor_args: Optional[dict[str, Any]], # noqa: ARG003
30
- random_seed: int, # noqa: ARG003
31
- ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
32
- if not isinstance(data, Iterable):
33
- raise TypeError(
34
- f"Unsupported data format. Expected Iterable[Any], got {type(data)}"
35
- )
36
-
37
- if not data:
38
- raise ValueError("Data is empty")
39
-
40
- if isinstance(data, dict):
41
- # assume data is a dictionary of columns and values: {"c1": ["i1", "i2"]}
42
- data_dict = cls.format_data_dict(data)
43
- elif isinstance(data[0], dict): # type: ignore[index]
44
- # assume data is a list of dictionaries: [{"c1": "i1"}, {"c1": "i2"}]
45
- data_dict = cls.format_data_iterable_dicts(data)
46
- else:
47
- # assume data is a list of items with no columns: ["i1", "i2"]
48
- data_dict = cls.format_data_iterable_values(data)
49
-
50
- return Dataset.from_dict(data_dict, **(data_args or {}))
51
-
52
- @classmethod
53
- def format_data_dict(cls, data: dict[Any, Any]) -> dict[str, Any]:
54
- if not isinstance(data, dict):
55
- raise TypeError(
56
- f"Unsupported data format. Expected Dict[str, Iterable[Any]], "
57
- f"got {type(data)}"
58
- )
59
-
60
- if not all(
61
- isinstance(key, str) and isinstance(val, Iterable)
62
- for key, val in data.items()
63
- ):
64
- raise TypeError(
65
- "Unsupported data format. Expected Dict[str, Iterable[Any]], "
66
- f"got {type(data)}"
67
- )
68
-
69
- samples = len(list(data.values())[0])
70
- if not all(len(val) == samples for val in data.values()):
71
- raise ValueError(
72
- "Unsupported data format. Not all columns have the same number samples "
73
- f"for {data}"
74
- )
75
-
76
- return data
77
-
78
- @classmethod
79
- def format_data_iterable_dicts(
80
- cls, data: Iterable[dict[Any, Any]]
81
- ) -> dict[str, Any]:
82
- if not isinstance(data, Iterable):
83
- raise TypeError(
84
- f"Unsupported data format. Expected Iterable[Dict[str, Any]], "
85
- f"got {type(data)}"
86
- )
87
-
88
- if not all(isinstance(item, dict) for item in data):
89
- raise TypeError(
90
- f"Unsupported data format. Expected Iterable[Dict[str, Any]], "
91
- f"got {type(data)}"
92
- )
93
-
94
- if not all(isinstance(key, str) for key in data[0]): # type: ignore[index]
95
- raise TypeError(
96
- "Unsupported data format. Expected Dict[str, Any], "
97
- f"but one of the items had a non string column for {data}"
98
- )
99
-
100
- columns = list(data[0].keys()) # type: ignore[index]
101
- if not all(
102
- len(item) == len(columns) and all(key in item for key in columns)
103
- for item in data
104
- ):
105
- raise ValueError(
106
- "Unsupported data format. Not all items have the same columns "
107
- f"for {data}"
108
- )
109
-
110
- data_dict: dict[str, Any] = {key: [] for key in columns}
111
- for item in data:
112
- for key, value in item.items():
113
- data_dict[key].append(value)
114
-
115
- return data_dict
116
-
117
- @classmethod
118
- def format_data_iterable_values(cls, data: Iterable[Any]) -> dict[str, Any]:
119
- if not isinstance(data, Iterable):
120
- raise TypeError(
121
- f"Unsupported data format. Expected Iterable[Iterable[Any]], "
122
- f"got {type(data)}"
123
- )
124
-
125
- first_item = next(iter(data), None)
126
- first_type = type(first_item)
127
- if not all(isinstance(item, first_type) for item in data):
128
- raise TypeError(
129
- f"Unsupported data format. Not all types are the same for {data}"
130
- )
131
-
132
- return {"data": list(data)}