guidellm 0.1.0__py3-none-any.whl → 0.2.0rc20250418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +38 -6
- guidellm/__main__.py +294 -0
- guidellm/backend/__init__.py +19 -6
- guidellm/backend/backend.py +238 -0
- guidellm/backend/openai.py +532 -122
- guidellm/backend/response.py +132 -0
- guidellm/benchmark/__init__.py +73 -0
- guidellm/benchmark/aggregator.py +760 -0
- guidellm/benchmark/benchmark.py +838 -0
- guidellm/benchmark/benchmarker.py +334 -0
- guidellm/benchmark/entrypoints.py +141 -0
- guidellm/benchmark/output.py +946 -0
- guidellm/benchmark/profile.py +409 -0
- guidellm/benchmark/progress.py +720 -0
- guidellm/config.py +34 -56
- guidellm/data/__init__.py +4 -0
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +22 -0
- guidellm/dataset/creator.py +213 -0
- guidellm/dataset/entrypoints.py +42 -0
- guidellm/dataset/file.py +90 -0
- guidellm/dataset/hf_datasets.py +62 -0
- guidellm/dataset/in_memory.py +132 -0
- guidellm/dataset/synthetic.py +262 -0
- guidellm/objects/__init__.py +18 -0
- guidellm/objects/pydantic.py +60 -0
- guidellm/objects/statistics.py +947 -0
- guidellm/request/__init__.py +12 -10
- guidellm/request/loader.py +281 -0
- guidellm/request/request.py +79 -0
- guidellm/scheduler/__init__.py +51 -3
- guidellm/scheduler/result.py +137 -0
- guidellm/scheduler/scheduler.py +382 -0
- guidellm/scheduler/strategy.py +493 -0
- guidellm/scheduler/types.py +7 -0
- guidellm/scheduler/worker.py +511 -0
- guidellm/utils/__init__.py +16 -29
- guidellm/utils/colors.py +8 -0
- guidellm/utils/hf_transformers.py +35 -0
- guidellm/utils/random.py +43 -0
- guidellm/utils/text.py +118 -357
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/METADATA +96 -79
- guidellm-0.2.0rc20250418.dist-info/RECORD +48 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/WHEEL +1 -1
- guidellm-0.2.0rc20250418.dist-info/entry_points.txt +2 -0
- guidellm/backend/base.py +0 -320
- guidellm/core/__init__.py +0 -24
- guidellm/core/distribution.py +0 -190
- guidellm/core/report.py +0 -321
- guidellm/core/request.py +0 -44
- guidellm/core/result.py +0 -545
- guidellm/core/serializable.py +0 -169
- guidellm/executor/__init__.py +0 -10
- guidellm/executor/base.py +0 -213
- guidellm/executor/profile_generator.py +0 -343
- guidellm/main.py +0 -336
- guidellm/request/base.py +0 -194
- guidellm/request/emulated.py +0 -391
- guidellm/request/file.py +0 -76
- guidellm/request/transformers.py +0 -100
- guidellm/scheduler/base.py +0 -374
- guidellm/scheduler/load_generator.py +0 -196
- guidellm/utils/injector.py +0 -70
- guidellm/utils/progress.py +0 -196
- guidellm/utils/transformers.py +0 -151
- guidellm-0.1.0.dist-info/RECORD +0 -35
- guidellm-0.1.0.dist-info/entry_points.txt +0 -3
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info/licenses}/LICENSE +0 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/top_level.txt +0 -0
guidellm/config.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from collections.abc import Sequence
|
|
2
3
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
4
|
+
from typing import Literal, Optional
|
|
4
5
|
|
|
5
6
|
from pydantic import BaseModel, Field, model_validator
|
|
6
7
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
9
10
|
"DatasetSettings",
|
|
10
|
-
"EmulatedDataSettings",
|
|
11
11
|
"Environment",
|
|
12
12
|
"LoggingSettings",
|
|
13
13
|
"OpenAISettings",
|
|
14
14
|
"print_config",
|
|
15
|
-
"ReportGenerationSettings",
|
|
16
15
|
"Settings",
|
|
17
16
|
"reload_settings",
|
|
18
17
|
"settings",
|
|
@@ -55,7 +54,7 @@ class DatasetSettings(BaseModel):
|
|
|
55
54
|
Dataset settings for the application
|
|
56
55
|
"""
|
|
57
56
|
|
|
58
|
-
preferred_data_columns:
|
|
57
|
+
preferred_data_columns: list[str] = Field(
|
|
59
58
|
default_factory=lambda: [
|
|
60
59
|
"prompt",
|
|
61
60
|
"instruction",
|
|
@@ -69,53 +68,23 @@ class DatasetSettings(BaseModel):
|
|
|
69
68
|
"data",
|
|
70
69
|
]
|
|
71
70
|
)
|
|
72
|
-
preferred_data_splits:
|
|
71
|
+
preferred_data_splits: list[str] = Field(
|
|
73
72
|
default_factory=lambda: ["test", "tst", "validation", "val", "train"]
|
|
74
73
|
)
|
|
75
74
|
|
|
76
75
|
|
|
77
|
-
class EmulatedDataSettings(BaseModel):
|
|
78
|
-
"""
|
|
79
|
-
Emulated data settings for the application to use
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
source: str = "https://www.gutenberg.org/files/1342/1342-0.txt"
|
|
83
|
-
filter_start: str = "It is a truth universally acknowledged, that a"
|
|
84
|
-
filter_end: str = "CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO."
|
|
85
|
-
clean_text_args: Dict[str, bool] = Field(
|
|
86
|
-
default_factory=lambda: {
|
|
87
|
-
"fix_encoding": True,
|
|
88
|
-
"clean_whitespace": True,
|
|
89
|
-
"remove_empty_lines": True,
|
|
90
|
-
"force_new_line_punctuation": True,
|
|
91
|
-
}
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
|
|
95
76
|
class OpenAISettings(BaseModel):
|
|
96
77
|
"""
|
|
97
78
|
OpenAI settings for the application to connect to the API
|
|
98
79
|
for OpenAI server based pathways
|
|
99
80
|
"""
|
|
100
81
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
max_gen_tokens: int = 4096
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class ReportGenerationSettings(BaseModel):
|
|
112
|
-
"""
|
|
113
|
-
Report generation settings for the application
|
|
114
|
-
"""
|
|
115
|
-
|
|
116
|
-
source: str = ""
|
|
117
|
-
report_html_match: str = "window.report_data = {};"
|
|
118
|
-
report_html_placeholder: str = "{}"
|
|
82
|
+
api_key: Optional[str] = None
|
|
83
|
+
bearer_token: Optional[str] = None
|
|
84
|
+
organization: Optional[str] = None
|
|
85
|
+
project: Optional[str] = None
|
|
86
|
+
base_url: str = "http://localhost:8000"
|
|
87
|
+
max_output_tokens: int = 16384
|
|
119
88
|
|
|
120
89
|
|
|
121
90
|
class Settings(BaseSettings):
|
|
@@ -141,27 +110,40 @@ class Settings(BaseSettings):
|
|
|
141
110
|
|
|
142
111
|
# general settings
|
|
143
112
|
env: Environment = Environment.PROD
|
|
144
|
-
|
|
145
|
-
max_concurrency: int = 512
|
|
146
|
-
num_sweep_profiles: int = 9
|
|
113
|
+
default_async_loop_sleep: float = 10e-5
|
|
147
114
|
logging: LoggingSettings = LoggingSettings()
|
|
115
|
+
default_sweep_number: int = 10
|
|
116
|
+
|
|
117
|
+
# HTTP settings
|
|
118
|
+
request_timeout: int = 60 * 5 # 5 minutes
|
|
119
|
+
request_http2: bool = True
|
|
120
|
+
|
|
121
|
+
# Scheduler settings
|
|
122
|
+
max_concurrency: int = 512
|
|
123
|
+
max_worker_processes: int = 10
|
|
124
|
+
max_add_requests_per_loop: int = 20
|
|
148
125
|
|
|
149
126
|
# Data settings
|
|
150
127
|
dataset: DatasetSettings = DatasetSettings()
|
|
151
|
-
emulated_data: EmulatedDataSettings = EmulatedDataSettings()
|
|
152
128
|
|
|
153
|
-
# Request settings
|
|
129
|
+
# Request/stats settings
|
|
130
|
+
preferred_prompt_tokens_source: Optional[
|
|
131
|
+
Literal["request", "response", "local"]
|
|
132
|
+
] = "response"
|
|
133
|
+
preferred_output_tokens_source: Optional[
|
|
134
|
+
Literal["request", "response", "local"]
|
|
135
|
+
] = "response"
|
|
136
|
+
preferred_backend: Literal["openai"] = "openai"
|
|
154
137
|
openai: OpenAISettings = OpenAISettings()
|
|
155
138
|
|
|
156
|
-
#
|
|
157
|
-
|
|
139
|
+
# Output settings
|
|
140
|
+
table_border_char: str = "="
|
|
141
|
+
table_headers_border_char: str = "-"
|
|
142
|
+
table_column_separator_char: str = "|"
|
|
158
143
|
|
|
159
144
|
@model_validator(mode="after")
|
|
160
145
|
@classmethod
|
|
161
146
|
def set_default_source(cls, values):
|
|
162
|
-
if not values.report_generation.source:
|
|
163
|
-
values.report_generation.source = ENV_REPORT_MAPPING.get(values.env)
|
|
164
|
-
|
|
165
147
|
return values
|
|
166
148
|
|
|
167
149
|
def generate_env_file(self) -> str:
|
|
@@ -197,7 +179,7 @@ class Settings(BaseSettings):
|
|
|
197
179
|
if isinstance(sub_value, Sequence) and not isinstance(sub_value, str):
|
|
198
180
|
value_str = ",".join(f'"{item}"' for item in sub_value)
|
|
199
181
|
env_file += f"{tag}=[{value_str}]\n"
|
|
200
|
-
elif isinstance(sub_value,
|
|
182
|
+
elif isinstance(sub_value, dict):
|
|
201
183
|
value_str = json.dumps(sub_value)
|
|
202
184
|
env_file += f"{tag}={value_str}\n"
|
|
203
185
|
elif not sub_value:
|
|
@@ -228,7 +210,3 @@ def print_config():
|
|
|
228
210
|
Print the current configuration settings
|
|
229
211
|
"""
|
|
230
212
|
print(f"Settings: \n{settings.generate_env_file()}") # noqa: T201
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
if __name__ == "__main__":
|
|
234
|
-
print_config()
|
|
Binary file
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .creator import ColumnInputTypes, DatasetCreator
|
|
2
|
+
from .entrypoints import load_dataset
|
|
3
|
+
from .file import FileDatasetCreator
|
|
4
|
+
from .hf_datasets import HFDatasetsCreator
|
|
5
|
+
from .in_memory import InMemoryDatasetCreator
|
|
6
|
+
from .synthetic import (
|
|
7
|
+
SyntheticDatasetConfig,
|
|
8
|
+
SyntheticDatasetCreator,
|
|
9
|
+
SyntheticTextItemsGenerator,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DatasetCreator",
|
|
14
|
+
"ColumnInputTypes",
|
|
15
|
+
"HFDatasetsCreator",
|
|
16
|
+
"load_dataset",
|
|
17
|
+
"FileDatasetCreator",
|
|
18
|
+
"InMemoryDatasetCreator",
|
|
19
|
+
"SyntheticDatasetCreator",
|
|
20
|
+
"SyntheticDatasetConfig",
|
|
21
|
+
"SyntheticTextItemsGenerator",
|
|
22
|
+
]
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
|
|
6
|
+
from transformers import PreTrainedTokenizerBase # type: ignore[import]
|
|
7
|
+
|
|
8
|
+
__all__ = ["DatasetCreator", "ColumnInputTypes"]
|
|
9
|
+
|
|
10
|
+
ColumnInputTypes = Literal[
|
|
11
|
+
"prompt_column",
|
|
12
|
+
"text_column",
|
|
13
|
+
"prompt_tokens_count_column",
|
|
14
|
+
"output_tokens_count_column",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatasetCreator(ABC):
|
|
19
|
+
DEFAULT_SPLITS_TRAIN = [
|
|
20
|
+
"train",
|
|
21
|
+
"training",
|
|
22
|
+
"train_set",
|
|
23
|
+
"training_set",
|
|
24
|
+
"train_dataset",
|
|
25
|
+
"training_dataset",
|
|
26
|
+
"train_data",
|
|
27
|
+
"training_data",
|
|
28
|
+
"pretrain",
|
|
29
|
+
"pretrain_set",
|
|
30
|
+
"pretrain_dataset",
|
|
31
|
+
"pretrain_data",
|
|
32
|
+
"pretraining",
|
|
33
|
+
]
|
|
34
|
+
DEFAULT_SPLITS_CALIB = [
|
|
35
|
+
"calibration",
|
|
36
|
+
"calib",
|
|
37
|
+
"cal",
|
|
38
|
+
"calibration_set",
|
|
39
|
+
"calib_set",
|
|
40
|
+
"cal_set",
|
|
41
|
+
"calibration_dataset",
|
|
42
|
+
"calib_dataset",
|
|
43
|
+
"cal_set",
|
|
44
|
+
"calibration_data",
|
|
45
|
+
"calib_data",
|
|
46
|
+
"cal_data",
|
|
47
|
+
]
|
|
48
|
+
DEFAULT_SPLITS_VAL = [
|
|
49
|
+
"validation",
|
|
50
|
+
"val",
|
|
51
|
+
"valid",
|
|
52
|
+
"validation_set",
|
|
53
|
+
"val_set",
|
|
54
|
+
"validation_dataset",
|
|
55
|
+
"val_dataset",
|
|
56
|
+
"validation_data",
|
|
57
|
+
"val_data",
|
|
58
|
+
"dev",
|
|
59
|
+
"dev_set",
|
|
60
|
+
"dev_dataset",
|
|
61
|
+
"dev_data",
|
|
62
|
+
]
|
|
63
|
+
DEFAULT_SPLITS_TEST = [
|
|
64
|
+
"test",
|
|
65
|
+
"testing",
|
|
66
|
+
"test_set",
|
|
67
|
+
"testing_set",
|
|
68
|
+
"test_dataset",
|
|
69
|
+
"testing_dataset",
|
|
70
|
+
"test_data",
|
|
71
|
+
"testing_data",
|
|
72
|
+
"eval",
|
|
73
|
+
"eval_set",
|
|
74
|
+
"eval_dataset",
|
|
75
|
+
"eval_data",
|
|
76
|
+
]
|
|
77
|
+
DEFAULT_SPLITS_DATASET: dict[str, str] = {}
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def create(
|
|
81
|
+
cls,
|
|
82
|
+
data: Any,
|
|
83
|
+
data_args: Optional[dict[str, Any]],
|
|
84
|
+
processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
|
|
85
|
+
processor_args: Optional[dict[str, Any]],
|
|
86
|
+
random_seed: int = 42,
|
|
87
|
+
split_pref_order: Optional[list[str]] = None,
|
|
88
|
+
) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
|
|
89
|
+
if not cls.is_supported(data, data_args):
|
|
90
|
+
raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
|
|
91
|
+
|
|
92
|
+
split = cls.extract_args_split(data_args)
|
|
93
|
+
column_mappings = cls.extract_args_column_mappings(data_args)
|
|
94
|
+
dataset = cls.handle_create(
|
|
95
|
+
data, data_args, processor, processor_args, random_seed
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
|
99
|
+
dataset = cls.extract_dataset_split(dataset, split, split_pref_order)
|
|
100
|
+
|
|
101
|
+
if not isinstance(dataset, (Dataset, IterableDataset)):
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"Unsupported data type: {type(dataset)} given for {dataset}."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return dataset, column_mappings
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def extract_args_split(cls, data_args: Optional[dict[str, Any]]) -> str:
|
|
110
|
+
split = "auto"
|
|
111
|
+
|
|
112
|
+
if data_args and "split" in data_args:
|
|
113
|
+
split = data_args["split"]
|
|
114
|
+
del data_args["split"]
|
|
115
|
+
|
|
116
|
+
return split
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def extract_args_column_mappings(
|
|
120
|
+
cls,
|
|
121
|
+
data_args: Optional[dict[str, Any]],
|
|
122
|
+
) -> dict[ColumnInputTypes, str]:
|
|
123
|
+
columns: dict[ColumnInputTypes, str] = {}
|
|
124
|
+
|
|
125
|
+
if data_args:
|
|
126
|
+
if "prompt_column" in data_args:
|
|
127
|
+
columns["prompt_column"] = data_args["prompt_column"]
|
|
128
|
+
del data_args["prompt_column"]
|
|
129
|
+
|
|
130
|
+
if "prompt_tokens_count_column" in data_args:
|
|
131
|
+
columns["prompt_tokens_count_column"] = data_args[
|
|
132
|
+
"prompt_tokens_count_column"
|
|
133
|
+
]
|
|
134
|
+
del data_args["prompt_tokens_count_column"]
|
|
135
|
+
|
|
136
|
+
if "output_tokens_count_column" in data_args:
|
|
137
|
+
columns["output_tokens_count_column"] = data_args[
|
|
138
|
+
"output_tokens_count_column"
|
|
139
|
+
]
|
|
140
|
+
del data_args["output_tokens_count_column"]
|
|
141
|
+
|
|
142
|
+
return columns
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def extract_dataset_name(
|
|
146
|
+
cls, dataset: Union[Dataset, IterableDataset, DatasetDict, IterableDatasetDict]
|
|
147
|
+
) -> Optional[str]:
|
|
148
|
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
|
149
|
+
dataset = dataset[list(dataset.keys())[0]]
|
|
150
|
+
|
|
151
|
+
if isinstance(dataset, (Dataset, IterableDataset)):
|
|
152
|
+
if not hasattr(dataset, "info") or not hasattr(
|
|
153
|
+
dataset.info, "dataset_name"
|
|
154
|
+
):
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
return dataset.info.dataset_name
|
|
158
|
+
|
|
159
|
+
raise ValueError(f"Unsupported data type: {type(dataset)} given for {dataset}.")
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def extract_dataset_split(
|
|
163
|
+
cls,
|
|
164
|
+
dataset: Union[DatasetDict, IterableDatasetDict],
|
|
165
|
+
specified_split: Union[Literal["auto"], str] = "auto",
|
|
166
|
+
split_pref_order: Optional[Union[Literal["auto"], list[str]]] = "auto",
|
|
167
|
+
) -> Union[Dataset, IterableDataset]:
|
|
168
|
+
if not isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Unsupported data type: {type(dataset)} given for {dataset}."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if specified_split != "auto":
|
|
174
|
+
if specified_split not in dataset:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"Split {specified_split} not found in dataset {dataset}."
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return dataset[specified_split]
|
|
180
|
+
|
|
181
|
+
dataset_name = cls.extract_dataset_name(dataset)
|
|
182
|
+
|
|
183
|
+
if dataset_name and dataset_name in cls.DEFAULT_SPLITS_DATASET:
|
|
184
|
+
return dataset[cls.DEFAULT_SPLITS_DATASET[dataset_name]]
|
|
185
|
+
|
|
186
|
+
if split_pref_order == "auto":
|
|
187
|
+
split_pref_order = [
|
|
188
|
+
*cls.DEFAULT_SPLITS_TEST,
|
|
189
|
+
*cls.DEFAULT_SPLITS_VAL,
|
|
190
|
+
*cls.DEFAULT_SPLITS_CALIB,
|
|
191
|
+
*cls.DEFAULT_SPLITS_TRAIN,
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
for test_split in split_pref_order or []:
|
|
195
|
+
if test_split in dataset:
|
|
196
|
+
return dataset[test_split]
|
|
197
|
+
|
|
198
|
+
return dataset[list(dataset.keys())[0]]
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
@abstractmethod
|
|
202
|
+
def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: ...
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
@abstractmethod
|
|
206
|
+
def handle_create(
|
|
207
|
+
cls,
|
|
208
|
+
data: Any,
|
|
209
|
+
data_args: Optional[dict[str, Any]],
|
|
210
|
+
processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
|
|
211
|
+
processor_args: Optional[dict[str, Any]],
|
|
212
|
+
random_seed: int,
|
|
213
|
+
) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: ...
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datasets import Dataset, IterableDataset
|
|
5
|
+
from transformers import PreTrainedTokenizerBase # type: ignore[import]
|
|
6
|
+
|
|
7
|
+
from guidellm.dataset.creator import ColumnInputTypes
|
|
8
|
+
from guidellm.dataset.file import FileDatasetCreator
|
|
9
|
+
from guidellm.dataset.hf_datasets import HFDatasetsCreator
|
|
10
|
+
from guidellm.dataset.in_memory import InMemoryDatasetCreator
|
|
11
|
+
from guidellm.dataset.synthetic import SyntheticDatasetCreator
|
|
12
|
+
|
|
13
|
+
__all__ = ["load_dataset"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_dataset(
|
|
17
|
+
data: Any,
|
|
18
|
+
data_args: Optional[dict[str, Any]],
|
|
19
|
+
processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
|
|
20
|
+
processor_args: Optional[dict[str, Any]],
|
|
21
|
+
random_seed: int = 42,
|
|
22
|
+
split_pref_order: Optional[list[str]] = None,
|
|
23
|
+
) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
|
|
24
|
+
creators = [
|
|
25
|
+
InMemoryDatasetCreator,
|
|
26
|
+
SyntheticDatasetCreator,
|
|
27
|
+
FileDatasetCreator,
|
|
28
|
+
HFDatasetsCreator,
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
for creator in creators:
|
|
32
|
+
if creator.is_supported(data, data_args):
|
|
33
|
+
return creator.create(
|
|
34
|
+
data,
|
|
35
|
+
data_args,
|
|
36
|
+
processor,
|
|
37
|
+
processor_args,
|
|
38
|
+
random_seed,
|
|
39
|
+
split_pref_order,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
|
guidellm/dataset/file.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd # type: ignore[import]
|
|
5
|
+
from datasets import (
|
|
6
|
+
Dataset,
|
|
7
|
+
DatasetDict,
|
|
8
|
+
IterableDataset,
|
|
9
|
+
IterableDatasetDict,
|
|
10
|
+
load_dataset,
|
|
11
|
+
)
|
|
12
|
+
from transformers import PreTrainedTokenizerBase # type: ignore[import]
|
|
13
|
+
|
|
14
|
+
from guidellm.dataset.creator import DatasetCreator
|
|
15
|
+
|
|
16
|
+
__all__ = ["FileDatasetCreator"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FileDatasetCreator(DatasetCreator):
|
|
20
|
+
SUPPORTED_TYPES = {
|
|
21
|
+
".txt",
|
|
22
|
+
".text",
|
|
23
|
+
".csv",
|
|
24
|
+
".json",
|
|
25
|
+
".jsonl",
|
|
26
|
+
".parquet",
|
|
27
|
+
".arrow",
|
|
28
|
+
".hdf5",
|
|
29
|
+
".tar",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
|
|
34
|
+
if isinstance(data, (str, Path)) and (path := Path(data)).exists():
|
|
35
|
+
# local folder or py file, assume supported
|
|
36
|
+
return path.suffix.lower() in cls.SUPPORTED_TYPES
|
|
37
|
+
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def handle_create(
|
|
42
|
+
cls,
|
|
43
|
+
data: Any,
|
|
44
|
+
data_args: Optional[dict[str, Any]],
|
|
45
|
+
processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
|
|
46
|
+
processor_args: Optional[dict[str, Any]], # noqa: ARG003
|
|
47
|
+
random_seed: int, # noqa: ARG003
|
|
48
|
+
) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
|
|
49
|
+
if not isinstance(data, (str, Path)):
|
|
50
|
+
raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
|
|
51
|
+
|
|
52
|
+
path = Path(data)
|
|
53
|
+
if not path.exists():
|
|
54
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
55
|
+
|
|
56
|
+
if not path.is_file():
|
|
57
|
+
raise ValueError(f"Unsupported data type: {path} given for {path}. ")
|
|
58
|
+
|
|
59
|
+
if path.suffix.lower() not in cls.SUPPORTED_TYPES:
|
|
60
|
+
raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
|
|
61
|
+
|
|
62
|
+
return cls.load_dataset(path, data_args)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def load_dataset(
|
|
66
|
+
cls, path: Path, data_args: Optional[dict[str, Any]]
|
|
67
|
+
) -> Union[Dataset, IterableDataset]:
|
|
68
|
+
if path.suffix.lower() in {".txt", ".text"}:
|
|
69
|
+
with path.open("r") as file:
|
|
70
|
+
items = file.readlines()
|
|
71
|
+
|
|
72
|
+
dataset = Dataset.from_dict({"text": items}, **(data_args or {}))
|
|
73
|
+
elif path.suffix.lower() == ".csv":
|
|
74
|
+
dataset = load_dataset("csv", data_files=path, **(data_args or {}))
|
|
75
|
+
elif path.suffix.lower() in {".json", ".jsonl"}:
|
|
76
|
+
dataset = load_dataset("json", data_files=path, **(data_args or {}))
|
|
77
|
+
elif path.suffix.lower() == ".parquet":
|
|
78
|
+
dataset = load_dataset("parquet", data_files=path, **(data_args or {}))
|
|
79
|
+
elif path.suffix.lower() == ".arrow":
|
|
80
|
+
dataset = load_dataset("arrow", data_files=path, **(data_args or {}))
|
|
81
|
+
elif path.suffix.lower() == ".hdf5":
|
|
82
|
+
dataset = Dataset.from_pandas(pd.read_hdf(path), **(data_args or {}))
|
|
83
|
+
elif path.suffix.lower() == ".db":
|
|
84
|
+
dataset = Dataset.from_sql(con=path, **(data_args or {}))
|
|
85
|
+
elif path.suffix.lower() == ".tar":
|
|
86
|
+
dataset = load_dataset("webdataset", data_files=path, **(data_args or {}))
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
|
|
89
|
+
|
|
90
|
+
return dataset
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datasets import (
|
|
5
|
+
Dataset,
|
|
6
|
+
DatasetDict,
|
|
7
|
+
IterableDataset,
|
|
8
|
+
IterableDatasetDict,
|
|
9
|
+
get_dataset_config_info,
|
|
10
|
+
load_dataset,
|
|
11
|
+
)
|
|
12
|
+
from transformers import PreTrainedTokenizerBase # type: ignore[import]
|
|
13
|
+
|
|
14
|
+
from guidellm.dataset.creator import DatasetCreator
|
|
15
|
+
|
|
16
|
+
__all__ = ["HFDatasetsCreator"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HFDatasetsCreator(DatasetCreator):
|
|
20
|
+
@classmethod
|
|
21
|
+
def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
|
|
22
|
+
if isinstance(
|
|
23
|
+
data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
|
|
24
|
+
):
|
|
25
|
+
# base type is supported
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
if isinstance(data, (str, Path)) and (path := Path(data)).exists():
|
|
29
|
+
# local folder or py file, assume supported
|
|
30
|
+
return path.is_dir() or path.suffix == ".py"
|
|
31
|
+
|
|
32
|
+
if isinstance(data, (str, Path)):
|
|
33
|
+
try:
|
|
34
|
+
# try to load dataset
|
|
35
|
+
return get_dataset_config_info(data) is not None
|
|
36
|
+
except Exception: # noqa: BLE001, S110
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def handle_create(
|
|
43
|
+
cls,
|
|
44
|
+
data: Any,
|
|
45
|
+
data_args: Optional[dict[str, Any]],
|
|
46
|
+
processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
|
|
47
|
+
processor_args: Optional[dict[str, Any]], # noqa: ARG003
|
|
48
|
+
random_seed: int, # noqa: ARG003
|
|
49
|
+
) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
|
|
50
|
+
if isinstance(data, (str, Path)):
|
|
51
|
+
data = load_dataset(data, **(data_args or {}))
|
|
52
|
+
elif data_args:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"data_args should not be provided when data is a {type(data)}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if isinstance(
|
|
58
|
+
data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
|
|
59
|
+
):
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
|