guidellm 0.1.0__py3-none-any.whl → 0.2.0rc20250418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show
  1. guidellm/__init__.py +38 -6
  2. guidellm/__main__.py +294 -0
  3. guidellm/backend/__init__.py +19 -6
  4. guidellm/backend/backend.py +238 -0
  5. guidellm/backend/openai.py +532 -122
  6. guidellm/backend/response.py +132 -0
  7. guidellm/benchmark/__init__.py +73 -0
  8. guidellm/benchmark/aggregator.py +760 -0
  9. guidellm/benchmark/benchmark.py +838 -0
  10. guidellm/benchmark/benchmarker.py +334 -0
  11. guidellm/benchmark/entrypoints.py +141 -0
  12. guidellm/benchmark/output.py +946 -0
  13. guidellm/benchmark/profile.py +409 -0
  14. guidellm/benchmark/progress.py +720 -0
  15. guidellm/config.py +34 -56
  16. guidellm/data/__init__.py +4 -0
  17. guidellm/data/prideandprejudice.txt.gz +0 -0
  18. guidellm/dataset/__init__.py +22 -0
  19. guidellm/dataset/creator.py +213 -0
  20. guidellm/dataset/entrypoints.py +42 -0
  21. guidellm/dataset/file.py +90 -0
  22. guidellm/dataset/hf_datasets.py +62 -0
  23. guidellm/dataset/in_memory.py +132 -0
  24. guidellm/dataset/synthetic.py +262 -0
  25. guidellm/objects/__init__.py +18 -0
  26. guidellm/objects/pydantic.py +60 -0
  27. guidellm/objects/statistics.py +947 -0
  28. guidellm/request/__init__.py +12 -10
  29. guidellm/request/loader.py +281 -0
  30. guidellm/request/request.py +79 -0
  31. guidellm/scheduler/__init__.py +51 -3
  32. guidellm/scheduler/result.py +137 -0
  33. guidellm/scheduler/scheduler.py +382 -0
  34. guidellm/scheduler/strategy.py +493 -0
  35. guidellm/scheduler/types.py +7 -0
  36. guidellm/scheduler/worker.py +511 -0
  37. guidellm/utils/__init__.py +16 -29
  38. guidellm/utils/colors.py +8 -0
  39. guidellm/utils/hf_transformers.py +35 -0
  40. guidellm/utils/random.py +43 -0
  41. guidellm/utils/text.py +118 -357
  42. {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/METADATA +96 -79
  43. guidellm-0.2.0rc20250418.dist-info/RECORD +48 -0
  44. {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/WHEEL +1 -1
  45. guidellm-0.2.0rc20250418.dist-info/entry_points.txt +2 -0
  46. guidellm/backend/base.py +0 -320
  47. guidellm/core/__init__.py +0 -24
  48. guidellm/core/distribution.py +0 -190
  49. guidellm/core/report.py +0 -321
  50. guidellm/core/request.py +0 -44
  51. guidellm/core/result.py +0 -545
  52. guidellm/core/serializable.py +0 -169
  53. guidellm/executor/__init__.py +0 -10
  54. guidellm/executor/base.py +0 -213
  55. guidellm/executor/profile_generator.py +0 -343
  56. guidellm/main.py +0 -336
  57. guidellm/request/base.py +0 -194
  58. guidellm/request/emulated.py +0 -391
  59. guidellm/request/file.py +0 -76
  60. guidellm/request/transformers.py +0 -100
  61. guidellm/scheduler/base.py +0 -374
  62. guidellm/scheduler/load_generator.py +0 -196
  63. guidellm/utils/injector.py +0 -70
  64. guidellm/utils/progress.py +0 -196
  65. guidellm/utils/transformers.py +0 -151
  66. guidellm-0.1.0.dist-info/RECORD +0 -35
  67. guidellm-0.1.0.dist-info/entry_points.txt +0 -3
  68. {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info/licenses}/LICENSE +0 -0
  69. {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/top_level.txt +0 -0
guidellm/config.py CHANGED
@@ -1,18 +1,17 @@
1
1
  import json
2
+ from collections.abc import Sequence
2
3
  from enum import Enum
3
- from typing import Dict, List, Optional, Sequence
4
+ from typing import Literal, Optional
4
5
 
5
6
  from pydantic import BaseModel, Field, model_validator
6
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
7
8
 
8
9
  __all__ = [
9
10
  "DatasetSettings",
10
- "EmulatedDataSettings",
11
11
  "Environment",
12
12
  "LoggingSettings",
13
13
  "OpenAISettings",
14
14
  "print_config",
15
- "ReportGenerationSettings",
16
15
  "Settings",
17
16
  "reload_settings",
18
17
  "settings",
@@ -55,7 +54,7 @@ class DatasetSettings(BaseModel):
55
54
  Dataset settings for the application
56
55
  """
57
56
 
58
- preferred_data_columns: List[str] = Field(
57
+ preferred_data_columns: list[str] = Field(
59
58
  default_factory=lambda: [
60
59
  "prompt",
61
60
  "instruction",
@@ -69,53 +68,23 @@ class DatasetSettings(BaseModel):
69
68
  "data",
70
69
  ]
71
70
  )
72
- preferred_data_splits: List[str] = Field(
71
+ preferred_data_splits: list[str] = Field(
73
72
  default_factory=lambda: ["test", "tst", "validation", "val", "train"]
74
73
  )
75
74
 
76
75
 
77
- class EmulatedDataSettings(BaseModel):
78
- """
79
- Emulated data settings for the application to use
80
- """
81
-
82
- source: str = "https://www.gutenberg.org/files/1342/1342-0.txt"
83
- filter_start: str = "It is a truth universally acknowledged, that a"
84
- filter_end: str = "CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO."
85
- clean_text_args: Dict[str, bool] = Field(
86
- default_factory=lambda: {
87
- "fix_encoding": True,
88
- "clean_whitespace": True,
89
- "remove_empty_lines": True,
90
- "force_new_line_punctuation": True,
91
- }
92
- )
93
-
94
-
95
76
  class OpenAISettings(BaseModel):
96
77
  """
97
78
  OpenAI settings for the application to connect to the API
98
79
  for OpenAI server based pathways
99
80
  """
100
81
 
101
- # OpenAI API key.
102
- api_key: str = "invalid_token"
103
-
104
- # OpenAI-compatible server URL
105
- # NOTE: The default value is default address of llama.cpp web server
106
- base_url: str = "http://localhost:8000/v1"
107
-
108
- max_gen_tokens: int = 4096
109
-
110
-
111
- class ReportGenerationSettings(BaseModel):
112
- """
113
- Report generation settings for the application
114
- """
115
-
116
- source: str = ""
117
- report_html_match: str = "window.report_data = {};"
118
- report_html_placeholder: str = "{}"
82
+ api_key: Optional[str] = None
83
+ bearer_token: Optional[str] = None
84
+ organization: Optional[str] = None
85
+ project: Optional[str] = None
86
+ base_url: str = "http://localhost:8000"
87
+ max_output_tokens: int = 16384
119
88
 
120
89
 
121
90
  class Settings(BaseSettings):
@@ -141,27 +110,40 @@ class Settings(BaseSettings):
141
110
 
142
111
  # general settings
143
112
  env: Environment = Environment.PROD
144
- request_timeout: int = 30
145
- max_concurrency: int = 512
146
- num_sweep_profiles: int = 9
113
+ default_async_loop_sleep: float = 10e-5
147
114
  logging: LoggingSettings = LoggingSettings()
115
+ default_sweep_number: int = 10
116
+
117
+ # HTTP settings
118
+ request_timeout: int = 60 * 5 # 5 minutes
119
+ request_http2: bool = True
120
+
121
+ # Scheduler settings
122
+ max_concurrency: int = 512
123
+ max_worker_processes: int = 10
124
+ max_add_requests_per_loop: int = 20
148
125
 
149
126
  # Data settings
150
127
  dataset: DatasetSettings = DatasetSettings()
151
- emulated_data: EmulatedDataSettings = EmulatedDataSettings()
152
128
 
153
- # Request settings
129
+ # Request/stats settings
130
+ preferred_prompt_tokens_source: Optional[
131
+ Literal["request", "response", "local"]
132
+ ] = "response"
133
+ preferred_output_tokens_source: Optional[
134
+ Literal["request", "response", "local"]
135
+ ] = "response"
136
+ preferred_backend: Literal["openai"] = "openai"
154
137
  openai: OpenAISettings = OpenAISettings()
155
138
 
156
- # Report settings
157
- report_generation: ReportGenerationSettings = ReportGenerationSettings()
139
+ # Output settings
140
+ table_border_char: str = "="
141
+ table_headers_border_char: str = "-"
142
+ table_column_separator_char: str = "|"
158
143
 
159
144
  @model_validator(mode="after")
160
145
  @classmethod
161
146
  def set_default_source(cls, values):
162
- if not values.report_generation.source:
163
- values.report_generation.source = ENV_REPORT_MAPPING.get(values.env)
164
-
165
147
  return values
166
148
 
167
149
  def generate_env_file(self) -> str:
@@ -197,7 +179,7 @@ class Settings(BaseSettings):
197
179
  if isinstance(sub_value, Sequence) and not isinstance(sub_value, str):
198
180
  value_str = ",".join(f'"{item}"' for item in sub_value)
199
181
  env_file += f"{tag}=[{value_str}]\n"
200
- elif isinstance(sub_value, Dict):
182
+ elif isinstance(sub_value, dict):
201
183
  value_str = json.dumps(sub_value)
202
184
  env_file += f"{tag}={value_str}\n"
203
185
  elif not sub_value:
@@ -228,7 +210,3 @@ def print_config():
228
210
  Print the current configuration settings
229
211
  """
230
212
  print(f"Settings: \n{settings.generate_env_file()}") # noqa: T201
231
-
232
-
233
- if __name__ == "__main__":
234
- print_config()
@@ -0,0 +1,4 @@
1
+ """
2
+ Required for python < 3.12
3
+ https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files
4
+ """
Binary file
@@ -0,0 +1,22 @@
1
+ from .creator import ColumnInputTypes, DatasetCreator
2
+ from .entrypoints import load_dataset
3
+ from .file import FileDatasetCreator
4
+ from .hf_datasets import HFDatasetsCreator
5
+ from .in_memory import InMemoryDatasetCreator
6
+ from .synthetic import (
7
+ SyntheticDatasetConfig,
8
+ SyntheticDatasetCreator,
9
+ SyntheticTextItemsGenerator,
10
+ )
11
+
12
+ __all__ = [
13
+ "DatasetCreator",
14
+ "ColumnInputTypes",
15
+ "HFDatasetsCreator",
16
+ "load_dataset",
17
+ "FileDatasetCreator",
18
+ "InMemoryDatasetCreator",
19
+ "SyntheticDatasetCreator",
20
+ "SyntheticDatasetConfig",
21
+ "SyntheticTextItemsGenerator",
22
+ ]
@@ -0,0 +1,213 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any, Literal, Optional, Union
4
+
5
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
6
+ from transformers import PreTrainedTokenizerBase # type: ignore[import]
7
+
8
+ __all__ = ["DatasetCreator", "ColumnInputTypes"]
9
+
10
+ ColumnInputTypes = Literal[
11
+ "prompt_column",
12
+ "text_column",
13
+ "prompt_tokens_count_column",
14
+ "output_tokens_count_column",
15
+ ]
16
+
17
+
18
+ class DatasetCreator(ABC):
19
+ DEFAULT_SPLITS_TRAIN = [
20
+ "train",
21
+ "training",
22
+ "train_set",
23
+ "training_set",
24
+ "train_dataset",
25
+ "training_dataset",
26
+ "train_data",
27
+ "training_data",
28
+ "pretrain",
29
+ "pretrain_set",
30
+ "pretrain_dataset",
31
+ "pretrain_data",
32
+ "pretraining",
33
+ ]
34
+ DEFAULT_SPLITS_CALIB = [
35
+ "calibration",
36
+ "calib",
37
+ "cal",
38
+ "calibration_set",
39
+ "calib_set",
40
+ "cal_set",
41
+ "calibration_dataset",
42
+ "calib_dataset",
43
+ "cal_set",
44
+ "calibration_data",
45
+ "calib_data",
46
+ "cal_data",
47
+ ]
48
+ DEFAULT_SPLITS_VAL = [
49
+ "validation",
50
+ "val",
51
+ "valid",
52
+ "validation_set",
53
+ "val_set",
54
+ "validation_dataset",
55
+ "val_dataset",
56
+ "validation_data",
57
+ "val_data",
58
+ "dev",
59
+ "dev_set",
60
+ "dev_dataset",
61
+ "dev_data",
62
+ ]
63
+ DEFAULT_SPLITS_TEST = [
64
+ "test",
65
+ "testing",
66
+ "test_set",
67
+ "testing_set",
68
+ "test_dataset",
69
+ "testing_dataset",
70
+ "test_data",
71
+ "testing_data",
72
+ "eval",
73
+ "eval_set",
74
+ "eval_dataset",
75
+ "eval_data",
76
+ ]
77
+ DEFAULT_SPLITS_DATASET: dict[str, str] = {}
78
+
79
+ @classmethod
80
+ def create(
81
+ cls,
82
+ data: Any,
83
+ data_args: Optional[dict[str, Any]],
84
+ processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
85
+ processor_args: Optional[dict[str, Any]],
86
+ random_seed: int = 42,
87
+ split_pref_order: Optional[list[str]] = None,
88
+ ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
89
+ if not cls.is_supported(data, data_args):
90
+ raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
91
+
92
+ split = cls.extract_args_split(data_args)
93
+ column_mappings = cls.extract_args_column_mappings(data_args)
94
+ dataset = cls.handle_create(
95
+ data, data_args, processor, processor_args, random_seed
96
+ )
97
+
98
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
99
+ dataset = cls.extract_dataset_split(dataset, split, split_pref_order)
100
+
101
+ if not isinstance(dataset, (Dataset, IterableDataset)):
102
+ raise ValueError(
103
+ f"Unsupported data type: {type(dataset)} given for {dataset}."
104
+ )
105
+
106
+ return dataset, column_mappings
107
+
108
+ @classmethod
109
+ def extract_args_split(cls, data_args: Optional[dict[str, Any]]) -> str:
110
+ split = "auto"
111
+
112
+ if data_args and "split" in data_args:
113
+ split = data_args["split"]
114
+ del data_args["split"]
115
+
116
+ return split
117
+
118
+ @classmethod
119
+ def extract_args_column_mappings(
120
+ cls,
121
+ data_args: Optional[dict[str, Any]],
122
+ ) -> dict[ColumnInputTypes, str]:
123
+ columns: dict[ColumnInputTypes, str] = {}
124
+
125
+ if data_args:
126
+ if "prompt_column" in data_args:
127
+ columns["prompt_column"] = data_args["prompt_column"]
128
+ del data_args["prompt_column"]
129
+
130
+ if "prompt_tokens_count_column" in data_args:
131
+ columns["prompt_tokens_count_column"] = data_args[
132
+ "prompt_tokens_count_column"
133
+ ]
134
+ del data_args["prompt_tokens_count_column"]
135
+
136
+ if "output_tokens_count_column" in data_args:
137
+ columns["output_tokens_count_column"] = data_args[
138
+ "output_tokens_count_column"
139
+ ]
140
+ del data_args["output_tokens_count_column"]
141
+
142
+ return columns
143
+
144
+ @classmethod
145
+ def extract_dataset_name(
146
+ cls, dataset: Union[Dataset, IterableDataset, DatasetDict, IterableDatasetDict]
147
+ ) -> Optional[str]:
148
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
149
+ dataset = dataset[list(dataset.keys())[0]]
150
+
151
+ if isinstance(dataset, (Dataset, IterableDataset)):
152
+ if not hasattr(dataset, "info") or not hasattr(
153
+ dataset.info, "dataset_name"
154
+ ):
155
+ return None
156
+
157
+ return dataset.info.dataset_name
158
+
159
+ raise ValueError(f"Unsupported data type: {type(dataset)} given for {dataset}.")
160
+
161
+ @classmethod
162
+ def extract_dataset_split(
163
+ cls,
164
+ dataset: Union[DatasetDict, IterableDatasetDict],
165
+ specified_split: Union[Literal["auto"], str] = "auto",
166
+ split_pref_order: Optional[Union[Literal["auto"], list[str]]] = "auto",
167
+ ) -> Union[Dataset, IterableDataset]:
168
+ if not isinstance(dataset, (DatasetDict, IterableDatasetDict)):
169
+ raise ValueError(
170
+ f"Unsupported data type: {type(dataset)} given for {dataset}."
171
+ )
172
+
173
+ if specified_split != "auto":
174
+ if specified_split not in dataset:
175
+ raise ValueError(
176
+ f"Split {specified_split} not found in dataset {dataset}."
177
+ )
178
+
179
+ return dataset[specified_split]
180
+
181
+ dataset_name = cls.extract_dataset_name(dataset)
182
+
183
+ if dataset_name and dataset_name in cls.DEFAULT_SPLITS_DATASET:
184
+ return dataset[cls.DEFAULT_SPLITS_DATASET[dataset_name]]
185
+
186
+ if split_pref_order == "auto":
187
+ split_pref_order = [
188
+ *cls.DEFAULT_SPLITS_TEST,
189
+ *cls.DEFAULT_SPLITS_VAL,
190
+ *cls.DEFAULT_SPLITS_CALIB,
191
+ *cls.DEFAULT_SPLITS_TRAIN,
192
+ ]
193
+
194
+ for test_split in split_pref_order or []:
195
+ if test_split in dataset:
196
+ return dataset[test_split]
197
+
198
+ return dataset[list(dataset.keys())[0]]
199
+
200
+ @classmethod
201
+ @abstractmethod
202
+ def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: ...
203
+
204
+ @classmethod
205
+ @abstractmethod
206
+ def handle_create(
207
+ cls,
208
+ data: Any,
209
+ data_args: Optional[dict[str, Any]],
210
+ processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
211
+ processor_args: Optional[dict[str, Any]],
212
+ random_seed: int,
213
+ ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: ...
@@ -0,0 +1,42 @@
1
+ from pathlib import Path
2
+ from typing import Any, Optional, Union
3
+
4
+ from datasets import Dataset, IterableDataset
5
+ from transformers import PreTrainedTokenizerBase # type: ignore[import]
6
+
7
+ from guidellm.dataset.creator import ColumnInputTypes
8
+ from guidellm.dataset.file import FileDatasetCreator
9
+ from guidellm.dataset.hf_datasets import HFDatasetsCreator
10
+ from guidellm.dataset.in_memory import InMemoryDatasetCreator
11
+ from guidellm.dataset.synthetic import SyntheticDatasetCreator
12
+
13
+ __all__ = ["load_dataset"]
14
+
15
+
16
+ def load_dataset(
17
+ data: Any,
18
+ data_args: Optional[dict[str, Any]],
19
+ processor: Optional[Union[str, Path, PreTrainedTokenizerBase]],
20
+ processor_args: Optional[dict[str, Any]],
21
+ random_seed: int = 42,
22
+ split_pref_order: Optional[list[str]] = None,
23
+ ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]:
24
+ creators = [
25
+ InMemoryDatasetCreator,
26
+ SyntheticDatasetCreator,
27
+ FileDatasetCreator,
28
+ HFDatasetsCreator,
29
+ ]
30
+
31
+ for creator in creators:
32
+ if creator.is_supported(data, data_args):
33
+ return creator.create(
34
+ data,
35
+ data_args,
36
+ processor,
37
+ processor_args,
38
+ random_seed,
39
+ split_pref_order,
40
+ )
41
+
42
+ raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
@@ -0,0 +1,90 @@
1
+ from pathlib import Path
2
+ from typing import Any, Optional, Union
3
+
4
+ import pandas as pd # type: ignore[import]
5
+ from datasets import (
6
+ Dataset,
7
+ DatasetDict,
8
+ IterableDataset,
9
+ IterableDatasetDict,
10
+ load_dataset,
11
+ )
12
+ from transformers import PreTrainedTokenizerBase # type: ignore[import]
13
+
14
+ from guidellm.dataset.creator import DatasetCreator
15
+
16
+ __all__ = ["FileDatasetCreator"]
17
+
18
+
19
+ class FileDatasetCreator(DatasetCreator):
20
+ SUPPORTED_TYPES = {
21
+ ".txt",
22
+ ".text",
23
+ ".csv",
24
+ ".json",
25
+ ".jsonl",
26
+ ".parquet",
27
+ ".arrow",
28
+ ".hdf5",
29
+ ".tar",
30
+ }
31
+
32
+ @classmethod
33
+ def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
34
+ if isinstance(data, (str, Path)) and (path := Path(data)).exists():
35
+ # local folder or py file, assume supported
36
+ return path.suffix.lower() in cls.SUPPORTED_TYPES
37
+
38
+ return False
39
+
40
+ @classmethod
41
+ def handle_create(
42
+ cls,
43
+ data: Any,
44
+ data_args: Optional[dict[str, Any]],
45
+ processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
46
+ processor_args: Optional[dict[str, Any]], # noqa: ARG003
47
+ random_seed: int, # noqa: ARG003
48
+ ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
49
+ if not isinstance(data, (str, Path)):
50
+ raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")
51
+
52
+ path = Path(data)
53
+ if not path.exists():
54
+ raise FileNotFoundError(f"File not found: {path}")
55
+
56
+ if not path.is_file():
57
+ raise ValueError(f"Unsupported data type: {path} given for {path}. ")
58
+
59
+ if path.suffix.lower() not in cls.SUPPORTED_TYPES:
60
+ raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
61
+
62
+ return cls.load_dataset(path, data_args)
63
+
64
+ @classmethod
65
+ def load_dataset(
66
+ cls, path: Path, data_args: Optional[dict[str, Any]]
67
+ ) -> Union[Dataset, IterableDataset]:
68
+ if path.suffix.lower() in {".txt", ".text"}:
69
+ with path.open("r") as file:
70
+ items = file.readlines()
71
+
72
+ dataset = Dataset.from_dict({"text": items}, **(data_args or {}))
73
+ elif path.suffix.lower() == ".csv":
74
+ dataset = load_dataset("csv", data_files=path, **(data_args or {}))
75
+ elif path.suffix.lower() in {".json", ".jsonl"}:
76
+ dataset = load_dataset("json", data_files=path, **(data_args or {}))
77
+ elif path.suffix.lower() == ".parquet":
78
+ dataset = load_dataset("parquet", data_files=path, **(data_args or {}))
79
+ elif path.suffix.lower() == ".arrow":
80
+ dataset = load_dataset("arrow", data_files=path, **(data_args or {}))
81
+ elif path.suffix.lower() == ".hdf5":
82
+ dataset = Dataset.from_pandas(pd.read_hdf(path), **(data_args or {}))
83
+ elif path.suffix.lower() == ".db":
84
+ dataset = Dataset.from_sql(con=path, **(data_args or {}))
85
+ elif path.suffix.lower() == ".tar":
86
+ dataset = load_dataset("webdataset", data_files=path, **(data_args or {}))
87
+ else:
88
+ raise ValueError(f"Unsupported file type: {path.suffix} given for {path}. ")
89
+
90
+ return dataset
@@ -0,0 +1,62 @@
1
+ from pathlib import Path
2
+ from typing import Any, Optional, Union
3
+
4
+ from datasets import (
5
+ Dataset,
6
+ DatasetDict,
7
+ IterableDataset,
8
+ IterableDatasetDict,
9
+ get_dataset_config_info,
10
+ load_dataset,
11
+ )
12
+ from transformers import PreTrainedTokenizerBase # type: ignore[import]
13
+
14
+ from guidellm.dataset.creator import DatasetCreator
15
+
16
+ __all__ = ["HFDatasetsCreator"]
17
+
18
+
19
+ class HFDatasetsCreator(DatasetCreator):
20
+ @classmethod
21
+ def is_supported(cls, data: Any, data_args: Optional[dict[str, Any]]) -> bool: # noqa: ARG003
22
+ if isinstance(
23
+ data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
24
+ ):
25
+ # base type is supported
26
+ return True
27
+
28
+ if isinstance(data, (str, Path)) and (path := Path(data)).exists():
29
+ # local folder or py file, assume supported
30
+ return path.is_dir() or path.suffix == ".py"
31
+
32
+ if isinstance(data, (str, Path)):
33
+ try:
34
+ # try to load dataset
35
+ return get_dataset_config_info(data) is not None
36
+ except Exception: # noqa: BLE001, S110
37
+ pass
38
+
39
+ return False
40
+
41
+ @classmethod
42
+ def handle_create(
43
+ cls,
44
+ data: Any,
45
+ data_args: Optional[dict[str, Any]],
46
+ processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003
47
+ processor_args: Optional[dict[str, Any]], # noqa: ARG003
48
+ random_seed: int, # noqa: ARG003
49
+ ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
50
+ if isinstance(data, (str, Path)):
51
+ data = load_dataset(data, **(data_args or {}))
52
+ elif data_args:
53
+ raise ValueError(
54
+ f"data_args should not be provided when data is a {type(data)}"
55
+ )
56
+
57
+ if isinstance(
58
+ data, (Dataset, DatasetDict, IterableDataset, IterableDatasetDict)
59
+ ):
60
+ return data
61
+
62
+ raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ")