guidellm 0.4.0a155__tar.gz → 0.4.0a173__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (105) hide show
  1. {guidellm-0.4.0a155/src/guidellm.egg-info → guidellm-0.4.0a173}/PKG-INFO +1 -1
  2. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/__main__.py +4 -3
  3. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/benchmarker.py +2 -0
  4. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/entrypoints.py +1 -0
  5. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/output.py +3 -1
  6. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/schemas.py +2 -1
  7. guidellm-0.4.0a173/src/guidellm/data/deserializers/deserializer.py +144 -0
  8. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/file.py +14 -14
  9. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/huggingface.py +1 -1
  10. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/memory.py +20 -18
  11. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/synthetic.py +18 -16
  12. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/loaders.py +7 -3
  13. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/formatters.py +24 -32
  14. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/mappers.py +2 -2
  15. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/preprocessor.py +5 -3
  16. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/processor.py +3 -2
  17. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/utils/__init__.py +0 -4
  18. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/utils/dataset.py +2 -2
  19. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/constraints.py +1 -3
  20. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/environments.py +2 -2
  21. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/scheduler.py +1 -1
  22. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/strategies.py +31 -4
  23. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/worker.py +56 -30
  24. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/worker_group.py +33 -31
  25. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/schemas/request.py +10 -0
  26. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/cli.py +26 -1
  27. guidellm-0.4.0a173/src/guidellm/version.py +6 -0
  28. {guidellm-0.4.0a155 → guidellm-0.4.0a173/src/guidellm.egg-info}/PKG-INFO +1 -1
  29. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm.egg-info/SOURCES.txt +0 -1
  30. guidellm-0.4.0a155/src/guidellm/data/deserializers/deserializer.py +0 -109
  31. guidellm-0.4.0a155/src/guidellm/data/utils/functions.py +0 -18
  32. guidellm-0.4.0a155/src/guidellm/version.py +0 -6
  33. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/LICENSE +0 -0
  34. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/MANIFEST.in +0 -0
  35. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/README.md +0 -0
  36. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/pyproject.toml +0 -0
  37. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/setup.cfg +0 -0
  38. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/setup.py +0 -0
  39. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/__init__.py +0 -0
  40. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/backends/__init__.py +0 -0
  41. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/backends/backend.py +0 -0
  42. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/backends/openai.py +0 -0
  43. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/backends/response_handlers.py +0 -0
  44. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/__init__.py +0 -0
  45. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/profile.py +0 -0
  46. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/progress.py +0 -0
  47. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/scenarios/__init__.py +0 -0
  48. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/scenarios/chat.json +0 -0
  49. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/scenarios/rag.json +0 -0
  50. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/__init__.py +0 -0
  51. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/collators.py +0 -0
  52. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/__init__.py +0 -0
  53. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/__init__.py +0 -0
  54. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/schemas.py +0 -0
  55. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/extras/__init__.py +0 -0
  56. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/extras/audio.py +0 -0
  57. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/extras/vision.py +0 -0
  58. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/logger.py +0 -0
  59. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/__init__.py +0 -0
  60. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/config.py +0 -0
  61. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/handlers/__init__.py +0 -0
  62. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/handlers/chat_completions.py +0 -0
  63. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/handlers/completions.py +0 -0
  64. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/handlers/tokenizer.py +0 -0
  65. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/models.py +0 -0
  66. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/server.py +0 -0
  67. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/mock_server/utils.py +0 -0
  68. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/preprocess/__init__.py +0 -0
  69. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/preprocess/dataset.py +0 -0
  70. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/presentation/__init__.py +0 -0
  71. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/presentation/builder.py +0 -0
  72. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/presentation/data_models.py +0 -0
  73. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/presentation/injector.py +0 -0
  74. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/__init__.py +0 -0
  75. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/scheduler/schemas.py +0 -0
  76. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/schemas/__init__.py +0 -0
  77. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/schemas/info.py +0 -0
  78. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/schemas/response.py +0 -0
  79. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/schemas/stats.py +0 -0
  80. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/settings.py +0 -0
  81. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/__init__.py +0 -0
  82. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/auto_importer.py +0 -0
  83. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/colors.py +0 -0
  84. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/console.py +0 -0
  85. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/default_group.py +0 -0
  86. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/dict.py +0 -0
  87. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/encoding.py +0 -0
  88. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/functions.py +0 -0
  89. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/hf_datasets.py +0 -0
  90. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/hf_transformers.py +0 -0
  91. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/imports.py +0 -0
  92. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/messaging.py +0 -0
  93. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/mixins.py +0 -0
  94. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/pydantic_utils.py +0 -0
  95. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/random.py +0 -0
  96. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/registry.py +0 -0
  97. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/singleton.py +0 -0
  98. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/statistics.py +0 -0
  99. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/synchronous.py +0 -0
  100. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/text.py +0 -0
  101. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/utils/typing.py +0 -0
  102. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm.egg-info/dependency_links.txt +0 -0
  103. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm.egg-info/entry_points.txt +0 -0
  104. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm.egg-info/requires.txt +0 -0
  105. {guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: guidellm
3
- Version: 0.4.0a155
3
+ Version: 0.4.0a173
4
4
  Summary: Guidance platform for deploying and managing large language models.
5
5
  Author: Red Hat
6
6
  License: Apache-2.0
@@ -156,8 +156,9 @@ def benchmark():
156
156
  )
157
157
  @click.option(
158
158
  "--rate",
159
- type=float,
160
- multiple=True,
159
+ type=str,
160
+ callback=cli_tools.parse_list_floats,
161
+ multiple=False,
161
162
  default=BenchmarkGenerativeTextArgs.get_default("rate"),
162
163
  help=(
163
164
  "Benchmark rate(s) to test. Meaning depends on profile: "
@@ -383,7 +384,7 @@ def run(**kwargs):
383
384
  kwargs.get("data_args"), default=[], simplify_single=False
384
385
  )
385
386
  kwargs["rate"] = cli_tools.format_list_arg(
386
- kwargs.get("rate"), default=None, simplify_single=True
387
+ kwargs.get("rate"), default=None, simplify_single=False
387
388
  )
388
389
 
389
390
  disable_console_outputs = kwargs.pop("disable_console_outputs", False)
@@ -57,6 +57,7 @@ class Benchmarker(
57
57
  backend: BackendInterface[RequestT, ResponseT],
58
58
  profile: Profile,
59
59
  environment: Environment,
60
+ data: list[Any],
60
61
  progress: BenchmarkerProgress[BenchmarkT] | None = None,
61
62
  sample_requests: int | None = 20,
62
63
  warmup: float | None = None,
@@ -149,6 +150,7 @@ class Benchmarker(
149
150
  environment=environment,
150
151
  strategy=strategy,
151
152
  constraints=constraints,
153
+ data=data,
152
154
  )
153
155
  if progress:
154
156
  await progress.on_benchmark_complete(benchmark)
@@ -436,6 +436,7 @@ async def benchmark_generative_text(
436
436
  backend=backend,
437
437
  profile=profile,
438
438
  environment=NonDistributedEnvironment(),
439
+ data=args.data,
439
440
  progress=progress,
440
441
  sample_requests=args.sample_requests,
441
442
  warmup=args.warmup,
@@ -649,6 +649,8 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
649
649
  status_dist_summary: StatusDistributionSummary = getattr(
650
650
  benchmark.metrics, metric
651
651
  )
652
+ if not hasattr(status_dist_summary, status):
653
+ return [], []
652
654
  dist_summary: DistributionSummary = getattr(status_dist_summary, status)
653
655
 
654
656
  headers = [
@@ -688,7 +690,7 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
688
690
  values: list[str] = [
689
691
  benchmark.benchmarker.profile.model_dump_json(),
690
692
  json.dumps(benchmark.benchmarker.backend),
691
- json.dumps(benchmark.benchmarker.requests["attributes"]["data"]),
693
+ json.dumps(benchmark.benchmarker.requests["data"]),
692
694
  ]
693
695
 
694
696
  if len(headers) != len(values):
@@ -1674,6 +1674,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
1674
1674
  environment: Environment,
1675
1675
  strategy: SchedulingStrategy,
1676
1676
  constraints: dict[str, dict[str, Any]],
1677
+ data: list[Any],
1677
1678
  ) -> GenerativeBenchmark:
1678
1679
  """
1679
1680
  Compile final generative benchmark from accumulated state.
@@ -1702,7 +1703,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
1702
1703
  ),
1703
1704
  benchmarker=BenchmarkerDict(
1704
1705
  profile=profile,
1705
- requests=InfoMixin.extract_from_obj(requests),
1706
+ requests={"data": data},
1706
1707
  backend=backend.info,
1707
1708
  environment=environment.info,
1708
1709
  ),
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from typing import Any, Protocol, Union, runtime_checkable
5
+
6
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
7
+ from transformers import PreTrainedTokenizerBase
8
+
9
+ from guidellm.data.utils import resolve_dataset_split
10
+ from guidellm.utils import RegistryMixin
11
+
12
+ __all__ = [
13
+ "DataNotSupportedError",
14
+ "DatasetDeserializer",
15
+ "DatasetDeserializerFactory",
16
+ ]
17
+
18
+
19
+ class DataNotSupportedError(Exception):
20
+ """Exception raised when data format is not supported by deserializer."""
21
+
22
+
23
+ @runtime_checkable
24
+ class DatasetDeserializer(Protocol):
25
+ def __call__(
26
+ self,
27
+ data: Any,
28
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
29
+ random_seed: int,
30
+ **data_kwargs: dict[str, Any],
31
+ ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
32
+
33
+
34
+ class DatasetDeserializerFactory(
35
+ RegistryMixin[Union["type[DatasetDeserializer]", DatasetDeserializer]],
36
+ ):
37
+ @classmethod
38
+ def deserialize(
39
+ cls,
40
+ data: Any,
41
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
42
+ random_seed: int = 42,
43
+ type_: str | None = None,
44
+ resolve_split: bool = True,
45
+ select_columns: list[str] | None = None,
46
+ remove_columns: list[str] | None = None,
47
+ **data_kwargs: dict[str, Any],
48
+ ) -> Dataset | IterableDataset:
49
+ dataset: Dataset
50
+
51
+ if type_ is None:
52
+ dataset = cls._deserialize_with_registered_deserializers(
53
+ data, processor_factory, random_seed, **data_kwargs
54
+ )
55
+
56
+ else:
57
+ dataset = cls._deserialize_with_specified_deserializer(
58
+ data, type_, processor_factory, random_seed, **data_kwargs
59
+ )
60
+
61
+ if resolve_split:
62
+ dataset = resolve_dataset_split(dataset)
63
+
64
+ if select_columns is not None or remove_columns is not None:
65
+ column_names = dataset.column_names or list(next(iter(dataset)).keys())
66
+ if select_columns is not None:
67
+ remove_columns = [
68
+ col for col in column_names if col not in select_columns
69
+ ]
70
+
71
+ dataset = dataset.remove_columns(remove_columns)
72
+
73
+ return dataset
74
+
75
+ @classmethod
76
+ def _deserialize_with_registered_deserializers(
77
+ cls,
78
+ data: Any,
79
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
80
+ random_seed: int = 42,
81
+ **data_kwargs: dict[str, Any],
82
+ ) -> Dataset:
83
+ if cls.registry is None:
84
+ raise RuntimeError("registry is None; cannot deserialize dataset")
85
+ dataset: Dataset | None = None
86
+
87
+ errors: dict[str, Exception] = {}
88
+ # Note: There is no priority order for the deserializers, so all deserializers
89
+ # must be mutually exclusive to ensure deterministic behavior.
90
+ for _name, deserializer in cls.registry.items():
91
+ deserializer_fn: DatasetDeserializer = (
92
+ deserializer() if isinstance(deserializer, type) else deserializer
93
+ )
94
+
95
+ try:
96
+ dataset = deserializer_fn(
97
+ data=data,
98
+ processor_factory=processor_factory,
99
+ random_seed=random_seed,
100
+ **data_kwargs,
101
+ )
102
+ except Exception as e: # noqa: BLE001 # The exceptions are saved.
103
+ errors[_name] = e
104
+
105
+ if dataset is not None:
106
+ return dataset # Success
107
+
108
+ if len(errors) > 0:
109
+ err_msgs = ""
110
+ def sort_key(item):
111
+ return (isinstance(item[1], DataNotSupportedError), item[0])
112
+ for key, err in sorted(errors.items(), key=sort_key):
113
+ err_msgs += f"\n - Deserializer '{key}': ({type(err).__name__}) {err}"
114
+ raise ValueError(
115
+ "Data deserialization failed, likely because the input doesn't "
116
+ f"match any of the input formats. See the {len(errors)} error(s) that "
117
+ f"occurred while attempting to deserialize the data {data}:{err_msgs}"
118
+ )
119
+ return dataset
120
+
121
+ @classmethod
122
+ def _deserialize_with_specified_deserializer(
123
+ cls,
124
+ data: Any,
125
+ type_: str,
126
+ processor_factory: Callable[[], PreTrainedTokenizerBase],
127
+ random_seed: int = 42,
128
+ **data_kwargs: dict[str, Any],
129
+ ) -> Dataset:
130
+ deserializer_from_type = cls.get_registered_object(type_)
131
+ if deserializer_from_type is None:
132
+ raise ValueError(f"Deserializer type '{type_}' is not registered.")
133
+ if isinstance(deserializer_from_type, type):
134
+ deserializer_fn = deserializer_from_type()
135
+ else:
136
+ deserializer_fn = deserializer_from_type
137
+
138
+ return deserializer_fn(
139
+ data=data,
140
+ processor_factory=processor_factory,
141
+ random_seed=random_seed,
142
+ **data_kwargs,
143
+ )
144
+
@@ -34,11 +34,11 @@ class TextFileDatasetDeserializer(DatasetDeserializer):
34
34
  processor_factory: Callable[[], PreTrainedTokenizerBase],
35
35
  random_seed: int,
36
36
  **data_kwargs: dict[str, Any],
37
- ) -> dict[str, list]:
37
+ ) -> Dataset:
38
38
  _ = (processor_factory, random_seed) # Ignore unused args format errors
39
39
 
40
40
  if (
41
- not isinstance(data, (str, Path))
41
+ not isinstance(data, str | Path)
42
42
  or not (path := Path(data)).exists()
43
43
  or not path.is_file()
44
44
  or path.suffix.lower() not in {".txt", ".text"}
@@ -62,10 +62,10 @@ class CSVFileDatasetDeserializer(DatasetDeserializer):
62
62
  processor_factory: Callable[[], PreTrainedTokenizerBase],
63
63
  random_seed: int,
64
64
  **data_kwargs: dict[str, Any],
65
- ) -> dict[str, list]:
65
+ ) -> Dataset:
66
66
  _ = (processor_factory, random_seed)
67
67
  if (
68
- not isinstance(data, (str, Path))
68
+ not isinstance(data, str | Path)
69
69
  or not (path := Path(data)).exists()
70
70
  or not path.is_file()
71
71
  or path.suffix.lower() != ".csv"
@@ -86,10 +86,10 @@ class JSONFileDatasetDeserializer(DatasetDeserializer):
86
86
  processor_factory: Callable[[], PreTrainedTokenizerBase],
87
87
  random_seed: int,
88
88
  **data_kwargs: dict[str, Any],
89
- ) -> dict[str, list]:
89
+ ) -> Dataset:
90
90
  _ = (processor_factory, random_seed)
91
91
  if (
92
- not isinstance(data, (str, Path))
92
+ not isinstance(data, str | Path)
93
93
  or not (path := Path(data)).exists()
94
94
  or not path.is_file()
95
95
  or path.suffix.lower() not in {".json", ".jsonl"}
@@ -110,10 +110,10 @@ class ParquetFileDatasetDeserializer(DatasetDeserializer):
110
110
  processor_factory: Callable[[], PreTrainedTokenizerBase],
111
111
  random_seed: int,
112
112
  **data_kwargs: dict[str, Any],
113
- ) -> dict[str, list]:
113
+ ) -> Dataset:
114
114
  _ = (processor_factory, random_seed)
115
115
  if (
116
- not isinstance(data, (str, Path))
116
+ not isinstance(data, str | Path)
117
117
  or not (path := Path(data)).exists()
118
118
  or not path.is_file()
119
119
  or path.suffix.lower() != ".parquet"
@@ -134,10 +134,10 @@ class ArrowFileDatasetDeserializer(DatasetDeserializer):
134
134
  processor_factory: Callable[[], PreTrainedTokenizerBase],
135
135
  random_seed: int,
136
136
  **data_kwargs: dict[str, Any],
137
- ) -> dict[str, list]:
137
+ ) -> Dataset:
138
138
  _ = (processor_factory, random_seed)
139
139
  if (
140
- not isinstance(data, (str, Path))
140
+ not isinstance(data, str | Path)
141
141
  or not (path := Path(data)).exists()
142
142
  or not path.is_file()
143
143
  or path.suffix.lower() != ".arrow"
@@ -158,10 +158,10 @@ class HDF5FileDatasetDeserializer(DatasetDeserializer):
158
158
  processor_factory: Callable[[], PreTrainedTokenizerBase],
159
159
  random_seed: int,
160
160
  **data_kwargs: dict[str, Any],
161
- ) -> dict[str, list]:
161
+ ) -> Dataset:
162
162
  _ = (processor_factory, random_seed)
163
163
  if (
164
- not isinstance(data, (str, Path))
164
+ not isinstance(data, str | Path)
165
165
  or not (path := Path(data)).exists()
166
166
  or not path.is_file()
167
167
  or path.suffix.lower() not in {".hdf5", ".h5"}
@@ -185,7 +185,7 @@ class DBFileDatasetDeserializer(DatasetDeserializer):
185
185
  ) -> dict[str, list]:
186
186
  _ = (processor_factory, random_seed)
187
187
  if (
188
- not isinstance(data, (str, Path))
188
+ not isinstance(data, str | Path)
189
189
  or not (path := Path(data)).exists()
190
190
  or not path.is_file()
191
191
  or path.suffix.lower() != ".db"
@@ -209,7 +209,7 @@ class TarFileDatasetDeserializer(DatasetDeserializer):
209
209
  ) -> dict[str, list]:
210
210
  _ = (processor_factory, random_seed)
211
211
  if (
212
- not isinstance(data, (str, Path))
212
+ not isinstance(data, str | Path)
213
213
  or not (path := Path(data)).exists()
214
214
  or not path.is_file()
215
215
  or path.suffix.lower() != ".tar"
@@ -36,7 +36,7 @@ class HuggingFaceDatasetDeserializer(DatasetDeserializer):
36
36
  processor_factory: Callable[[], PreTrainedTokenizerBase],
37
37
  random_seed: int,
38
38
  **data_kwargs: dict[str, Any],
39
- ) -> dict[str, list]:
39
+ ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
40
40
  _ = (processor_factory, random_seed)
41
41
 
42
42
  if isinstance(
@@ -33,7 +33,7 @@ class InMemoryDictDatasetDeserializer(DatasetDeserializer):
33
33
  processor_factory: Callable[[], PreTrainedTokenizerBase],
34
34
  random_seed: int,
35
35
  **data_kwargs: dict[str, Any],
36
- ) -> dict[str, list]:
36
+ ) -> Dataset:
37
37
  _ = (processor_factory, random_seed) # Ignore unused args format errors
38
38
 
39
39
  if (
@@ -67,7 +67,7 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
67
67
  processor_factory: Callable[[], PreTrainedTokenizerBase],
68
68
  random_seed: int,
69
69
  **data_kwargs: dict[str, Any],
70
- ) -> dict[str, list]:
70
+ ) -> Dataset:
71
71
  _ = (processor_factory, random_seed) # Ignore unused args format errors
72
72
 
73
73
  if (
@@ -81,9 +81,9 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
81
81
  f"expected list of dicts, got {data}"
82
82
  )
83
83
 
84
- data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
85
- first_keys = set(data[0].keys())
86
- for index, item in enumerate(data):
84
+ typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
85
+ first_keys = set(typed_data[0].keys())
86
+ for index, item in enumerate(typed_data):
87
87
  if set(item.keys()) != first_keys:
88
88
  raise DataNotSupportedError(
89
89
  f"All dictionaries must have the same keys. "
@@ -92,8 +92,8 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
92
92
  )
93
93
 
94
94
  # Convert list of dicts to dict of lists
95
- result_dict = {key: [] for key in first_keys}
96
- for item in data:
95
+ result_dict: dict = {key: [] for key in first_keys}
96
+ for item in typed_data:
97
97
  for key, value in item.items():
98
98
  result_dict[key].append(value)
99
99
 
@@ -108,7 +108,7 @@ class InMemoryItemListDatasetDeserializer(DatasetDeserializer):
108
108
  processor_factory: Callable[[], PreTrainedTokenizerBase],
109
109
  random_seed: int,
110
110
  **data_kwargs: dict[str, Any],
111
- ) -> dict[str, list]:
111
+ ) -> Dataset:
112
112
  _ = (processor_factory, random_seed) # Ignore unused args format errors
113
113
 
114
114
  primitive_types = (str, int, float, bool, type(None))
@@ -135,7 +135,7 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
135
135
  processor_factory: Callable[[], PreTrainedTokenizerBase],
136
136
  random_seed: int,
137
137
  **data_kwargs: dict[str, Any],
138
- ) -> dict[str, list]:
138
+ ) -> Dataset:
139
139
  if (
140
140
  isinstance(data, str)
141
141
  and (json_str := data.strip())
@@ -145,16 +145,18 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
145
145
  )
146
146
  ):
147
147
  with contextlib.suppress(Exception):
148
- parsed = json.loads(data)
148
+ parsed_data = json.loads(data)
149
149
 
150
- for deserializer in [
151
- InMemoryDictDatasetDeserializer,
152
- InMemoryDictListDatasetDeserializer,
153
- InMemoryItemListDatasetDeserializer,
154
- ]:
150
+ deserializers = [
151
+ InMemoryDictDatasetDeserializer(),
152
+ InMemoryDictListDatasetDeserializer(),
153
+ InMemoryItemListDatasetDeserializer(),
154
+ ]
155
+
156
+ for deserializer in deserializers:
155
157
  with contextlib.suppress(DataNotSupportedError):
156
- return deserializer()(
157
- parsed, data_kwargs, processor_factory, random_seed
158
+ return deserializer(
159
+ parsed_data, processor_factory, random_seed, **data_kwargs
158
160
  )
159
161
 
160
162
  raise DataNotSupportedError(
@@ -171,7 +173,7 @@ class InMemoryCsvDatasetDeserializer(DatasetDeserializer):
171
173
  processor_factory: Callable[[], PreTrainedTokenizerBase],
172
174
  random_seed: int,
173
175
  **data_kwargs: dict[str, Any],
174
- ) -> dict[str, list]:
176
+ ) -> Dataset:
175
177
  if (
176
178
  isinstance(data, str)
177
179
  and (csv_str := data.strip())
@@ -99,21 +99,23 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
99
99
 
100
100
  @model_validator(mode="after")
101
101
  def check_prefix_options(self) -> SyntheticTextDatasetConfig:
102
- prefix_count = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
103
- prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None) # type: ignore[attr-defined]
104
- if prefix_count is not None or prefix_tokens is not None:
105
- if self.prefix_buckets:
106
- raise ValueError(
107
- "prefix_buckets is mutually exclusive"
108
- " with prefix_count and prefix_tokens"
109
- )
102
+ if self.__pydantic_extra__ is not None:
103
+ prefix_count = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
104
+ prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None) # type: ignore[attr-defined]
105
+
106
+ if prefix_count is not None or prefix_tokens is not None:
107
+ if self.prefix_buckets:
108
+ raise ValueError(
109
+ "prefix_buckets is mutually exclusive"
110
+ " with prefix_count and prefix_tokens"
111
+ )
110
112
 
111
- self.prefix_buckets = [
112
- SyntheticTextPrefixBucketConfig(
113
- prefix_count=prefix_count or 1,
114
- prefix_tokens=prefix_tokens or 0,
115
- )
116
- ]
113
+ self.prefix_buckets = [
114
+ SyntheticTextPrefixBucketConfig(
115
+ prefix_count=prefix_count or 1,
116
+ prefix_tokens=prefix_tokens or 0,
117
+ )
118
+ ]
117
119
 
118
120
  return self
119
121
 
@@ -174,14 +176,14 @@ class SyntheticTextGenerator:
174
176
  def _create_prompt(
175
177
  self, prompt_tokens_count: int, faker: Faker, unique: str = ""
176
178
  ) -> str:
177
- prompt_token_ids = []
179
+ prompt_token_ids: list[int] = []
178
180
  avg_chars_per_token = 5
179
181
  margin_of_safety = 1.5
180
182
  attempts = 0
181
183
 
182
184
  while len(prompt_token_ids) < prompt_tokens_count:
183
185
  attempts += 1
184
- num_chars = (
186
+ num_chars = int(
185
187
  prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
186
188
  )
187
189
  text = unique + faker.text(max_nb_chars=num_chars)
@@ -17,6 +17,7 @@ from guidellm.logger import logger
17
17
  __all__ = ["DataLoader", "DatasetsIterator"]
18
18
 
19
19
 
20
+
20
21
  class DatasetsIterator(TorchIterableDataset):
21
22
  def __init__(
22
23
  self,
@@ -85,7 +86,7 @@ class DatasetsIterator(TorchIterableDataset):
85
86
 
86
87
  while max_items is None or gen_count < max_items:
87
88
  try:
88
- row = {
89
+ row: dict[str, Any] = {
89
90
  "items": [next(dataset_iter) for dataset_iter in dataset_iters]
90
91
  }
91
92
  gen_count += 1
@@ -98,9 +99,12 @@ class DatasetsIterator(TorchIterableDataset):
98
99
  continue
99
100
 
100
101
  for preprocessor in self.preprocessors:
101
- row = preprocessor(row)
102
+ # This can assign a GenerationRequest, which would then be
103
+ # passed into the preprocessor, which is a type violation.
104
+ # This should be fixed at some point.
105
+ row = preprocessor(row) # type: ignore[assignment]
102
106
  yield row
103
- except Exception as err:
107
+ except Exception as err: # noqa: BLE001 # Exception logged
104
108
  logger.error(f"Skipping data row due to error: {err}")
105
109
  gen_count -= 1
106
110
 
@@ -7,8 +7,6 @@ from guidellm.data.preprocessors.preprocessor import (
7
7
  DatasetPreprocessor,
8
8
  PreprocessorRegistry,
9
9
  )
10
- from guidellm.data.schemas import GenerativeDatasetColumnType
11
- from guidellm.data.utils import text_stats
12
10
  from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
13
11
 
14
12
  __all__ = [
@@ -59,9 +57,13 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
59
57
  self.max_tokens: int | None = max_tokens or max_completion_tokens
60
58
 
61
59
  def __call__(
62
- self, columns: dict[GenerativeDatasetColumnType, list[Any]]
60
+ self, columns: dict[str, list[Any]]
63
61
  ) -> GenerationRequest:
64
- arguments: GenerationRequestArguments = GenerationRequestArguments(body={})
62
+ """
63
+ :param columns: A dict of GenerativeDatasetColumnType to Any
64
+ """
65
+ arguments: GenerationRequestArguments = GenerationRequestArguments()
66
+ arguments.body = {} # The type checker works better setting this field here
65
67
  input_metrics = UsageMetrics()
66
68
  output_metrics = UsageMetrics()
67
69
 
@@ -99,10 +101,9 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
99
101
  prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
100
102
  text = "".join(txt for txt in columns.get("text_column", []) if txt)
101
103
  if prefix or text:
102
- arguments.body["prompt"] = prefix + text
103
- stats = text_stats(arguments.body["prompt"])
104
- input_metrics.text_characters = stats.get("num_chars")
105
- input_metrics.text_words = stats.get("num_words")
104
+ prompt = prefix + text
105
+ arguments.body["prompt"] = prompt
106
+ input_metrics.add_text_metrics(prompt)
106
107
 
107
108
  return GenerationRequest(
108
109
  request_type="text_completions",
@@ -142,9 +143,13 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
142
143
  )
143
144
 
144
145
  def __call__( # noqa: C901, PLR0912, PLR0915
145
- self, columns: dict[GenerativeDatasetColumnType, list[Any]]
146
+ self, columns: dict[str, list[Any]]
146
147
  ) -> GenerationRequest:
147
- arguments = GenerationRequestArguments(body={})
148
+ """
149
+ :param columns: A dict of GenerativeDatasetColumnType to Any
150
+ """
151
+ arguments = GenerationRequestArguments()
152
+ arguments.body = {} # The type checker works best with body assigned here
148
153
  input_metrics = UsageMetrics()
149
154
  output_metrics = UsageMetrics()
150
155
 
@@ -191,27 +196,14 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
191
196
  if not prefix:
192
197
  continue
193
198
 
194
- stats = text_stats(prefix)
195
- if (num_chars := stats.get("num_chars")) is not None:
196
- input_metrics.text_characters = (
197
- input_metrics.text_characters or 0
198
- ) + num_chars
199
- if (num_words := stats.get("num_words")) is not None:
200
- input_metrics.text_words = (input_metrics.text_words or 0) + num_words
201
-
199
+ input_metrics.add_text_metrics(prefix)
202
200
  arguments.body["messages"].append({"role": "system", "content": prefix})
203
201
 
204
202
  for text in columns.get("text_column", []):
205
203
  if not text:
206
204
  continue
207
205
 
208
- stats = text_stats(text)
209
- if (num_chars := stats.get("num_chars")) is not None:
210
- input_metrics.text_characters = (
211
- input_metrics.text_characters or 0
212
- ) + num_chars
213
- if (num_words := stats.get("num_words")) is not None:
214
- input_metrics.text_words = (input_metrics.text_words or 0) + num_words
206
+ input_metrics.add_text_metrics(text)
215
207
 
216
208
  arguments.body["messages"].append(
217
209
  {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -329,9 +321,10 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
329
321
  self.encode_audio_kwargs = encode_kwargs or {}
330
322
 
331
323
  def __call__( # noqa: C901
332
- self, columns: dict[GenerativeDatasetColumnType, list[Any]]
324
+ self, columns: dict[str, list[Any]]
333
325
  ) -> GenerationRequest:
334
- arguments = GenerationRequestArguments(body={}, files={})
326
+ arguments = GenerationRequestArguments(files={})
327
+ arguments.body = {} # The type checker works best with body assigned here
335
328
  input_metrics = UsageMetrics()
336
329
  output_metrics = UsageMetrics()
337
330
 
@@ -387,10 +380,9 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
387
380
  prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
388
381
  text = "".join(txt for txt in columns.get("text_column", []) if txt)
389
382
  if prefix or text:
390
- arguments.body["prompt"] = prefix + text
391
- stats = text_stats(arguments.body["prompt"])
392
- input_metrics.text_characters = stats.get("num_chars")
393
- input_metrics.text_words = stats.get("num_words")
383
+ prompt = prefix + text
384
+ arguments.body["prompt"] = prompt
385
+ input_metrics.add_text_metrics(prompt)
394
386
 
395
387
  return GenerationRequest(
396
388
  request_type="audio_transcriptions",
@@ -405,7 +397,7 @@ class GenerativeAudioTranslationRequestFormatter(
405
397
  GenerativeAudioTranscriptionRequestFormatter
406
398
  ):
407
399
  def __call__(
408
- self, columns: dict[GenerativeDatasetColumnType, list[Any]]
400
+ self, columns: dict[str, list[Any]]
409
401
  ) -> GenerationRequest:
410
402
  result = super().__call__(columns)
411
403
  result.request_type = "audio_translations"
@@ -169,12 +169,12 @@ class GenerativeColumnMapper(DataDependentPreprocessor):
169
169
 
170
170
  def __call__(
171
171
  self, row: dict[str, Any]
172
- ) -> dict[GenerativeDatasetColumnType, list[Any]]:
172
+ ) -> dict[str, list[Any]]:
173
173
  if self.datasets_column_mappings is None:
174
174
  raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
175
175
 
176
176
  items = cast("dict[int, dict[str, Any]]", row.pop("items"))
177
- mapped: dict[GenerativeDatasetColumnType, list[Any]] = defaultdict(list)
177
+ mapped: dict[str, Any] = defaultdict(list)
178
178
 
179
179
  for column_type, column_mappings in self.datasets_column_mappings.items():
180
180
  for (