PyPI - guidellm - Versions diffs - 0.4.0a155__tar.gz → 0.4.0a173__tar.gz - Mend

guidellm 0.4.0a155tar.gz → 0.4.0a173tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (105) hide show

{guidellm-0.4.0a155/src/guidellm.egg-info → guidellm-0.4.0a173}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: guidellm
-Version: 0.4.0a155
+Version: 0.4.0a173
 Summary: Guidance platform for deploying and managing large language models.
 Author: Red Hat
 License: Apache-2.0

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/__main__.py RENAMED Viewed

@@ -156,8 +156,9 @@ def benchmark():
 )
 @click.option(
     "--rate",
-    type=float,
-    multiple=True,
+    type=str,
+    callback=cli_tools.parse_list_floats,
+    multiple=False,
     default=BenchmarkGenerativeTextArgs.get_default("rate"),
     help=(
         "Benchmark rate(s) to test. Meaning depends on profile: "
@@ -383,7 +384,7 @@ def run(**kwargs):
         kwargs.get("data_args"), default=[], simplify_single=False
     )
     kwargs["rate"] = cli_tools.format_list_arg(
-        kwargs.get("rate"), default=None, simplify_single=True
+        kwargs.get("rate"), default=None, simplify_single=False
     )
     disable_console_outputs = kwargs.pop("disable_console_outputs", False)

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/benchmarker.py RENAMED Viewed

@@ -57,6 +57,7 @@ class Benchmarker(
         backend: BackendInterface[RequestT, ResponseT],
         profile: Profile,
         environment: Environment,
+        data: list[Any],
         progress: BenchmarkerProgress[BenchmarkT] | None = None,
         sample_requests: int | None = 20,
         warmup: float | None = None,
@@ -149,6 +150,7 @@ class Benchmarker(
                     environment=environment,
                     strategy=strategy,
                     constraints=constraints,
+                    data=data,
                 )
                 if progress:
                     await progress.on_benchmark_complete(benchmark)

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/entrypoints.py RENAMED Viewed

@@ -436,6 +436,7 @@ async def benchmark_generative_text(
         backend=backend,
         profile=profile,
         environment=NonDistributedEnvironment(),
+        data=args.data,
         progress=progress,
         sample_requests=args.sample_requests,
         warmup=args.warmup,

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/output.py RENAMED Viewed

@@ -649,6 +649,8 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
         status_dist_summary: StatusDistributionSummary = getattr(
             benchmark.metrics, metric
         )
+        if not hasattr(status_dist_summary, status):
+            return [], []
         dist_summary: DistributionSummary = getattr(status_dist_summary, status)
         headers = [
@@ -688,7 +690,7 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
         values: list[str] = [
             benchmark.benchmarker.profile.model_dump_json(),
             json.dumps(benchmark.benchmarker.backend),
-            json.dumps(benchmark.benchmarker.requests["attributes"]["data"]),
+            json.dumps(benchmark.benchmarker.requests["data"]),
         ]
         if len(headers) != len(values):

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/benchmark/schemas.py RENAMED Viewed

@@ -1674,6 +1674,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
         environment: Environment,
         strategy: SchedulingStrategy,
         constraints: dict[str, dict[str, Any]],
+        data: list[Any],
     ) -> GenerativeBenchmark:
         """
         Compile final generative benchmark from accumulated state.
@@ -1702,7 +1703,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
             ),
             benchmarker=BenchmarkerDict(
                 profile=profile,
-                requests=InfoMixin.extract_from_obj(requests),
+                requests={"data": data},
                 backend=backend.info,
                 environment=environment.info,
             ),

guidellm-0.4.0a173/src/guidellm/data/deserializers/deserializer.py ADDED Viewed

@@ -0,0 +1,144 @@
+from __future__ import annotations
+from collections.abc import Callable
+from typing import Any, Protocol, Union, runtime_checkable
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.utils import resolve_dataset_split
+from guidellm.utils import RegistryMixin
+__all__ = [
+    "DataNotSupportedError",
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+]
+class DataNotSupportedError(Exception):
+    """Exception raised when data format is not supported by deserializer."""
+@runtime_checkable
+class DatasetDeserializer(Protocol):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
+class DatasetDeserializerFactory(
+    RegistryMixin[Union["type[DatasetDeserializer]", DatasetDeserializer]],
+):
+    @classmethod
+    def deserialize(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        type_: str | None = None,
+        resolve_split: bool = True,
+        select_columns: list[str] | None = None,
+        remove_columns: list[str] | None = None,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset:
+        dataset: Dataset
+        if type_ is None:
+            dataset = cls._deserialize_with_registered_deserializers(
+                data, processor_factory, random_seed, **data_kwargs
+            )
+        else:
+            dataset = cls._deserialize_with_specified_deserializer(
+                data, type_, processor_factory, random_seed, **data_kwargs
+            )
+        if resolve_split:
+            dataset = resolve_dataset_split(dataset)
+        if select_columns is not None or remove_columns is not None:
+            column_names = dataset.column_names or list(next(iter(dataset)).keys())
+            if select_columns is not None:
+                remove_columns = [
+                    col for col in column_names if col not in select_columns
+                ]
+            dataset = dataset.remove_columns(remove_columns)
+        return dataset
+    @classmethod
+    def _deserialize_with_registered_deserializers(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if cls.registry is None:
+            raise RuntimeError("registry is None; cannot deserialize dataset")
+        dataset: Dataset | None = None
+        errors: dict[str, Exception] = {}
+        # Note: There is no priority order for the deserializers, so all deserializers
+        #  must be mutually exclusive to ensure deterministic behavior.
+        for _name, deserializer in cls.registry.items():
+            deserializer_fn: DatasetDeserializer = (
+                deserializer() if isinstance(deserializer, type) else deserializer
+            )
+            try:
+                dataset = deserializer_fn(
+                    data=data,
+                    processor_factory=processor_factory,
+                    random_seed=random_seed,
+                    **data_kwargs,
+                )
+            except Exception as e:  # noqa: BLE001 # The exceptions are saved.
+                errors[_name] = e
+            if dataset is not None:
+                return dataset  # Success
+        if len(errors) > 0:
+            err_msgs = ""
+            def sort_key(item):
+                return (isinstance(item[1], DataNotSupportedError), item[0])
+            for key, err in sorted(errors.items(), key=sort_key):
+                err_msgs += f"\n  - Deserializer '{key}': ({type(err).__name__}) {err}"
+            raise ValueError(
+                "Data deserialization failed, likely because the input doesn't "
+                f"match any of the input formats. See the {len(errors)} error(s) that "
+                f"occurred while attempting to deserialize the data {data}:{err_msgs}"
+            )
+        return dataset
+    @classmethod
+    def _deserialize_with_specified_deserializer(
+        cls,
+        data: Any,
+        type_: str,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        deserializer_from_type = cls.get_registered_object(type_)
+        if deserializer_from_type is None:
+            raise ValueError(f"Deserializer type '{type_}' is not registered.")
+        if isinstance(deserializer_from_type, type):
+            deserializer_fn = deserializer_from_type()
+        else:
+            deserializer_fn = deserializer_from_type
+        return deserializer_fn(
+            data=data,
+            processor_factory=processor_factory,
+            random_seed=random_seed,
+            **data_kwargs,
+        )

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/file.py RENAMED Viewed

@@ -34,11 +34,11 @@ class TextFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".txt", ".text"}
@@ -62,10 +62,10 @@ class CSVFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".csv"
@@ -86,10 +86,10 @@ class JSONFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".json", ".jsonl"}
@@ -110,10 +110,10 @@ class ParquetFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".parquet"
@@ -134,10 +134,10 @@ class ArrowFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".arrow"
@@ -158,10 +158,10 @@ class HDF5FileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".hdf5", ".h5"}
@@ -185,7 +185,7 @@ class DBFileDatasetDeserializer(DatasetDeserializer):
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".db"
@@ -209,7 +209,7 @@ class TarFileDatasetDeserializer(DatasetDeserializer):
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".tar"

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/huggingface.py RENAMED Viewed

@@ -36,7 +36,7 @@ class HuggingFaceDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
         _ = (processor_factory, random_seed)
         if isinstance(

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/memory.py RENAMED Viewed

@@ -33,7 +33,7 @@ class InMemoryDictDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
@@ -67,7 +67,7 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
@@ -81,9 +81,9 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
                 f"expected list of dicts, got {data}"
             )
-        data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
-        first_keys = set(data[0].keys())
-        for index, item in enumerate(data):
+        typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
+        first_keys = set(typed_data[0].keys())
+        for index, item in enumerate(typed_data):
             if set(item.keys()) != first_keys:
                 raise DataNotSupportedError(
                     f"All dictionaries must have the same keys. "
@@ -92,8 +92,8 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
                 )
         # Convert list of dicts to dict of lists
-        result_dict = {key: [] for key in first_keys}
-        for item in data:
+        result_dict: dict = {key: [] for key in first_keys}
+        for item in typed_data:
             for key, value in item.items():
                 result_dict[key].append(value)
@@ -108,7 +108,7 @@ class InMemoryItemListDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         primitive_types = (str, int, float, bool, type(None))
@@ -135,7 +135,7 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (json_str := data.strip())
@@ -145,16 +145,18 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
             )
         ):
             with contextlib.suppress(Exception):
-                parsed = json.loads(data)
+                parsed_data = json.loads(data)
-            for deserializer in [
-                InMemoryDictDatasetDeserializer,
-                InMemoryDictListDatasetDeserializer,
-                InMemoryItemListDatasetDeserializer,
-            ]:
+            deserializers = [
+                InMemoryDictDatasetDeserializer(),
+                InMemoryDictListDatasetDeserializer(),
+                InMemoryItemListDatasetDeserializer(),
+            ]
+            for deserializer in deserializers:
                 with contextlib.suppress(DataNotSupportedError):
-                    return deserializer()(
-                        parsed, data_kwargs, processor_factory, random_seed
+                    return deserializer(
+                        parsed_data, processor_factory, random_seed, **data_kwargs
                     )
         raise DataNotSupportedError(
@@ -171,7 +173,7 @@ class InMemoryCsvDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (csv_str := data.strip())

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/deserializers/synthetic.py RENAMED Viewed

@@ -99,21 +99,23 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
     @model_validator(mode="after")
     def check_prefix_options(self) -> SyntheticTextDatasetConfig:
-        prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
-        prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
-        if prefix_count is not None or prefix_tokens is not None:
-            if self.prefix_buckets:
-                raise ValueError(
-                    "prefix_buckets is mutually exclusive"
-                    " with prefix_count and prefix_tokens"
-                )
+        if self.__pydantic_extra__ is not None:
+            prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+            prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
+            if prefix_count is not None or prefix_tokens is not None:
+                if self.prefix_buckets:
+                    raise ValueError(
+                        "prefix_buckets is mutually exclusive"
+                        " with prefix_count and prefix_tokens"
+                    )
-            self.prefix_buckets = [
-                SyntheticTextPrefixBucketConfig(
-                    prefix_count=prefix_count or 1,
-                    prefix_tokens=prefix_tokens or 0,
-                )
-            ]
+                self.prefix_buckets = [
+                    SyntheticTextPrefixBucketConfig(
+                        prefix_count=prefix_count or 1,
+                        prefix_tokens=prefix_tokens or 0,
+                    )
+                ]
         return self
@@ -174,14 +176,14 @@ class SyntheticTextGenerator:
     def _create_prompt(
         self, prompt_tokens_count: int, faker: Faker, unique: str = ""
     ) -> str:
-        prompt_token_ids = []
+        prompt_token_ids: list[int] = []
         avg_chars_per_token = 5
         margin_of_safety = 1.5
         attempts = 0
         while len(prompt_token_ids) < prompt_tokens_count:
             attempts += 1
-            num_chars = (
+            num_chars = int(
                 prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
             )
             text = unique + faker.text(max_nb_chars=num_chars)

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/loaders.py RENAMED Viewed

@@ -17,6 +17,7 @@ from guidellm.logger import logger
 __all__ = ["DataLoader", "DatasetsIterator"]
 class DatasetsIterator(TorchIterableDataset):
     def __init__(
         self,
@@ -85,7 +86,7 @@ class DatasetsIterator(TorchIterableDataset):
             while max_items is None or gen_count < max_items:
                 try:
-                    row = {
+                    row: dict[str, Any] = {
                         "items": [next(dataset_iter) for dataset_iter in dataset_iters]
                     }
                     gen_count += 1
@@ -98,9 +99,12 @@ class DatasetsIterator(TorchIterableDataset):
                         continue
                     for preprocessor in self.preprocessors:
-                        row = preprocessor(row)
+                        # This can assign a GenerationRequest, which would then be
+                        # passed into the preprocessor, which is a type violation.
+                        # This should be fixed at some point.
+                        row = preprocessor(row)  # type: ignore[assignment]
                     yield row
-                except Exception as err:
+                except Exception as err:  # noqa: BLE001 # Exception logged
                     logger.error(f"Skipping data row due to error: {err}")
                     gen_count -= 1

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/formatters.py RENAMED Viewed

@@ -7,8 +7,6 @@ from guidellm.data.preprocessors.preprocessor import (
     DatasetPreprocessor,
     PreprocessorRegistry,
 )
-from guidellm.data.schemas import GenerativeDatasetColumnType
-from guidellm.data.utils import text_stats
 from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
 __all__ = [
@@ -59,9 +57,13 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
         self.max_tokens: int | None = max_tokens or max_completion_tokens
     def __call__(
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments: GenerationRequestArguments = GenerationRequestArguments(body={})
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments: GenerationRequestArguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works better setting this field here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -99,10 +101,9 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.add_text_metrics(prompt)
         return GenerationRequest(
             request_type="text_completions",
@@ -142,9 +143,13 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
         )
     def __call__(  # noqa: C901, PLR0912, PLR0915
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments = GenerationRequestArguments(body={})
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works best with body assigned here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -191,27 +196,14 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
             if not prefix:
                 continue
-            stats = text_stats(prefix)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.add_text_metrics(prefix)
             arguments.body["messages"].append({"role": "system", "content": prefix})
         for text in columns.get("text_column", []):
             if not text:
                 continue
-            stats = text_stats(text)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.add_text_metrics(text)
             arguments.body["messages"].append(
                 {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -329,9 +321,10 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
         self.encode_audio_kwargs = encode_kwargs or {}
     def __call__(  # noqa: C901
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments = GenerationRequestArguments(body={}, files={})
+        arguments = GenerationRequestArguments(files={})
+        arguments.body = {}  # The type checker works best with body assigned here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -387,10 +380,9 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.add_text_metrics(prompt)
         return GenerationRequest(
             request_type="audio_transcriptions",
@@ -405,7 +397,7 @@ class GenerativeAudioTranslationRequestFormatter(
     GenerativeAudioTranscriptionRequestFormatter
 ):
     def __call__(
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
         result = super().__call__(columns)
         result.request_type = "audio_translations"

{guidellm-0.4.0a155 → guidellm-0.4.0a173}/src/guidellm/data/preprocessors/mappers.py RENAMED Viewed

@@ -169,12 +169,12 @@ class GenerativeColumnMapper(DataDependentPreprocessor):
     def __call__(
         self, row: dict[str, Any]
-    ) -> dict[GenerativeDatasetColumnType, list[Any]]:
+    ) -> dict[str, list[Any]]:
         if self.datasets_column_mappings is None:
             raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
         items = cast("dict[int, dict[str, Any]]", row.pop("items"))
-        mapped: dict[GenerativeDatasetColumnType, list[Any]] = defaultdict(list)
+        mapped: dict[str, Any] = defaultdict(list)
         for column_type, column_mappings in self.datasets_column_mappings.items():
             for (

guidellm 0.4.0a155__tar.gz → 0.4.0a173__tar.gz

Potentially problematic release.

guidellm 0.4.0a155tar.gz → 0.4.0a173tar.gz