PyPI - guidellm - Versions diffs - 0.4.0a155__py3-none-any.whl → 0.4.0a173__py3-none-any.whl - Mend

guidellm 0.4.0a155py3-none-any.whl → 0.4.0a173py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (32) hide show

guidellm/__main__.py +4 -3
guidellm/benchmark/benchmarker.py +2 -0
guidellm/benchmark/entrypoints.py +1 -0
guidellm/benchmark/output.py +3 -1
guidellm/benchmark/schemas.py +2 -1
guidellm/data/deserializers/deserializer.py +79 -44
guidellm/data/deserializers/file.py +14 -14
guidellm/data/deserializers/huggingface.py +1 -1
guidellm/data/deserializers/memory.py +20 -18
guidellm/data/deserializers/synthetic.py +18 -16
guidellm/data/loaders.py +7 -3
guidellm/data/preprocessors/formatters.py +24 -32
guidellm/data/preprocessors/mappers.py +2 -2
guidellm/data/preprocessors/preprocessor.py +5 -3
guidellm/data/processor.py +3 -2
guidellm/data/utils/__init__.py +0 -4
guidellm/data/utils/dataset.py +2 -2
guidellm/scheduler/constraints.py +1 -3
guidellm/scheduler/environments.py +2 -2
guidellm/scheduler/scheduler.py +1 -1
guidellm/scheduler/strategies.py +31 -4
guidellm/scheduler/worker.py +56 -30
guidellm/scheduler/worker_group.py +33 -31
guidellm/schemas/request.py +10 -0
guidellm/utils/cli.py +26 -1
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/METADATA +1 -1
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/RECORD +31 -32
guidellm/data/utils/functions.py +0 -18
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/WHEEL +0 -0
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/entry_points.txt +0 -0
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.4.0a155.dist-info → guidellm-0.4.0a173.dist-info}/top_level.txt +0 -0

guidellm/__main__.py CHANGED Viewed

@@ -156,8 +156,9 @@ def benchmark():
 )
 @click.option(
     "--rate",
-    type=float,
-    multiple=True,
+    type=str,
+    callback=cli_tools.parse_list_floats,
+    multiple=False,
     default=BenchmarkGenerativeTextArgs.get_default("rate"),
     help=(
         "Benchmark rate(s) to test. Meaning depends on profile: "
@@ -383,7 +384,7 @@ def run(**kwargs):
         kwargs.get("data_args"), default=[], simplify_single=False
     )
     kwargs["rate"] = cli_tools.format_list_arg(
-        kwargs.get("rate"), default=None, simplify_single=True
+        kwargs.get("rate"), default=None, simplify_single=False
     )
     disable_console_outputs = kwargs.pop("disable_console_outputs", False)

guidellm/benchmark/benchmarker.py CHANGED Viewed

@@ -57,6 +57,7 @@ class Benchmarker(
         backend: BackendInterface[RequestT, ResponseT],
         profile: Profile,
         environment: Environment,
+        data: list[Any],
         progress: BenchmarkerProgress[BenchmarkT] | None = None,
         sample_requests: int | None = 20,
         warmup: float | None = None,
@@ -149,6 +150,7 @@ class Benchmarker(
                     environment=environment,
                     strategy=strategy,
                     constraints=constraints,
+                    data=data,
                 )
                 if progress:
                     await progress.on_benchmark_complete(benchmark)

guidellm/benchmark/entrypoints.py CHANGED Viewed

@@ -436,6 +436,7 @@ async def benchmark_generative_text(
         backend=backend,
         profile=profile,
         environment=NonDistributedEnvironment(),
+        data=args.data,
         progress=progress,
         sample_requests=args.sample_requests,
         warmup=args.warmup,

guidellm/benchmark/output.py CHANGED Viewed

@@ -649,6 +649,8 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
         status_dist_summary: StatusDistributionSummary = getattr(
             benchmark.metrics, metric
         )
+        if not hasattr(status_dist_summary, status):
+            return [], []
         dist_summary: DistributionSummary = getattr(status_dist_summary, status)
         headers = [
@@ -688,7 +690,7 @@ class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
         values: list[str] = [
             benchmark.benchmarker.profile.model_dump_json(),
             json.dumps(benchmark.benchmarker.backend),
-            json.dumps(benchmark.benchmarker.requests["attributes"]["data"]),
+            json.dumps(benchmark.benchmarker.requests["data"]),
         ]
         if len(headers) != len(values):

guidellm/benchmark/schemas.py CHANGED Viewed

@@ -1674,6 +1674,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
         environment: Environment,
         strategy: SchedulingStrategy,
         constraints: dict[str, dict[str, Any]],
+        data: list[Any],
     ) -> GenerativeBenchmark:
         """
         Compile final generative benchmark from accumulated state.
@@ -1702,7 +1703,7 @@ class GenerativeBenchmark(Benchmark, StandardBaseDict):
             ),
             benchmarker=BenchmarkerDict(
                 profile=profile,
-                requests=InfoMixin.extract_from_obj(requests),
+                requests={"data": data},
                 backend=backend.info,
                 environment=environment.info,
             ),

guidellm/data/deserializers/deserializer.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from __future__ import annotations
-import contextlib
 from collections.abc import Callable
 from typing import Any, Protocol, Union, runtime_checkable
-from datasets import Dataset, IterableDataset
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from transformers import PreTrainedTokenizerBase
 from guidellm.data.utils import resolve_dataset_split
@@ -29,7 +28,7 @@ class DatasetDeserializer(Protocol):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]: ...
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
 class DatasetDeserializerFactory(
@@ -47,51 +46,16 @@ class DatasetDeserializerFactory(
         remove_columns: list[str] | None = None,
         **data_kwargs: dict[str, Any],
     ) -> Dataset | IterableDataset:
-        dataset = None
+        dataset: Dataset
         if type_ is None:
-            errors = []
-            # Note: There is no priority order for the deserializers, so all deserializers
-            #  must be mutually exclusive to ensure deterministic behavior.
-            for name, deserializer in cls.registry.items():
-                deserializer_fn: DatasetDeserializer = (
-                    deserializer() if isinstance(deserializer, type) else deserializer
-                )
-                try:
-                    with contextlib.suppress(DataNotSupportedError):
-                        dataset = deserializer_fn(
-                            data=data,
-                            processor_factory=processor_factory,
-                            random_seed=random_seed,
-                            **data_kwargs,
-                        )
-                except Exception as e:
-                    errors.append(e)
-                if dataset is not None:
-                    break # Found one that works. Continuing could overwrite it.
-            if dataset is None and len(errors) > 0:
-                raise DataNotSupportedError(f"data deserialization failed; {len(errors)} errors occurred while "
-                                            f"attempting to deserialize data {data}: {errors}")
-        elif deserializer := cls.get_registered_object(type_) is not None:
-            deserializer_fn: DatasetDeserializer = (
-                deserializer() if isinstance(deserializer, type) else deserializer
+            dataset = cls._deserialize_with_registered_deserializers(
+                data, processor_factory, random_seed, **data_kwargs
             )
-            dataset = deserializer_fn(
-                data=data,
-                processor_factory=processor_factory,
-                random_seed=random_seed,
-                **data_kwargs,
-            )
-        if dataset is None:
-            raise DataNotSupportedError(
-                f"No suitable deserializer found for data {data} "
-                f"with kwargs {data_kwargs} and deserializer type {type_}."
+        else:
+            dataset = cls._deserialize_with_specified_deserializer(
+                data, type_, processor_factory, random_seed, **data_kwargs
             )
         if resolve_split:
@@ -107,3 +71,74 @@ class DatasetDeserializerFactory(
             dataset = dataset.remove_columns(remove_columns)
         return dataset
+    @classmethod
+    def _deserialize_with_registered_deserializers(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if cls.registry is None:
+            raise RuntimeError("registry is None; cannot deserialize dataset")
+        dataset: Dataset | None = None
+        errors: dict[str, Exception] = {}
+        # Note: There is no priority order for the deserializers, so all deserializers
+        #  must be mutually exclusive to ensure deterministic behavior.
+        for _name, deserializer in cls.registry.items():
+            deserializer_fn: DatasetDeserializer = (
+                deserializer() if isinstance(deserializer, type) else deserializer
+            )
+            try:
+                dataset = deserializer_fn(
+                    data=data,
+                    processor_factory=processor_factory,
+                    random_seed=random_seed,
+                    **data_kwargs,
+                )
+            except Exception as e:  # noqa: BLE001 # The exceptions are saved.
+                errors[_name] = e
+            if dataset is not None:
+                return dataset  # Success
+        if len(errors) > 0:
+            err_msgs = ""
+            def sort_key(item):
+                return (isinstance(item[1], DataNotSupportedError), item[0])
+            for key, err in sorted(errors.items(), key=sort_key):
+                err_msgs += f"\n  - Deserializer '{key}': ({type(err).__name__}) {err}"
+            raise ValueError(
+                "Data deserialization failed, likely because the input doesn't "
+                f"match any of the input formats. See the {len(errors)} error(s) that "
+                f"occurred while attempting to deserialize the data {data}:{err_msgs}"
+            )
+        return dataset
+    @classmethod
+    def _deserialize_with_specified_deserializer(
+        cls,
+        data: Any,
+        type_: str,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        deserializer_from_type = cls.get_registered_object(type_)
+        if deserializer_from_type is None:
+            raise ValueError(f"Deserializer type '{type_}' is not registered.")
+        if isinstance(deserializer_from_type, type):
+            deserializer_fn = deserializer_from_type()
+        else:
+            deserializer_fn = deserializer_from_type
+        return deserializer_fn(
+            data=data,
+            processor_factory=processor_factory,
+            random_seed=random_seed,
+            **data_kwargs,
+        )

guidellm/data/deserializers/file.py CHANGED Viewed

@@ -34,11 +34,11 @@ class TextFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".txt", ".text"}
@@ -62,10 +62,10 @@ class CSVFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".csv"
@@ -86,10 +86,10 @@ class JSONFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".json", ".jsonl"}
@@ -110,10 +110,10 @@ class ParquetFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".parquet"
@@ -134,10 +134,10 @@ class ArrowFileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".arrow"
@@ -158,10 +158,10 @@ class HDF5FileDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".hdf5", ".h5"}
@@ -185,7 +185,7 @@ class DBFileDatasetDeserializer(DatasetDeserializer):
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".db"
@@ -209,7 +209,7 @@ class TarFileDatasetDeserializer(DatasetDeserializer):
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".tar"

guidellm/data/deserializers/huggingface.py CHANGED Viewed

@@ -36,7 +36,7 @@ class HuggingFaceDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
         _ = (processor_factory, random_seed)
         if isinstance(

guidellm/data/deserializers/memory.py CHANGED Viewed

@@ -33,7 +33,7 @@ class InMemoryDictDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
@@ -67,7 +67,7 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         if (
@@ -81,9 +81,9 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
                 f"expected list of dicts, got {data}"
             )
-        data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
-        first_keys = set(data[0].keys())
-        for index, item in enumerate(data):
+        typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
+        first_keys = set(typed_data[0].keys())
+        for index, item in enumerate(typed_data):
             if set(item.keys()) != first_keys:
                 raise DataNotSupportedError(
                     f"All dictionaries must have the same keys. "
@@ -92,8 +92,8 @@ class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
                 )
         # Convert list of dicts to dict of lists
-        result_dict = {key: [] for key in first_keys}
-        for item in data:
+        result_dict: dict = {key: [] for key in first_keys}
+        for item in typed_data:
             for key, value in item.items():
                 result_dict[key].append(value)
@@ -108,7 +108,7 @@ class InMemoryItemListDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
         primitive_types = (str, int, float, bool, type(None))
@@ -135,7 +135,7 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (json_str := data.strip())
@@ -145,16 +145,18 @@ class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
             )
         ):
             with contextlib.suppress(Exception):
-                parsed = json.loads(data)
+                parsed_data = json.loads(data)
-            for deserializer in [
-                InMemoryDictDatasetDeserializer,
-                InMemoryDictListDatasetDeserializer,
-                InMemoryItemListDatasetDeserializer,
-            ]:
+            deserializers = [
+                InMemoryDictDatasetDeserializer(),
+                InMemoryDictListDatasetDeserializer(),
+                InMemoryItemListDatasetDeserializer(),
+            ]
+            for deserializer in deserializers:
                 with contextlib.suppress(DataNotSupportedError):
-                    return deserializer()(
-                        parsed, data_kwargs, processor_factory, random_seed
+                    return deserializer(
+                        parsed_data, processor_factory, random_seed, **data_kwargs
                     )
         raise DataNotSupportedError(
@@ -171,7 +173,7 @@ class InMemoryCsvDatasetDeserializer(DatasetDeserializer):
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (csv_str := data.strip())

guidellm/data/deserializers/synthetic.py CHANGED Viewed

@@ -99,21 +99,23 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
     @model_validator(mode="after")
     def check_prefix_options(self) -> SyntheticTextDatasetConfig:
-        prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
-        prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
-        if prefix_count is not None or prefix_tokens is not None:
-            if self.prefix_buckets:
-                raise ValueError(
-                    "prefix_buckets is mutually exclusive"
-                    " with prefix_count and prefix_tokens"
-                )
+        if self.__pydantic_extra__ is not None:
+            prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+            prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
+            if prefix_count is not None or prefix_tokens is not None:
+                if self.prefix_buckets:
+                    raise ValueError(
+                        "prefix_buckets is mutually exclusive"
+                        " with prefix_count and prefix_tokens"
+                    )
-            self.prefix_buckets = [
-                SyntheticTextPrefixBucketConfig(
-                    prefix_count=prefix_count or 1,
-                    prefix_tokens=prefix_tokens or 0,
-                )
-            ]
+                self.prefix_buckets = [
+                    SyntheticTextPrefixBucketConfig(
+                        prefix_count=prefix_count or 1,
+                        prefix_tokens=prefix_tokens or 0,
+                    )
+                ]
         return self
@@ -174,14 +176,14 @@ class SyntheticTextGenerator:
     def _create_prompt(
         self, prompt_tokens_count: int, faker: Faker, unique: str = ""
     ) -> str:
-        prompt_token_ids = []
+        prompt_token_ids: list[int] = []
         avg_chars_per_token = 5
         margin_of_safety = 1.5
         attempts = 0
         while len(prompt_token_ids) < prompt_tokens_count:
             attempts += 1
-            num_chars = (
+            num_chars = int(
                 prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
             )
             text = unique + faker.text(max_nb_chars=num_chars)

guidellm/data/loaders.py CHANGED Viewed

@@ -17,6 +17,7 @@ from guidellm.logger import logger
 __all__ = ["DataLoader", "DatasetsIterator"]
 class DatasetsIterator(TorchIterableDataset):
     def __init__(
         self,
@@ -85,7 +86,7 @@ class DatasetsIterator(TorchIterableDataset):
             while max_items is None or gen_count < max_items:
                 try:
-                    row = {
+                    row: dict[str, Any] = {
                         "items": [next(dataset_iter) for dataset_iter in dataset_iters]
                     }
                     gen_count += 1
@@ -98,9 +99,12 @@ class DatasetsIterator(TorchIterableDataset):
                         continue
                     for preprocessor in self.preprocessors:
-                        row = preprocessor(row)
+                        # This can assign a GenerationRequest, which would then be
+                        # passed into the preprocessor, which is a type violation.
+                        # This should be fixed at some point.
+                        row = preprocessor(row)  # type: ignore[assignment]
                     yield row
-                except Exception as err:
+                except Exception as err:  # noqa: BLE001 # Exception logged
                     logger.error(f"Skipping data row due to error: {err}")
                     gen_count -= 1

guidellm/data/preprocessors/formatters.py CHANGED Viewed

@@ -7,8 +7,6 @@ from guidellm.data.preprocessors.preprocessor import (
     DatasetPreprocessor,
     PreprocessorRegistry,
 )
-from guidellm.data.schemas import GenerativeDatasetColumnType
-from guidellm.data.utils import text_stats
 from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
 __all__ = [
@@ -59,9 +57,13 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
         self.max_tokens: int | None = max_tokens or max_completion_tokens
     def __call__(
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments: GenerationRequestArguments = GenerationRequestArguments(body={})
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments: GenerationRequestArguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works better setting this field here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -99,10 +101,9 @@ class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.add_text_metrics(prompt)
         return GenerationRequest(
             request_type="text_completions",
@@ -142,9 +143,13 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
         )
     def __call__(  # noqa: C901, PLR0912, PLR0915
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments = GenerationRequestArguments(body={})
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works best with body assigned here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -191,27 +196,14 @@ class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
             if not prefix:
                 continue
-            stats = text_stats(prefix)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.add_text_metrics(prefix)
             arguments.body["messages"].append({"role": "system", "content": prefix})
         for text in columns.get("text_column", []):
             if not text:
                 continue
-            stats = text_stats(text)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.add_text_metrics(text)
             arguments.body["messages"].append(
                 {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -329,9 +321,10 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
         self.encode_audio_kwargs = encode_kwargs or {}
     def __call__(  # noqa: C901
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
-        arguments = GenerationRequestArguments(body={}, files={})
+        arguments = GenerationRequestArguments(files={})
+        arguments.body = {}  # The type checker works best with body assigned here
         input_metrics = UsageMetrics()
         output_metrics = UsageMetrics()
@@ -387,10 +380,9 @@ class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.add_text_metrics(prompt)
         return GenerationRequest(
             request_type="audio_transcriptions",
@@ -405,7 +397,7 @@ class GenerativeAudioTranslationRequestFormatter(
     GenerativeAudioTranscriptionRequestFormatter
 ):
     def __call__(
-        self, columns: dict[GenerativeDatasetColumnType, list[Any]]
+        self, columns: dict[str, list[Any]]
     ) -> GenerationRequest:
         result = super().__call__(columns)
         result.request_type = "audio_translations"

guidellm/data/preprocessors/mappers.py CHANGED Viewed

@@ -169,12 +169,12 @@ class GenerativeColumnMapper(DataDependentPreprocessor):
     def __call__(
         self, row: dict[str, Any]
-    ) -> dict[GenerativeDatasetColumnType, list[Any]]:
+    ) -> dict[str, list[Any]]:
         if self.datasets_column_mappings is None:
             raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
         items = cast("dict[int, dict[str, Any]]", row.pop("items"))
-        mapped: dict[GenerativeDatasetColumnType, list[Any]] = defaultdict(list)
+        mapped: dict[str, Any] = defaultdict(list)
         for column_type, column_mappings in self.datasets_column_mappings.items():
             for (

guidellm 0.4.0a155__py3-none-any.whl → 0.4.0a173__py3-none-any.whl

Potentially problematic release.

guidellm 0.4.0a155py3-none-any.whl → 0.4.0a173py3-none-any.whl