PyPI - guidellm - Versions diffs - 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl - Mend

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +524 -255
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +109 -0
guidellm/backends/openai.py +340 -0
guidellm/backends/response_handlers.py +428 -0
guidellm/benchmark/__init__.py +69 -39
guidellm/benchmark/benchmarker.py +160 -316
guidellm/benchmark/entrypoints.py +560 -127
guidellm/benchmark/outputs/__init__.py +24 -0
guidellm/benchmark/outputs/console.py +633 -0
guidellm/benchmark/outputs/csv.py +721 -0
guidellm/benchmark/outputs/html.py +473 -0
guidellm/benchmark/outputs/output.py +169 -0
guidellm/benchmark/outputs/serialized.py +69 -0
guidellm/benchmark/profiles.py +718 -0
guidellm/benchmark/progress.py +553 -556
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas/__init__.py +66 -0
guidellm/benchmark/schemas/base.py +402 -0
guidellm/benchmark/schemas/generative/__init__.py +55 -0
guidellm/benchmark/schemas/generative/accumulator.py +841 -0
guidellm/benchmark/schemas/generative/benchmark.py +163 -0
guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
guidellm/benchmark/schemas/generative/metrics.py +927 -0
guidellm/benchmark/schemas/generative/report.py +158 -0
guidellm/data/__init__.py +34 -4
guidellm/data/builders.py +541 -0
guidellm/data/collators.py +16 -0
guidellm/data/config.py +120 -0
guidellm/data/deserializers/__init__.py +49 -0
guidellm/data/deserializers/deserializer.py +141 -0
guidellm/data/deserializers/file.py +223 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +194 -0
guidellm/data/deserializers/synthetic.py +246 -0
guidellm/data/entrypoints.py +52 -0
guidellm/data/loaders.py +190 -0
guidellm/data/preprocessors/__init__.py +27 -0
guidellm/data/preprocessors/formatters.py +410 -0
guidellm/data/preprocessors/mappers.py +196 -0
guidellm/data/preprocessors/preprocessor.py +30 -0
guidellm/data/processor.py +29 -0
guidellm/data/schemas.py +175 -0
guidellm/data/utils/__init__.py +6 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +220 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +238 -0
guidellm/mock_server/utils.py +302 -0
guidellm/scheduler/__init__.py +69 -26
guidellm/scheduler/constraints/__init__.py +49 -0
guidellm/scheduler/constraints/constraint.py +325 -0
guidellm/scheduler/constraints/error.py +411 -0
guidellm/scheduler/constraints/factory.py +182 -0
guidellm/scheduler/constraints/request.py +312 -0
guidellm/scheduler/constraints/saturation.py +722 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +137 -368
guidellm/scheduler/schemas.py +358 -0
guidellm/scheduler/strategies.py +617 -0
guidellm/scheduler/worker.py +413 -419
guidellm/scheduler/worker_group.py +712 -0
guidellm/schemas/__init__.py +65 -0
guidellm/schemas/base.py +417 -0
guidellm/schemas/info.py +188 -0
guidellm/schemas/request.py +235 -0
guidellm/schemas/request_stats.py +349 -0
guidellm/schemas/response.py +124 -0
guidellm/schemas/statistics.py +1018 -0
guidellm/{config.py → settings.py} +31 -24
guidellm/utils/__init__.py +71 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +132 -5
guidellm/utils/console.py +566 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +159 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +2 -2
guidellm-0.6.0a5.dist-info/METADATA +364 -0
guidellm-0.6.0a5.dist-info/RECORD +109 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -708
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/output.py +0 -997
guidellm/benchmark/profile.py +0 -409
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/objects/statistics.py +0 -953
guidellm/preprocess/__init__.py +0 -3
guidellm/preprocess/dataset.py +0 -374
guidellm/presentation/__init__.py +0 -28
guidellm/presentation/builder.py +0 -27
guidellm/presentation/data_models.py +0 -232
guidellm/presentation/injector.py +0 -66
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.3.1.dist-info/METADATA +0 -329
guidellm-0.3.1.dist-info/RECORD +0 -62
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0

guidellm/data/deserializers/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+from .deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+from .file import (
+    ArrowFileDatasetDeserializer,
+    CSVFileDatasetDeserializer,
+    DBFileDatasetDeserializer,
+    HDF5FileDatasetDeserializer,
+    JSONFileDatasetDeserializer,
+    ParquetFileDatasetDeserializer,
+    TarFileDatasetDeserializer,
+    TextFileDatasetDeserializer,
+)
+from .huggingface import HuggingFaceDatasetDeserializer
+from .memory import (
+    InMemoryCsvDatasetDeserializer,
+    InMemoryDictDatasetDeserializer,
+    InMemoryDictListDatasetDeserializer,
+    InMemoryItemListDatasetDeserializer,
+    InMemoryJsonStrDatasetDeserializer,
+)
+from .synthetic import (
+    SyntheticTextDataset,
+    SyntheticTextDatasetDeserializer,
+)
+__all__ = [
+    "ArrowFileDatasetDeserializer",
+    "CSVFileDatasetDeserializer",
+    "DBFileDatasetDeserializer",
+    "DataNotSupportedError",
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+    "HDF5FileDatasetDeserializer",
+    "HuggingFaceDatasetDeserializer",
+    "InMemoryCsvDatasetDeserializer",
+    "InMemoryDictDatasetDeserializer",
+    "InMemoryDictListDatasetDeserializer",
+    "InMemoryItemListDatasetDeserializer",
+    "InMemoryJsonStrDatasetDeserializer",
+    "JSONFileDatasetDeserializer",
+    "ParquetFileDatasetDeserializer",
+    "SyntheticTextDataset",
+    "SyntheticTextDatasetDeserializer",
+    "TarFileDatasetDeserializer",
+    "TextFileDatasetDeserializer",
+]

guidellm/data/deserializers/deserializer.py ADDED Viewed

@@ -0,0 +1,141 @@
+from __future__ import annotations
+from collections.abc import Callable
+from typing import Any, Protocol, Union, runtime_checkable
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.schemas import DataNotSupportedError
+from guidellm.data.utils import resolve_dataset_split
+from guidellm.utils import RegistryMixin
+__all__ = [
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+]
+@runtime_checkable
+class DatasetDeserializer(Protocol):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
+class DatasetDeserializerFactory(
+    RegistryMixin[Union["type[DatasetDeserializer]", DatasetDeserializer]],
+):
+    @classmethod
+    def deserialize(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        type_: str | None = None,
+        resolve_split: bool = True,
+        select_columns: list[str] | None = None,
+        remove_columns: list[str] | None = None,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset:
+        dataset: Dataset
+        if type_ is None:
+            dataset = cls._deserialize_with_registered_deserializers(
+                data, processor_factory, random_seed, **data_kwargs
+            )
+        else:
+            dataset = cls._deserialize_with_specified_deserializer(
+                data, type_, processor_factory, random_seed, **data_kwargs
+            )
+        if resolve_split:
+            dataset = resolve_dataset_split(dataset)
+        if select_columns is not None or remove_columns is not None:
+            column_names = dataset.column_names or list(next(iter(dataset)).keys())
+            if select_columns is not None:
+                remove_columns = [
+                    col for col in column_names if col not in select_columns
+                ]
+            dataset = dataset.remove_columns(remove_columns)
+        return dataset
+    @classmethod
+    def _deserialize_with_registered_deserializers(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if cls.registry is None:
+            raise RuntimeError("registry is None; cannot deserialize dataset")
+        dataset: Dataset | None = None
+        errors: dict[str, Exception] = {}
+        # Note: There is no priority order for the deserializers, so all deserializers
+        #  must be mutually exclusive to ensure deterministic behavior.
+        for _name, deserializer in cls.registry.items():
+            deserializer_fn: DatasetDeserializer = (
+                deserializer() if isinstance(deserializer, type) else deserializer
+            )
+            try:
+                dataset = deserializer_fn(
+                    data=data,
+                    processor_factory=processor_factory,
+                    random_seed=random_seed,
+                    **data_kwargs,
+                )
+            except Exception as e:  # noqa: BLE001 # The exceptions are saved.
+                errors[_name] = e
+            if dataset is not None:
+                return dataset  # Success
+        if len(errors) > 0:
+            err_msgs = ""
+            def sort_key(item):
+                return (isinstance(item[1], DataNotSupportedError), item[0])
+            for key, err in sorted(errors.items(), key=sort_key):
+                err_msgs += f"\n  - Deserializer '{key}': ({type(err).__name__}) {err}"
+            raise ValueError(
+                "Data deserialization failed, likely because the input doesn't "
+                f"match any of the input formats. See the {len(errors)} error(s) that "
+                f"occurred while attempting to deserialize the data {data}:{err_msgs}"
+            )
+        return dataset
+    @classmethod
+    def _deserialize_with_specified_deserializer(
+        cls,
+        data: Any,
+        type_: str,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        deserializer_from_type = cls.get_registered_object(type_)
+        if deserializer_from_type is None:
+            raise ValueError(f"Deserializer type '{type_}' is not registered.")
+        if isinstance(deserializer_from_type, type):
+            deserializer_fn = deserializer_from_type()
+        else:
+            deserializer_fn = deserializer_from_type
+        return deserializer_fn(
+            data=data,
+            processor_factory=processor_factory,
+            random_seed=random_seed,
+            **data_kwargs,
+        )

guidellm/data/deserializers/file.py ADDED Viewed

@@ -0,0 +1,223 @@
+from __future__ import annotations
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+import pandas as pd
+from datasets import Dataset, load_dataset
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+__all__ = [
+    "ArrowFileDatasetDeserializer",
+    "CSVFileDatasetDeserializer",
+    "DBFileDatasetDeserializer",
+    "HDF5FileDatasetDeserializer",
+    "JSONFileDatasetDeserializer",
+    "ParquetFileDatasetDeserializer",
+    "TarFileDatasetDeserializer",
+    "TextFileDatasetDeserializer",
+]
+@DatasetDeserializerFactory.register("text_file")
+class TextFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)  # Ignore unused args format errors
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".txt", ".text"}
+        ):
+            raise DataNotSupportedError(
+                "Unsupported data for TextFileDatasetDeserializer, "
+                f"expected str or Path to a local .txt or .text file, got {data}"
+            )
+        with path.open() as file:
+            lines = file.readlines()
+        return Dataset.from_dict({"text": lines}, **data_kwargs)
+@DatasetDeserializerFactory.register("csv_file")
+class CSVFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".csv"
+        ):
+            raise DataNotSupportedError(
+                "Unsupported data for CSVFileDatasetDeserializer, "
+                f"expected str or Path to a valid local .csv file, got {data}"
+            )
+        return load_dataset("csv", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("json_file")
+class JSONFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".json", ".jsonl"}
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for JSONFileDatasetDeserializer, "
+                f"expected str or Path to a local .json or .jsonl file, got {data}"
+            )
+        return load_dataset("json", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("parquet_file")
+class ParquetFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".parquet"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for ParquetFileDatasetDeserializer, "
+                f"expected str or Path to a local .parquet file, got {data}"
+            )
+        return load_dataset("parquet", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("arrow_file")
+class ArrowFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".arrow"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for ArrowFileDatasetDeserializer, "
+                f"expected str or Path to a local .arrow file, got {data}"
+            )
+        return load_dataset("arrow", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("hdf5_file")
+class HDF5FileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".hdf5", ".h5"}
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for HDF5FileDatasetDeserializer, "
+                f"expected str or Path to a local .hdf5 or .h5 file, got {data}"
+            )
+        return Dataset.from_pandas(pd.read_hdf(str(path)), **data_kwargs)
+@DatasetDeserializerFactory.register("db_file")
+class DBFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".db"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for DBFileDatasetDeserializer, "
+                f"expected str or Path to a local .db file, got {data}"
+            )
+        return Dataset.from_sql(con=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("tar_file")
+class TarFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".tar"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for TarFileDatasetDeserializer, "
+                f"expected str or Path to a local .tar file, got {data}"
+            )
+        return load_dataset("webdataset", data_files=str(path), **data_kwargs)

guidellm/data/deserializers/huggingface.py ADDED Viewed

@@ -0,0 +1,94 @@
+from __future__ import annotations
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from datasets import (
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    load_dataset,
+    load_from_disk,
+)
+from datasets.exceptions import (
+    DataFilesNotFoundError,
+    DatasetNotFoundError,
+    FileNotFoundDatasetsError,
+)
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+__all__ = ["HuggingFaceDatasetDeserializer"]
+@DatasetDeserializerFactory.register("huggingface")
+class HuggingFaceDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+        _ = (processor_factory, random_seed)
+        if isinstance(
+            data, Dataset | IterableDataset | DatasetDict | IterableDatasetDict
+        ):
+            return data
+        load_error = None
+        if (
+            isinstance(data, str | Path)
+            and (path := Path(data)).exists()
+            and ((path.is_file() and path.suffix == ".py") or path.is_dir())
+        ):
+            # Handle python script or nested python script in a directory
+            try:
+                return load_dataset(str(data), **data_kwargs)
+            except (
+                FileNotFoundDatasetsError,
+                DatasetNotFoundError,
+                DataFilesNotFoundError,
+            ) as err:
+                load_error = err
+            except Exception:  # noqa: BLE001
+                # Try loading as a local dataset directory next
+                try:
+                    return load_from_disk(str(data), **data_kwargs)
+                except (
+                    FileNotFoundDatasetsError,
+                    DatasetNotFoundError,
+                    DataFilesNotFoundError,
+                ) as err2:
+                    load_error = err2
+        try:
+            # Handle dataset identifier from the Hugging Face Hub
+            return load_dataset(str(data), **data_kwargs)
+        except (
+            FileNotFoundDatasetsError,
+            DatasetNotFoundError,
+            DataFilesNotFoundError,
+        ) as err:
+            load_error = err
+        not_supported = DataNotSupportedError(
+            "Unsupported data for HuggingFaceDatasetDeserializer, "
+            "expected Dataset, IterableDataset, DatasetDict, IterableDatasetDict, "
+            "str or Path to a local dataset directory or a local .py dataset script, "
+            f"got {data} and HF load error: {load_error}"
+        )
+        if load_error is not None:
+            raise not_supported from load_error
+        else:
+            raise not_supported

guidellm/data/deserializers/memory.py ADDED Viewed

@@ -0,0 +1,194 @@
+from __future__ import annotations
+import contextlib
+import csv
+import json
+from collections.abc import Callable
+from io import StringIO
+from typing import Any, cast
+from datasets import Dataset
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+__all__ = [
+    "InMemoryCsvDatasetDeserializer",
+    "InMemoryDictDatasetDeserializer",
+    "InMemoryDictListDatasetDeserializer",
+    "InMemoryItemListDatasetDeserializer",
+    "InMemoryJsonStrDatasetDeserializer",
+]
+@DatasetDeserializerFactory.register("in_memory_dict")
+class InMemoryDictDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)  # Ignore unused args format errors
+        if (
+            not data
+            or not isinstance(data, dict)
+            or not all(
+                isinstance(key, str) and isinstance(val, list)
+                for key, val in data.items()
+            )
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for InMemoryDictDatasetDeserializer, "
+                f"expected dict[str, list], got {data}"
+            )
+        rows = len(list(data.values())[0])
+        if not all(len(val) == rows for val in data.values()):
+            raise DataNotSupportedError(
+                "All lists in the data dictionary must have the same length, "
+                f"expected {rows} for all keys {list(data.keys())}"
+            )
+        return Dataset.from_dict(data, **data_kwargs)
+@DatasetDeserializerFactory.register("in_memory_dict_list")
+class InMemoryDictListDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)  # Ignore unused args format errors
+        if (
+            not data
+            or not isinstance(data, list)
+            or not all(isinstance(item, dict) for item in data)
+            or not all(isinstance(key, str) for item in data for key in item)
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for InMemoryDictListDatasetDeserializer, "
+                f"expected list of dicts, got {data}"
+            )
+        typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
+        first_keys = set(typed_data[0].keys())
+        for index, item in enumerate(typed_data):
+            if set(item.keys()) != first_keys:
+                raise DataNotSupportedError(
+                    f"All dictionaries must have the same keys. "
+                    f"Expected keys: {first_keys}, "
+                    f"got keys at index {index}: {set(item.keys())}"
+                )
+        # Convert list of dicts to dict of lists
+        result_dict: dict = {key: [] for key in first_keys}
+        for item in typed_data:
+            for key, value in item.items():
+                result_dict[key].append(value)
+        return Dataset.from_dict(result_dict, **data_kwargs)
+@DatasetDeserializerFactory.register("in_memory_item_list")
+class InMemoryItemListDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        _ = (processor_factory, random_seed)  # Ignore unused args format errors
+        primitive_types = (str, int, float, bool, type(None))
+        if (
+            not data
+            or not isinstance(data, list)
+            or not all(isinstance(item, primitive_types) for item in data)
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for InMemoryItemListDatasetDeserializer, "
+                f"expected list of primitive items, got {data}"
+            )
+        column_name = data_kwargs.pop("column_name", "data")
+        return Dataset.from_dict({column_name: data}, **data_kwargs)
+@DatasetDeserializerFactory.register("in_memory_json_str")
+class InMemoryJsonStrDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if (
+            isinstance(data, str)
+            and (json_str := data.strip())
+            and (
+                (json_str.startswith("{") and json_str.endswith("}"))
+                or (json_str.startswith("[") and json_str.endswith("]"))
+            )
+        ):
+            with contextlib.suppress(Exception):
+                parsed_data = json.loads(data)
+            deserializers = [
+                InMemoryDictDatasetDeserializer(),
+                InMemoryDictListDatasetDeserializer(),
+                InMemoryItemListDatasetDeserializer(),
+            ]
+            for deserializer in deserializers:
+                with contextlib.suppress(DataNotSupportedError):
+                    return deserializer(
+                        parsed_data, processor_factory, random_seed, **data_kwargs
+                    )
+        raise DataNotSupportedError(
+            f"Unsupported data for InMemoryJsonStrDatasetDeserializer, "
+            f"expected JSON string with a list or dict of items, got {data}"
+        )
+@DatasetDeserializerFactory.register("in_memory_csv_str")
+class InMemoryCsvDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if (
+            isinstance(data, str)
+            and (csv_str := data.strip())
+            and len(csv_str.split("\n")) > 0
+        ):
+            with contextlib.suppress(Exception):
+                csv_buffer = StringIO(data)
+                reader = csv.DictReader(csv_buffer)
+                rows = list(reader)
+                return InMemoryDictListDatasetDeserializer()(
+                    rows, processor_factory, random_seed, **data_kwargs
+                )
+        raise DataNotSupportedError(
+            f"Unsupported data for InMemoryCsvDatasetDeserializer, "
+            f"expected CSV string, got {type(data)}"
+        )

guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl