PyPI - guidellm - Versions diffs - 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl - Mend

guidellm 0.4.0a18py3-none-any.whl → 0.4.0a155py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (116) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +451 -252
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +110 -0
guidellm/backends/openai.py +355 -0
guidellm/backends/response_handlers.py +455 -0
guidellm/benchmark/__init__.py +53 -39
guidellm/benchmark/benchmarker.py +148 -317
guidellm/benchmark/entrypoints.py +466 -128
guidellm/benchmark/output.py +517 -771
guidellm/benchmark/profile.py +580 -280
guidellm/benchmark/progress.py +568 -549
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas.py +2085 -0
guidellm/data/__init__.py +28 -4
guidellm/data/collators.py +16 -0
guidellm/data/deserializers/__init__.py +53 -0
guidellm/data/deserializers/deserializer.py +109 -0
guidellm/data/deserializers/file.py +222 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +192 -0
guidellm/data/deserializers/synthetic.py +346 -0
guidellm/data/loaders.py +145 -0
guidellm/data/preprocessors/__init__.py +25 -0
guidellm/data/preprocessors/formatters.py +412 -0
guidellm/data/preprocessors/mappers.py +198 -0
guidellm/data/preprocessors/preprocessor.py +29 -0
guidellm/data/processor.py +30 -0
guidellm/data/schemas.py +13 -0
guidellm/data/utils/__init__.py +10 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/data/utils/functions.py +18 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +215 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +168 -0
guidellm/mock_server/utils.py +302 -0
guidellm/preprocess/dataset.py +23 -26
guidellm/presentation/builder.py +2 -2
guidellm/presentation/data_models.py +25 -21
guidellm/presentation/injector.py +2 -3
guidellm/scheduler/__init__.py +65 -26
guidellm/scheduler/constraints.py +1035 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +140 -368
guidellm/scheduler/schemas.py +272 -0
guidellm/scheduler/strategies.py +519 -0
guidellm/scheduler/worker.py +391 -420
guidellm/scheduler/worker_group.py +707 -0
guidellm/schemas/__init__.py +31 -0
guidellm/schemas/info.py +159 -0
guidellm/schemas/request.py +216 -0
guidellm/schemas/response.py +119 -0
guidellm/schemas/stats.py +228 -0
guidellm/{config.py → settings.py} +32 -21
guidellm/utils/__init__.py +95 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +46 -2
guidellm/utils/console.py +183 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +134 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/pydantic_utils.py +411 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/{objects → utils}/statistics.py +341 -247
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +1 -1
{guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
guidellm-0.4.0a155.dist-info/RECORD +96 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -705
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.4.0a18.dist-info/RECORD +0 -62
{guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
{guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
{guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0

guidellm/data/__init__.py CHANGED Viewed

@@ -1,4 +1,28 @@
-"""
-Required for python < 3.12
-https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files
-"""
+from .collators import GenerativeRequestCollator
+from .deserializers import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+from .loaders import DataLoader, DatasetsIterator
+from .preprocessors import (
+    DataDependentPreprocessor,
+    DatasetPreprocessor,
+    PreprocessorRegistry,
+)
+from .processor import ProcessorFactory
+from .schemas import GenerativeDatasetColumnType
+__all__ = [
+    "DataDependentPreprocessor",
+    "DataLoader",
+    "DataNotSupportedError",
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+    "DatasetPreprocessor",
+    "DatasetsIterator",
+    "GenerativeDatasetColumnType",
+    "GenerativeRequestCollator",
+    "PreprocessorRegistry",
+    "ProcessorFactory",
+]

guidellm/data/collators.py ADDED Viewed

@@ -0,0 +1,16 @@
+from __future__ import annotations
+from guidellm.schemas import GenerationRequest
+__all__ = ["GenerativeRequestCollator"]
+class GenerativeRequestCollator:
+    def __call__(self, batch: list) -> GenerationRequest:
+        if len(batch) != 1:
+            raise NotImplementedError(
+                f"Batch size greater than 1 is not currently supported. "
+                f"Got batch size: {len(batch)}"
+            )
+        return batch[0]

guidellm/data/deserializers/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+from .deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+from .file import (
+    ArrowFileDatasetDeserializer,
+    CSVFileDatasetDeserializer,
+    DBFileDatasetDeserializer,
+    HDF5FileDatasetDeserializer,
+    JSONFileDatasetDeserializer,
+    ParquetFileDatasetDeserializer,
+    TarFileDatasetDeserializer,
+    TextFileDatasetDeserializer,
+)
+from .huggingface import HuggingFaceDatasetDeserializer
+from .memory import (
+    InMemoryCsvDatasetDeserializer,
+    InMemoryDictDatasetDeserializer,
+    InMemoryDictListDatasetDeserializer,
+    InMemoryItemListDatasetDeserializer,
+    InMemoryJsonStrDatasetDeserializer,
+)
+from .synthetic import (
+    SyntheticTextDatasetConfig,
+    SyntheticTextDatasetDeserializer,
+    SyntheticTextGenerator,
+    SyntheticTextPrefixBucketConfig,
+)
+__all__ = [
+    "ArrowFileDatasetDeserializer",
+    "CSVFileDatasetDeserializer",
+    "DBFileDatasetDeserializer",
+    "DataNotSupportedError",
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+    "HDF5FileDatasetDeserializer",
+    "HuggingFaceDatasetDeserializer",
+    "InMemoryCsvDatasetDeserializer",
+    "InMemoryDictDatasetDeserializer",
+    "InMemoryDictListDatasetDeserializer",
+    "InMemoryItemListDatasetDeserializer",
+    "InMemoryJsonStrDatasetDeserializer",
+    "JSONFileDatasetDeserializer",
+    "ParquetFileDatasetDeserializer",
+    "SyntheticTextDatasetConfig",
+    "SyntheticTextDatasetDeserializer",
+    "SyntheticTextGenerator",
+    "SyntheticTextPrefixBucketConfig",
+    "TarFileDatasetDeserializer",
+    "TextFileDatasetDeserializer",
+]

guidellm/data/deserializers/deserializer.py ADDED Viewed

@@ -0,0 +1,109 @@
+from __future__ import annotations
+import contextlib
+from collections.abc import Callable
+from typing import Any, Protocol, Union, runtime_checkable
+from datasets import Dataset, IterableDataset
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.utils import resolve_dataset_split
+from guidellm.utils import RegistryMixin
+__all__ = [
+    "DataNotSupportedError",
+    "DatasetDeserializer",
+    "DatasetDeserializerFactory",
+]
+class DataNotSupportedError(Exception):
+    """Exception raised when data format is not supported by deserializer."""
+@runtime_checkable
+class DatasetDeserializer(Protocol):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]: ...
+class DatasetDeserializerFactory(
+    RegistryMixin[Union["type[DatasetDeserializer]", DatasetDeserializer]],
+):
+    @classmethod
+    def deserialize(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        type_: str | None = None,
+        resolve_split: bool = True,
+        select_columns: list[str] | None = None,
+        remove_columns: list[str] | None = None,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset | IterableDataset:
+        dataset = None
+        if type_ is None:
+            errors = []
+            # Note: There is no priority order for the deserializers, so all deserializers
+            #  must be mutually exclusive to ensure deterministic behavior.
+            for name, deserializer in cls.registry.items():
+                deserializer_fn: DatasetDeserializer = (
+                    deserializer() if isinstance(deserializer, type) else deserializer
+                )
+                try:
+                    with contextlib.suppress(DataNotSupportedError):
+                        dataset = deserializer_fn(
+                            data=data,
+                            processor_factory=processor_factory,
+                            random_seed=random_seed,
+                            **data_kwargs,
+                        )
+                except Exception as e:
+                    errors.append(e)
+                if dataset is not None:
+                    break # Found one that works. Continuing could overwrite it.
+            if dataset is None and len(errors) > 0:
+                raise DataNotSupportedError(f"data deserialization failed; {len(errors)} errors occurred while "
+                                            f"attempting to deserialize data {data}: {errors}")
+        elif deserializer := cls.get_registered_object(type_) is not None:
+            deserializer_fn: DatasetDeserializer = (
+                deserializer() if isinstance(deserializer, type) else deserializer
+            )
+            dataset = deserializer_fn(
+                data=data,
+                processor_factory=processor_factory,
+                random_seed=random_seed,
+                **data_kwargs,
+            )
+        if dataset is None:
+            raise DataNotSupportedError(
+                f"No suitable deserializer found for data {data} "
+                f"with kwargs {data_kwargs} and deserializer type {type_}."
+            )
+        if resolve_split:
+            dataset = resolve_dataset_split(dataset)
+        if select_columns is not None or remove_columns is not None:
+            column_names = dataset.column_names or list(next(iter(dataset)).keys())
+            if select_columns is not None:
+                remove_columns = [
+                    col for col in column_names if col not in select_columns
+                ]
+            dataset = dataset.remove_columns(remove_columns)
+        return dataset

guidellm/data/deserializers/file.py ADDED Viewed

@@ -0,0 +1,222 @@
+from __future__ import annotations
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+import pandas as pd
+from datasets import Dataset, load_dataset
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+__all__ = [
+    "ArrowFileDatasetDeserializer",
+    "CSVFileDatasetDeserializer",
+    "DBFileDatasetDeserializer",
+    "HDF5FileDatasetDeserializer",
+    "JSONFileDatasetDeserializer",
+    "ParquetFileDatasetDeserializer",
+    "TarFileDatasetDeserializer",
+    "TextFileDatasetDeserializer",
+]
+@DatasetDeserializerFactory.register("text_file")
+class TextFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)  # Ignore unused args format errors
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".txt", ".text"}
+        ):
+            raise DataNotSupportedError(
+                "Unsupported data for TextFileDatasetDeserializer, "
+                f"expected str or Path to a local .txt or .text file, got {data}"
+            )
+        with path.open() as file:
+            lines = file.readlines()
+        return Dataset.from_dict({"text": lines}, **data_kwargs)
+@DatasetDeserializerFactory.register("csv_file")
+class CSVFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".csv"
+        ):
+            raise DataNotSupportedError(
+                "Unsupported data for CSVFileDatasetDeserializer, "
+                f"expected str or Path to a local .csv file, got {data}"
+            )
+        return load_dataset("csv", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("json_file")
+class JSONFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".json", ".jsonl"}
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for JSONFileDatasetDeserializer, "
+                f"expected str or Path to a local .json or .jsonl file, got {data}"
+            )
+        return load_dataset("json", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("parquet_file")
+class ParquetFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".parquet"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for ParquetFileDatasetDeserializer, "
+                f"expected str or Path to a local .parquet file, got {data}"
+            )
+        return load_dataset("parquet", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("arrow_file")
+class ArrowFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".arrow"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for ArrowFileDatasetDeserializer, "
+                f"expected str or Path to a local .arrow file, got {data}"
+            )
+        return load_dataset("arrow", data_files=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("hdf5_file")
+class HDF5FileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() not in {".hdf5", ".h5"}
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for HDF5FileDatasetDeserializer, "
+                f"expected str or Path to a local .hdf5 or .h5 file, got {data}"
+            )
+        return Dataset.from_pandas(pd.read_hdf(str(path)), **data_kwargs)
+@DatasetDeserializerFactory.register("db_file")
+class DBFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".db"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for DBFileDatasetDeserializer, "
+                f"expected str or Path to a local .db file, got {data}"
+            )
+        return Dataset.from_sql(con=str(path), **data_kwargs)
+@DatasetDeserializerFactory.register("tar_file")
+class TarFileDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if (
+            not isinstance(data, (str, Path))
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+            or path.suffix.lower() != ".tar"
+        ):
+            raise DataNotSupportedError(
+                f"Unsupported data for TarFileDatasetDeserializer, "
+                f"expected str or Path to a local .tar file, got {data}"
+            )
+        return load_dataset("webdataset", data_files=str(path), **data_kwargs)

guidellm/data/deserializers/huggingface.py ADDED Viewed

@@ -0,0 +1,94 @@
+from __future__ import annotations
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from datasets import (
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    load_dataset,
+    load_from_disk,
+)
+from datasets.exceptions import (
+    DataFilesNotFoundError,
+    DatasetNotFoundError,
+    FileNotFoundDatasetsError,
+)
+from transformers import PreTrainedTokenizerBase
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+__all__ = ["HuggingFaceDatasetDeserializer"]
+@DatasetDeserializerFactory.register("huggingface")
+class HuggingFaceDatasetDeserializer(DatasetDeserializer):
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> dict[str, list]:
+        _ = (processor_factory, random_seed)
+        if isinstance(
+            data, Dataset | IterableDataset | DatasetDict | IterableDatasetDict
+        ):
+            return data
+        load_error = None
+        if (
+            isinstance(data, str | Path)
+            and (path := Path(data)).exists()
+            and ((path.is_file() and path.suffix == ".py") or path.is_dir())
+        ):
+            # Handle python script or nested python script in a directory
+            try:
+                return load_dataset(str(data), **data_kwargs)
+            except (
+                FileNotFoundDatasetsError,
+                DatasetNotFoundError,
+                DataFilesNotFoundError,
+            ) as err:
+                load_error = err
+            except Exception:  # noqa: BLE001
+                # Try loading as a local dataset directory next
+                try:
+                    return load_from_disk(str(data), **data_kwargs)
+                except (
+                    FileNotFoundDatasetsError,
+                    DatasetNotFoundError,
+                    DataFilesNotFoundError,
+                ) as err2:
+                    load_error = err2
+        try:
+            # Handle dataset identifier from the Hugging Face Hub
+            return load_dataset(str(data), **data_kwargs)
+        except (
+            FileNotFoundDatasetsError,
+            DatasetNotFoundError,
+            DataFilesNotFoundError,
+        ) as err:
+            load_error = err
+        not_supported = DataNotSupportedError(
+            "Unsupported data for HuggingFaceDatasetDeserializer, "
+            "expected Dataset, IterableDataset, DatasetDict, IterableDatasetDict, "
+            "str or Path to a local dataset directory or a local .py dataset script, "
+            f"got {data} and HF load error: {load_error}"
+        )
+        if load_error is not None:
+            raise not_supported from load_error
+        else:
+            raise not_supported

guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl

Potentially problematic release.

guidellm 0.4.0a18py3-none-any.whl → 0.4.0a155py3-none-any.whl