PyPI - guidellm - Versions diffs - 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl - Mend

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +524 -255
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +109 -0
guidellm/backends/openai.py +340 -0
guidellm/backends/response_handlers.py +428 -0
guidellm/benchmark/__init__.py +69 -39
guidellm/benchmark/benchmarker.py +160 -316
guidellm/benchmark/entrypoints.py +560 -127
guidellm/benchmark/outputs/__init__.py +24 -0
guidellm/benchmark/outputs/console.py +633 -0
guidellm/benchmark/outputs/csv.py +721 -0
guidellm/benchmark/outputs/html.py +473 -0
guidellm/benchmark/outputs/output.py +169 -0
guidellm/benchmark/outputs/serialized.py +69 -0
guidellm/benchmark/profiles.py +718 -0
guidellm/benchmark/progress.py +553 -556
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas/__init__.py +66 -0
guidellm/benchmark/schemas/base.py +402 -0
guidellm/benchmark/schemas/generative/__init__.py +55 -0
guidellm/benchmark/schemas/generative/accumulator.py +841 -0
guidellm/benchmark/schemas/generative/benchmark.py +163 -0
guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
guidellm/benchmark/schemas/generative/metrics.py +927 -0
guidellm/benchmark/schemas/generative/report.py +158 -0
guidellm/data/__init__.py +34 -4
guidellm/data/builders.py +541 -0
guidellm/data/collators.py +16 -0
guidellm/data/config.py +120 -0
guidellm/data/deserializers/__init__.py +49 -0
guidellm/data/deserializers/deserializer.py +141 -0
guidellm/data/deserializers/file.py +223 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +194 -0
guidellm/data/deserializers/synthetic.py +246 -0
guidellm/data/entrypoints.py +52 -0
guidellm/data/loaders.py +190 -0
guidellm/data/preprocessors/__init__.py +27 -0
guidellm/data/preprocessors/formatters.py +410 -0
guidellm/data/preprocessors/mappers.py +196 -0
guidellm/data/preprocessors/preprocessor.py +30 -0
guidellm/data/processor.py +29 -0
guidellm/data/schemas.py +175 -0
guidellm/data/utils/__init__.py +6 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +220 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +238 -0
guidellm/mock_server/utils.py +302 -0
guidellm/scheduler/__init__.py +69 -26
guidellm/scheduler/constraints/__init__.py +49 -0
guidellm/scheduler/constraints/constraint.py +325 -0
guidellm/scheduler/constraints/error.py +411 -0
guidellm/scheduler/constraints/factory.py +182 -0
guidellm/scheduler/constraints/request.py +312 -0
guidellm/scheduler/constraints/saturation.py +722 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +137 -368
guidellm/scheduler/schemas.py +358 -0
guidellm/scheduler/strategies.py +617 -0
guidellm/scheduler/worker.py +413 -419
guidellm/scheduler/worker_group.py +712 -0
guidellm/schemas/__init__.py +65 -0
guidellm/schemas/base.py +417 -0
guidellm/schemas/info.py +188 -0
guidellm/schemas/request.py +235 -0
guidellm/schemas/request_stats.py +349 -0
guidellm/schemas/response.py +124 -0
guidellm/schemas/statistics.py +1018 -0
guidellm/{config.py → settings.py} +31 -24
guidellm/utils/__init__.py +71 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +132 -5
guidellm/utils/console.py +566 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +159 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +2 -2
guidellm-0.6.0a5.dist-info/METADATA +364 -0
guidellm-0.6.0a5.dist-info/RECORD +109 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -708
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/output.py +0 -997
guidellm/benchmark/profile.py +0 -409
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/objects/statistics.py +0 -953
guidellm/preprocess/__init__.py +0 -3
guidellm/preprocess/dataset.py +0 -374
guidellm/presentation/__init__.py +0 -28
guidellm/presentation/builder.py +0 -27
guidellm/presentation/data_models.py +0 -232
guidellm/presentation/injector.py +0 -66
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.3.1.dist-info/METADATA +0 -329
guidellm-0.3.1.dist-info/RECORD +0 -62
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0

guidellm/data/preprocessors/formatters.py ADDED Viewed

@@ -0,0 +1,410 @@
+from __future__ import annotations
+from typing import Any
+from guidellm.data.preprocessors.preprocessor import (
+    DatasetPreprocessor,
+    PreprocessorRegistry,
+)
+from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
+__all__ = [
+    "GenerativeAudioTranscriptionRequestFormatter",
+    "GenerativeAudioTranslationRequestFormatter",
+    "GenerativeChatCompletionsRequestFormatter",
+    "GenerativeTextCompletionsRequestFormatter",
+    "RequestFormatter",
+]
+class RequestFormatter(DatasetPreprocessor):
+    def __init__(self, model: str, **_kwargs):
+        self.model = model
+    @staticmethod
+    def encode_audio(*args, **kwargs):
+        from guidellm.extras.audio import encode_audio
+        return encode_audio(*args, **kwargs)
+    @staticmethod
+    def encode_image(*args, **kwargs):
+        from guidellm.extras.vision import encode_image
+        return encode_image(*args, **kwargs)
+    @staticmethod
+    def encode_video(*args, **kwargs):
+        from guidellm.extras.vision import encode_video
+        return encode_video(*args, **kwargs)
+@PreprocessorRegistry.register("text_completions")
+class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
+    def __init__(
+        self,
+        model: str,
+        extras: dict[str, Any] | GenerationRequestArguments | None = None,
+        stream: bool = True,
+        max_tokens: int | None = None,
+        max_completion_tokens: int | None = None,
+    ):
+        self.model: str = model
+        self.extras = (
+            GenerationRequestArguments(**extras)
+            if extras and isinstance(extras, dict)
+            else extras
+        )
+        self.stream: bool = stream
+        self.max_tokens: int | None = max_tokens or max_completion_tokens
+    def __call__(self, columns: dict[str, list[Any]]) -> GenerationRequest:
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments: GenerationRequestArguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works better setting this field here
+        input_metrics = UsageMetrics()
+        output_metrics = UsageMetrics()
+        # Add model
+        if self.model is not None:
+            arguments.body["model"] = self.model
+        # Configure streaming
+        if self.stream:
+            arguments.stream = True
+            arguments.body["stream"] = True
+            arguments.body["stream_options"] = {
+                "include_usage": True,
+                "continuous_usage_stats": True,
+            }
+        # Handle output tokens
+        if output_tokens := sum(
+            count for count in columns.get("output_tokens_count_column", []) if count
+        ):
+            output_metrics.text_tokens = output_tokens
+            arguments.body["max_tokens"] = output_tokens
+            arguments.body["stop"] = None
+            arguments.body["ignore_eos"] = True
+        elif self.max_tokens is not None:
+            arguments.body["max_tokens"] = self.max_tokens
+        # Handle prompt tokens
+        if prompt_tokens := sum(
+            count for count in columns.get("prompt_tokens_count_column", []) if count
+        ):
+            input_metrics.text_tokens = prompt_tokens
+        # Apply extra arguments
+        if self.extras:
+            arguments.model_combine(self.extras)
+        # Build prompt
+        prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
+        text = "".join(txt for txt in columns.get("text_column", []) if txt)
+        if prefix or text:
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.add_text_metrics(prompt)
+        return GenerationRequest(
+            request_type="text_completions",
+            arguments=arguments,
+            input_metrics=input_metrics,
+            output_metrics=output_metrics,
+        )
+@PreprocessorRegistry.register("chat_completions")
+class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
+    def __init__(
+        self,
+        model: str,
+        extras: dict[str, Any] | GenerationRequestArguments | None = None,
+        stream: bool = True,
+        max_tokens: int | None = None,
+        max_completion_tokens: int | None = None,
+        encode_kwargs: dict[str, Any] | None = None,
+    ):
+        self.model = model
+        self.extras = (
+            GenerationRequestArguments(**extras)
+            if extras and isinstance(extras, dict)
+            else extras
+        )
+        self.stream = stream
+        self.max_completion_tokens = max_tokens or max_completion_tokens
+        self.encode_image_kwargs = (
+            encode_kwargs.get("image", {}) if encode_kwargs else {}
+        )
+        self.encode_video_kwargs = (
+            encode_kwargs.get("video", {}) if encode_kwargs else {}
+        )
+        self.encode_audio_kwargs = (
+            encode_kwargs.get("audio", {}) if encode_kwargs else {}
+        )
+    def __call__(  # noqa: C901, PLR0912, PLR0915
+        self, columns: dict[str, list[Any]]
+    ) -> GenerationRequest:
+        """
+        :param columns: A dict of GenerativeDatasetColumnType to Any
+        """
+        arguments = GenerationRequestArguments()
+        arguments.body = {}  # The type checker works best with body assigned here
+        input_metrics = UsageMetrics()
+        output_metrics = UsageMetrics()
+        # Add model
+        if self.model is not None:
+            arguments.body["model"] = self.model
+        # Configure streaming
+        if self.stream:
+            arguments.stream = True
+            arguments.body["stream"] = True
+            arguments.body["stream_options"] = {
+                "include_usage": True,
+                "continuous_usage_stats": True,
+            }
+        # Handle output tokens
+        if output_tokens := sum(
+            count for count in columns.get("output_tokens_count_column", []) if count
+        ):
+            output_metrics.text_tokens = output_tokens
+            arguments.body.update(
+                {
+                    "max_completion_tokens": output_tokens,
+                    "stop": None,
+                    "ignore_eos": True,
+                }
+            )
+        elif self.max_completion_tokens is not None:
+            arguments.body["max_completion_tokens"] = self.max_completion_tokens
+        # Handle prompt tokens
+        if prompt_tokens := sum(
+            count for count in columns.get("prompt_tokens_count_column", []) if count
+        ):
+            input_metrics.text_tokens = prompt_tokens
+        # Apply extra arguments
+        if self.extras:
+            arguments.model_combine(self.extras)
+        # Build messages
+        arguments.body["messages"] = []
+        for prefix in columns.get("prefix_column", []):
+            if not prefix:
+                continue
+            input_metrics.add_text_metrics(prefix)
+            arguments.body["messages"].append({"role": "system", "content": prefix})
+        for text in columns.get("text_column", []):
+            if not text:
+                continue
+            input_metrics.add_text_metrics(text)
+            arguments.body["messages"].append(
+                {"role": "user", "content": [{"type": "text", "text": text}]}
+            )
+        for image in columns.get("image_column", []):
+            if not image:
+                continue
+            image_dict = self.encode_image(image, **self.encode_image_kwargs)
+            if (image_pixels := image_dict.get("image_pixels")) is not None:
+                input_metrics.image_pixels = (
+                    input_metrics.image_pixels or 0
+                ) + image_pixels
+            if (image_bytes := image_dict.get("image_bytes")) is not None:
+                input_metrics.image_bytes = (
+                    input_metrics.image_bytes or 0
+                ) + image_bytes
+            arguments.body["messages"].append(
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_dict.get("image")},
+                        }
+                    ],
+                }
+            )
+        for video in columns.get("video_column", []):
+            if not video:
+                continue
+            video_dict = self.encode_video(video, **self.encode_video_kwargs)
+            if (video_frames := video_dict.get("video_frames")) is not None:
+                input_metrics.video_frames = (
+                    input_metrics.video_frames or 0
+                ) + video_frames
+            if (video_seconds := video_dict.get("video_seconds")) is not None:
+                input_metrics.video_seconds = (
+                    input_metrics.video_seconds or 0.0
+                ) + video_seconds
+            if (video_bytes := video_dict.get("video_bytes")) is not None:
+                input_metrics.video_bytes = (
+                    input_metrics.video_bytes or 0
+                ) + video_bytes
+            arguments.body["messages"].append(
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "video_url",
+                            "video_url": {"url": video_dict.get("video")},
+                        }
+                    ],
+                }
+            )
+        for audio in columns.get("audio_column", []):
+            if not audio:
+                continue
+            audio_dict = self.encode_audio(
+                audio, b64encode=True, **self.encode_audio_kwargs
+            )
+            if (audio_samples := audio_dict.get("audio_samples")) is not None:
+                input_metrics.audio_samples = (
+                    input_metrics.audio_samples or 0
+                ) + audio_samples
+            if (audio_seconds := audio_dict.get("audio_seconds")) is not None:
+                input_metrics.audio_seconds = (
+                    input_metrics.audio_seconds or 0.0
+                ) + audio_seconds
+            if (audio_bytes := audio_dict.get("audio_bytes")) is not None:
+                input_metrics.audio_bytes = (
+                    input_metrics.audio_bytes or 0
+                ) + audio_bytes
+            arguments.body["messages"].append(
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": audio_dict.get("audio"),
+                                "format": audio_dict.get("format"),
+                            },
+                        }
+                    ],
+                }
+            )
+        return GenerationRequest(
+            request_type="chat_completions",
+            arguments=arguments,
+            input_metrics=input_metrics,
+            output_metrics=output_metrics,
+        )
+@PreprocessorRegistry.register("audio_transcriptions")
+class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
+    def __init__(
+        self,
+        model: str,
+        extras: dict[str, Any] | GenerationRequestArguments | None = None,
+        stream: bool = True,
+        encode_kwargs: dict[str, Any] | None = None,
+    ):
+        self.model = model
+        self.extras = (
+            GenerationRequestArguments(**extras)
+            if extras and isinstance(extras, dict)
+            else extras
+        )
+        self.stream = stream
+        self.encode_audio_kwargs = encode_kwargs or {}
+    def __call__(  # noqa: C901
+        self, columns: dict[str, list[Any]]
+    ) -> GenerationRequest:
+        arguments = GenerationRequestArguments(files={})
+        arguments.body = {}  # The type checker works best with body assigned here
+        input_metrics = UsageMetrics()
+        output_metrics = UsageMetrics()
+        # Add model
+        if self.model is not None:
+            arguments.body["model"] = self.model
+        # Configure streaming
+        if self.stream:
+            arguments.stream = True
+            arguments.body["stream"] = True
+            # NOTE: File upload endpoints use flattened stream options
+            arguments.body["stream_include_usage"] = True
+            arguments.body["stream_continuous_usage_stats"] = True
+        # Handle output tokens
+        if output_tokens := sum(
+            count for count in columns.get("output_tokens_count_column", []) if count
+        ):
+            output_metrics.text_tokens = output_tokens
+        # Handle prompt tokens (for audio duration tracking)
+        if prompt_tokens := sum(
+            count for count in columns.get("prompt_tokens_count_column", []) if count
+        ):
+            input_metrics.text_tokens = prompt_tokens
+        # Apply extra arguments
+        if self.extras:
+            arguments.model_combine(self.extras)
+        # Build audio input
+        audio_columns = columns.get("audio_column", [])
+        if len(audio_columns) != 1:
+            raise ValueError(
+                f"GenerativeAudioTranscriptionRequestFormatter expects exactly "
+                f"one audio column, but got {len(audio_columns)}."
+            )
+        audio_dict = self.encode_audio(
+            audio_columns[0], b64encode=False, **self.encode_audio_kwargs
+        )
+        input_metrics.audio_samples = audio_dict.get("audio_samples")
+        input_metrics.audio_seconds = audio_dict.get("audio_seconds")
+        input_metrics.audio_bytes = audio_dict.get("audio_bytes")
+        arguments.files = {
+            "file": (
+                audio_dict.get("file_name", "audio_input"),
+                audio_dict.get("audio"),
+                audio_dict.get("mimetype"),
+            )
+        }
+        return GenerationRequest(
+            request_type="audio_transcriptions",
+            arguments=arguments,
+            input_metrics=input_metrics,
+            output_metrics=output_metrics,
+        )
+@PreprocessorRegistry.register("audio_translations")
+class GenerativeAudioTranslationRequestFormatter(
+    GenerativeAudioTranscriptionRequestFormatter
+):
+    def __call__(self, columns: dict[str, list[Any]]) -> GenerationRequest:
+        result = super().__call__(columns)
+        result.request_type = "audio_translations"
+        return result

guidellm/data/preprocessors/mappers.py ADDED Viewed

@@ -0,0 +1,196 @@
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, ClassVar, cast
+from datasets import Dataset, IterableDataset
+from guidellm.data.preprocessors.preprocessor import (
+    DataDependentPreprocessor,
+    PreprocessorRegistry,
+)
+from guidellm.data.schemas import GenerativeDatasetColumnType
+__all__ = ["GenerativeColumnMapper"]
+@PreprocessorRegistry.register("generative_column_mapper")
+class GenerativeColumnMapper(DataDependentPreprocessor):
+    defaults: ClassVar[dict[str, list[str]]] = {
+        "prompt_tokens_count_column": ["prompt_tokens_count", "input_tokens_count"],
+        "output_tokens_count_column": [
+            "output_tokens_count",
+            "completion_tokens_count",
+        ],
+        "prefix_column": [
+            "system_prompt",
+            "system",
+            "prefix",
+        ],
+        "text_column": [
+            "prompt",
+            "instruction",
+            "question",
+            "input",
+            "context",
+            "content",
+            "conversation",
+            "turn",
+            "text",
+        ],
+        "image_column": [
+            "image",
+            "picture",
+            "photo",
+            "img",
+        ],
+        "video_column": [
+            "video",
+            "clip",
+            "movie",
+            "footage",
+            "mp4",
+            "mov",
+            "avi",
+        ],
+        "audio_column": [
+            "audio",
+            "sound",
+            "voice",
+            "speech",
+            "wav",
+            "mp3",
+        ],
+    }
+    @classmethod
+    def datasets_default_mappings(
+        cls, datasets: list[Dataset | IterableDataset]
+    ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
+        mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
+            defaultdict(list)
+        )
+        for index, dataset in enumerate(datasets):
+            dataset_columns = dataset.column_names or list(next(iter(dataset)).keys())
+            for column_type in cls.defaults:
+                if column_type in mappings:
+                    continue
+                type_names = [
+                    variant
+                    for name in cls.defaults.get(column_type, [])
+                    for plural in [name, f"{name}s", f"{name}es"]
+                    for variant in [
+                        plural,
+                        plural.lower(),
+                        plural.upper(),
+                        plural.capitalize(),
+                    ]
+                ]
+                for name in type_names:
+                    if name in dataset_columns:
+                        key = cast("GenerativeDatasetColumnType", column_type)
+                        mappings[key].append((index, name))
+                        break
+        return mappings
+    @classmethod
+    def datasets_mappings(
+        cls,
+        datasets: list[Dataset | IterableDataset],
+        input_mappings: dict[GenerativeDatasetColumnType, str | list[str]],
+    ) -> dict[GenerativeDatasetColumnType, list[tuple[int, str]]]:
+        mappings: dict[GenerativeDatasetColumnType, list[tuple[int, str]]] = (
+            defaultdict(list)
+        )
+        datasets_named_indices = {
+            (
+                dataset.info.dataset_name
+                if dataset.info and dataset.info.dataset_name
+                else index
+            ): index
+            for index, dataset in enumerate(datasets)
+        }
+        datasets_columns = {
+            index: dataset.column_names or list(next(iter(dataset)).keys())
+            for index, dataset in enumerate(datasets)
+        }
+        # Parse out user mappings that were passed in and validate them
+        # Must be in the format of:
+        # {<column_type>: [<column_names>]}
+        # where <column_names> can be a single string or list of strings
+        # and each string can be any of:
+        # - a column name (assumes the first dataset was intended)
+        # - <int>.<column_name> where <int> is the dataset index
+        # - <str>.<column_name> where <str> is the dataset name
+        for column_type, names in input_mappings.items():
+            mappings[column_type] = []
+            for name in names if isinstance(names, list) else [names]:
+                if "." in name:
+                    dataset, column_name = name.split(".", 1)
+                    dataset_index = (
+                        int(dataset)
+                        if dataset.isdigit()
+                        else datasets_named_indices.get(dataset)
+                    )
+                else:
+                    dataset_index = 0
+                    column_name = name
+                if dataset_index is None or dataset_index >= len(datasets):
+                    raise ValueError(
+                        f"Dataset '{name}' not found in datasets: "
+                        f"{datasets_named_indices}."
+                    )
+                if column_name not in datasets_columns[dataset_index]:
+                    raise ValueError(
+                        f"Column '{column_name}' not found in dataset "
+                        f"'{datasets[dataset_index]}' "
+                        f"columns: {datasets_columns[dataset_index]}."
+                    )
+                mappings[column_type].append((dataset_index, column_name))
+        return mappings
+    def __init__(
+        self,
+        column_mappings: dict[GenerativeDatasetColumnType, str | list[str]]
+        | None = None,
+    ):
+        self.input_mappings = column_mappings
+        self.datasets_column_mappings: (
+            dict[GenerativeDatasetColumnType, list[tuple[int, str]]] | None
+        )
+    def __call__(self, row: dict[str, Any]) -> dict[str, list[Any]]:
+        if self.datasets_column_mappings is None:
+            raise ValueError("DefaultGenerativeColumnMapper not setup with data.")
+        items = cast("dict[int, dict[str, Any]]", row.pop("items"))
+        mapped: dict[str, Any] = defaultdict(list)
+        for column_type, column_mappings in self.datasets_column_mappings.items():
+            for (
+                dataset_index,
+                dataset_column,
+            ) in column_mappings:
+                mapped[column_type].append(items[dataset_index][dataset_column])
+        return dict(mapped)
+    def setup_data(
+        self,
+        datasets: list[Dataset | IterableDataset],
+        data_args: list[dict[str, Any]],
+    ):
+        _ = data_args  # Unused for this mapper
+        self.datasets_column_mappings = (
+            self.datasets_default_mappings(datasets)
+            if self.input_mappings is None
+            else self.datasets_mappings(datasets, self.input_mappings)
+        )

guidellm/data/preprocessors/preprocessor.py ADDED Viewed

@@ -0,0 +1,30 @@
+from __future__ import annotations
+from typing import Any, Protocol, runtime_checkable
+from datasets import Dataset, IterableDataset
+from guidellm.schemas import GenerationRequest
+from guidellm.utils import RegistryMixin
+__all__ = ["DataDependentPreprocessor", "DatasetPreprocessor", "PreprocessorRegistry"]
+@runtime_checkable
+class DatasetPreprocessor(Protocol):
+    def __call__(self, item: dict[str, Any]) -> GenerationRequest | dict[str, Any]: ...
+@runtime_checkable
+class DataDependentPreprocessor(DatasetPreprocessor, Protocol):
+    def setup_data(
+        self,
+        datasets: list[Dataset | IterableDataset],
+        data_args: list[dict[str, Any]],
+    ): ...
+class PreprocessorRegistry(
+    RegistryMixin[type[DatasetPreprocessor] | type[DataDependentPreprocessor]]
+):
+    pass

guidellm/data/processor.py ADDED Viewed

@@ -0,0 +1,29 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from transformers import AutoTokenizer, PreTrainedTokenizerBase  # type: ignore[import]
+__all__ = ["ProcessorFactory"]
+class ProcessorFactory:
+    def __init__(
+        self,
+        processor: str | Path | PreTrainedTokenizerBase,
+        processor_args: dict[str, Any] | None = None,
+    ) -> None:
+        self.processor = processor
+        self.processor_args = processor_args or {}
+    def __call__(self) -> PreTrainedTokenizerBase:
+        if isinstance(self.processor, PreTrainedTokenizerBase):
+            return self.processor
+        else:
+            from_pretrained = AutoTokenizer.from_pretrained(
+                self.processor,
+                **(self.processor_args or {}),
+            )
+            self.processor = from_pretrained
+            return from_pretrained

guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl