PyPI - guidellm - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0.dev0__py3-none-any.whl - Mend

guidellm 0.1.0py3-none-any.whl → 0.2.0.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show

guidellm/__init__.py +38 -6
guidellm/__main__.py +294 -0
guidellm/backend/__init__.py +19 -6
guidellm/backend/backend.py +238 -0
guidellm/backend/openai.py +532 -122
guidellm/backend/response.py +132 -0
guidellm/benchmark/__init__.py +73 -0
guidellm/benchmark/aggregator.py +760 -0
guidellm/benchmark/benchmark.py +838 -0
guidellm/benchmark/benchmarker.py +334 -0
guidellm/benchmark/entrypoints.py +141 -0
guidellm/benchmark/output.py +946 -0
guidellm/benchmark/profile.py +409 -0
guidellm/benchmark/progress.py +720 -0
guidellm/config.py +34 -56
guidellm/data/__init__.py +4 -0
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +22 -0
guidellm/dataset/creator.py +213 -0
guidellm/dataset/entrypoints.py +42 -0
guidellm/dataset/file.py +90 -0
guidellm/dataset/hf_datasets.py +62 -0
guidellm/dataset/in_memory.py +132 -0
guidellm/dataset/synthetic.py +262 -0
guidellm/objects/__init__.py +18 -0
guidellm/objects/pydantic.py +60 -0
guidellm/objects/statistics.py +947 -0
guidellm/request/__init__.py +12 -10
guidellm/request/loader.py +281 -0
guidellm/request/request.py +79 -0
guidellm/scheduler/__init__.py +51 -3
guidellm/scheduler/result.py +137 -0
guidellm/scheduler/scheduler.py +382 -0
guidellm/scheduler/strategy.py +493 -0
guidellm/scheduler/types.py +7 -0
guidellm/scheduler/worker.py +511 -0
guidellm/utils/__init__.py +16 -29
guidellm/utils/colors.py +8 -0
guidellm/utils/hf_transformers.py +35 -0
guidellm/utils/random.py +43 -0
guidellm/utils/text.py +118 -357
{guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/METADATA +96 -79
guidellm-0.2.0.dev0.dist-info/RECORD +48 -0
{guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/WHEEL +1 -1
guidellm-0.2.0.dev0.dist-info/entry_points.txt +2 -0
guidellm/backend/base.py +0 -320
guidellm/core/__init__.py +0 -24
guidellm/core/distribution.py +0 -190
guidellm/core/report.py +0 -321
guidellm/core/request.py +0 -44
guidellm/core/result.py +0 -545
guidellm/core/serializable.py +0 -169
guidellm/executor/__init__.py +0 -10
guidellm/executor/base.py +0 -213
guidellm/executor/profile_generator.py +0 -343
guidellm/main.py +0 -336
guidellm/request/base.py +0 -194
guidellm/request/emulated.py +0 -391
guidellm/request/file.py +0 -76
guidellm/request/transformers.py +0 -100
guidellm/scheduler/base.py +0 -374
guidellm/scheduler/load_generator.py +0 -196
guidellm/utils/injector.py +0 -70
guidellm/utils/progress.py +0 -196
guidellm/utils/transformers.py +0 -151
guidellm-0.1.0.dist-info/RECORD +0 -35
guidellm-0.1.0.dist-info/entry_points.txt +0 -3
{guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info/licenses}/LICENSE +0 -0
{guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/top_level.txt +0 -0

guidellm/main.py DELETED Viewed

@@ -1,336 +0,0 @@
-import asyncio
-from typing import Literal, Optional, get_args
-import click
-from loguru import logger
-from guidellm.backend import Backend, BackendEnginePublic
-from guidellm.core import GuidanceReport, TextGenerationBenchmarkReport
-from guidellm.executor import Executor, ProfileGenerationMode
-from guidellm.request import (
-    EmulatedRequestGenerator,
-    FileRequestGenerator,
-    TransformersDatasetRequestGenerator,
-)
-from guidellm.request.base import RequestGenerator
-from guidellm.utils import BenchmarkReportProgress
-__all__ = ["generate_benchmark_report"]
-@click.command()
-@click.option(
-    "--target",
-    type=str,
-    required=True,
-    help=(
-        "The target path or url for the backend to evaluate. "
-        "Ex: 'http://localhost:8000/v1'"
-    ),
-)
-@click.option(
-    "--backend",
-    type=click.Choice(get_args(BackendEnginePublic)),
-    default="openai_server",
-    help=(
-        "The backend to use for benchmarking. "
-        "The default is OpenAI Server enabling compatability with any server that "
-        "follows the OpenAI spec including vLLM."
-    ),
-)
-@click.option(
-    "--model",
-    type=str,
-    default=None,
-    help=(
-        "The Model to use for benchmarking. If not provided, it will use "
-        "the first available model provided the backend supports listing models."
-    ),
-)
-@click.option(
-    "--data",
-    type=str,
-    required=True,
-    help=(
-        "The data source to use for benchmarking. "
-        "Depending on the data-type, it should be a "
-        "path to a data file containing prompts to run (ex: data.txt), "
-        "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), "
-        "or a configuration for emulated data "
-        "(ex: 'prompt_tokens=128,generated_tokens=128')."
-    ),
-)
-@click.option(
-    "--data-type",
-    type=click.Choice(["emulated", "file", "transformers"]),
-    required=True,
-    help=(
-        "The type of data to use for benchmarking. "
-        "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' "
-        "for a HuggingFace dataset. Specify the data source with the --data flag."
-    ),
-)
-@click.option(
-    "--tokenizer",
-    type=str,
-    default=None,
-    help=(
-        "The tokenizer to use for calculating the number of prompt tokens. "
-        "This should match the tokenizer used by the model."
-        "By default, it will use the --model flag to determine the tokenizer. "
-        "If not provided and the model is not available, will raise an error. "
-        "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'"
-    ),
-)
-@click.option(
-    "--rate-type",
-    type=click.Choice(get_args(ProfileGenerationMode)),
-    default="sweep",
-    help=(
-        "The type of request rate to use for benchmarking. "
-        "Use sweep to run a full range from synchronous to throughput (default), "
-        "synchronous for sending requests one after the other, "
-        "throughput to send requests as fast as possible, "
-        "constant for a fixed request rate, "
-        "or poisson for a real-world variable request rate."
-    ),
-)
-@click.option(
-    "--rate",
-    type=float,
-    default=None,
-    help=(
-        "The request rate to use for constant and poisson rate types. "
-        "To run multiple, provide the flag multiple times. "
-    ),
-    multiple=True,
-)
-@click.option(
-    "--max-seconds",
-    type=int,
-    default=120,
-    help=(
-        "The maximum number of seconds for each benchmark run. "
-        "Either max-seconds, max-requests, or both must be set. "
-        "The default is 120 seconds. "
-        "Note, this is the maximum time for each rate supplied, not the total time. "
-        "This value should be large enough to allow for "
-        "the server's performance to stabilize."
-    ),
-)
-@click.option(
-    "--max-requests",
-    type=int,
-    default=None,
-    help=(
-        "The maximum number of requests for each benchmark run. "
-        "Either max-seconds, max-requests, or both must be set. "
-        "Note, this is the maximum number of requests for each rate supplied, "
-        "not the total number of requests. "
-        "This value should be large enough to allow for "
-        "the server's performance to stabilize."
-    ),
-)
-@click.option(
-    "--output-path",
-    type=str,
-    default=None,
-    help=(
-        "The output path to save the output report to for loading later. "
-        "Ex: guidance_report.json. "
-        "The default is None, meaning no output is saved and results are only "
-        "printed to the console."
-    ),
-)
-@click.option(
-    "--enable-continuous-refresh",
-    is_flag=True,
-    default=False,
-    help=(
-        "Enable continual refreshing of the output table in the CLI "
-        "until the user exits. "
-    ),
-)
-def generate_benchmark_report_cli(
-    target: str,
-    backend: BackendEnginePublic,
-    model: Optional[str],
-    data: Optional[str],
-    data_type: Literal["emulated", "file", "transformers"],
-    tokenizer: Optional[str],
-    rate_type: ProfileGenerationMode,
-    rate: Optional[float],
-    max_seconds: Optional[int],
-    max_requests: Optional[int],
-    output_path: str,
-    enable_continuous_refresh: bool,
-):
-    """
-    Generate a benchmark report for a specified backend and dataset.
-    """
-    generate_benchmark_report(
-        target=target,
-        backend=backend,
-        model=model,
-        data=data,
-        data_type=data_type,
-        tokenizer=tokenizer,
-        rate_type=rate_type,
-        rate=rate,
-        max_seconds=max_seconds,
-        max_requests=max_requests,
-        output_path=output_path,
-        cont_refresh_table=enable_continuous_refresh,
-    )
-def generate_benchmark_report(
-    target: str,
-    backend: BackendEnginePublic,
-    model: Optional[str],
-    data: Optional[str],
-    data_type: Literal["emulated", "file", "transformers"],
-    tokenizer: Optional[str],
-    rate_type: ProfileGenerationMode,
-    rate: Optional[float],
-    max_seconds: Optional[int],
-    max_requests: Optional[int],
-    output_path: str,
-    cont_refresh_table: bool,
-) -> GuidanceReport:
-    """
-    Generate a benchmark report for a specified backend and dataset.
-    :param target: The target URL or path for the backend to evaluate.
-    :param backend: The backend type to use for benchmarking.
-    :param model: The model to benchmark;
-        defaults to the first available if not specified.
-    :param data: The data source for benchmarking,
-        which may be a path, dataset name, or config.
-    :param data_type: The type of data to use,
-        such as 'emulated', 'file', or 'transformers'.
-    :param tokenizer: The tokenizer to use for token counting,
-        defaulting to Llama 3.1 if not provided.
-    :param rate_type: The rate type for requests during benchmarking.
-    :param rate: The specific request rate for constant and poisson rate types.
-    :param max_seconds: Maximum duration for each benchmark run in seconds.
-    :param max_requests: Maximum number of requests per benchmark run.
-    :param output_path: Path to save the output report file.
-    :param cont_refresh_table: Continually refresh the table in the CLI
-        until the user exits.
-    """
-    logger.info(
-        "Generating benchmark report with target: {}, backend: {}", target, backend
-    )
-    # Create backend
-    backend_inst = Backend.create(
-        backend_type=backend,
-        target=target,
-        model=model,
-    )
-    request_generator: RequestGenerator
-    # Create tokenizer and request generator
-    tokenizer_inst = tokenizer
-    if not tokenizer_inst:
-        try:
-            tokenizer_inst = backend_inst.model_tokenizer()
-        except Exception as err:
-            raise ValueError(
-                "Could not load model's tokenizer, "
-                "--tokenizer must be provided for request generation"
-            ) from err
-    if data_type == "emulated":
-        request_generator = EmulatedRequestGenerator(
-            config=data, tokenizer=tokenizer_inst
-        )
-    elif data_type == "file":
-        request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst)
-    elif data_type == "transformers":
-        request_generator = TransformersDatasetRequestGenerator(
-            dataset=data, tokenizer=tokenizer_inst
-        )
-    else:
-        raise ValueError(f"Unknown data type: {data_type}")
-    # Create executor
-    executor = Executor(
-        backend=backend_inst,
-        request_generator=request_generator,
-        mode=rate_type,
-        rate=rate if rate_type in ("constant", "poisson") else None,
-        max_number=max_requests,
-        max_duration=max_seconds,
-    )
-    # Run executor
-    logger.debug(
-        "Running executor with args: {}",
-        {
-            "backend": backend,
-            "request_generator": request_generator,
-            "mode": rate_type,
-            "rate": rate,
-            "max_number": max_requests,
-            "max_duration": max_seconds,
-        },
-    )
-    report = asyncio.run(_run_executor_for_result(executor))
-    # Save and print report
-    guidance_report = GuidanceReport()
-    guidance_report.benchmarks.append(report)
-    if output_path:
-        guidance_report.save_file(output_path)
-    guidance_report.print(
-        save_path=output_path if output_path is not None else "stdout",
-        continual_refresh=cont_refresh_table,
-    )
-    return guidance_report
-async def _run_executor_for_result(executor: Executor) -> TextGenerationBenchmarkReport:
-    report = None
-    progress = BenchmarkReportProgress()
-    started = False
-    async for result in executor.run():
-        if not started:
-            progress.start(result.generation_modes)  # type: ignore  # noqa: PGH003
-            started = True
-        if result.current_index is not None:
-            description = f"{result.current_profile.load_gen_mode}"  # type: ignore  # noqa: PGH003
-            if result.current_profile.load_gen_mode in ("constant", "poisson"):  # type: ignore  # noqa: PGH003
-                description += f"@{result.current_profile.load_gen_rate:.2f} req/s"  # type: ignore  # noqa: PGH003
-            progress.update_benchmark(
-                index=result.current_index,
-                description=description,
-                completed=result.scheduler_result.completed,  # type: ignore  # noqa: PGH003
-                completed_count=result.scheduler_result.count_completed,  # type: ignore  # noqa: PGH003
-                completed_total=result.scheduler_result.count_total,  # type: ignore  # noqa: PGH003
-                start_time=result.scheduler_result.benchmark.start_time,  # type: ignore  # noqa: PGH003
-                req_per_sec=result.scheduler_result.benchmark.completed_request_rate,  # type: ignore  # noqa: PGH003
-            )
-        if result.completed:
-            report = result.report
-            break
-    progress.finish()
-    if not report:
-        raise ValueError("No report generated by executor")
-    return report
-if __name__ == "__main__":
-    generate_benchmark_report_cli()

guidellm/request/base.py DELETED Viewed

@@ -1,194 +0,0 @@
-import contextlib
-import threading
-import time
-from abc import ABC, abstractmethod
-from queue import Empty, Full, Queue
-from typing import Iterator, Literal, Union
-from loguru import logger
-from transformers import (  # type: ignore  # noqa: PGH003
-    AutoTokenizer,
-    PreTrainedTokenizer,
-)
-from guidellm.core.request import TextGenerationRequest
-__all__ = ["GenerationMode", "RequestGenerator"]
-GenerationMode = Literal["async", "sync"]
-class RequestGenerator(ABC):
-    """
-    A base class for request generators that generate result requests.
-    :param type_: The type of the request generator.
-    :type type_: str
-    :param source: The data source for the request generator.
-    :type source: str
-    :param tokenizer: The tokenizer instance or the name/config to use
-        for tokenizing prompts.
-    :type tokenizer: Union[str, PreTrainedTokenizer]
-    :param mode: The generation mode, either 'async' or 'sync'.
-    :type mode: GenerationMode
-    :param async_queue_size: The size of the request queue.
-    :type async_queue_size: int
-    """
-    def __init__(
-        self,
-        type_: str,
-        source: str,
-        tokenizer: Union[str, PreTrainedTokenizer],
-        mode: GenerationMode = "async",
-        async_queue_size: int = 50,
-    ):
-        self._type = type_
-        self._source = source
-        self._async_queue_size: int = async_queue_size
-        self._mode: str = mode
-        self._queue: Queue = Queue(maxsize=async_queue_size)
-        self._stop_event: threading.Event = threading.Event()
-        if not tokenizer:
-            err = "Tokenizer must be provided for request generation"
-            logger.error(err)
-            raise ValueError(err)
-        self._tokenizer = (
-            AutoTokenizer.from_pretrained(tokenizer)
-            if isinstance(tokenizer, str)
-            else tokenizer
-        )
-        logger.info("Tokenizer initialized for request generation: {}", self._tokenizer)
-        if self._mode == "async":
-            self._thread = threading.Thread(target=self._populate_queue, daemon=True)
-            self._thread.start()
-            logger.info(
-                "RequestGenerator started in async mode with queue size: {}",
-                self._async_queue_size,
-            )
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the RequestGenerator.
-        :return: String representation of the RequestGenerator.
-        :rtype: str
-        """
-        return (
-            f"RequestGenerator("
-            f"mode={self._mode}, "
-            f"async_queue_size={self._async_queue_size}, "
-            f"tokenizer={self._tokenizer})"
-        )
-    def __iter__(self) -> Iterator[TextGenerationRequest]:
-        """
-        Provide an iterator interface to generate new requests.
-        :return: An iterator over result requests.
-        :rtype: Iterator[TextGenerationRequest]
-        """
-        if self.mode == "async":
-            while not self._stop_event.is_set():
-                try:
-                    item = self._queue.get_nowait()
-                    self._queue.task_done()
-                    yield item
-                except Empty:
-                    time.sleep(0.01)
-                    continue
-        else:
-            while not self._stop_event.is_set():
-                yield self.create_item()
-    @property
-    def type_(self) -> str:
-        """
-        Get the type of the request generator.
-        :return: The type of the request generator.
-        :rtype: str
-        """
-        return self._type
-    @property
-    def source(self) -> str:
-        """
-        Get the data source for the request generator.
-        :return: The data source.
-        :rtype: str
-        """
-        return self._source
-    @property
-    def tokenizer(self) -> PreTrainedTokenizer:
-        """
-        Get the tokenizer instance.
-        :return: The tokenizer instance.
-        :rtype: PreTrainedTokenizer
-        """
-        return self._tokenizer
-    @property
-    def mode(self) -> str:
-        """
-        Get the generation mode.
-        :return: The generation mode.
-        :rtype: str
-        """
-        return self._mode
-    @property
-    def async_queue_size(self) -> int:
-        """
-        Get the size of the request queue.
-        :return: The size of the request queue.
-        :rtype: int
-        """
-        return self._async_queue_size
-    @abstractmethod
-    def create_item(self) -> TextGenerationRequest:
-        """
-        Abstract method to create a new result request item.
-        :return: A new result request.
-        :rtype: TextGenerationRequest
-        """
-    def stop(self):
-        """
-        Stop the background task that populates the queue.
-        """
-        logger.info("Stopping RequestGenerator...")
-        self._stop_event.set()
-        if self._mode == "async":
-            self._thread.join()
-        logger.info("RequestGenerator stopped")
-    def _populate_queue(self):
-        """
-        Populate the request queue in the background.
-        """
-        while not self._stop_event.is_set():
-            with contextlib.suppress(Full):
-                if self._queue.qsize() < self._async_queue_size:
-                    item = self.create_item()
-                    self._queue.put(item, timeout=0.1)
-                    logger.debug(
-                        "Item added to queue. Current queue size: {}",
-                        self._queue.qsize(),
-                    )
-                else:
-                    time.sleep(0.1)
-        logger.info("RequestGenerator stopped populating queue")

guidellm 0.1.0__py3-none-any.whl → 0.2.0.dev0__py3-none-any.whl

Potentially problematic release.

guidellm 0.1.0py3-none-any.whl → 0.2.0.dev0py3-none-any.whl