PyPI - guidellm - Versions diffs - 0.4.0a21__py3-none-any.whl → 0.4.0a155__py3-none-any.whl - Mend

guidellm 0.4.0a21py3-none-any.whl → 0.4.0a155py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (116) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +451 -252
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +110 -0
guidellm/backends/openai.py +355 -0
guidellm/backends/response_handlers.py +455 -0
guidellm/benchmark/__init__.py +53 -39
guidellm/benchmark/benchmarker.py +148 -317
guidellm/benchmark/entrypoints.py +466 -128
guidellm/benchmark/output.py +517 -771
guidellm/benchmark/profile.py +580 -280
guidellm/benchmark/progress.py +568 -549
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas.py +2085 -0
guidellm/data/__init__.py +28 -4
guidellm/data/collators.py +16 -0
guidellm/data/deserializers/__init__.py +53 -0
guidellm/data/deserializers/deserializer.py +109 -0
guidellm/data/deserializers/file.py +222 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +192 -0
guidellm/data/deserializers/synthetic.py +346 -0
guidellm/data/loaders.py +145 -0
guidellm/data/preprocessors/__init__.py +25 -0
guidellm/data/preprocessors/formatters.py +412 -0
guidellm/data/preprocessors/mappers.py +198 -0
guidellm/data/preprocessors/preprocessor.py +29 -0
guidellm/data/processor.py +30 -0
guidellm/data/schemas.py +13 -0
guidellm/data/utils/__init__.py +10 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/data/utils/functions.py +18 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +215 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +168 -0
guidellm/mock_server/utils.py +302 -0
guidellm/preprocess/dataset.py +23 -26
guidellm/presentation/builder.py +2 -2
guidellm/presentation/data_models.py +25 -21
guidellm/presentation/injector.py +2 -3
guidellm/scheduler/__init__.py +65 -26
guidellm/scheduler/constraints.py +1035 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +140 -368
guidellm/scheduler/schemas.py +272 -0
guidellm/scheduler/strategies.py +519 -0
guidellm/scheduler/worker.py +391 -420
guidellm/scheduler/worker_group.py +707 -0
guidellm/schemas/__init__.py +31 -0
guidellm/schemas/info.py +159 -0
guidellm/schemas/request.py +216 -0
guidellm/schemas/response.py +119 -0
guidellm/schemas/stats.py +228 -0
guidellm/{config.py → settings.py} +32 -21
guidellm/utils/__init__.py +95 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +46 -2
guidellm/utils/console.py +183 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +134 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/pydantic_utils.py +411 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/{objects → utils}/statistics.py +341 -247
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +1 -1
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
guidellm-0.4.0a155.dist-info/RECORD +96 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -705
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.4.0a21.dist-info/RECORD +0 -62
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0

guidellm/scheduler/scheduler.py CHANGED Viewed

@@ -1,390 +1,162 @@
-import asyncio
-import math
-import time
-from collections.abc import AsyncGenerator, Iterable, Iterator
-from concurrent.futures import ProcessPoolExecutor
-from multiprocessing import Manager
-from threading import Event
-from typing import (
-    Any,
-    Generic,
-    Optional,
-    Union,
-)
+"""
+Thread-safe singleton scheduler for distributed benchmarking workload coordination.
+Orchestrates request processing across worker processes with distributed timing
+coordination, constraint enforcement, and result aggregation. Integrates with
+backends, environments, and strategies to enable scalable load testing across
+various scenarios including LLM inference benchmarking.
+"""
-from loguru import logger
+from __future__ import annotations
-from guidellm.config import settings
-from guidellm.request.types import (
+from collections.abc import AsyncIterator, Iterable
+from typing import Any, Generic
+from guidellm.scheduler.constraints import Constraint, ConstraintsInitializerFactory
+from guidellm.scheduler.environments import Environment, NonDistributedEnvironment
+from guidellm.scheduler.schemas import (
+    BackendInterface,
+    MultiTurnRequestT,
     RequestT,
     ResponseT,
+    SchedulerState,
 )
-from guidellm.scheduler.queues import MPQueues, Queue, QueueEmpty
-from guidellm.scheduler.result import (
-    SchedulerRequestResult,
-    SchedulerResult,
-    SchedulerRunInfo,
-    WorkerProcessRequest,
-    WorkerProcessResult,
-)
-from guidellm.scheduler.strategy import SchedulingStrategy
-from guidellm.scheduler.worker import (
-    RequestsWorker,
-)
+from guidellm.scheduler.strategies import SchedulingStrategy
+from guidellm.scheduler.worker_group import WorkerProcessGroup
+from guidellm.schemas import RequestInfo
+from guidellm.utils.singleton import ThreadSafeSingletonMixin
 __all__ = ["Scheduler"]
-class Scheduler(Generic[RequestT, ResponseT]):
+class Scheduler(
+    Generic[RequestT, ResponseT],
+    ThreadSafeSingletonMixin,
+):
     """
-    A class that handles the scheduling of requests to a worker.
-    This class is responsible for managing the lifecycle of the requests,
-    including their creation, queuing, and processing.
-    It uses a multiprocessing approach to handle requests concurrently
-    and efficiently, based on the specified scheduling strategy.
-    The Scheduler class is designed to work with a RequestsWorker,
-    which is an abstract base class that defines the interface for a worker
-    that can resolve requests asynchronously or synchronously.
-    The Scheduler class also supports different scheduling strategies,
-    including synchronous, throughput, and concurrent strategies.
-    :param worker: The worker that will process the requests.
-        This should be an instance of RequestsWorker.
-    :param request_loader: An iterable that generates requests.
-        This can be a list, generator, or any other iterable.
-        The requests will be processed by the worker.
+    Thread-safe singleton scheduler for distributed benchmarking workload coordination.
+    Orchestrates request processing across worker processes with distributed timing
+    coordination, constraint enforcement, and result aggregation. Abstracts the
+    complexity of multi-process coordination, environment synchronization, and
+    resource management while providing a unified interface for executing benchmarking
+    operations. Implements singleton pattern to ensure consistent execution state.
+    Example:
+    ::
+        from guidellm.scheduler import Scheduler
+        from guidellm.scheduler import NonDistributedEnvironment, SynchronousStrategy
+        scheduler = Scheduler()
+        async for response, request, info, state in scheduler.run(
+            requests=request_list,
+            backend=backend,
+            strategy=SynchronousStrategy(),
+            env=NonDistributedEnvironment(),
+            max_requests=1000
+        ):
+            print(f"Processed: {request}")
     """
-    def __init__(
-        self,
-        worker: RequestsWorker[RequestT, ResponseT],
-        request_loader: Iterable[RequestT],
-    ):
-        if not isinstance(worker, RequestsWorker):
-            raise ValueError(f"Invalid worker: {worker}")
-        if not isinstance(request_loader, Iterable):
-            raise ValueError(f"Invalid request_loader: {request_loader}")
-        self.worker = worker
-        self.request_loader = request_loader
     async def run(
         self,
-        scheduling_strategy: SchedulingStrategy,
-        max_number: Optional[int] = None,
-        max_duration: Optional[float] = None,
-    ) -> AsyncGenerator[
-        Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None
+        requests: Iterable[RequestT | MultiTurnRequestT[RequestT]],
+        backend: BackendInterface[RequestT, ResponseT],
+        strategy: SchedulingStrategy,
+        startup_duration: float,
+        env: Environment[RequestT, ResponseT] | None,
+        **constraints: Any | dict[str, Any] | Constraint,
+    ) -> AsyncIterator[
+        tuple[
+            ResponseT | None,
+            RequestT,
+            RequestInfo,
+            SchedulerState,
+        ]
     ]:
         """
-        The main method that runs the scheduler.
-        This method is a generator that yields SchedulerResult objects
-        at the start and end of the run, as well as at the start and end
-        of each request.
-        It uses multiprocessing to handle requests concurrently
-        and efficiently, based on the specified scheduling strategy.
-        The method also handles the lifecycle of the requests,
-        including their creation, queuing, and processing.
-        The method is designed to be used as an asynchronous generator,
-        allowing it to be used with asyncio and other asynchronous frameworks.
-        :param scheduling_strategy: The scheduling strategy to use.
-            Specifies the times at which requests will be sent as well how many
-            worker processes are used and if requests are scheduled sync or async.
-            This can be one of the following:
-            - "synchronous": Requests are sent synchronously.
-            - "throughput": Requests are sent at the maximum rate possible.
-            - An instance of SchedulingStrategy.
-        :param max_number: The maximum number of requests to process.
-            If None, then no limit is set and either the iterator must be exhaustible
-            or the max_duration must be set.
-        :param max_duration: The maximum duration for the scheduling run.
-            If None, then no limit is set and either the iterator must be exhaustible
-            or the max_number must be set.
-        :return: An asynchronous generator that yields SchedulerResult objects.
-            Each SchedulerResult object contains information about the request,
-            the response, and the run information.
+        Execute distributed request processing with coordinated timing and constraints.
+        Orchestrates the complete benchmarking workflow across worker processes with
+        environment synchronization, constraint enforcement, and error handling. Manages
+        resource lifecycle from initialization through cleanup while yielding real-time
+        processing updates for monitoring and aggregation.
+        :param requests: Request collection to process, supporting single requests or
+            multi-turn sequences with optional inter-request delays
+        :param backend: Backend interface for request processing and response generation
+        :param strategy: Scheduling strategy controlling request timing and distribution
+        :param startup_duration: Duration in seconds for requests to ramp up
+        :param env: Environment interface for distributed coordination and
+            synchronization. Defaults to NonDistributedEnvironment if None
+        :param constraints: Runtime constraints for execution control (max_requests,
+            max_duration, max_error_rate, etc.) as primitives, dictionaries, or
+            constraint instances
+        :yields: Request updates as (response, request, request_info, scheduler_state)
+            tuples. Each request generates three ordered updates: queued, in_progress,
+            completed | errored | cancelled
+        :raises Exception: Worker process errors, environment synchronization failures,
+            or constraint evaluation errors are propagated after cleanup
         """
-        if scheduling_strategy is None or not isinstance(
-            scheduling_strategy, SchedulingStrategy
-        ):
-            raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}")
-        if max_number is not None and max_number < 1:
-            raise ValueError(f"Invalid max_number: {max_number}")
+        with self.thread_lock:
+            if env is None:
+                env = NonDistributedEnvironment[RequestT, ResponseT]()
-        if max_duration is not None and max_duration < 0:
-            raise ValueError(f"Invalid max_duration: {max_duration}")
-        with (
-            Manager() as manager,
-            ProcessPoolExecutor(
-                max_workers=scheduling_strategy.processes_limit
-            ) as executor,
-        ):
-            requests_iter: Optional[Iterator[Any]] = None
-            scheduling_strategy.start_time = (
-                time.time() + settings.scheduler_start_delay
-            )  # Add a small delay to allow processes to start
-            futures, queues, stop_event = await self._start_processes(
-                manager, executor, scheduling_strategy
-            )
-            run_info, requests_iter, times_iter = self._run_setup(
-                futures, scheduling_strategy, max_number, max_duration
-            )
-            # Add some initial requests to the queue
-            requests_iter = self._add_requests(
-                requests_iter,
-                queues.requests,
-                times_iter,
-                run_info,
-            )
-            # Wait for the test to start
-            await asyncio.sleep(time.time() - scheduling_strategy.start_time)
-            yield SchedulerResult(
-                type_="run_start",
-                run_info=run_info,
-            )
+            worker_group: WorkerProcessGroup[RequestT, ResponseT] | None = None
+            # Any issues during the run will raise an error (local or remote),
+            # be caught and passed to the environment,
+            # and will ensure clean up before raising the error.
             try:
-                while True:
-                    # check errors and raise them
-                    for future in futures:
-                        if future.done() and (err := future.exception()) is not None:
-                            raise err
-                    if (
-                        requests_iter is None
-                        and run_info.processing_requests <= 0
-                        and (  # Ensure we have met one of the end conditions
-                            time.time() >= run_info.end_time
-                            or run_info.completed_requests >= run_info.end_number
-                        )
-                    ):
-                        # we've exhausted all requests we've wanted to run
-                        # and yielded all responses
-                        break
-                    requests_iter = self._add_requests(
-                        requests_iter,
-                        queues.requests,
-                        times_iter,
-                        run_info,
-                    )
-                    await asyncio.sleep(0)  # enable requests to start
-                    iter_result = self._check_result_ready(
-                        queues.responses,
-                        run_info,
-                    )
-                    if iter_result is not None:
-                        yield iter_result
-                    # yield control to the event loop
-                    await asyncio.sleep(settings.default_async_loop_sleep)
-            except Exception as err:
-                raise RuntimeError(f"Scheduler run failed: {err}") from err
-            yield SchedulerResult(
-                type_="run_complete",
-                run_info=run_info,
-            )
-            await self._stop_processes(futures, stop_event)
-    async def _start_processes(
-        self,
-        manager,
-        executor: ProcessPoolExecutor,
-        scheduling_strategy: SchedulingStrategy,
-    ) -> tuple[
-        list[asyncio.Future],
-        MPQueues[RequestT, ResponseT],
-        Event,
-    ]:
-        await self.worker.prepare_multiprocessing()
-        queues: MPQueues[RequestT, ResponseT] = MPQueues(
-            requests=manager.Queue(
-                maxsize=scheduling_strategy.processing_requests_limit
-            ),
-            responses=manager.Queue(),
-        )
-        stop_event = manager.Event()
-        num_processes = min(
-            scheduling_strategy.processes_limit,
-            scheduling_strategy.processing_requests_limit,
-        )
-        requests_limit_split = (
-            scheduling_strategy.processing_requests_limit
-            // scheduling_strategy.processes_limit
-        )
-        requests_limit_remain = (
-            scheduling_strategy.processing_requests_limit
-            % scheduling_strategy.processes_limit
-        )
-        process_ids = (id_ for id_ in range(num_processes))
-        process_requests_limits = (
-            requests_limit_split + 1
-            if i < requests_limit_remain
-            else requests_limit_split
-            for i in range(num_processes)
-        )
-        futures = []
-        loop = asyncio.get_event_loop()
-        for id_, requests_limit in zip(process_ids, process_requests_limits):
-            futures.append(
-                loop.run_in_executor(
-                    executor,
-                    self.worker.process_loop_asynchronous,
-                    queues,
-                    scheduling_strategy,
-                    stop_event,
-                    requests_limit,
-                    id_,
-                    num_processes,
+                # Setup local run parameters, sync with the environment
+                resolved_constraints = (
+                    ConstraintsInitializerFactory.resolve_constraints(constraints)
                 )
-            )
-        await asyncio.sleep(0.1)  # give time for processes to start
-        return futures, queues, stop_event
-    def _run_setup(
-        self,
-        processes: list[asyncio.Future],
-        scheduling_strategy: SchedulingStrategy,
-        max_number: Optional[int],
-        max_duration: Optional[float],
-    ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]:
-        requests_iter = iter(self.request_loader)
-        times_iter = iter(scheduling_strategy.request_times())
-        end_time = scheduling_strategy.start_time + (max_duration or math.inf)
-        end_number = max_number or math.inf
-        try:
-            # update end number if the request loader is finite and less than max
-            iter_length = len(self.request_loader)  # type: ignore[arg-type]
-            if 0 < iter_length < end_number:
-                end_number = iter_length
-        except Exception:  # noqa: BLE001, S110
-            pass
-        if end_number == math.inf and end_time is None:
-            logger.warning(
-                "No end number or end time set, "
-                "scheduler will run indefinitely until the request loader is exhausted."
-            )
-        info = SchedulerRunInfo(
-            start_time=scheduling_strategy.start_time,
-            end_time=end_time,
-            end_number=end_number,
-            processes=len(processes),
-            strategy=scheduling_strategy,
-        )
-        return info, requests_iter, times_iter
-    def _add_requests(
-        self,
-        requests_iter: Optional[Iterator[Any]],
-        requests_queue: Queue[WorkerProcessRequest[RequestT, ResponseT]],
-        times_iter: Iterator[float],
-        run_info: SchedulerRunInfo,
-    ) -> Optional[Iterator[Any]]:
-        if requests_iter is not None:
-            try:
-                added_count = 0
-                while not requests_queue.full() and added_count < (
-                    run_info.strategy.queued_requests_limit
-                    or settings.min_queued_requests
-                ):
-                    if run_info.created_requests >= run_info.end_number:
-                        raise StopIteration
-                    if (
-                        next(times_iter) >= run_info.end_time
-                        or time.time() >= run_info.end_time
-                    ):
-                        raise StopIteration
-                    work_req = WorkerProcessRequest[RequestT, ResponseT](
-                        request=next(requests_iter),
-                        timeout_time=run_info.end_time,
-                        queued_time=time.time(),
+                (
+                    local_requests,
+                    local_strategy,
+                    local_constraints,
+                ) = await env.sync_run_params(requests, strategy, resolved_constraints)
+                # Setup the worker group, sync start with the environment
+                worker_group = WorkerProcessGroup[RequestT, ResponseT](
+                    requests=local_requests,
+                    backend=backend,
+                    strategy=local_strategy,
+                    startup_duration=startup_duration,
+                    **local_constraints,
+                )
+                await worker_group.create_processes()
+                local_start_time = await env.sync_run_start()
+                await worker_group.start(local_start_time)
+                # Yield any updates and sync with the environment for non-local updates
+                async for (
+                    response,
+                    request,
+                    request_info,
+                    state,
+                ) in worker_group.request_updates():
+                    await env.update_run_iteration(
+                        response, request, request_info, state
                     )
-                    requests_queue.put(work_req)
-                    run_info.created_requests += 1
-                    run_info.queued_requests += 1
-                    added_count += 1
-            except StopIteration:
-                # we've reached the limit number, limit time, or exhausted the requests
-                # set to None to stop adding more and tell the loop no more requests
-                requests_iter = None
-        return requests_iter
-    def _check_result_ready(
-        self,
-        responses_queue: Queue[WorkerProcessResult[RequestT, ResponseT]],
-        run_info: SchedulerRunInfo,
-    ) -> Optional[SchedulerRequestResult[RequestT, ResponseT]]:
-        try:
-            process_response: WorkerProcessResult[RequestT, ResponseT] = (
-                responses_queue.get_nowait()
-            )
-        except QueueEmpty:
-            return None
-        if process_response.type_ == "request_scheduled":
-            run_info.queued_requests -= 1
-            run_info.scheduled_requests += 1
-            return SchedulerRequestResult(
-                type_="request_scheduled",
-                run_info=run_info,
-                request=process_response.request,
-                request_info=process_response.info,
-                response=None,
-            )
-        if process_response.type_ == "request_start":
-            run_info.scheduled_requests -= 1
-            run_info.processing_requests += 1
-            return SchedulerRequestResult(
-                type_="request_start",
-                run_info=run_info,
-                request=process_response.request,
-                request_info=process_response.info,
-                response=None,
-            )
-        if process_response.type_ == "request_complete":
-            run_info.processing_requests -= 1
-            run_info.completed_requests += 1
-            return SchedulerRequestResult(
-                type_="request_complete",
-                run_info=run_info,
-                request=process_response.request,
-                request_info=process_response.info,
-                response=process_response.response,
-            )
-        raise ValueError(f"Invalid process response type: {process_response}")
-    async def _stop_processes(
-        self,
-        futures: list[asyncio.Future],
-        stop_event: Event,
-    ):
-        # stop all processes
-        stop_event.set()
-        await asyncio.gather(*futures)
+                    yield response, request, request_info, state
+            except Exception as err:  # noqa: BLE001
+                await env.sync_run_error(err)
+                raise err
+            finally:
+                # Ensure all worker processes are cleaned up for error or completion
+                if worker_group is not None:
+                    err = await worker_group.shutdown()  # type: ignore[misc]
+                    if err is not None:
+                        await env.sync_run_error(err)
+            # Ensure any errors are raised and all responses
+            # are yielded for aggregation on the primary node
+            async for (
+                dist_response,
+                dist_request,
+                dist_request_info,
+                dist_state,
+            ) in env.sync_run_end():
+                yield dist_response, dist_request, dist_request_info, dist_state

guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a155__py3-none-any.whl

Potentially problematic release.

guidellm 0.4.0a21py3-none-any.whl → 0.4.0a155py3-none-any.whl