guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -1,472 +1,466 @@
1
- import asyncio
2
- import math
3
- import time
4
- from abc import ABC, abstractmethod
5
- from collections.abc import AsyncGenerator
6
- from dataclasses import dataclass
7
- from itertools import islice
8
- from threading import Event
9
- from typing import (
10
- Any,
11
- Generic,
12
- Literal,
13
- Optional,
14
- Union,
15
- )
1
+ """
2
+ Worker process implementation for distributed request execution and coordination.
16
3
 
17
- from loguru import logger
18
- from pydantic import Field
4
+ Manages individual worker processes within the scheduler system, handling request
5
+ lifecycle from queue consumption through backend processing and status publication.
6
+ Workers coordinate with other processes through barriers and events, apply timing
7
+ strategies for request scheduling, maintain concurrency limits, and publish real-time
8
+ status updates throughout request processing.
9
+ """
19
10
 
20
- from guidellm.backend import (
21
- Backend,
22
- BackendType,
23
- RequestArgs,
24
- ResponseSummary,
25
- StreamingTextResponse,
26
- )
27
- from guidellm.objects import StandardBaseModel
28
- from guidellm.request import GenerationRequest
29
- from guidellm.request.types import RequestT, ResponseT
30
- from guidellm.scheduler.queues import MPQueues, Queue, QueueEmpty
31
- from guidellm.scheduler.result import (
32
- SchedulerRequestInfo,
33
- WorkerProcessRequest,
34
- WorkerProcessResult,
35
- )
36
- from guidellm.scheduler.strategy import SchedulingStrategy
11
+ from __future__ import annotations
37
12
 
38
- __all__ = [
39
- "GenerativeRequestsWorker",
40
- "GenerativeRequestsWorkerDescription",
41
- "RequestsWorker",
42
- "ResolveStatus",
43
- "WorkerDescription",
44
- ]
13
+ import asyncio
14
+ import time
15
+ import traceback
16
+ from multiprocessing.synchronize import Barrier as ProcessingBarrier
17
+ from multiprocessing.synchronize import Event as ProcessingEvent
18
+ from typing import Annotated, Generic, Literal
45
19
 
20
+ try:
21
+ import uvloop
46
22
 
47
- @dataclass
48
- class ResolveStatus:
49
- requested: bool
50
- completed: bool
51
- errored: bool
52
- canceled: bool
23
+ HAS_UVLOOP: Annotated[
24
+ bool, "Flag indicating uvloop availability for event loop optimization"
25
+ ] = True
26
+ except ImportError:
27
+ uvloop = None # type: ignore[assignment] # Optional dependency
53
28
 
54
- request_start: float
55
- request_end: float
29
+ HAS_UVLOOP = False
56
30
 
57
31
 
58
- class WorkerDescription(StandardBaseModel):
59
- type_: Literal["worker"] = "worker"
32
+ from guidellm.scheduler.schemas import (
33
+ BackendInterface,
34
+ MultiTurnRequestT,
35
+ RequestT,
36
+ ResponseT,
37
+ )
38
+ from guidellm.scheduler.strategies import SchedulingStrategy
39
+ from guidellm.schemas import RequestInfo
40
+ from guidellm.utils import (
41
+ InterProcessMessaging,
42
+ wait_for_sync_barrier,
43
+ wait_for_sync_event,
44
+ wait_for_sync_objects,
45
+ )
60
46
 
47
+ __all__ = ["WorkerProcess"]
61
48
 
62
- class RequestsWorker(ABC, Generic[RequestT, ResponseT]):
49
+
50
+ class WorkerProcess(Generic[RequestT, ResponseT]):
63
51
  """
64
- An abstract base class for a worker that processes requests.
65
- This class defines the interface for a worker that can resolve requests
66
- asynchronously or synchronously within the Scheduler class.
67
- Subclasses must implement the `resolve` method,
68
- which takes a request directly given from the load generator,
69
- along with the desired start_time for the request and a timeout_time.
70
- The `resolve` method should return the response from the backend.
52
+ Worker process for distributed request execution in the scheduler system.
53
+
54
+ Manages complete request lifecycle including queue consumption, backend processing,
55
+ timing strategy application, and status publication. Coordinates with other workers
56
+ through synchronization primitives while maintaining concurrency limits and handling
57
+ graceful shutdown scenarios including errors and cancellations.
58
+
59
+ Example:
60
+ ::
61
+ worker = WorkerProcess(
62
+ worker_index=0,
63
+ messaging=messaging_interface,
64
+ backend=backend_instance,
65
+ strategy=timing_strategy,
66
+ async_limit=10,
67
+ fut_scheduling_time_limit=5.0,
68
+ startup_barrier=barrier,
69
+ requests_generated_event=generated_event,
70
+ constraint_reached_event=constraint_event,
71
+ shutdown_event=shutdown,
72
+ error_event=error,
73
+ )
74
+ worker.run()
71
75
  """
72
76
 
73
- @property
74
- @abstractmethod
75
- def description(self) -> WorkerDescription:
77
+ def __init__(
78
+ self,
79
+ worker_index: int,
80
+ messaging: InterProcessMessaging[
81
+ tuple[
82
+ ResponseT | None,
83
+ RequestT | MultiTurnRequestT[RequestT],
84
+ RequestInfo,
85
+ ],
86
+ tuple[
87
+ RequestT | MultiTurnRequestT[RequestT],
88
+ RequestInfo,
89
+ ],
90
+ ],
91
+ backend: BackendInterface[RequestT, ResponseT],
92
+ strategy: SchedulingStrategy,
93
+ async_limit: int,
94
+ fut_scheduling_time_limit: float,
95
+ startup_barrier: ProcessingBarrier,
96
+ requests_generated_event: ProcessingEvent,
97
+ constraint_reached_event: ProcessingEvent,
98
+ shutdown_event: ProcessingEvent,
99
+ error_event: ProcessingEvent,
100
+ ):
76
101
  """
77
- An abstract property that must be implemented by subclasses.
78
- This property should return a Serializable class representing the information
79
- about the worker instance.
102
+ Initialize worker process instance.
103
+
104
+ :param worker_index: Unique identifier for this worker within the process group
105
+ :param messaging: Inter-process messaging interface for request coordination
106
+ :param backend: Backend interface for processing requests
107
+ :param strategy: Scheduling strategy for determining request timing
108
+ :param async_limit: Maximum concurrent requests this worker can process
109
+ :param fut_scheduling_time_limit: Maximum time in seconds to schedule requests
110
+ into the future
111
+ :param startup_barrier: Synchronization barrier for coordinated startup
112
+ :param requests_generated_event: Event signaling request generation completion
113
+ :param constraint_reached_event: Event signaling processing constraint reached
114
+ :param shutdown_event: Event signaling graceful shutdown request
115
+ :param error_event: Event signaling error conditions across processes
80
116
  """
81
- ...
117
+ self.worker_index = worker_index
118
+ self.messaging = messaging
119
+ self.backend = backend
120
+ self.strategy = strategy
121
+ self.async_limit = async_limit
122
+ self.fut_scheduling_time_limit = fut_scheduling_time_limit
123
+ self.startup_barrier = startup_barrier
124
+ self.requests_generated_event = requests_generated_event
125
+ self.constraint_reached_event = constraint_reached_event
126
+ self.shutdown_event = shutdown_event
127
+ self.error_event = error_event
128
+
129
+ # Internal states
130
+ self.startup_completed = False
131
+ self.backend_started = False
132
+ self.messaging_started = False
133
+
134
+ def run(self):
135
+ """
136
+ Main entry point for worker process execution.
137
+
138
+ Initializes asyncio event loop with optional uvloop optimization and executes
139
+ worker async operations. Handles event loop cleanup and error propagation.
82
140
 
83
- @abstractmethod
84
- async def prepare_multiprocessing(self):
141
+ :raises RuntimeError: If worker encounters unrecoverable error during execution
85
142
  """
86
- An abstract method that must be implemented by subclasses.
87
- This is useful for workers that have instance state that can not
88
- be shared across processes and should be cleared out and re-initialized
89
- for each new process.
143
+ try:
144
+ if HAS_UVLOOP:
145
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
146
+ asyncio.run(self.run_async())
147
+ except Exception as err:
148
+ self.error_event.set()
149
+ raise RuntimeError(
150
+ f"Worker process {self.messaging.worker_index} encountered an "
151
+ f"error: {err}"
152
+ ) from err
153
+
154
+ async def run_async(self):
90
155
  """
91
- ...
156
+ Execute main asynchronous worker process logic.
92
157
 
93
- @abstractmethod
94
- async def resolve(
95
- self,
96
- request: RequestT,
97
- timeout_time: float,
98
- ) -> tuple[ResolveStatus, ResponseT]:
99
- """
100
- An abstract method that must be implemented by subclasses.
101
- This method should handle the resolution of a request through asyncio,
102
- including any necessary backend processing and response handling.
103
-
104
- :param request: The request to be resolved generated by the load generator.
105
- :param timeout_time: The timeout time for the request, if there is no timeout
106
- given, then this will be math.inf.
107
- :return: The response from the worker.
158
+ Orchestrates concurrent execution of request processing and shutdown monitoring.
159
+ Handles task cleanup, error propagation, and cancellation coordination when any
160
+ task completes or encounters an error.
161
+
162
+ :raises RuntimeError: If worker tasks encounter unrecoverable errors
163
+ :raises asyncio.CancelledError: If worker process was cancelled
108
164
  """
109
- ...
165
+ stop_task = asyncio.create_task(self._stop_monitor())
166
+ request_proc_task = asyncio.create_task(self._process_requests())
167
+ caller_cancelled = False
110
168
 
111
- async def send_result(
112
- self,
113
- results_queue: Queue[WorkerProcessResult[RequestT, ResponseT]],
114
- result: WorkerProcessResult[RequestT, ResponseT],
115
- ):
116
- await asyncio.to_thread(results_queue.put, result) # type: ignore[attr-defined]
169
+ try:
170
+ await asyncio.wait(
171
+ [stop_task, request_proc_task],
172
+ return_when=asyncio.FIRST_COMPLETED,
173
+ )
174
+ except asyncio.CancelledError:
175
+ caller_cancelled = True
117
176
 
118
- async def resolve_scheduler_request(
119
- self,
120
- process_request: WorkerProcessRequest[RequestT, ResponseT],
121
- dequeued_time: float,
122
- start_time: float,
123
- results_queue: Queue[WorkerProcessResult[RequestT, ResponseT]],
124
- process_id: int,
125
- ):
126
- request = process_request.request
127
- timeout_time = process_request.timeout_time
128
- queued_time = process_request.queued_time
129
-
130
- info = SchedulerRequestInfo(
131
- targeted_start_time=start_time,
132
- queued_time=queued_time,
133
- dequeued_time=dequeued_time,
134
- scheduled_time=time.time(),
135
- process_id=process_id,
136
- )
137
- result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult(
138
- type_="request_scheduled",
139
- request=request,
140
- response=None,
141
- info=info,
142
- )
143
- asyncio.create_task(self.send_result(results_queue, result))
177
+ stop_task.cancel()
178
+ request_proc_task.cancel()
144
179
 
145
- if (wait_time := start_time - time.time()) > 0:
146
- await asyncio.sleep(wait_time)
180
+ try:
181
+ # Ensure all child tasks cancel correctly
182
+ await asyncio.wait(
183
+ [stop_task, request_proc_task], return_when=asyncio.ALL_COMPLETED
184
+ )
185
+ except asyncio.CancelledError:
186
+ caller_cancelled = True
187
+
188
+ if (
189
+ task_err := (
190
+ request_proc_task.exception()
191
+ if not request_proc_task.cancelled()
192
+ else stop_task.exception()
193
+ if not stop_task.cancelled()
194
+ else None
195
+ )
196
+ ) is not None:
197
+ raise RuntimeError(
198
+ f"Worker process {self.messaging.worker_index} encountered an "
199
+ f"error: {task_err}"
200
+ ) from task_err
147
201
 
148
- info.worker_start = time.time()
149
- result = WorkerProcessResult(
150
- type_="request_start",
151
- request=request,
152
- response=None,
153
- info=info,
154
- )
155
- asyncio.create_task(self.send_result(results_queue, result))
156
-
157
- status, response = await self.resolve(request, timeout_time)
158
- info.worker_end = time.time()
159
- info.requested = status.requested
160
- info.completed = status.completed
161
- info.errored = status.errored
162
- info.canceled = status.canceled
163
- info.request_start = status.request_start
164
- info.request_end = status.request_end
165
- result = WorkerProcessResult(
166
- type_="request_complete",
167
- request=request,
168
- response=response,
169
- info=info,
170
- )
171
- asyncio.create_task(self.send_result(results_queue, result))
202
+ if caller_cancelled:
203
+ raise asyncio.CancelledError("Worker process was cancelled")
172
204
 
173
- def process_loop_asynchronous(
205
+ async def _stop_monitor(
174
206
  self,
175
- queues: MPQueues[RequestT, ResponseT],
176
- strategy: SchedulingStrategy,
177
- stop_event: Event,
178
- max_concurrency: int,
179
- process_id: int,
180
- num_processes: int,
181
- ):
182
- async def _process_runner():
183
- lock = asyncio.Semaphore(max_concurrency)
184
- times_iter = islice(
185
- strategy.request_times(),
186
- process_id,
187
- None,
188
- num_processes,
207
+ ) -> None:
208
+ """
209
+ Monitor shutdown and error events for worker termination.
210
+ :raises RuntimeError if the work process received an error signal.
211
+ """
212
+ exit_key = await wait_for_sync_objects(
213
+ {
214
+ "error_event": self.error_event,
215
+ "shutdown_event": self.shutdown_event,
216
+ },
217
+ poll_interval=self.messaging.poll_interval,
218
+ )
219
+
220
+ if exit_key == "error_event":
221
+ raise RuntimeError(
222
+ f"Worker process {self.messaging.worker_index} received error signal."
189
223
  )
190
224
 
191
- start_time = None
192
- while not stop_event.is_set():
193
- if start_time is None:
194
- start_time = next(times_iter)
195
-
196
- # Yield control to the event loop. Sleep if we are way ahead
197
- await asyncio.sleep(start_time - time.time() - 1)
198
- await lock.acquire()
199
-
200
- try:
201
- process_request = queues.requests.get_nowait()
202
- dequeued_time = time.time()
203
- except QueueEmpty:
204
- lock.release()
205
- continue
206
-
207
- def _request_callback(
208
- _: asyncio.Future[WorkerProcessRequest[RequestT, ResponseT]],
209
- ):
210
- nonlocal lock
211
- lock.release()
212
-
213
- task = asyncio.create_task(
214
- self.resolve_scheduler_request(
215
- process_request=process_request,
216
- dequeued_time=dequeued_time,
217
- start_time=start_time,
218
- results_queue=queues.responses,
219
- process_id=process_id,
220
- )
221
- )
222
- task.add_done_callback(_request_callback)
223
- start_time = None
225
+ async def _process_requests(self):
226
+ """
227
+ Manage request processing lifecycle from startup to shutdown.
224
228
 
229
+ Coordinates startup synchronization, processes requests until constraints are
230
+ reached, then cancels pending requests until shutdown or error occurs.
231
+ """
225
232
  try:
226
- asyncio.run(_process_runner())
227
- except Exception as exc: # noqa: BLE001
228
- logger.error(
229
- f"Error in worker process {process_id}: {exc}",
230
- exc_info=True,
231
- stack_info=True,
233
+ # 1. Start up synchronization (backend, messaging, and other processes)
234
+ # 2. Messaging startup, receive requests until requests_generated event
235
+ await self._processing_startup()
236
+
237
+ # 3. Run process requests loop until constraint_reached event
238
+ processing_task = asyncio.create_task(self._process_requests_loop())
239
+ await wait_for_sync_event(
240
+ self.constraint_reached_event,
241
+ poll_interval=self.messaging.poll_interval,
232
242
  )
243
+ processing_task.cancel()
244
+
245
+ # 4. Cancel pending requests until proc canceled (manual, shutdown, error)
246
+ await self._cancel_requests_loop()
247
+ finally:
248
+ # 5. On cancel, shut down event, error event, or internal error:
249
+ # attempt to shut down this worker cleanly (stop backend and messaging)
250
+ await self._processing_shutdown()
251
+
252
+ async def _processing_startup(self):
253
+ """Initialize backend, messaging, and synchronize with other workers."""
254
+ # Get backend ready
255
+ await self.backend.process_startup()
256
+ self.backend_started = True
257
+ await self.backend.validate()
258
+
259
+ # Get messaging system ready
260
+ await self.messaging.start(
261
+ receive_stop_criteria=[self.requests_generated_event]
262
+ )
263
+ self.messaging_started = True
233
264
 
265
+ # Wait for all processes to be ready
266
+ await wait_for_sync_barrier(
267
+ self.startup_barrier,
268
+ poll_interval=self.messaging.poll_interval,
269
+ )
234
270
 
235
- class GenerativeRequestsWorkerDescription(WorkerDescription):
236
- type_: Literal["generative_requests_worker"] = "generative_requests_worker" # type: ignore[assignment]
237
- backend_type: BackendType
238
- backend_target: str
239
- backend_model: str
240
- backend_info: dict[str, Any] = Field(
241
- default_factory=dict,
242
- )
271
+ self.startup_completed = True
243
272
 
273
+ async def _processing_shutdown(self):
274
+ if self.backend_started:
275
+ await self.backend.process_shutdown()
276
+ self.backend_started = False
244
277
 
245
- class GenerativeRequestsWorker(RequestsWorker[GenerationRequest, ResponseSummary]):
246
- """
247
- A class that handles the execution of requests using a backend.
248
- This class is responsible for sending requests to the backend,
249
- handling responses, and managing errors.
278
+ if self.messaging_started:
279
+ await self.messaging.stop()
280
+ self.messaging_started = False
250
281
 
251
- :param backend: The backend to use for handling requests.
252
- This should be an instance of Backend such as an OpenAIHTTPBackend.
253
- """
282
+ self.startup_completed = False
254
283
 
255
- def __init__(self, backend: Backend):
256
- self.backend = backend
257
-
258
- @property
259
- def description(self) -> GenerativeRequestsWorkerDescription:
260
- """
261
- Get the description of the worker.
262
- :return: The description of the worker.
284
+ async def _process_requests_loop(self):
263
285
  """
264
- return GenerativeRequestsWorkerDescription(
265
- backend_type=self.backend.type_,
266
- backend_target=self.backend.target,
267
- backend_model=self.backend.model or "None",
268
- backend_info=self.backend.info,
269
- )
286
+ Process requests continuously until cancelled with concurrency limits.
270
287
 
271
- async def prepare_multiprocessing(self):
288
+ Schedules and processes requests according to the timing strategy while
289
+ maintaining the configured concurrency limit through semaphore coordination.
272
290
  """
273
- Prepare the worker for multiprocessing.
274
- This is useful for workers that have instance state that can not
275
- be shared across processes and should be cleared out and re-initialized
276
- for each new process.
277
- """
278
- await self.backend.prepare_multiprocessing()
291
+ try:
292
+ # Run request processing
293
+ async_semaphore = asyncio.Semaphore(self.async_limit)
294
+ pending_tasks: set[asyncio.Task] = set()
295
+
296
+ def _task_done(task):
297
+ pending_tasks.discard(task)
298
+ async_semaphore.release()
299
+
300
+ if not task.cancelled() and (exception := task.exception()):
301
+ raise exception
302
+
303
+ # Main loop; loop until canceled
304
+ while True:
305
+ await async_semaphore.acquire()
306
+ request_time = await self.strategy.next_request_time(
307
+ worker_index=self.worker_index
308
+ )
279
309
 
280
- def process_loop_asynchronous(
281
- self,
282
- queues: MPQueues[GenerationRequest, ResponseSummary],
283
- strategy: SchedulingStrategy,
284
- stop_event: Event,
285
- max_concurrency: int,
286
- process_id: int,
287
- num_processes: int,
288
- ):
289
- asyncio.run(self.backend.validate())
290
- super().process_loop_asynchronous(
291
- queues=queues,
292
- strategy=strategy,
293
- stop_event=stop_event,
294
- max_concurrency=max_concurrency,
295
- process_id=process_id,
296
- num_processes=num_processes,
297
- )
310
+ if (
311
+ time_until := request_time - time.time()
312
+ ) >= self.fut_scheduling_time_limit:
313
+ await asyncio.sleep(time_until - self.fut_scheduling_time_limit)
298
314
 
299
- async def resolve(
300
- self,
301
- request: GenerationRequest,
302
- timeout_time: float,
303
- ) -> tuple[ResolveStatus, ResponseSummary]:
304
- """
305
- Resolve a request by sending it to the backend and handling the response.
306
- This method sends the request to the backend, waits for a response,
307
- and handles any errors that may occur during the process.
308
-
309
- :param request: The request to resolve.
310
- :param timeout_time: The time to wait for a response before timing out.
311
- If timeout_time is math.inf, the request will not timeout.
312
- :return: A ResponseSummary object containing the response from the backend.
313
- If an error occurs, the ResponseSummary will contain the error message.
315
+ request_task = asyncio.create_task(
316
+ self._process_next_request(target_start=request_time)
317
+ )
318
+ pending_tasks.add(request_task)
319
+ request_task.add_done_callback(_task_done)
320
+ except asyncio.CancelledError as err:
321
+ for task in pending_tasks:
322
+ task.cancel()
323
+ await asyncio.gather(*pending_tasks, return_exceptions=True)
324
+
325
+ raise err
326
+
327
+ async def _cancel_requests_loop(self):
328
+ """Cancel all remaining queued requests until worker process terminates."""
329
+ while True:
330
+ try:
331
+ request: RequestT | MultiTurnRequestT[RequestT]
332
+ request_info: RequestInfo
333
+ request, request_info = await self.messaging.get(
334
+ timeout=self.messaging.poll_interval
335
+ )
336
+ except asyncio.TimeoutError:
337
+ continue
338
+
339
+ request_info.scheduler_node_id = self.messaging.worker_index or -1
340
+ request_info.error = "Request was cancelled"
341
+ request_info.timings.resolve_end = time.time()
342
+ self._send_update("cancelled", None, request, request_info)
343
+
344
+ async def _process_next_request(self, target_start: float):
314
345
  """
315
- resolve_start_time = time.time()
316
- response = None
317
- error: Optional[str] = None
318
- status = ResolveStatus(
319
- requested=False,
320
- completed=False,
321
- errored=False,
322
- canceled=False,
323
- request_start=-1,
324
- request_end=-1,
325
- )
346
+ Process a single request from queue to completion.
326
347
 
327
- try:
328
- if timeout_time < time.time():
329
- raise asyncio.TimeoutError(
330
- "The timeout time has already passed."
331
- ) # exit early
332
-
333
- status.requested = True
334
- request_func, request_kwargs = self._create_request_func_kwargs(request)
335
-
336
- async def _runner():
337
- # wrap function so we can enforce timeout and
338
- # still return the latest state from the backend
339
- async for resp in request_func(**request_kwargs): # type: ignore[operator]
340
- nonlocal response
341
- response = resp
342
-
343
- await asyncio.wait_for(
344
- _runner(),
345
- timeout=timeout_time - time.time() if timeout_time < math.inf else None,
346
- )
348
+ Retrieves request from messaging queue, applies timing strategy, processes
349
+ through backend, and publishes status updates throughout the lifecycle.
347
350
 
348
- if not response:
349
- raise ValueError(
350
- f"No response received for request: {request} "
351
- f"and backend: {self.backend}"
352
- )
353
- if not isinstance(response, ResponseSummary):
354
- raise ValueError(
355
- f"Received no ResponseSummary for request: {request} "
356
- f"and backend: {self.backend}, received: {response}"
357
- )
351
+ :param target_start: Unix timestamp when request should begin processing
352
+ """
353
+ request: RequestT | MultiTurnRequestT[RequestT] | None = None
354
+ request_info: RequestInfo | None = None
355
+ response: ResponseT | None = None
358
356
 
359
- status.completed = True
360
- except asyncio.TimeoutError:
361
- error = "TimeoutError: The request timed out before completing."
362
- status.errored = True
363
- status.canceled = True
357
+ try:
358
+ # Pull request from the queue, update state, and send "pending" update
359
+ request, request_info = await self._dequeue_next_request(target_start)
360
+
361
+ # Schedule the request and send "in_progress" update
362
+ await self._schedule_request(request, request_info, target_start)
363
+
364
+ async for resp, info in self.backend.resolve( # type: ignore[attr-defined]
365
+ request, request_info, None
366
+ ):
367
+ response = resp
368
+ request_info = info
369
+ if request_info is None:
370
+ raise RuntimeError("Received invalid request info from backend")
371
+
372
+ # Complete the request
373
+ request_info.timings.resolve_end = time.time()
374
+ self._send_update("completed", response, request, request_info)
375
+
376
+ response = request = request_info = None
377
+ except asyncio.CancelledError:
378
+ # Handle cancellation
379
+ if request is not None and request_info is not None:
380
+ request_info.error = "Request was cancelled"
381
+ request_info.timings.resolve_end = time.time()
382
+ self._send_update("cancelled", response, request, request_info)
383
+ raise
364
384
  except Exception as exc: # noqa: BLE001
365
- error = str(exc)
366
- status.errored = True
367
-
368
- return self._handle_response(
369
- status=status,
370
- request=request,
371
- response=response,
372
- error=error,
373
- resolve_start_time=resolve_start_time,
374
- )
385
+ if request is not None and request_info is not None:
386
+ request_info.error = repr(exc)
387
+ request_info.traceback = traceback.format_exc()
388
+ request_info.timings.resolve_end = time.time()
389
+ self._send_update("errored", response, request, request_info)
390
+ finally:
391
+ if request_info is not None:
392
+ self.strategy.request_completed(request_info)
393
+
394
+ async def _dequeue_next_request(
395
+ self, target_start: float
396
+ ) -> tuple[RequestT, RequestInfo]:
397
+ request, request_info = await self.messaging.get()
398
+ dequeued_time = time.time() # Ensure accurate dequeue timing
399
+ if request is None or request_info is None:
400
+ raise RuntimeError("Received invalid request or request info")
401
+ if isinstance(request, list | tuple):
402
+ raise NotImplementedError("Multi-turn requests are not yet supported")
403
+
404
+ request_info.timings.dequeued = dequeued_time
405
+ request_info.scheduler_node_id = self.messaging.worker_index or -1
406
+ request_info.timings.targeted_start = target_start
407
+ self._send_update("pending", None, request, request_info)
408
+ return request, request_info
409
+
410
+ async def _schedule_request(
411
+ self, request: RequestT, request_info: RequestInfo, target_start: float
412
+ ):
413
+ request_info.timings.scheduled_at = request_info.timings.dequeued
414
+ if target_start > (current_time := time.time()):
415
+ await asyncio.sleep(target_start - current_time)
416
+ # Adapt delay so that scheduled at reflects the sleep time
417
+ request_info.timings.scheduled_at = target_start
418
+
419
+ # Process the request with the backend
420
+ request_info.timings.resolve_start = time.time()
421
+ self._send_update("in_progress", None, request, request_info)
375
422
 
376
- def _create_request_func_kwargs(
423
+ def _send_update(
377
424
  self,
378
- request: GenerationRequest,
379
- ) -> tuple[
380
- AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None],
381
- dict[str, Any],
382
- ]:
383
- request_func: AsyncGenerator[
384
- Union[StreamingTextResponse, ResponseSummary], None
385
- ]
386
- request_kwargs: dict[str, Any]
387
-
388
- if request.request_type == "text_completions":
389
- request_func = self.backend.text_completions # type: ignore[assignment]
390
- request_kwargs = {
391
- "prompt": request.content,
392
- "request_id": request.request_id,
393
- "prompt_token_count": request.stats.get("prompt_tokens", None),
394
- "output_token_count": request.constraints.get("output_tokens", None),
395
- **request.params,
396
- }
397
- elif request.request_type == "chat_completions":
398
- request_func = self.backend.chat_completions # type: ignore[assignment]
399
- request_kwargs = {
400
- "content": request.content,
401
- "request_id": request.request_id,
402
- "prompt_token_count": request.stats.get("prompt_tokens", None),
403
- "output_token_count": request.constraints.get("output_tokens", None),
404
- **request.params,
405
- }
406
- else:
407
- raise ValueError(
408
- f"Invalid request type: {request.request_type} for {request}"
409
- )
425
+ new_status: Literal[
426
+ "pending", "in_progress", "completed", "errored", "cancelled"
427
+ ],
428
+ response: ResponseT | None,
429
+ request: RequestT | MultiTurnRequestT[RequestT],
430
+ request_info: RequestInfo,
431
+ ):
432
+ """
433
+ Publish request status update through messaging system.
410
434
 
411
- return request_func, request_kwargs
435
+ Updates request status and publishes to messaging queue for coordinator
436
+ consumption. Prevents duplicate status updates for the same state.
412
437
 
413
- def _handle_response(
414
- self,
415
- status: ResolveStatus,
416
- request: GenerationRequest,
417
- response: Any,
418
- error: Optional[str],
419
- resolve_start_time: float,
420
- ) -> tuple[ResolveStatus, ResponseSummary]:
421
- if response is None or not isinstance(
422
- response, (ResponseSummary, StreamingTextResponse)
423
- ):
424
- # nothing received or invalid response, fill in defaults for error
425
- if response:
426
- error = str(
427
- ValueError(
428
- f"Invalid response: {type(response)} for request: {request}; "
429
- )
430
- ) + (error or "")
431
-
432
- response = ResponseSummary(
433
- value="",
434
- request_args=RequestArgs(
435
- target=self.backend.target,
436
- headers={},
437
- params={},
438
- payload={},
439
- ),
440
- start_time=resolve_start_time,
441
- end_time=status.request_end,
442
- first_iter_time=None,
443
- last_iter_time=None,
444
- request_id=request.request_id,
445
- error=error or "Unknown error",
446
- )
447
- elif isinstance(response, StreamingTextResponse):
448
- response = ResponseSummary(
449
- value=response.value,
450
- request_args=RequestArgs(
451
- target=self.backend.target,
452
- headers={},
453
- params={},
454
- payload={},
455
- ),
456
- start_time=response.start_time,
457
- end_time=time.time(),
458
- first_iter_time=response.first_iter_time,
459
- last_iter_time=response.time if response.iter_count > 0 else None,
460
- request_prompt_tokens=request.stats.get("prompt_tokens", None),
461
- request_output_tokens=request.constraints.get("output_tokens", None),
462
- response_prompt_tokens=None,
463
- response_output_tokens=response.iter_count,
464
- request_id=request.request_id,
465
- error=error or "Unknown error",
466
- )
438
+ :param new_status: New status for the request
439
+ :param response: Response object if available, None otherwise
440
+ :param request: Request object being processed
441
+ :param request_info: Request metadata and timing information
442
+ :raises Exception: If messaging system fails to publish the update
443
+ """
444
+ prev_status = request_info.status
467
445
 
468
- response.error = error
469
- status.request_start = response.start_time
470
- status.request_end = response.end_time
446
+ if new_status == prev_status:
447
+ # already sent this update, don't send again
448
+ return
471
449
 
472
- return status, response
450
+ try:
451
+ request_info.status = new_status
452
+ request_info = (
453
+ request_info.model_copy()
454
+ if new_status not in {"completed", "errored", "cancelled"}
455
+ else request_info # last update, don't need to copy
456
+ )
457
+ self.messaging.put_sync(
458
+ (response, request, request_info),
459
+ timeout=-1,
460
+ )
461
+ prev_status = new_status
462
+ except Exception as exc:
463
+ # Reset status to last one that succeeded or started function with
464
+ # Calling logic can retry after handling error, if possible
465
+ request_info.status = prev_status
466
+ raise exc