guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
guidellm/scheduler/worker.py
CHANGED
|
@@ -1,472 +1,466 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import time
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from collections.abc import AsyncGenerator
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from itertools import islice
|
|
8
|
-
from threading import Event
|
|
9
|
-
from typing import (
|
|
10
|
-
Any,
|
|
11
|
-
Generic,
|
|
12
|
-
Literal,
|
|
13
|
-
Optional,
|
|
14
|
-
Union,
|
|
15
|
-
)
|
|
1
|
+
"""
|
|
2
|
+
Worker process implementation for distributed request execution and coordination.
|
|
16
3
|
|
|
17
|
-
|
|
18
|
-
from
|
|
4
|
+
Manages individual worker processes within the scheduler system, handling request
|
|
5
|
+
lifecycle from queue consumption through backend processing and status publication.
|
|
6
|
+
Workers coordinate with other processes through barriers and events, apply timing
|
|
7
|
+
strategies for request scheduling, maintain concurrency limits, and publish real-time
|
|
8
|
+
status updates throughout request processing.
|
|
9
|
+
"""
|
|
19
10
|
|
|
20
|
-
from
|
|
21
|
-
Backend,
|
|
22
|
-
BackendType,
|
|
23
|
-
RequestArgs,
|
|
24
|
-
ResponseSummary,
|
|
25
|
-
StreamingTextResponse,
|
|
26
|
-
)
|
|
27
|
-
from guidellm.objects import StandardBaseModel
|
|
28
|
-
from guidellm.request import GenerationRequest
|
|
29
|
-
from guidellm.request.types import RequestT, ResponseT
|
|
30
|
-
from guidellm.scheduler.queues import MPQueues, Queue, QueueEmpty
|
|
31
|
-
from guidellm.scheduler.result import (
|
|
32
|
-
SchedulerRequestInfo,
|
|
33
|
-
WorkerProcessRequest,
|
|
34
|
-
WorkerProcessResult,
|
|
35
|
-
)
|
|
36
|
-
from guidellm.scheduler.strategy import SchedulingStrategy
|
|
11
|
+
from __future__ import annotations
|
|
37
12
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
]
|
|
13
|
+
import asyncio
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
from multiprocessing.synchronize import Barrier as ProcessingBarrier
|
|
17
|
+
from multiprocessing.synchronize import Event as ProcessingEvent
|
|
18
|
+
from typing import Annotated, Generic, Literal
|
|
45
19
|
|
|
20
|
+
try:
|
|
21
|
+
import uvloop
|
|
46
22
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
canceled: bool
|
|
23
|
+
HAS_UVLOOP: Annotated[
|
|
24
|
+
bool, "Flag indicating uvloop availability for event loop optimization"
|
|
25
|
+
] = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
uvloop = None # type: ignore[assignment] # Optional dependency
|
|
53
28
|
|
|
54
|
-
|
|
55
|
-
request_end: float
|
|
29
|
+
HAS_UVLOOP = False
|
|
56
30
|
|
|
57
31
|
|
|
58
|
-
|
|
59
|
-
|
|
32
|
+
from guidellm.scheduler.schemas import (
|
|
33
|
+
BackendInterface,
|
|
34
|
+
MultiTurnRequestT,
|
|
35
|
+
RequestT,
|
|
36
|
+
ResponseT,
|
|
37
|
+
)
|
|
38
|
+
from guidellm.scheduler.strategies import SchedulingStrategy
|
|
39
|
+
from guidellm.schemas import RequestInfo
|
|
40
|
+
from guidellm.utils import (
|
|
41
|
+
InterProcessMessaging,
|
|
42
|
+
wait_for_sync_barrier,
|
|
43
|
+
wait_for_sync_event,
|
|
44
|
+
wait_for_sync_objects,
|
|
45
|
+
)
|
|
60
46
|
|
|
47
|
+
__all__ = ["WorkerProcess"]
|
|
61
48
|
|
|
62
|
-
|
|
49
|
+
|
|
50
|
+
class WorkerProcess(Generic[RequestT, ResponseT]):
|
|
63
51
|
"""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
52
|
+
Worker process for distributed request execution in the scheduler system.
|
|
53
|
+
|
|
54
|
+
Manages complete request lifecycle including queue consumption, backend processing,
|
|
55
|
+
timing strategy application, and status publication. Coordinates with other workers
|
|
56
|
+
through synchronization primitives while maintaining concurrency limits and handling
|
|
57
|
+
graceful shutdown scenarios including errors and cancellations.
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
::
|
|
61
|
+
worker = WorkerProcess(
|
|
62
|
+
worker_index=0,
|
|
63
|
+
messaging=messaging_interface,
|
|
64
|
+
backend=backend_instance,
|
|
65
|
+
strategy=timing_strategy,
|
|
66
|
+
async_limit=10,
|
|
67
|
+
fut_scheduling_time_limit=5.0,
|
|
68
|
+
startup_barrier=barrier,
|
|
69
|
+
requests_generated_event=generated_event,
|
|
70
|
+
constraint_reached_event=constraint_event,
|
|
71
|
+
shutdown_event=shutdown,
|
|
72
|
+
error_event=error,
|
|
73
|
+
)
|
|
74
|
+
worker.run()
|
|
71
75
|
"""
|
|
72
76
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
worker_index: int,
|
|
80
|
+
messaging: InterProcessMessaging[
|
|
81
|
+
tuple[
|
|
82
|
+
ResponseT | None,
|
|
83
|
+
RequestT | MultiTurnRequestT[RequestT],
|
|
84
|
+
RequestInfo,
|
|
85
|
+
],
|
|
86
|
+
tuple[
|
|
87
|
+
RequestT | MultiTurnRequestT[RequestT],
|
|
88
|
+
RequestInfo,
|
|
89
|
+
],
|
|
90
|
+
],
|
|
91
|
+
backend: BackendInterface[RequestT, ResponseT],
|
|
92
|
+
strategy: SchedulingStrategy,
|
|
93
|
+
async_limit: int,
|
|
94
|
+
fut_scheduling_time_limit: float,
|
|
95
|
+
startup_barrier: ProcessingBarrier,
|
|
96
|
+
requests_generated_event: ProcessingEvent,
|
|
97
|
+
constraint_reached_event: ProcessingEvent,
|
|
98
|
+
shutdown_event: ProcessingEvent,
|
|
99
|
+
error_event: ProcessingEvent,
|
|
100
|
+
):
|
|
76
101
|
"""
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
102
|
+
Initialize worker process instance.
|
|
103
|
+
|
|
104
|
+
:param worker_index: Unique identifier for this worker within the process group
|
|
105
|
+
:param messaging: Inter-process messaging interface for request coordination
|
|
106
|
+
:param backend: Backend interface for processing requests
|
|
107
|
+
:param strategy: Scheduling strategy for determining request timing
|
|
108
|
+
:param async_limit: Maximum concurrent requests this worker can process
|
|
109
|
+
:param fut_scheduling_time_limit: Maximum time in seconds to schedule requests
|
|
110
|
+
into the future
|
|
111
|
+
:param startup_barrier: Synchronization barrier for coordinated startup
|
|
112
|
+
:param requests_generated_event: Event signaling request generation completion
|
|
113
|
+
:param constraint_reached_event: Event signaling processing constraint reached
|
|
114
|
+
:param shutdown_event: Event signaling graceful shutdown request
|
|
115
|
+
:param error_event: Event signaling error conditions across processes
|
|
80
116
|
"""
|
|
81
|
-
|
|
117
|
+
self.worker_index = worker_index
|
|
118
|
+
self.messaging = messaging
|
|
119
|
+
self.backend = backend
|
|
120
|
+
self.strategy = strategy
|
|
121
|
+
self.async_limit = async_limit
|
|
122
|
+
self.fut_scheduling_time_limit = fut_scheduling_time_limit
|
|
123
|
+
self.startup_barrier = startup_barrier
|
|
124
|
+
self.requests_generated_event = requests_generated_event
|
|
125
|
+
self.constraint_reached_event = constraint_reached_event
|
|
126
|
+
self.shutdown_event = shutdown_event
|
|
127
|
+
self.error_event = error_event
|
|
128
|
+
|
|
129
|
+
# Internal states
|
|
130
|
+
self.startup_completed = False
|
|
131
|
+
self.backend_started = False
|
|
132
|
+
self.messaging_started = False
|
|
133
|
+
|
|
134
|
+
def run(self):
|
|
135
|
+
"""
|
|
136
|
+
Main entry point for worker process execution.
|
|
137
|
+
|
|
138
|
+
Initializes asyncio event loop with optional uvloop optimization and executes
|
|
139
|
+
worker async operations. Handles event loop cleanup and error propagation.
|
|
82
140
|
|
|
83
|
-
|
|
84
|
-
async def prepare_multiprocessing(self):
|
|
141
|
+
:raises RuntimeError: If worker encounters unrecoverable error during execution
|
|
85
142
|
"""
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
143
|
+
try:
|
|
144
|
+
if HAS_UVLOOP:
|
|
145
|
+
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
146
|
+
asyncio.run(self.run_async())
|
|
147
|
+
except Exception as err:
|
|
148
|
+
self.error_event.set()
|
|
149
|
+
raise RuntimeError(
|
|
150
|
+
f"Worker process {self.messaging.worker_index} encountered an "
|
|
151
|
+
f"error: {err}"
|
|
152
|
+
) from err
|
|
153
|
+
|
|
154
|
+
async def run_async(self):
|
|
90
155
|
"""
|
|
91
|
-
|
|
156
|
+
Execute main asynchronous worker process logic.
|
|
92
157
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
"""
|
|
100
|
-
An abstract method that must be implemented by subclasses.
|
|
101
|
-
This method should handle the resolution of a request through asyncio,
|
|
102
|
-
including any necessary backend processing and response handling.
|
|
103
|
-
|
|
104
|
-
:param request: The request to be resolved generated by the load generator.
|
|
105
|
-
:param timeout_time: The timeout time for the request, if there is no timeout
|
|
106
|
-
given, then this will be math.inf.
|
|
107
|
-
:return: The response from the worker.
|
|
158
|
+
Orchestrates concurrent execution of request processing and shutdown monitoring.
|
|
159
|
+
Handles task cleanup, error propagation, and cancellation coordination when any
|
|
160
|
+
task completes or encounters an error.
|
|
161
|
+
|
|
162
|
+
:raises RuntimeError: If worker tasks encounter unrecoverable errors
|
|
163
|
+
:raises asyncio.CancelledError: If worker process was cancelled
|
|
108
164
|
"""
|
|
109
|
-
|
|
165
|
+
stop_task = asyncio.create_task(self._stop_monitor())
|
|
166
|
+
request_proc_task = asyncio.create_task(self._process_requests())
|
|
167
|
+
caller_cancelled = False
|
|
110
168
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
169
|
+
try:
|
|
170
|
+
await asyncio.wait(
|
|
171
|
+
[stop_task, request_proc_task],
|
|
172
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
173
|
+
)
|
|
174
|
+
except asyncio.CancelledError:
|
|
175
|
+
caller_cancelled = True
|
|
117
176
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
process_request: WorkerProcessRequest[RequestT, ResponseT],
|
|
121
|
-
dequeued_time: float,
|
|
122
|
-
start_time: float,
|
|
123
|
-
results_queue: Queue[WorkerProcessResult[RequestT, ResponseT]],
|
|
124
|
-
process_id: int,
|
|
125
|
-
):
|
|
126
|
-
request = process_request.request
|
|
127
|
-
timeout_time = process_request.timeout_time
|
|
128
|
-
queued_time = process_request.queued_time
|
|
129
|
-
|
|
130
|
-
info = SchedulerRequestInfo(
|
|
131
|
-
targeted_start_time=start_time,
|
|
132
|
-
queued_time=queued_time,
|
|
133
|
-
dequeued_time=dequeued_time,
|
|
134
|
-
scheduled_time=time.time(),
|
|
135
|
-
process_id=process_id,
|
|
136
|
-
)
|
|
137
|
-
result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult(
|
|
138
|
-
type_="request_scheduled",
|
|
139
|
-
request=request,
|
|
140
|
-
response=None,
|
|
141
|
-
info=info,
|
|
142
|
-
)
|
|
143
|
-
asyncio.create_task(self.send_result(results_queue, result))
|
|
177
|
+
stop_task.cancel()
|
|
178
|
+
request_proc_task.cancel()
|
|
144
179
|
|
|
145
|
-
|
|
146
|
-
|
|
180
|
+
try:
|
|
181
|
+
# Ensure all child tasks cancel correctly
|
|
182
|
+
await asyncio.wait(
|
|
183
|
+
[stop_task, request_proc_task], return_when=asyncio.ALL_COMPLETED
|
|
184
|
+
)
|
|
185
|
+
except asyncio.CancelledError:
|
|
186
|
+
caller_cancelled = True
|
|
187
|
+
|
|
188
|
+
if (
|
|
189
|
+
task_err := (
|
|
190
|
+
request_proc_task.exception()
|
|
191
|
+
if not request_proc_task.cancelled()
|
|
192
|
+
else stop_task.exception()
|
|
193
|
+
if not stop_task.cancelled()
|
|
194
|
+
else None
|
|
195
|
+
)
|
|
196
|
+
) is not None:
|
|
197
|
+
raise RuntimeError(
|
|
198
|
+
f"Worker process {self.messaging.worker_index} encountered an "
|
|
199
|
+
f"error: {task_err}"
|
|
200
|
+
) from task_err
|
|
147
201
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
type_="request_start",
|
|
151
|
-
request=request,
|
|
152
|
-
response=None,
|
|
153
|
-
info=info,
|
|
154
|
-
)
|
|
155
|
-
asyncio.create_task(self.send_result(results_queue, result))
|
|
156
|
-
|
|
157
|
-
status, response = await self.resolve(request, timeout_time)
|
|
158
|
-
info.worker_end = time.time()
|
|
159
|
-
info.requested = status.requested
|
|
160
|
-
info.completed = status.completed
|
|
161
|
-
info.errored = status.errored
|
|
162
|
-
info.canceled = status.canceled
|
|
163
|
-
info.request_start = status.request_start
|
|
164
|
-
info.request_end = status.request_end
|
|
165
|
-
result = WorkerProcessResult(
|
|
166
|
-
type_="request_complete",
|
|
167
|
-
request=request,
|
|
168
|
-
response=response,
|
|
169
|
-
info=info,
|
|
170
|
-
)
|
|
171
|
-
asyncio.create_task(self.send_result(results_queue, result))
|
|
202
|
+
if caller_cancelled:
|
|
203
|
+
raise asyncio.CancelledError("Worker process was cancelled")
|
|
172
204
|
|
|
173
|
-
def
|
|
205
|
+
async def _stop_monitor(
|
|
174
206
|
self,
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
207
|
+
) -> None:
|
|
208
|
+
"""
|
|
209
|
+
Monitor shutdown and error events for worker termination.
|
|
210
|
+
:raises RuntimeError if the work process received an error signal.
|
|
211
|
+
"""
|
|
212
|
+
exit_key = await wait_for_sync_objects(
|
|
213
|
+
{
|
|
214
|
+
"error_event": self.error_event,
|
|
215
|
+
"shutdown_event": self.shutdown_event,
|
|
216
|
+
},
|
|
217
|
+
poll_interval=self.messaging.poll_interval,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if exit_key == "error_event":
|
|
221
|
+
raise RuntimeError(
|
|
222
|
+
f"Worker process {self.messaging.worker_index} received error signal."
|
|
189
223
|
)
|
|
190
224
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
start_time = next(times_iter)
|
|
195
|
-
|
|
196
|
-
# Yield control to the event loop. Sleep if we are way ahead
|
|
197
|
-
await asyncio.sleep(start_time - time.time() - 1)
|
|
198
|
-
await lock.acquire()
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
process_request = queues.requests.get_nowait()
|
|
202
|
-
dequeued_time = time.time()
|
|
203
|
-
except QueueEmpty:
|
|
204
|
-
lock.release()
|
|
205
|
-
continue
|
|
206
|
-
|
|
207
|
-
def _request_callback(
|
|
208
|
-
_: asyncio.Future[WorkerProcessRequest[RequestT, ResponseT]],
|
|
209
|
-
):
|
|
210
|
-
nonlocal lock
|
|
211
|
-
lock.release()
|
|
212
|
-
|
|
213
|
-
task = asyncio.create_task(
|
|
214
|
-
self.resolve_scheduler_request(
|
|
215
|
-
process_request=process_request,
|
|
216
|
-
dequeued_time=dequeued_time,
|
|
217
|
-
start_time=start_time,
|
|
218
|
-
results_queue=queues.responses,
|
|
219
|
-
process_id=process_id,
|
|
220
|
-
)
|
|
221
|
-
)
|
|
222
|
-
task.add_done_callback(_request_callback)
|
|
223
|
-
start_time = None
|
|
225
|
+
async def _process_requests(self):
|
|
226
|
+
"""
|
|
227
|
+
Manage request processing lifecycle from startup to shutdown.
|
|
224
228
|
|
|
229
|
+
Coordinates startup synchronization, processes requests until constraints are
|
|
230
|
+
reached, then cancels pending requests until shutdown or error occurs.
|
|
231
|
+
"""
|
|
225
232
|
try:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
233
|
+
# 1. Start up synchronization (backend, messaging, and other processes)
|
|
234
|
+
# 2. Messaging startup, receive requests until requests_generated event
|
|
235
|
+
await self._processing_startup()
|
|
236
|
+
|
|
237
|
+
# 3. Run process requests loop until constraint_reached event
|
|
238
|
+
processing_task = asyncio.create_task(self._process_requests_loop())
|
|
239
|
+
await wait_for_sync_event(
|
|
240
|
+
self.constraint_reached_event,
|
|
241
|
+
poll_interval=self.messaging.poll_interval,
|
|
232
242
|
)
|
|
243
|
+
processing_task.cancel()
|
|
244
|
+
|
|
245
|
+
# 4. Cancel pending requests until proc canceled (manual, shutdown, error)
|
|
246
|
+
await self._cancel_requests_loop()
|
|
247
|
+
finally:
|
|
248
|
+
# 5. On cancel, shut down event, error event, or internal error:
|
|
249
|
+
# attempt to shut down this worker cleanly (stop backend and messaging)
|
|
250
|
+
await self._processing_shutdown()
|
|
251
|
+
|
|
252
|
+
async def _processing_startup(self):
|
|
253
|
+
"""Initialize backend, messaging, and synchronize with other workers."""
|
|
254
|
+
# Get backend ready
|
|
255
|
+
await self.backend.process_startup()
|
|
256
|
+
self.backend_started = True
|
|
257
|
+
await self.backend.validate()
|
|
258
|
+
|
|
259
|
+
# Get messaging system ready
|
|
260
|
+
await self.messaging.start(
|
|
261
|
+
receive_stop_criteria=[self.requests_generated_event]
|
|
262
|
+
)
|
|
263
|
+
self.messaging_started = True
|
|
233
264
|
|
|
265
|
+
# Wait for all processes to be ready
|
|
266
|
+
await wait_for_sync_barrier(
|
|
267
|
+
self.startup_barrier,
|
|
268
|
+
poll_interval=self.messaging.poll_interval,
|
|
269
|
+
)
|
|
234
270
|
|
|
235
|
-
|
|
236
|
-
type_: Literal["generative_requests_worker"] = "generative_requests_worker" # type: ignore[assignment]
|
|
237
|
-
backend_type: BackendType
|
|
238
|
-
backend_target: str
|
|
239
|
-
backend_model: str
|
|
240
|
-
backend_info: dict[str, Any] = Field(
|
|
241
|
-
default_factory=dict,
|
|
242
|
-
)
|
|
271
|
+
self.startup_completed = True
|
|
243
272
|
|
|
273
|
+
async def _processing_shutdown(self):
|
|
274
|
+
if self.backend_started:
|
|
275
|
+
await self.backend.process_shutdown()
|
|
276
|
+
self.backend_started = False
|
|
244
277
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
This class is responsible for sending requests to the backend,
|
|
249
|
-
handling responses, and managing errors.
|
|
278
|
+
if self.messaging_started:
|
|
279
|
+
await self.messaging.stop()
|
|
280
|
+
self.messaging_started = False
|
|
250
281
|
|
|
251
|
-
|
|
252
|
-
This should be an instance of Backend such as an OpenAIHTTPBackend.
|
|
253
|
-
"""
|
|
282
|
+
self.startup_completed = False
|
|
254
283
|
|
|
255
|
-
def
|
|
256
|
-
self.backend = backend
|
|
257
|
-
|
|
258
|
-
@property
|
|
259
|
-
def description(self) -> GenerativeRequestsWorkerDescription:
|
|
260
|
-
"""
|
|
261
|
-
Get the description of the worker.
|
|
262
|
-
:return: The description of the worker.
|
|
284
|
+
async def _process_requests_loop(self):
|
|
263
285
|
"""
|
|
264
|
-
|
|
265
|
-
backend_type=self.backend.type_,
|
|
266
|
-
backend_target=self.backend.target,
|
|
267
|
-
backend_model=self.backend.model or "None",
|
|
268
|
-
backend_info=self.backend.info,
|
|
269
|
-
)
|
|
286
|
+
Process requests continuously until cancelled with concurrency limits.
|
|
270
287
|
|
|
271
|
-
|
|
288
|
+
Schedules and processes requests according to the timing strategy while
|
|
289
|
+
maintaining the configured concurrency limit through semaphore coordination.
|
|
272
290
|
"""
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
291
|
+
try:
|
|
292
|
+
# Run request processing
|
|
293
|
+
async_semaphore = asyncio.Semaphore(self.async_limit)
|
|
294
|
+
pending_tasks: set[asyncio.Task] = set()
|
|
295
|
+
|
|
296
|
+
def _task_done(task):
|
|
297
|
+
pending_tasks.discard(task)
|
|
298
|
+
async_semaphore.release()
|
|
299
|
+
|
|
300
|
+
if not task.cancelled() and (exception := task.exception()):
|
|
301
|
+
raise exception
|
|
302
|
+
|
|
303
|
+
# Main loop; loop until canceled
|
|
304
|
+
while True:
|
|
305
|
+
await async_semaphore.acquire()
|
|
306
|
+
request_time = await self.strategy.next_request_time(
|
|
307
|
+
worker_index=self.worker_index
|
|
308
|
+
)
|
|
279
309
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
stop_event: Event,
|
|
285
|
-
max_concurrency: int,
|
|
286
|
-
process_id: int,
|
|
287
|
-
num_processes: int,
|
|
288
|
-
):
|
|
289
|
-
asyncio.run(self.backend.validate())
|
|
290
|
-
super().process_loop_asynchronous(
|
|
291
|
-
queues=queues,
|
|
292
|
-
strategy=strategy,
|
|
293
|
-
stop_event=stop_event,
|
|
294
|
-
max_concurrency=max_concurrency,
|
|
295
|
-
process_id=process_id,
|
|
296
|
-
num_processes=num_processes,
|
|
297
|
-
)
|
|
310
|
+
if (
|
|
311
|
+
time_until := request_time - time.time()
|
|
312
|
+
) >= self.fut_scheduling_time_limit:
|
|
313
|
+
await asyncio.sleep(time_until - self.fut_scheduling_time_limit)
|
|
298
314
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
315
|
+
request_task = asyncio.create_task(
|
|
316
|
+
self._process_next_request(target_start=request_time)
|
|
317
|
+
)
|
|
318
|
+
pending_tasks.add(request_task)
|
|
319
|
+
request_task.add_done_callback(_task_done)
|
|
320
|
+
except asyncio.CancelledError as err:
|
|
321
|
+
for task in pending_tasks:
|
|
322
|
+
task.cancel()
|
|
323
|
+
await asyncio.gather(*pending_tasks, return_exceptions=True)
|
|
324
|
+
|
|
325
|
+
raise err
|
|
326
|
+
|
|
327
|
+
async def _cancel_requests_loop(self):
|
|
328
|
+
"""Cancel all remaining queued requests until worker process terminates."""
|
|
329
|
+
while True:
|
|
330
|
+
try:
|
|
331
|
+
request: RequestT | MultiTurnRequestT[RequestT]
|
|
332
|
+
request_info: RequestInfo
|
|
333
|
+
request, request_info = await self.messaging.get(
|
|
334
|
+
timeout=self.messaging.poll_interval
|
|
335
|
+
)
|
|
336
|
+
except asyncio.TimeoutError:
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
request_info.scheduler_node_id = self.messaging.worker_index or -1
|
|
340
|
+
request_info.error = "Request was cancelled"
|
|
341
|
+
request_info.timings.resolve_end = time.time()
|
|
342
|
+
self._send_update("cancelled", None, request, request_info)
|
|
343
|
+
|
|
344
|
+
async def _process_next_request(self, target_start: float):
|
|
314
345
|
"""
|
|
315
|
-
|
|
316
|
-
response = None
|
|
317
|
-
error: Optional[str] = None
|
|
318
|
-
status = ResolveStatus(
|
|
319
|
-
requested=False,
|
|
320
|
-
completed=False,
|
|
321
|
-
errored=False,
|
|
322
|
-
canceled=False,
|
|
323
|
-
request_start=-1,
|
|
324
|
-
request_end=-1,
|
|
325
|
-
)
|
|
346
|
+
Process a single request from queue to completion.
|
|
326
347
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
raise asyncio.TimeoutError(
|
|
330
|
-
"The timeout time has already passed."
|
|
331
|
-
) # exit early
|
|
332
|
-
|
|
333
|
-
status.requested = True
|
|
334
|
-
request_func, request_kwargs = self._create_request_func_kwargs(request)
|
|
335
|
-
|
|
336
|
-
async def _runner():
|
|
337
|
-
# wrap function so we can enforce timeout and
|
|
338
|
-
# still return the latest state from the backend
|
|
339
|
-
async for resp in request_func(**request_kwargs): # type: ignore[operator]
|
|
340
|
-
nonlocal response
|
|
341
|
-
response = resp
|
|
342
|
-
|
|
343
|
-
await asyncio.wait_for(
|
|
344
|
-
_runner(),
|
|
345
|
-
timeout=timeout_time - time.time() if timeout_time < math.inf else None,
|
|
346
|
-
)
|
|
348
|
+
Retrieves request from messaging queue, applies timing strategy, processes
|
|
349
|
+
through backend, and publishes status updates throughout the lifecycle.
|
|
347
350
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
if not isinstance(response, ResponseSummary):
|
|
354
|
-
raise ValueError(
|
|
355
|
-
f"Received no ResponseSummary for request: {request} "
|
|
356
|
-
f"and backend: {self.backend}, received: {response}"
|
|
357
|
-
)
|
|
351
|
+
:param target_start: Unix timestamp when request should begin processing
|
|
352
|
+
"""
|
|
353
|
+
request: RequestT | MultiTurnRequestT[RequestT] | None = None
|
|
354
|
+
request_info: RequestInfo | None = None
|
|
355
|
+
response: ResponseT | None = None
|
|
358
356
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
357
|
+
try:
|
|
358
|
+
# Pull request from the queue, update state, and send "pending" update
|
|
359
|
+
request, request_info = await self._dequeue_next_request(target_start)
|
|
360
|
+
|
|
361
|
+
# Schedule the request and send "in_progress" update
|
|
362
|
+
await self._schedule_request(request, request_info, target_start)
|
|
363
|
+
|
|
364
|
+
async for resp, info in self.backend.resolve( # type: ignore[attr-defined]
|
|
365
|
+
request, request_info, None
|
|
366
|
+
):
|
|
367
|
+
response = resp
|
|
368
|
+
request_info = info
|
|
369
|
+
if request_info is None:
|
|
370
|
+
raise RuntimeError("Received invalid request info from backend")
|
|
371
|
+
|
|
372
|
+
# Complete the request
|
|
373
|
+
request_info.timings.resolve_end = time.time()
|
|
374
|
+
self._send_update("completed", response, request, request_info)
|
|
375
|
+
|
|
376
|
+
response = request = request_info = None
|
|
377
|
+
except asyncio.CancelledError:
|
|
378
|
+
# Handle cancellation
|
|
379
|
+
if request is not None and request_info is not None:
|
|
380
|
+
request_info.error = "Request was cancelled"
|
|
381
|
+
request_info.timings.resolve_end = time.time()
|
|
382
|
+
self._send_update("cancelled", response, request, request_info)
|
|
383
|
+
raise
|
|
364
384
|
except Exception as exc: # noqa: BLE001
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
385
|
+
if request is not None and request_info is not None:
|
|
386
|
+
request_info.error = repr(exc)
|
|
387
|
+
request_info.traceback = traceback.format_exc()
|
|
388
|
+
request_info.timings.resolve_end = time.time()
|
|
389
|
+
self._send_update("errored", response, request, request_info)
|
|
390
|
+
finally:
|
|
391
|
+
if request_info is not None:
|
|
392
|
+
self.strategy.request_completed(request_info)
|
|
393
|
+
|
|
394
|
+
async def _dequeue_next_request(
|
|
395
|
+
self, target_start: float
|
|
396
|
+
) -> tuple[RequestT, RequestInfo]:
|
|
397
|
+
request, request_info = await self.messaging.get()
|
|
398
|
+
dequeued_time = time.time() # Ensure accurate dequeue timing
|
|
399
|
+
if request is None or request_info is None:
|
|
400
|
+
raise RuntimeError("Received invalid request or request info")
|
|
401
|
+
if isinstance(request, list | tuple):
|
|
402
|
+
raise NotImplementedError("Multi-turn requests are not yet supported")
|
|
403
|
+
|
|
404
|
+
request_info.timings.dequeued = dequeued_time
|
|
405
|
+
request_info.scheduler_node_id = self.messaging.worker_index or -1
|
|
406
|
+
request_info.timings.targeted_start = target_start
|
|
407
|
+
self._send_update("pending", None, request, request_info)
|
|
408
|
+
return request, request_info
|
|
409
|
+
|
|
410
|
+
async def _schedule_request(
|
|
411
|
+
self, request: RequestT, request_info: RequestInfo, target_start: float
|
|
412
|
+
):
|
|
413
|
+
request_info.timings.scheduled_at = request_info.timings.dequeued
|
|
414
|
+
if target_start > (current_time := time.time()):
|
|
415
|
+
await asyncio.sleep(target_start - current_time)
|
|
416
|
+
# Adapt delay so that scheduled at reflects the sleep time
|
|
417
|
+
request_info.timings.scheduled_at = target_start
|
|
418
|
+
|
|
419
|
+
# Process the request with the backend
|
|
420
|
+
request_info.timings.resolve_start = time.time()
|
|
421
|
+
self._send_update("in_progress", None, request, request_info)
|
|
375
422
|
|
|
376
|
-
def
|
|
423
|
+
def _send_update(
|
|
377
424
|
self,
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
if request.request_type == "text_completions":
|
|
389
|
-
request_func = self.backend.text_completions # type: ignore[assignment]
|
|
390
|
-
request_kwargs = {
|
|
391
|
-
"prompt": request.content,
|
|
392
|
-
"request_id": request.request_id,
|
|
393
|
-
"prompt_token_count": request.stats.get("prompt_tokens", None),
|
|
394
|
-
"output_token_count": request.constraints.get("output_tokens", None),
|
|
395
|
-
**request.params,
|
|
396
|
-
}
|
|
397
|
-
elif request.request_type == "chat_completions":
|
|
398
|
-
request_func = self.backend.chat_completions # type: ignore[assignment]
|
|
399
|
-
request_kwargs = {
|
|
400
|
-
"content": request.content,
|
|
401
|
-
"request_id": request.request_id,
|
|
402
|
-
"prompt_token_count": request.stats.get("prompt_tokens", None),
|
|
403
|
-
"output_token_count": request.constraints.get("output_tokens", None),
|
|
404
|
-
**request.params,
|
|
405
|
-
}
|
|
406
|
-
else:
|
|
407
|
-
raise ValueError(
|
|
408
|
-
f"Invalid request type: {request.request_type} for {request}"
|
|
409
|
-
)
|
|
425
|
+
new_status: Literal[
|
|
426
|
+
"pending", "in_progress", "completed", "errored", "cancelled"
|
|
427
|
+
],
|
|
428
|
+
response: ResponseT | None,
|
|
429
|
+
request: RequestT | MultiTurnRequestT[RequestT],
|
|
430
|
+
request_info: RequestInfo,
|
|
431
|
+
):
|
|
432
|
+
"""
|
|
433
|
+
Publish request status update through messaging system.
|
|
410
434
|
|
|
411
|
-
|
|
435
|
+
Updates request status and publishes to messaging queue for coordinator
|
|
436
|
+
consumption. Prevents duplicate status updates for the same state.
|
|
412
437
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
) -> tuple[ResolveStatus, ResponseSummary]:
|
|
421
|
-
if response is None or not isinstance(
|
|
422
|
-
response, (ResponseSummary, StreamingTextResponse)
|
|
423
|
-
):
|
|
424
|
-
# nothing received or invalid response, fill in defaults for error
|
|
425
|
-
if response:
|
|
426
|
-
error = str(
|
|
427
|
-
ValueError(
|
|
428
|
-
f"Invalid response: {type(response)} for request: {request}; "
|
|
429
|
-
)
|
|
430
|
-
) + (error or "")
|
|
431
|
-
|
|
432
|
-
response = ResponseSummary(
|
|
433
|
-
value="",
|
|
434
|
-
request_args=RequestArgs(
|
|
435
|
-
target=self.backend.target,
|
|
436
|
-
headers={},
|
|
437
|
-
params={},
|
|
438
|
-
payload={},
|
|
439
|
-
),
|
|
440
|
-
start_time=resolve_start_time,
|
|
441
|
-
end_time=status.request_end,
|
|
442
|
-
first_iter_time=None,
|
|
443
|
-
last_iter_time=None,
|
|
444
|
-
request_id=request.request_id,
|
|
445
|
-
error=error or "Unknown error",
|
|
446
|
-
)
|
|
447
|
-
elif isinstance(response, StreamingTextResponse):
|
|
448
|
-
response = ResponseSummary(
|
|
449
|
-
value=response.value,
|
|
450
|
-
request_args=RequestArgs(
|
|
451
|
-
target=self.backend.target,
|
|
452
|
-
headers={},
|
|
453
|
-
params={},
|
|
454
|
-
payload={},
|
|
455
|
-
),
|
|
456
|
-
start_time=response.start_time,
|
|
457
|
-
end_time=time.time(),
|
|
458
|
-
first_iter_time=response.first_iter_time,
|
|
459
|
-
last_iter_time=response.time if response.iter_count > 0 else None,
|
|
460
|
-
request_prompt_tokens=request.stats.get("prompt_tokens", None),
|
|
461
|
-
request_output_tokens=request.constraints.get("output_tokens", None),
|
|
462
|
-
response_prompt_tokens=None,
|
|
463
|
-
response_output_tokens=response.iter_count,
|
|
464
|
-
request_id=request.request_id,
|
|
465
|
-
error=error or "Unknown error",
|
|
466
|
-
)
|
|
438
|
+
:param new_status: New status for the request
|
|
439
|
+
:param response: Response object if available, None otherwise
|
|
440
|
+
:param request: Request object being processed
|
|
441
|
+
:param request_info: Request metadata and timing information
|
|
442
|
+
:raises Exception: If messaging system fails to publish the update
|
|
443
|
+
"""
|
|
444
|
+
prev_status = request_info.status
|
|
467
445
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
446
|
+
if new_status == prev_status:
|
|
447
|
+
# already sent this update, don't send again
|
|
448
|
+
return
|
|
471
449
|
|
|
472
|
-
|
|
450
|
+
try:
|
|
451
|
+
request_info.status = new_status
|
|
452
|
+
request_info = (
|
|
453
|
+
request_info.model_copy()
|
|
454
|
+
if new_status not in {"completed", "errored", "cancelled"}
|
|
455
|
+
else request_info # last update, don't need to copy
|
|
456
|
+
)
|
|
457
|
+
self.messaging.put_sync(
|
|
458
|
+
(response, request, request_info),
|
|
459
|
+
timeout=-1,
|
|
460
|
+
)
|
|
461
|
+
prev_status = new_status
|
|
462
|
+
except Exception as exc:
|
|
463
|
+
# Reset status to last one that succeeded or started function with
|
|
464
|
+
# Calling logic can retry after handling error, if possible
|
|
465
|
+
request_info.status = prev_status
|
|
466
|
+
raise exc
|