opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opengris-scaler might be problematic. Click here for more details.
- opengris_scaler-1.12.28.dist-info/METADATA +728 -0
- opengris_scaler-1.12.28.dist-info/RECORD +187 -0
- opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +210 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +658 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +115 -0
- scaler/cluster/combo.py +150 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/defaults.py +94 -0
- scaler/config/loader.py +96 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +55 -0
- scaler/config/section/ecs_worker_adapter.py +85 -0
- scaler/config/section/native_worker_adapter.py +43 -0
- scaler/config/section/object_storage_server.py +8 -0
- scaler/config/section/scheduler.py +54 -0
- scaler/config/section/symphony_worker_adapter.py +47 -0
- scaler/config/section/top.py +13 -0
- scaler/config/section/webui.py +21 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +62 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +133 -0
- scaler/entry_points/object_storage_server.py +45 -0
- scaler/entry_points/scheduler.py +144 -0
- scaler/entry_points/top.py +286 -0
- scaler/entry_points/webui.py +48 -0
- scaler/entry_points/worker_adapter_ecs.py +191 -0
- scaler/entry_points/worker_adapter_native.py +137 -0
- scaler/entry_points/worker_adapter_symphony.py +98 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +247 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/constants.py +9 -0
- scaler/ui/live_display.py +147 -0
- scaler/ui/memory_window.py +146 -0
- scaler/ui/setting_page.py +40 -0
- scaler/ui/task_graph.py +832 -0
- scaler/ui/task_log.py +107 -0
- scaler/ui/utility.py +66 -0
- scaler/ui/webui.py +147 -0
- scaler/ui/worker_processors.py +104 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +107 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +269 -0
- scaler/worker_adapter/native.py +155 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +139 -0
- src/scaler/io/ymq/_ymq.so +0 -0
- src/scaler/object_storage/object_storage_server.so +0 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import tblib.pickling_support
|
|
6
|
+
|
|
7
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
8
|
+
|
|
9
|
+
# from scaler.utility.logging.utility import setup_logger
|
|
10
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
|
|
11
|
+
from scaler.protocol.python.common import ObjectMetadata, TaskResultType
|
|
12
|
+
from scaler.protocol.python.message import ObjectInstruction, ProcessorInitialized, Task, TaskResult
|
|
13
|
+
from scaler.utility.exceptions import ProcessorDiedError
|
|
14
|
+
from scaler.utility.identifiers import ObjectID, ProcessorID, TaskID, WorkerID
|
|
15
|
+
from scaler.utility.metadata.profile_result import ProfileResult
|
|
16
|
+
from scaler.utility.serialization import serialize_failure
|
|
17
|
+
from scaler.worker.agent.mixins import HeartbeatManager, ProcessorManager, ProfilingManager, TaskManager
|
|
18
|
+
from scaler.worker.agent.processor_holder import ProcessorHolder
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class VanillaProcessorManager(ProcessorManager):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
identity: WorkerID,
|
|
25
|
+
event_loop: str,
|
|
26
|
+
address_internal: ZMQConfig,
|
|
27
|
+
scheduler_address: ZMQConfig,
|
|
28
|
+
preload: Optional[str],
|
|
29
|
+
garbage_collect_interval_seconds: int,
|
|
30
|
+
trim_memory_threshold_bytes: int,
|
|
31
|
+
hard_processor_suspend: bool,
|
|
32
|
+
logging_paths: Tuple[str, ...],
|
|
33
|
+
logging_level: str,
|
|
34
|
+
):
|
|
35
|
+
tblib.pickling_support.install()
|
|
36
|
+
|
|
37
|
+
self._identity = identity
|
|
38
|
+
self._event_loop = event_loop
|
|
39
|
+
self._scheduler_address = scheduler_address
|
|
40
|
+
self._preload = preload
|
|
41
|
+
|
|
42
|
+
self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
|
|
43
|
+
self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
|
|
44
|
+
self._hard_processor_suspend = hard_processor_suspend
|
|
45
|
+
self._logging_paths = logging_paths
|
|
46
|
+
self._logging_level = logging_level
|
|
47
|
+
|
|
48
|
+
self._heartbeat_manager: Optional[HeartbeatManager] = None
|
|
49
|
+
self._task_manager: Optional[TaskManager] = None
|
|
50
|
+
self._profiling_manager: Optional[ProfilingManager] = None
|
|
51
|
+
self._connector_external: Optional[AsyncConnector] = None
|
|
52
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
53
|
+
|
|
54
|
+
self._address_internal: ZMQConfig = address_internal
|
|
55
|
+
|
|
56
|
+
self._current_holder: Optional[ProcessorHolder] = None
|
|
57
|
+
self._suspended_holders_by_task_id: Dict[bytes, ProcessorHolder] = {}
|
|
58
|
+
self._holders_by_processor_id: Dict[ProcessorID, ProcessorHolder] = {}
|
|
59
|
+
|
|
60
|
+
self._can_accept_task_lock: asyncio.Lock = asyncio.Lock()
|
|
61
|
+
|
|
62
|
+
self._binder_internal: Optional[AsyncBinder] = None
|
|
63
|
+
|
|
64
|
+
def register(
|
|
65
|
+
self,
|
|
66
|
+
heartbeat_manager: HeartbeatManager,
|
|
67
|
+
task_manager: TaskManager,
|
|
68
|
+
profiling_manager: ProfilingManager,
|
|
69
|
+
connector_external: AsyncConnector,
|
|
70
|
+
binder_internal: AsyncBinder,
|
|
71
|
+
connector_storage: AsyncObjectStorageConnector,
|
|
72
|
+
):
|
|
73
|
+
self._heartbeat_manager = heartbeat_manager
|
|
74
|
+
self._task_manager = task_manager
|
|
75
|
+
self._profiling_manager = profiling_manager
|
|
76
|
+
self._connector_external = connector_external
|
|
77
|
+
self._binder_internal = binder_internal
|
|
78
|
+
self._connector_storage = connector_storage
|
|
79
|
+
|
|
80
|
+
async def initialize(self):
|
|
81
|
+
await self._can_accept_task_lock.acquire() # prevents any processor to accept task until initialized
|
|
82
|
+
|
|
83
|
+
await self._connector_storage.wait_until_connected()
|
|
84
|
+
|
|
85
|
+
self.__start_new_processor() # we can start the processor now that we know the storage address.
|
|
86
|
+
|
|
87
|
+
def can_accept_task(self) -> bool:
|
|
88
|
+
return not self._can_accept_task_lock.locked()
|
|
89
|
+
|
|
90
|
+
async def wait_until_can_accept_task(self):
|
|
91
|
+
"""
|
|
92
|
+
Makes sure a processor is ready to start processing a new or suspended task.
|
|
93
|
+
|
|
94
|
+
Must be called before any call to `on_task()` or `on_task_resume()`.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
await self._can_accept_task_lock.acquire()
|
|
98
|
+
|
|
99
|
+
async def on_processor_initialized(self, processor_id: ProcessorID, processor_initialized: ProcessorInitialized):
|
|
100
|
+
assert self._current_holder is not None
|
|
101
|
+
|
|
102
|
+
if self._current_holder.initialized():
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
self._holders_by_processor_id[processor_id] = self._current_holder
|
|
106
|
+
self._current_holder.initialize(processor_id)
|
|
107
|
+
|
|
108
|
+
self._can_accept_task_lock.release()
|
|
109
|
+
|
|
110
|
+
async def on_task(self, task: Task) -> bool:
|
|
111
|
+
assert self._can_accept_task_lock.locked()
|
|
112
|
+
assert self.current_processor_is_initialized()
|
|
113
|
+
|
|
114
|
+
holder = self._current_holder
|
|
115
|
+
|
|
116
|
+
assert holder.task() is None
|
|
117
|
+
holder.set_task(task)
|
|
118
|
+
|
|
119
|
+
self._profiling_manager.on_task_start(holder.pid(), task.task_id)
|
|
120
|
+
|
|
121
|
+
await self._binder_internal.send(holder.processor_id(), task)
|
|
122
|
+
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
async def on_cancel_task(self, task_id: TaskID) -> Optional[Task]:
|
|
126
|
+
assert self._current_holder is not None
|
|
127
|
+
|
|
128
|
+
if self.current_task_id() == task_id:
|
|
129
|
+
current_task = self.current_task()
|
|
130
|
+
self.__restart_current_processor(f"cancel task_id={task_id.hex()}")
|
|
131
|
+
return current_task
|
|
132
|
+
|
|
133
|
+
if task_id in self._suspended_holders_by_task_id:
|
|
134
|
+
suspended_holder = self._suspended_holders_by_task_id.pop(task_id)
|
|
135
|
+
task = suspended_holder.task()
|
|
136
|
+
self.__kill_processor(f"cancel suspended task_id={task_id.hex()}", suspended_holder)
|
|
137
|
+
return task
|
|
138
|
+
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
async def on_failing_processor(self, processor_id: ProcessorID, process_status: str):
|
|
142
|
+
assert self._current_holder is not None
|
|
143
|
+
|
|
144
|
+
holder = self._holders_by_processor_id.get(processor_id)
|
|
145
|
+
|
|
146
|
+
if holder is None:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
task = holder.task()
|
|
150
|
+
if task is not None:
|
|
151
|
+
profile_result = self.__end_task(holder) # profiling the task should happen before killing the processor
|
|
152
|
+
else:
|
|
153
|
+
profile_result = None
|
|
154
|
+
|
|
155
|
+
reason = f"process died {process_status=}"
|
|
156
|
+
if holder == self._current_holder:
|
|
157
|
+
self.__restart_current_processor(reason)
|
|
158
|
+
else:
|
|
159
|
+
self.__kill_processor(reason, holder)
|
|
160
|
+
|
|
161
|
+
if task is not None:
|
|
162
|
+
source = task.source
|
|
163
|
+
task_id = task.task_id
|
|
164
|
+
|
|
165
|
+
result_object_id = ObjectID.generate_object_id(source)
|
|
166
|
+
result_object_bytes = serialize_failure(ProcessorDiedError(f"{process_status=}"))
|
|
167
|
+
|
|
168
|
+
await self._connector_storage.set_object(result_object_id, result_object_bytes)
|
|
169
|
+
await self._connector_external.send(
|
|
170
|
+
ObjectInstruction.new_msg(
|
|
171
|
+
ObjectInstruction.ObjectInstructionType.Create,
|
|
172
|
+
source,
|
|
173
|
+
ObjectMetadata.new_msg((result_object_id,), (ObjectMetadata.ObjectContentType.Object,), (b"",)),
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
await self._task_manager.on_task_result(
|
|
178
|
+
TaskResult.new_msg(
|
|
179
|
+
task_id, TaskResultType.Failed, profile_result.serialize(), [bytes(result_object_id)]
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
async def on_suspend_task(self, task_id: TaskID) -> bool:
|
|
184
|
+
assert self._current_holder is not None
|
|
185
|
+
holder = self._current_holder
|
|
186
|
+
|
|
187
|
+
current_task = holder.task()
|
|
188
|
+
|
|
189
|
+
if current_task is None or current_task.task_id != task_id:
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
holder.suspend()
|
|
193
|
+
self._suspended_holders_by_task_id[task_id] = holder
|
|
194
|
+
|
|
195
|
+
logging.info(f"{self._identity!r}: suspend Processor[{holder.pid()}]")
|
|
196
|
+
|
|
197
|
+
self.__start_new_processor()
|
|
198
|
+
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
def on_resume_task(self, task_id: TaskID) -> bool:
|
|
202
|
+
assert self._can_accept_task_lock.locked()
|
|
203
|
+
assert self.current_processor_is_initialized()
|
|
204
|
+
|
|
205
|
+
if self.current_task() is not None:
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
suspended_holder = self._suspended_holders_by_task_id.pop(task_id, None)
|
|
209
|
+
|
|
210
|
+
if suspended_holder is None:
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
self.__kill_processor("replaced by suspended processor", self._current_holder)
|
|
214
|
+
|
|
215
|
+
self._current_holder = suspended_holder
|
|
216
|
+
suspended_holder.resume()
|
|
217
|
+
|
|
218
|
+
logging.info(f"{self._identity!r}: resume Processor[{self._current_holder.pid()}]")
|
|
219
|
+
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
async def on_task_result(self, processor_id: ProcessorID, task_result: TaskResult):
|
|
223
|
+
assert self._current_holder is not None
|
|
224
|
+
task_id = task_result.task_id
|
|
225
|
+
|
|
226
|
+
if task_id == self.current_task_id():
|
|
227
|
+
assert self._current_holder.processor_id() == processor_id
|
|
228
|
+
|
|
229
|
+
profile_result = self.__end_task(self._current_holder)
|
|
230
|
+
|
|
231
|
+
release_task_lock = True
|
|
232
|
+
elif task_id in self._suspended_holders_by_task_id:
|
|
233
|
+
# Receiving a task result from a suspended processor is possible as the message might have been queued while
|
|
234
|
+
# we were suspending the process.
|
|
235
|
+
|
|
236
|
+
holder = self._suspended_holders_by_task_id.pop(task_id)
|
|
237
|
+
assert holder.processor_id() == processor_id
|
|
238
|
+
|
|
239
|
+
profile_result = self.__end_task(holder)
|
|
240
|
+
|
|
241
|
+
self.__kill_processor("task finished in suspended processor", holder)
|
|
242
|
+
|
|
243
|
+
release_task_lock = False
|
|
244
|
+
else:
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
await self._task_manager.on_task_result(
|
|
248
|
+
TaskResult.new_msg(
|
|
249
|
+
task_id=task_id,
|
|
250
|
+
result_type=task_result.result_type,
|
|
251
|
+
metadata=profile_result.serialize(),
|
|
252
|
+
results=task_result.results,
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# task lock must be released after calling `TaskManager.on_task_result()`
|
|
257
|
+
if release_task_lock:
|
|
258
|
+
self._can_accept_task_lock.release()
|
|
259
|
+
|
|
260
|
+
async def on_external_object_instruction(self, instruction: ObjectInstruction):
|
|
261
|
+
for processor_id in self._holders_by_processor_id.keys():
|
|
262
|
+
await self._binder_internal.send(processor_id, instruction)
|
|
263
|
+
|
|
264
|
+
async def on_internal_object_instruction(self, processor_id: ProcessorID, instruction: ObjectInstruction):
|
|
265
|
+
if not self.__processor_ready_to_process_object(processor_id):
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
await self._connector_external.send(instruction)
|
|
269
|
+
|
|
270
|
+
def destroy(self, reason: str):
|
|
271
|
+
if self._connector_storage is not None:
|
|
272
|
+
self._connector_external.destroy()
|
|
273
|
+
|
|
274
|
+
self.__kill_all_processors(reason)
|
|
275
|
+
|
|
276
|
+
def current_processor_is_initialized(self) -> bool:
|
|
277
|
+
return self._current_holder is not None and self._current_holder.initialized()
|
|
278
|
+
|
|
279
|
+
def current_task(self) -> Optional[Task]:
|
|
280
|
+
if self._current_holder is None: # worker is not yet initialized
|
|
281
|
+
return None
|
|
282
|
+
|
|
283
|
+
return self._current_holder.task()
|
|
284
|
+
|
|
285
|
+
def current_task_id(self) -> Optional[TaskID]:
|
|
286
|
+
task = self.current_task()
|
|
287
|
+
|
|
288
|
+
if task is None:
|
|
289
|
+
return None
|
|
290
|
+
else:
|
|
291
|
+
return task.task_id
|
|
292
|
+
|
|
293
|
+
def processors(self) -> List[ProcessorHolder]:
|
|
294
|
+
return list(self._holders_by_processor_id.values())
|
|
295
|
+
|
|
296
|
+
def num_suspended_processors(self) -> int:
|
|
297
|
+
return len(self._suspended_holders_by_task_id)
|
|
298
|
+
|
|
299
|
+
def __start_new_processor(self):
|
|
300
|
+
object_storage_address = self._heartbeat_manager.get_object_storage_address()
|
|
301
|
+
|
|
302
|
+
self._current_holder = ProcessorHolder(
|
|
303
|
+
self._event_loop,
|
|
304
|
+
self._address_internal,
|
|
305
|
+
self._scheduler_address,
|
|
306
|
+
object_storage_address,
|
|
307
|
+
self._preload,
|
|
308
|
+
self._garbage_collect_interval_seconds,
|
|
309
|
+
self._trim_memory_threshold_bytes,
|
|
310
|
+
self._hard_processor_suspend,
|
|
311
|
+
self._logging_paths,
|
|
312
|
+
self._logging_level,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
processor_pid = self._current_holder.pid()
|
|
316
|
+
|
|
317
|
+
self._profiling_manager.on_process_start(processor_pid)
|
|
318
|
+
|
|
319
|
+
logging.info(f"{self._identity!r}: start Processor[{processor_pid}]")
|
|
320
|
+
|
|
321
|
+
def __kill_processor(self, reason: str, holder: ProcessorHolder):
|
|
322
|
+
processor_pid = holder.pid()
|
|
323
|
+
|
|
324
|
+
self._profiling_manager.on_process_end(processor_pid)
|
|
325
|
+
|
|
326
|
+
if holder.initialized():
|
|
327
|
+
self._holders_by_processor_id.pop(holder.processor_id(), None)
|
|
328
|
+
|
|
329
|
+
holder.kill()
|
|
330
|
+
|
|
331
|
+
logging.info(f"{self._identity!r}: stop Processor[{processor_pid}], reason: {reason}")
|
|
332
|
+
|
|
333
|
+
def __restart_current_processor(self, reason: str):
|
|
334
|
+
assert self._current_holder is not None
|
|
335
|
+
|
|
336
|
+
self.__kill_processor(reason, self._current_holder)
|
|
337
|
+
self.__start_new_processor()
|
|
338
|
+
|
|
339
|
+
def __kill_all_processors(self, reason: str):
|
|
340
|
+
if self._current_holder is not None:
|
|
341
|
+
self.__kill_processor(reason, self._current_holder)
|
|
342
|
+
self._current_holder = None
|
|
343
|
+
|
|
344
|
+
for processor_holder in self._suspended_holders_by_task_id.values():
|
|
345
|
+
self.__kill_processor(reason, processor_holder)
|
|
346
|
+
|
|
347
|
+
self._suspended_holders_by_task_id = {}
|
|
348
|
+
self._holders_by_processor_id = {}
|
|
349
|
+
|
|
350
|
+
def __end_task(self, processor_holder: ProcessorHolder) -> ProfileResult:
|
|
351
|
+
profile_result = self._profiling_manager.on_task_end(processor_holder.pid(), processor_holder.task().task_id)
|
|
352
|
+
processor_holder.set_task(None)
|
|
353
|
+
|
|
354
|
+
return profile_result
|
|
355
|
+
|
|
356
|
+
def __processor_ready_to_process_object(self, processor_id: ProcessorID) -> bool:
|
|
357
|
+
holder = self._holders_by_processor_id.get(processor_id)
|
|
358
|
+
|
|
359
|
+
if holder is None:
|
|
360
|
+
return False
|
|
361
|
+
|
|
362
|
+
assert holder.initialized()
|
|
363
|
+
|
|
364
|
+
if holder.task() is None:
|
|
365
|
+
return False
|
|
366
|
+
|
|
367
|
+
# TODO: check if the objects belong to the task
|
|
368
|
+
|
|
369
|
+
return True
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
import psutil
|
|
7
|
+
|
|
8
|
+
from scaler.utility.identifiers import TaskID
|
|
9
|
+
from scaler.utility.metadata.profile_result import ProfileResult
|
|
10
|
+
from scaler.utility.mixins import Looper
|
|
11
|
+
from scaler.worker.agent.mixins import ProfilingManager
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class _ProcessProfiler:
|
|
16
|
+
process: psutil.Process
|
|
17
|
+
|
|
18
|
+
current_task_id: Optional[TaskID] = None
|
|
19
|
+
|
|
20
|
+
start_time: Optional[float] = None
|
|
21
|
+
start_cpu_time: Optional[float] = None
|
|
22
|
+
init_memory_rss: Optional[int] = None
|
|
23
|
+
peak_memory_rss: Optional[int] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VanillaProfilingManager(ProfilingManager, Looper):
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self._process_profiler_by_pid: Dict[int, _ProcessProfiler] = {}
|
|
29
|
+
|
|
30
|
+
def on_process_start(self, pid: int):
|
|
31
|
+
if pid in self._process_profiler_by_pid:
|
|
32
|
+
raise ValueError(f"process {pid=} is already registered.")
|
|
33
|
+
|
|
34
|
+
self._process_profiler_by_pid[pid] = _ProcessProfiler(psutil.Process(pid))
|
|
35
|
+
|
|
36
|
+
def on_process_end(self, pid: int):
|
|
37
|
+
if pid not in self._process_profiler_by_pid:
|
|
38
|
+
raise ValueError(f"process {pid=} is not registered.")
|
|
39
|
+
|
|
40
|
+
self._process_profiler_by_pid.pop(pid)
|
|
41
|
+
|
|
42
|
+
def on_task_start(self, pid: int, task_id: TaskID):
|
|
43
|
+
process_profiler = self._process_profiler_by_pid.get(pid)
|
|
44
|
+
|
|
45
|
+
if process_profiler is None:
|
|
46
|
+
raise ValueError(f"process {pid=} is not registered.")
|
|
47
|
+
|
|
48
|
+
process_profiler.current_task_id = task_id
|
|
49
|
+
|
|
50
|
+
process = process_profiler.process
|
|
51
|
+
|
|
52
|
+
process_profiler.start_time = self.__process_time()
|
|
53
|
+
process_profiler.start_cpu_time = self.__process_cpu_time(process)
|
|
54
|
+
process_profiler.init_memory_rss = self.__process_memory_rss(process)
|
|
55
|
+
process_profiler.peak_memory_rss = process_profiler.init_memory_rss
|
|
56
|
+
|
|
57
|
+
def on_task_end(self, pid: int, task_id: TaskID) -> ProfileResult:
|
|
58
|
+
process_profiler = self._process_profiler_by_pid.get(pid)
|
|
59
|
+
|
|
60
|
+
if process_profiler is None:
|
|
61
|
+
raise ValueError(f"process {pid=} is not registered.")
|
|
62
|
+
|
|
63
|
+
if task_id != process_profiler.current_task_id:
|
|
64
|
+
raise ValueError(f"task {task_id=!r} is not the current task task_id={process_profiler.current_task_id!r}.")
|
|
65
|
+
|
|
66
|
+
assert process_profiler.start_time is not None
|
|
67
|
+
assert process_profiler.init_memory_rss is not None
|
|
68
|
+
assert process_profiler.peak_memory_rss is not None
|
|
69
|
+
|
|
70
|
+
process = process_profiler.process
|
|
71
|
+
|
|
72
|
+
time_delta = self.__process_time() - process_profiler.start_time
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
cpu_time_delta = self.__process_cpu_time(process) - process_profiler.start_cpu_time
|
|
76
|
+
except psutil.ZombieProcess:
|
|
77
|
+
logging.warning(f"profiling zombie process: {pid=}")
|
|
78
|
+
cpu_time_delta = 0
|
|
79
|
+
|
|
80
|
+
memory_delta = process_profiler.peak_memory_rss - process_profiler.init_memory_rss
|
|
81
|
+
|
|
82
|
+
process_profiler.current_task_id = None
|
|
83
|
+
process_profiler.init_memory_rss = None
|
|
84
|
+
process_profiler.peak_memory_rss = None
|
|
85
|
+
|
|
86
|
+
return ProfileResult(time_delta, memory_delta, cpu_time_delta)
|
|
87
|
+
|
|
88
|
+
async def routine(self):
|
|
89
|
+
for process_profiler in self._process_profiler_by_pid.values():
|
|
90
|
+
if process_profiler.current_task_id is not None:
|
|
91
|
+
try:
|
|
92
|
+
process_profiler.peak_memory_rss = max(
|
|
93
|
+
process_profiler.peak_memory_rss, self.__process_memory_rss(process_profiler.process)
|
|
94
|
+
)
|
|
95
|
+
except psutil.ZombieProcess:
|
|
96
|
+
logging.warning(f"profiling zombie process: pid={process_profiler.process.pid}")
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def __process_time():
|
|
100
|
+
return time.monotonic()
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def __process_cpu_time(process: psutil.Process) -> float:
|
|
104
|
+
cpu_times = process.cpu_times()
|
|
105
|
+
return cpu_times.user + cpu_times.system
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def __process_memory_rss(process: psutil.Process) -> int:
|
|
109
|
+
return process.memory_info().rss
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from scaler.io.mixins import AsyncConnector
|
|
4
|
+
from scaler.protocol.python.common import TaskCancelConfirmType
|
|
5
|
+
from scaler.protocol.python.message import Task, TaskCancel, TaskCancelConfirm, TaskResult
|
|
6
|
+
from scaler.utility.identifiers import TaskID
|
|
7
|
+
from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
|
|
8
|
+
from scaler.utility.mixins import Looper
|
|
9
|
+
from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
|
|
10
|
+
from scaler.worker.agent.mixins import ProcessorManager, TaskManager
|
|
11
|
+
|
|
12
|
+
_SUSPENDED_TASKS_PRIORITY = 1
|
|
13
|
+
_QUEUED_TASKS_PRIORITY = 2
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class VanillaTaskManager(Looper, TaskManager):
|
|
17
|
+
def __init__(self, task_timeout_seconds: int):
|
|
18
|
+
self._task_timeout_seconds = task_timeout_seconds
|
|
19
|
+
|
|
20
|
+
self._queued_task_id_to_task: Dict[TaskID, Task] = dict()
|
|
21
|
+
|
|
22
|
+
# Queued tasks are sorted first by task's priorities, then suspended tasks are prioritized over non yet started
|
|
23
|
+
# tasks, finally the sorted queue ensure we execute the oldest tasks first.
|
|
24
|
+
#
|
|
25
|
+
# For example, if we receive these tasks in this order:
|
|
26
|
+
# 1. Task(priority=0) [suspended]
|
|
27
|
+
# 2. Task(priority=3) [suspended]
|
|
28
|
+
# 3. Task(priority=3)
|
|
29
|
+
# 4. Task(priority=0)
|
|
30
|
+
#
|
|
31
|
+
# We want to execute the tasks in this order: 2-3-1-4.
|
|
32
|
+
self._queued_task_ids = AsyncSortedPriorityQueue()
|
|
33
|
+
|
|
34
|
+
self._processing_task_ids: Set[TaskID] = set() # Tasks associated with a processor, including suspended tasks
|
|
35
|
+
|
|
36
|
+
self._connector_external: Optional[AsyncConnector] = None
|
|
37
|
+
self._processor_manager: Optional[ProcessorManager] = None
|
|
38
|
+
|
|
39
|
+
def register(self, connector: AsyncConnector, processor_manager: ProcessorManager):
|
|
40
|
+
self._connector_external = connector
|
|
41
|
+
self._processor_manager = processor_manager
|
|
42
|
+
|
|
43
|
+
async def on_task_new(self, task: Task):
|
|
44
|
+
self.__enqueue_task(task, is_suspended=False)
|
|
45
|
+
|
|
46
|
+
await self.__suspend_if_priority_is_higher(task)
|
|
47
|
+
|
|
48
|
+
async def on_cancel_task(self, task_cancel: TaskCancel):
|
|
49
|
+
task_not_found = (
|
|
50
|
+
task_cancel.task_id not in self._processing_task_ids
|
|
51
|
+
and task_cancel.task_id not in self._queued_task_id_to_task
|
|
52
|
+
)
|
|
53
|
+
if task_not_found:
|
|
54
|
+
await self._connector_external.send(
|
|
55
|
+
TaskCancelConfirm.new_msg(
|
|
56
|
+
task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if task_cancel.task_id in self._processing_task_ids and not task_cancel.flags.force:
|
|
62
|
+
# ignore cancel task while in processing if is not force cancel
|
|
63
|
+
await self._connector_external.send(
|
|
64
|
+
TaskCancelConfirm.new_msg(
|
|
65
|
+
task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
# A suspended task will be both processing AND queued
|
|
71
|
+
|
|
72
|
+
if task_cancel.task_id in self._processing_task_ids:
|
|
73
|
+
# if task is in processing
|
|
74
|
+
self._processing_task_ids.remove(task_cancel.task_id)
|
|
75
|
+
_ = await self._processor_manager.on_cancel_task(task_cancel.task_id)
|
|
76
|
+
else:
|
|
77
|
+
# if task is queued
|
|
78
|
+
assert task_cancel.task_id in self._queued_task_id_to_task
|
|
79
|
+
self._queued_task_ids.remove(task_cancel.task_id)
|
|
80
|
+
_ = self._queued_task_id_to_task.pop(task_cancel.task_id)
|
|
81
|
+
|
|
82
|
+
await self._connector_external.send(
|
|
83
|
+
TaskCancelConfirm.new_msg(task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
async def on_task_result(self, result: TaskResult):
|
|
87
|
+
if result.task_id in self._queued_task_id_to_task:
|
|
88
|
+
# Finishing a queued task might happen if a task ended during the suspension process.
|
|
89
|
+
self._queued_task_id_to_task.pop(result.task_id)
|
|
90
|
+
self._queued_task_ids.remove(result.task_id)
|
|
91
|
+
|
|
92
|
+
self._processing_task_ids.remove(result.task_id)
|
|
93
|
+
|
|
94
|
+
await self._connector_external.send(result)
|
|
95
|
+
|
|
96
|
+
async def routine(self):
|
|
97
|
+
await self.__processing_task()
|
|
98
|
+
|
|
99
|
+
def get_queued_size(self):
|
|
100
|
+
return self._queued_task_ids.qsize()
|
|
101
|
+
|
|
102
|
+
async def __processing_task(self):
|
|
103
|
+
await self._processor_manager.wait_until_can_accept_task()
|
|
104
|
+
|
|
105
|
+
_, task_id = await self._queued_task_ids.get()
|
|
106
|
+
task = self._queued_task_id_to_task.pop(task_id)
|
|
107
|
+
|
|
108
|
+
if task_id not in self._processing_task_ids:
|
|
109
|
+
self._processing_task_ids.add(task_id)
|
|
110
|
+
await self._processor_manager.on_task(task)
|
|
111
|
+
else:
|
|
112
|
+
self._processor_manager.on_resume_task(task_id)
|
|
113
|
+
|
|
114
|
+
async def __suspend_if_priority_is_higher(self, new_task: Task):
|
|
115
|
+
current_task = self._processor_manager.current_task()
|
|
116
|
+
|
|
117
|
+
if current_task is None:
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
new_task_priority = self.__get_task_priority(new_task)
|
|
121
|
+
current_task_priority = self.__get_task_priority(current_task)
|
|
122
|
+
|
|
123
|
+
if new_task_priority <= current_task_priority:
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
self.__enqueue_task(current_task, is_suspended=True)
|
|
127
|
+
|
|
128
|
+
await self._processor_manager.on_suspend_task(current_task.task_id)
|
|
129
|
+
|
|
130
|
+
def __enqueue_task(self, task: Task, is_suspended: bool):
|
|
131
|
+
task_priority = self.__get_task_priority(task)
|
|
132
|
+
|
|
133
|
+
# Higher-priority tasks have a higher priority value. But as the queue is sorted by increasing order, we negate
|
|
134
|
+
# the inserted value that it will be at the head of the queue.
|
|
135
|
+
if is_suspended:
|
|
136
|
+
queue_priority = (-task_priority, _SUSPENDED_TASKS_PRIORITY)
|
|
137
|
+
else:
|
|
138
|
+
queue_priority = (-task_priority, _QUEUED_TASKS_PRIORITY)
|
|
139
|
+
|
|
140
|
+
self._queued_task_ids.put_nowait((queue_priority, task.task_id))
|
|
141
|
+
self._queued_task_id_to_task[task.task_id] = task
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def __get_task_priority(task: Task) -> int:
|
|
145
|
+
priority = retrieve_task_flags_from_task(task).priority
|
|
146
|
+
|
|
147
|
+
if priority < 0:
|
|
148
|
+
raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
|
|
149
|
+
|
|
150
|
+
return priority
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
from scaler.utility.mixins import Looper
|
|
4
|
+
from scaler.worker.agent.mixins import TimeoutManager
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VanillaTimeoutManager(Looper, TimeoutManager):
|
|
8
|
+
def __init__(self, death_timeout_seconds: int):
|
|
9
|
+
self._death_timeout_seconds = death_timeout_seconds
|
|
10
|
+
self._last_seen_time = time.time()
|
|
11
|
+
|
|
12
|
+
def update_last_seen_time(self):
|
|
13
|
+
self._last_seen_time = time.time()
|
|
14
|
+
|
|
15
|
+
async def routine(self):
|
|
16
|
+
if (time.time() - self._last_seen_time) < self._death_timeout_seconds:
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
raise TimeoutError("timeout when connect to scheduler, quitting")
|