opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opengris-scaler might be problematic. Click here for more details.
- opengris_scaler-1.12.28.dist-info/METADATA +728 -0
- opengris_scaler-1.12.28.dist-info/RECORD +187 -0
- opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +210 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +658 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +115 -0
- scaler/cluster/combo.py +150 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/defaults.py +94 -0
- scaler/config/loader.py +96 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +55 -0
- scaler/config/section/ecs_worker_adapter.py +85 -0
- scaler/config/section/native_worker_adapter.py +43 -0
- scaler/config/section/object_storage_server.py +8 -0
- scaler/config/section/scheduler.py +54 -0
- scaler/config/section/symphony_worker_adapter.py +47 -0
- scaler/config/section/top.py +13 -0
- scaler/config/section/webui.py +21 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +62 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +133 -0
- scaler/entry_points/object_storage_server.py +45 -0
- scaler/entry_points/scheduler.py +144 -0
- scaler/entry_points/top.py +286 -0
- scaler/entry_points/webui.py +48 -0
- scaler/entry_points/worker_adapter_ecs.py +191 -0
- scaler/entry_points/worker_adapter_native.py +137 -0
- scaler/entry_points/worker_adapter_symphony.py +98 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +247 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/constants.py +9 -0
- scaler/ui/live_display.py +147 -0
- scaler/ui/memory_window.py +146 -0
- scaler/ui/setting_page.py +40 -0
- scaler/ui/task_graph.py +832 -0
- scaler/ui/task_log.py +107 -0
- scaler/ui/utility.py +66 -0
- scaler/ui/webui.py +147 -0
- scaler/ui/worker_processors.py +104 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +107 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +269 -0
- scaler/worker_adapter/native.py +155 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +139 -0
- src/scaler/io/ymq/_ymq.so +0 -0
- src/scaler/object_storage/object_storage_server.so +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import zmq.asyncio
|
|
6
|
+
|
|
7
|
+
from scaler.config.defaults import CLEANUP_INTERVAL_SECONDS, STATUS_REPORT_INTERVAL_SECONDS
|
|
8
|
+
from scaler.config.section.scheduler import SchedulerConfig
|
|
9
|
+
from scaler.config.types.zmq import ZMQConfig, ZMQType
|
|
10
|
+
from scaler.io.async_binder import ZMQAsyncBinder
|
|
11
|
+
from scaler.io.async_connector import ZMQAsyncConnector
|
|
12
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
|
|
13
|
+
from scaler.io.utility import create_async_object_storage_connector
|
|
14
|
+
from scaler.protocol.python.common import ObjectStorageAddress
|
|
15
|
+
from scaler.protocol.python.message import (
|
|
16
|
+
ClientDisconnect,
|
|
17
|
+
ClientHeartbeat,
|
|
18
|
+
DisconnectRequest,
|
|
19
|
+
GraphTask,
|
|
20
|
+
InformationRequest,
|
|
21
|
+
ObjectInstruction,
|
|
22
|
+
Task,
|
|
23
|
+
TaskCancel,
|
|
24
|
+
TaskCancelConfirm,
|
|
25
|
+
TaskLog,
|
|
26
|
+
TaskResult,
|
|
27
|
+
WorkerHeartbeat,
|
|
28
|
+
)
|
|
29
|
+
from scaler.protocol.python.mixins import Message
|
|
30
|
+
from scaler.scheduler.controllers.balance_controller import VanillaBalanceController
|
|
31
|
+
from scaler.scheduler.controllers.client_controller import VanillaClientController
|
|
32
|
+
from scaler.scheduler.controllers.config_controller import VanillaConfigController
|
|
33
|
+
from scaler.scheduler.controllers.graph_controller import VanillaGraphTaskController
|
|
34
|
+
from scaler.scheduler.controllers.information_controller import VanillaInformationController
|
|
35
|
+
from scaler.scheduler.controllers.object_controller import VanillaObjectController
|
|
36
|
+
from scaler.scheduler.controllers.scaling_policies.utility import create_scaling_controller
|
|
37
|
+
from scaler.scheduler.controllers.task_controller import VanillaTaskController
|
|
38
|
+
from scaler.scheduler.controllers.worker_controller import VanillaWorkerController
|
|
39
|
+
from scaler.utility.event_loop import create_async_loop_routine
|
|
40
|
+
from scaler.utility.exceptions import ClientShutdownException
|
|
41
|
+
from scaler.utility.identifiers import ClientID, WorkerID
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Scheduler:
|
|
45
|
+
def __init__(self, config: SchedulerConfig):
|
|
46
|
+
self._config_controller = VanillaConfigController(config)
|
|
47
|
+
|
|
48
|
+
if config.scheduler_address.type != ZMQType.tcp:
|
|
49
|
+
raise TypeError(
|
|
50
|
+
f"{self.__class__.__name__}: scheduler address must be tcp type: \
|
|
51
|
+
{config.scheduler_address.to_address()}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if config.object_storage_address is None:
|
|
55
|
+
object_storage_address = ObjectStorageAddress.new_msg(
|
|
56
|
+
host=config.scheduler_address.host, port=config.scheduler_address.port + 1
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
object_storage_address = ObjectStorageAddress.new_msg(
|
|
60
|
+
host=config.object_storage_address.host, port=config.object_storage_address.port
|
|
61
|
+
)
|
|
62
|
+
self._config_controller.update_config("object_storage_address", object_storage_address)
|
|
63
|
+
|
|
64
|
+
if config.monitor_address is None:
|
|
65
|
+
monitor_address = ZMQConfig(
|
|
66
|
+
type=ZMQType.tcp, host=config.scheduler_address.host, port=config.scheduler_address.port + 2
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
monitor_address = config.monitor_address
|
|
70
|
+
self._config_controller.update_config("monitor_address", monitor_address)
|
|
71
|
+
|
|
72
|
+
self._context = zmq.asyncio.Context(io_threads=config.io_threads)
|
|
73
|
+
|
|
74
|
+
self._binder: AsyncBinder = ZMQAsyncBinder(
|
|
75
|
+
context=self._context, name="scheduler", address=config.scheduler_address
|
|
76
|
+
)
|
|
77
|
+
logging.info(f"{self.__class__.__name__}: listen to scheduler address {config.scheduler_address}")
|
|
78
|
+
|
|
79
|
+
self._connector_storage: AsyncObjectStorageConnector = create_async_object_storage_connector()
|
|
80
|
+
logging.info(f"{self.__class__.__name__}: connect to object storage server {object_storage_address!r}")
|
|
81
|
+
|
|
82
|
+
self._binder_monitor: AsyncConnector = ZMQAsyncConnector(
|
|
83
|
+
context=self._context,
|
|
84
|
+
name="scheduler_monitor",
|
|
85
|
+
socket_type=zmq.PUB,
|
|
86
|
+
address=monitor_address,
|
|
87
|
+
bind_or_connect="bind",
|
|
88
|
+
callback=None,
|
|
89
|
+
identity=None,
|
|
90
|
+
)
|
|
91
|
+
logging.info(f"{self.__class__.__name__}: listen to scheduler monitor address {monitor_address.to_address()}")
|
|
92
|
+
|
|
93
|
+
self._task_allocate_policy = config.allocate_policy.value()
|
|
94
|
+
|
|
95
|
+
self._client_manager = VanillaClientController(config_controller=self._config_controller)
|
|
96
|
+
self._object_controller = VanillaObjectController(config_controller=self._config_controller)
|
|
97
|
+
self._graph_controller = VanillaGraphTaskController(config_controller=self._config_controller)
|
|
98
|
+
self._task_controller = VanillaTaskController(config_controller=self._config_controller)
|
|
99
|
+
self._worker_controller = VanillaWorkerController(
|
|
100
|
+
config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
|
|
101
|
+
)
|
|
102
|
+
self._balance_controller = VanillaBalanceController(
|
|
103
|
+
config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
|
|
104
|
+
)
|
|
105
|
+
self._information_controller = VanillaInformationController(config_controller=self._config_controller)
|
|
106
|
+
self._scaling_controller = create_scaling_controller(
|
|
107
|
+
config.scaling_controller_strategy, config.adapter_webhook_urls
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# register
|
|
111
|
+
self._binder.register(self.on_receive_message)
|
|
112
|
+
self._client_manager.register(
|
|
113
|
+
self._binder, self._binder_monitor, self._object_controller, self._task_controller, self._worker_controller
|
|
114
|
+
)
|
|
115
|
+
self._object_controller.register(
|
|
116
|
+
self._binder, self._binder_monitor, self._connector_storage, self._client_manager, self._worker_controller
|
|
117
|
+
)
|
|
118
|
+
self._graph_controller.register(
|
|
119
|
+
self._binder,
|
|
120
|
+
self._binder_monitor,
|
|
121
|
+
self._connector_storage,
|
|
122
|
+
self._client_manager,
|
|
123
|
+
self._task_controller,
|
|
124
|
+
self._object_controller,
|
|
125
|
+
)
|
|
126
|
+
self._task_controller.register(
|
|
127
|
+
self._binder,
|
|
128
|
+
self._binder_monitor,
|
|
129
|
+
self._client_manager,
|
|
130
|
+
self._object_controller,
|
|
131
|
+
self._worker_controller,
|
|
132
|
+
self._graph_controller,
|
|
133
|
+
)
|
|
134
|
+
self._worker_controller.register(self._binder, self._binder_monitor, self._task_controller)
|
|
135
|
+
self._balance_controller.register(self._binder, self._binder_monitor, self._task_controller)
|
|
136
|
+
|
|
137
|
+
self._information_controller.register_managers(
|
|
138
|
+
self._binder_monitor,
|
|
139
|
+
self._binder,
|
|
140
|
+
self._client_manager,
|
|
141
|
+
self._object_controller,
|
|
142
|
+
self._task_controller,
|
|
143
|
+
self._worker_controller,
|
|
144
|
+
self._scaling_controller,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def connect_to_storage(self):
|
|
148
|
+
object_storage_address = self._config_controller.get_config("object_storage_address")
|
|
149
|
+
await self._connector_storage.connect(object_storage_address.host, object_storage_address.port)
|
|
150
|
+
|
|
151
|
+
async def on_receive_message(self, source: bytes, message: Message):
|
|
152
|
+
# =====================================================================================
|
|
153
|
+
# client manager
|
|
154
|
+
if isinstance(message, ClientHeartbeat):
|
|
155
|
+
await self._client_manager.on_heartbeat(ClientID(source), message)
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
# scheduler receives client shutdown request from upstream
|
|
159
|
+
if isinstance(message, ClientDisconnect):
|
|
160
|
+
await self._client_manager.on_client_disconnect(ClientID(source), message)
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
# =====================================================================================
|
|
164
|
+
# graph manager
|
|
165
|
+
if isinstance(message, GraphTask):
|
|
166
|
+
await self._graph_controller.on_graph_task(ClientID(source), message)
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
# =====================================================================================
|
|
170
|
+
# task manager
|
|
171
|
+
if isinstance(message, Task):
|
|
172
|
+
await self._task_controller.on_task_new(message)
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
if isinstance(message, TaskCancel):
|
|
176
|
+
if self._graph_controller.is_graph_subtask(message.task_id):
|
|
177
|
+
await self._graph_controller.on_graph_task_cancel(message)
|
|
178
|
+
else:
|
|
179
|
+
await self._task_controller.on_task_cancel(ClientID(source), message)
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
if isinstance(message, TaskCancelConfirm):
|
|
183
|
+
await self._task_controller.on_task_cancel_confirm(message)
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
if isinstance(message, TaskResult):
|
|
187
|
+
await self._task_controller.on_task_result(message)
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if isinstance(message, TaskLog):
|
|
191
|
+
client = self._client_manager.get_client_id(message.task_id)
|
|
192
|
+
if client is not None:
|
|
193
|
+
await self._binder.send(client, message)
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
# =====================================================================================
|
|
197
|
+
# worker manager
|
|
198
|
+
if isinstance(message, WorkerHeartbeat):
|
|
199
|
+
await self._worker_controller.on_heartbeat(WorkerID(source), message)
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
# scheduler receives worker disconnect request from downstream
|
|
203
|
+
if isinstance(message, DisconnectRequest):
|
|
204
|
+
await self._worker_controller.on_disconnect(WorkerID(source), message)
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
# =====================================================================================
|
|
208
|
+
# object manager
|
|
209
|
+
if isinstance(message, ObjectInstruction):
|
|
210
|
+
await self._object_controller.on_object_instruction(source, message)
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
# =====================================================================================
|
|
214
|
+
# information manager
|
|
215
|
+
if isinstance(message, InformationRequest):
|
|
216
|
+
await self._information_controller.on_request(message)
|
|
217
|
+
|
|
218
|
+
logging.error(f"{self.__class__.__name__}: unknown message from {source=}: {message}")
|
|
219
|
+
|
|
220
|
+
async def get_loops(self):
|
|
221
|
+
await self.connect_to_storage()
|
|
222
|
+
|
|
223
|
+
loops = [
|
|
224
|
+
create_async_loop_routine(self._binder.routine, 0),
|
|
225
|
+
create_async_loop_routine(self._connector_storage.routine, 0),
|
|
226
|
+
create_async_loop_routine(self._graph_controller.routine, 0),
|
|
227
|
+
create_async_loop_routine(
|
|
228
|
+
self._balance_controller.routine, self._config_controller.get_config("load_balance_seconds")
|
|
229
|
+
),
|
|
230
|
+
create_async_loop_routine(self._client_manager.routine, CLEANUP_INTERVAL_SECONDS),
|
|
231
|
+
create_async_loop_routine(self._object_controller.routine, CLEANUP_INTERVAL_SECONDS),
|
|
232
|
+
create_async_loop_routine(self._worker_controller.routine, CLEANUP_INTERVAL_SECONDS),
|
|
233
|
+
create_async_loop_routine(self._information_controller.routine, STATUS_REPORT_INTERVAL_SECONDS),
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
await asyncio.gather(*loops)
|
|
238
|
+
except asyncio.CancelledError:
|
|
239
|
+
pass
|
|
240
|
+
except ClientShutdownException as e:
|
|
241
|
+
logging.info(f"{self.__class__.__name__}: {e}")
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
self._binder.destroy()
|
|
245
|
+
self._binder_monitor.destroy()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@functools.wraps(Scheduler)
|
|
249
|
+
async def scheduler_main(*args, **kwargs):
|
|
250
|
+
scheduler = Scheduler(*args, **kwargs)
|
|
251
|
+
await scheduler.get_loops()
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from scaler.protocol.python.common import TaskState, TaskTransition
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TaskStateMachine:
|
|
7
|
+
# see https://github.com/finos/opengris-scaler/issues/56
|
|
8
|
+
TRANSITION_MAP: Dict[TaskState, Dict[TaskTransition, TaskState]] = {
|
|
9
|
+
TaskState.Inactive: {
|
|
10
|
+
TaskTransition.HasCapacity: TaskState.Running,
|
|
11
|
+
TaskTransition.TaskCancel: TaskState.Canceled,
|
|
12
|
+
},
|
|
13
|
+
TaskState.Canceling: {
|
|
14
|
+
TaskTransition.TaskCancelConfirmCanceled: TaskState.Canceled,
|
|
15
|
+
TaskTransition.WorkerDisconnect: TaskState.Canceled,
|
|
16
|
+
TaskTransition.TaskCancelConfirmFailed: TaskState.Running,
|
|
17
|
+
TaskTransition.TaskCancelConfirmNotFound: TaskState.CanceledNotFound,
|
|
18
|
+
},
|
|
19
|
+
TaskState.Running: {
|
|
20
|
+
TaskTransition.TaskResultSuccess: TaskState.Success,
|
|
21
|
+
TaskTransition.TaskResultFailed: TaskState.Failed,
|
|
22
|
+
TaskTransition.TaskResultWorkerDied: TaskState.FailedWorkerDied,
|
|
23
|
+
TaskTransition.TaskCancel: TaskState.Canceling,
|
|
24
|
+
TaskTransition.BalanceTaskCancel: TaskState.BalanceCanceling,
|
|
25
|
+
TaskTransition.WorkerDisconnect: TaskState.WorkerDisconnecting,
|
|
26
|
+
},
|
|
27
|
+
TaskState.BalanceCanceling: {
|
|
28
|
+
TaskTransition.TaskResultSuccess: TaskState.Success,
|
|
29
|
+
TaskTransition.TaskResultFailed: TaskState.Failed,
|
|
30
|
+
TaskTransition.TaskResultWorkerDied: TaskState.FailedWorkerDied,
|
|
31
|
+
TaskTransition.TaskCancel: TaskState.Canceling,
|
|
32
|
+
TaskTransition.TaskCancelConfirmCanceled: TaskState.Inactive,
|
|
33
|
+
TaskTransition.TaskCancelConfirmFailed: TaskState.Running,
|
|
34
|
+
TaskTransition.WorkerDisconnect: TaskState.WorkerDisconnecting,
|
|
35
|
+
},
|
|
36
|
+
TaskState.WorkerDisconnecting: {
|
|
37
|
+
TaskTransition.SchedulerHasTask: TaskState.Inactive,
|
|
38
|
+
TaskTransition.SchedulerHasNoTask: TaskState.FailedWorkerDied,
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def __init__(self, debug):
|
|
43
|
+
self._debug = debug
|
|
44
|
+
self._paths = list()
|
|
45
|
+
|
|
46
|
+
self._previous_state = None
|
|
47
|
+
self._state = TaskState.Inactive
|
|
48
|
+
|
|
49
|
+
def __repr__(self):
|
|
50
|
+
return f"TaskStateMachine(previous_state={self._previous_state}, state={self._state})"
|
|
51
|
+
|
|
52
|
+
def get_path(self):
|
|
53
|
+
return (
|
|
54
|
+
" ".join(f"[{state.name}] -{transition.name}->" for state, transition in self._paths)
|
|
55
|
+
+ f" [{self._state.name}]"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def previous_state(self) -> Optional[TaskState]:
|
|
59
|
+
return self._previous_state
|
|
60
|
+
|
|
61
|
+
def current_state(self) -> TaskState:
|
|
62
|
+
return self._state
|
|
63
|
+
|
|
64
|
+
def is_running(self) -> bool:
|
|
65
|
+
return self._state == TaskState.Running
|
|
66
|
+
|
|
67
|
+
def is_canceling(self) -> bool:
|
|
68
|
+
return self._state == TaskState.Canceling
|
|
69
|
+
|
|
70
|
+
def is_finished(self) -> bool:
|
|
71
|
+
return self._state in {TaskState.Success, TaskState.Failed, TaskState.FailedWorkerDied}
|
|
72
|
+
|
|
73
|
+
def is_canceled(self) -> bool:
|
|
74
|
+
return self._state in {TaskState.Canceled, TaskState.CanceledNotFound}
|
|
75
|
+
|
|
76
|
+
def is_done(self) -> bool:
|
|
77
|
+
return self.is_finished() or self.is_canceled()
|
|
78
|
+
|
|
79
|
+
def on_transition(self, transition: TaskTransition) -> bool:
|
|
80
|
+
if self._state not in TaskStateMachine.TRANSITION_MAP:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
options = TaskStateMachine.TRANSITION_MAP[self._state]
|
|
84
|
+
if transition not in options:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
if self._debug:
|
|
88
|
+
self._paths.append((self._state, transition))
|
|
89
|
+
|
|
90
|
+
self._previous_state = self._state
|
|
91
|
+
self._state = options[transition]
|
|
92
|
+
return True
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from scaler.protocol.python.common import TaskState, TaskTransition
|
|
5
|
+
from scaler.scheduler.task.task_state_machine import TaskStateMachine
|
|
6
|
+
from scaler.utility.identifiers import TaskID
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TaskStateManager:
|
|
10
|
+
def __init__(self, debug: bool):
|
|
11
|
+
self._debug = debug
|
|
12
|
+
self._task_id_to_state_machine: Dict[TaskID, TaskStateMachine] = dict()
|
|
13
|
+
self._statistics: Dict[TaskState, int] = {state: 0 for state in TaskState}
|
|
14
|
+
|
|
15
|
+
def add_state_machine(self, task_id: TaskID) -> TaskStateMachine:
|
|
16
|
+
"""Create new task state machine, return True if success, False otherwise"""
|
|
17
|
+
assert task_id not in self._task_id_to_state_machine
|
|
18
|
+
|
|
19
|
+
state_machine = TaskStateMachine(self._debug)
|
|
20
|
+
self._task_id_to_state_machine[task_id] = state_machine
|
|
21
|
+
self._statistics[state_machine.current_state()] += 1
|
|
22
|
+
return state_machine
|
|
23
|
+
|
|
24
|
+
def remove_state_machine(self, task_id: TaskID):
|
|
25
|
+
self._task_id_to_state_machine.pop(task_id)
|
|
26
|
+
|
|
27
|
+
def get_state_machine(self, task_id: TaskID) -> Optional[TaskStateMachine]:
|
|
28
|
+
return self._task_id_to_state_machine.get(task_id, None)
|
|
29
|
+
|
|
30
|
+
def on_transition(self, task_id: TaskID, transition: TaskTransition) -> Optional[TaskStateMachine]:
|
|
31
|
+
"""if adjust task state machine is successful, then return TaskStateFlags object associate with the task_id,
|
|
32
|
+
return None otherwise
|
|
33
|
+
|
|
34
|
+
This should be a central place to synchronize task state machine, if any unexpected event happened, it will not
|
|
35
|
+
return the TaskStateFlags
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
task_state_machine = self._task_id_to_state_machine.get(task_id, None)
|
|
39
|
+
if task_state_machine is None:
|
|
40
|
+
logging.error(f"{task_id!r}: unknown {transition=} for non-existed state machine")
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
transit_success = task_state_machine.on_transition(transition)
|
|
44
|
+
if transit_success:
|
|
45
|
+
self._statistics[task_state_machine.previous_state()] -= 1
|
|
46
|
+
self._statistics[task_state_machine.current_state()] += 1
|
|
47
|
+
else:
|
|
48
|
+
logging.error(
|
|
49
|
+
f"{task_id!r}: cannot apply {transition} to current state" f" {task_state_machine.current_state()}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return task_state_machine if transit_success else None
|
|
53
|
+
|
|
54
|
+
def get_statistics(self) -> Dict[TaskState, int]:
|
|
55
|
+
return self._statistics
|
|
56
|
+
|
|
57
|
+
def get_debug_paths(self):
|
|
58
|
+
return "\n".join(
|
|
59
|
+
f"{task_id!r}: {state_machine.get_path()}"
|
|
60
|
+
for task_id, state_machine in self._task_id_to_state_machine.items()
|
|
61
|
+
)
|
scaler/ui/__init__.py
ADDED
|
File without changes
|
scaler/ui/constants.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List, Optional, Set
|
|
4
|
+
|
|
5
|
+
from nicegui import ui
|
|
6
|
+
from nicegui.element import Element
|
|
7
|
+
|
|
8
|
+
from scaler.protocol.python.common import WorkerState
|
|
9
|
+
from scaler.protocol.python.message import StateTask, StateWorker
|
|
10
|
+
from scaler.protocol.python.status import WorkerStatus
|
|
11
|
+
from scaler.ui.utility import display_capabilities, format_worker_name
|
|
12
|
+
from scaler.utility.formatter import format_microseconds, format_seconds
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclasses.dataclass
|
|
16
|
+
class SchedulerSection:
|
|
17
|
+
cpu: str = dataclasses.field(default="")
|
|
18
|
+
rss: str = dataclasses.field(default="")
|
|
19
|
+
rss_free: str = dataclasses.field(default="")
|
|
20
|
+
|
|
21
|
+
handler: Optional[Element] = dataclasses.field(default=None)
|
|
22
|
+
|
|
23
|
+
def handle_task_state(self, _: StateTask):
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
def handle_worker_state(self, _: StateWorker):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
def draw_section(self):
|
|
30
|
+
with ui.card().classes("w-full"), ui.row() as handler:
|
|
31
|
+
self.handler = handler
|
|
32
|
+
ui.label("Scheduler")
|
|
33
|
+
ui.label()
|
|
34
|
+
ui.label("CPU:")
|
|
35
|
+
ui.label().bind_text_from(self, "cpu")
|
|
36
|
+
ui.label()
|
|
37
|
+
ui.label("RSS:")
|
|
38
|
+
ui.label().bind_text_from(self, "rss")
|
|
39
|
+
ui.label()
|
|
40
|
+
ui.label("RSS Free:")
|
|
41
|
+
ui.label().bind_text_from(self, "rss_free")
|
|
42
|
+
|
|
43
|
+
def delete_section(self):
|
|
44
|
+
self.handler.clear()
|
|
45
|
+
self.handler.delete()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclasses.dataclass
|
|
49
|
+
class WorkerRow:
|
|
50
|
+
worker: str = dataclasses.field(default="")
|
|
51
|
+
agt_cpu: float = dataclasses.field(default=0)
|
|
52
|
+
agt_rss: int = dataclasses.field(default=0)
|
|
53
|
+
cpu: float = dataclasses.field(default=0)
|
|
54
|
+
rss: int = dataclasses.field(default=0)
|
|
55
|
+
rss_free: int = dataclasses.field(default=0)
|
|
56
|
+
free: int = dataclasses.field(default=0)
|
|
57
|
+
sent: int = dataclasses.field(default=0)
|
|
58
|
+
queued: int = dataclasses.field(default=0)
|
|
59
|
+
suspended: int = dataclasses.field(default=0)
|
|
60
|
+
lag: str = dataclasses.field(default="")
|
|
61
|
+
itl: str = dataclasses.field(default="")
|
|
62
|
+
last_seen: str = dataclasses.field(default="")
|
|
63
|
+
capabilities: Set[str] = dataclasses.field(default_factory=set)
|
|
64
|
+
display_capabilities: str = dataclasses.field(default="")
|
|
65
|
+
|
|
66
|
+
handlers: List[Element] = dataclasses.field(default_factory=list)
|
|
67
|
+
|
|
68
|
+
def populate(self, data: WorkerStatus):
|
|
69
|
+
self.worker = data.worker_id.decode()
|
|
70
|
+
self.agt_cpu = data.agent.cpu / 10
|
|
71
|
+
self.agt_rss = int(data.agent.rss / 1e6)
|
|
72
|
+
self.cpu = sum(p.resource.cpu for p in data.processor_statuses) / 10
|
|
73
|
+
self.rss = int(sum(p.resource.rss for p in data.processor_statuses) / 1e6)
|
|
74
|
+
self.rss_free = int(data.rss_free / 1e6)
|
|
75
|
+
self.free = data.free
|
|
76
|
+
self.sent = data.sent
|
|
77
|
+
self.queued = data.queued
|
|
78
|
+
self.suspended = data.suspended
|
|
79
|
+
self.lag = format_microseconds(data.lag_us)
|
|
80
|
+
self.itl = data.itl
|
|
81
|
+
self.last_seen = format_seconds(data.last_s)
|
|
82
|
+
|
|
83
|
+
def set_capabilities(self, capabilities: Set[str]):
|
|
84
|
+
self.capabilities = capabilities
|
|
85
|
+
self.display_capabilities = display_capabilities(self.capabilities)
|
|
86
|
+
|
|
87
|
+
def draw_row(self):
|
|
88
|
+
total_rss = self.rss + self.rss_free
|
|
89
|
+
|
|
90
|
+
ui.label(format_worker_name(self.worker))
|
|
91
|
+
ui.knob(track_color="grey-2", show_value=True, min=0, max=100).bind_value_from(self, "agt_cpu")
|
|
92
|
+
ui.knob(track_color="grey-2", show_value=True, min=0, max=total_rss).bind_value_from(self, "agt_rss")
|
|
93
|
+
ui.knob(track_color="grey-2", show_value=True, min=0, max=100).bind_value_from(self, "cpu")
|
|
94
|
+
ui.knob(track_color="grey-2", show_value=True, min=0, max=total_rss).bind_value_from(self, "rss")
|
|
95
|
+
ui.label().bind_text_from(self, "free")
|
|
96
|
+
ui.label().bind_text_from(self, "sent")
|
|
97
|
+
ui.label().bind_text_from(self, "queued")
|
|
98
|
+
ui.label().bind_text_from(self, "suspended")
|
|
99
|
+
ui.label().bind_text_from(self, "lag")
|
|
100
|
+
ui.label().bind_text_from(self, "ITL")
|
|
101
|
+
ui.label().bind_text_from(self, "last_seen")
|
|
102
|
+
ui.label().bind_text_from(self, "display_capabilities")
|
|
103
|
+
|
|
104
|
+
def delete_row(self):
|
|
105
|
+
for element in self.handlers:
|
|
106
|
+
element.delete()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclasses.dataclass
|
|
110
|
+
class WorkersSection:
|
|
111
|
+
workers: Dict[str, WorkerRow] = dataclasses.field(default_factory=lambda: defaultdict(WorkerRow))
|
|
112
|
+
|
|
113
|
+
@ui.refreshable
|
|
114
|
+
def draw_section(self):
|
|
115
|
+
with ui.row().classes("h-max"), ui.card().classes("w-full"), ui.grid(columns=13):
|
|
116
|
+
self.__draw_titles()
|
|
117
|
+
for worker_row in self.workers.values():
|
|
118
|
+
worker_row.draw_row()
|
|
119
|
+
|
|
120
|
+
def handle_task_state(self, _: StateTask):
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
def handle_worker_state(self, state_worker: StateWorker):
|
|
124
|
+
worker_id = state_worker.worker_id.decode()
|
|
125
|
+
state = state_worker.state
|
|
126
|
+
|
|
127
|
+
if state == WorkerState.Connected:
|
|
128
|
+
self.workers[worker_id].set_capabilities(set(state_worker.capabilities.keys()))
|
|
129
|
+
if state == WorkerState.Disconnected:
|
|
130
|
+
self.workers.pop(worker_id, None)
|
|
131
|
+
self.draw_section.refresh()
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def __draw_titles():
|
|
135
|
+
ui.label("Worker")
|
|
136
|
+
ui.label("Agt CPU %")
|
|
137
|
+
ui.label("Agt RSS (in MB)")
|
|
138
|
+
ui.label("Processors CPU %")
|
|
139
|
+
ui.label("Processors RSS (in MB)")
|
|
140
|
+
ui.label("Queue Capacity")
|
|
141
|
+
ui.label("Tasks Sent")
|
|
142
|
+
ui.label("Tasks Queued")
|
|
143
|
+
ui.label("Tasks Suspended")
|
|
144
|
+
ui.label("Lag")
|
|
145
|
+
ui.label("ITL")
|
|
146
|
+
ui.label("Last Seen")
|
|
147
|
+
ui.label("Capabilities")
|