opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,251 @@
1
+ import asyncio
2
+ import functools
3
+ import logging
4
+
5
+ import zmq.asyncio
6
+
7
+ from scaler.config.defaults import CLEANUP_INTERVAL_SECONDS, STATUS_REPORT_INTERVAL_SECONDS
8
+ from scaler.config.section.scheduler import SchedulerConfig
9
+ from scaler.config.types.zmq import ZMQConfig, ZMQType
10
+ from scaler.io.async_binder import ZMQAsyncBinder
11
+ from scaler.io.async_connector import ZMQAsyncConnector
12
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
13
+ from scaler.io.utility import create_async_object_storage_connector
14
+ from scaler.protocol.python.common import ObjectStorageAddress
15
+ from scaler.protocol.python.message import (
16
+ ClientDisconnect,
17
+ ClientHeartbeat,
18
+ DisconnectRequest,
19
+ GraphTask,
20
+ InformationRequest,
21
+ ObjectInstruction,
22
+ Task,
23
+ TaskCancel,
24
+ TaskCancelConfirm,
25
+ TaskLog,
26
+ TaskResult,
27
+ WorkerHeartbeat,
28
+ )
29
+ from scaler.protocol.python.mixins import Message
30
+ from scaler.scheduler.controllers.balance_controller import VanillaBalanceController
31
+ from scaler.scheduler.controllers.client_controller import VanillaClientController
32
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
33
+ from scaler.scheduler.controllers.graph_controller import VanillaGraphTaskController
34
+ from scaler.scheduler.controllers.information_controller import VanillaInformationController
35
+ from scaler.scheduler.controllers.object_controller import VanillaObjectController
36
+ from scaler.scheduler.controllers.scaling_policies.utility import create_scaling_controller
37
+ from scaler.scheduler.controllers.task_controller import VanillaTaskController
38
+ from scaler.scheduler.controllers.worker_controller import VanillaWorkerController
39
+ from scaler.utility.event_loop import create_async_loop_routine
40
+ from scaler.utility.exceptions import ClientShutdownException
41
+ from scaler.utility.identifiers import ClientID, WorkerID
42
+
43
+
44
+ class Scheduler:
45
+ def __init__(self, config: SchedulerConfig):
46
+ self._config_controller = VanillaConfigController(config)
47
+
48
+ if config.scheduler_address.type != ZMQType.tcp:
49
+ raise TypeError(
50
+ f"{self.__class__.__name__}: scheduler address must be tcp type: \
51
+ {config.scheduler_address.to_address()}"
52
+ )
53
+
54
+ if config.object_storage_address is None:
55
+ object_storage_address = ObjectStorageAddress.new_msg(
56
+ host=config.scheduler_address.host, port=config.scheduler_address.port + 1
57
+ )
58
+ else:
59
+ object_storage_address = ObjectStorageAddress.new_msg(
60
+ host=config.object_storage_address.host, port=config.object_storage_address.port
61
+ )
62
+ self._config_controller.update_config("object_storage_address", object_storage_address)
63
+
64
+ if config.monitor_address is None:
65
+ monitor_address = ZMQConfig(
66
+ type=ZMQType.tcp, host=config.scheduler_address.host, port=config.scheduler_address.port + 2
67
+ )
68
+ else:
69
+ monitor_address = config.monitor_address
70
+ self._config_controller.update_config("monitor_address", monitor_address)
71
+
72
+ self._context = zmq.asyncio.Context(io_threads=config.worker_io_threads)
73
+
74
+ self._binder: AsyncBinder = ZMQAsyncBinder(
75
+ context=self._context, name="scheduler", address=config.scheduler_address
76
+ )
77
+ logging.info(f"{self.__class__.__name__}: listen to scheduler address {config.scheduler_address}")
78
+
79
+ self._connector_storage: AsyncObjectStorageConnector = create_async_object_storage_connector()
80
+ logging.info(f"{self.__class__.__name__}: connect to object storage server {object_storage_address!r}")
81
+
82
+ self._binder_monitor: AsyncConnector = ZMQAsyncConnector(
83
+ context=self._context,
84
+ name="scheduler_monitor",
85
+ socket_type=zmq.PUB,
86
+ address=monitor_address,
87
+ bind_or_connect="bind",
88
+ callback=None,
89
+ identity=None,
90
+ )
91
+ logging.info(f"{self.__class__.__name__}: listen to scheduler monitor address {monitor_address.to_address()}")
92
+
93
+ self._task_allocate_policy = config.allocate_policy.value()
94
+
95
+ self._client_manager = VanillaClientController(config_controller=self._config_controller)
96
+ self._object_controller = VanillaObjectController(config_controller=self._config_controller)
97
+ self._graph_controller = VanillaGraphTaskController(config_controller=self._config_controller)
98
+ self._task_controller = VanillaTaskController(config_controller=self._config_controller)
99
+ self._worker_controller = VanillaWorkerController(
100
+ config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
101
+ )
102
+ self._balance_controller = VanillaBalanceController(
103
+ config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
104
+ )
105
+ self._information_controller = VanillaInformationController(config_controller=self._config_controller)
106
+ self._scaling_controller = create_scaling_controller(
107
+ config.scaling_controller_strategy, config.adapter_webhook_urls
108
+ )
109
+
110
+ # register
111
+ self._binder.register(self.on_receive_message)
112
+ self._client_manager.register(
113
+ self._binder, self._binder_monitor, self._object_controller, self._task_controller, self._worker_controller
114
+ )
115
+ self._object_controller.register(
116
+ self._binder, self._binder_monitor, self._connector_storage, self._client_manager, self._worker_controller
117
+ )
118
+ self._graph_controller.register(
119
+ self._binder,
120
+ self._binder_monitor,
121
+ self._connector_storage,
122
+ self._client_manager,
123
+ self._task_controller,
124
+ self._object_controller,
125
+ )
126
+ self._task_controller.register(
127
+ self._binder,
128
+ self._binder_monitor,
129
+ self._client_manager,
130
+ self._object_controller,
131
+ self._worker_controller,
132
+ self._graph_controller,
133
+ )
134
+ self._worker_controller.register(self._binder, self._binder_monitor, self._task_controller)
135
+ self._balance_controller.register(self._binder, self._binder_monitor, self._task_controller)
136
+
137
+ self._information_controller.register_managers(
138
+ self._binder_monitor,
139
+ self._binder,
140
+ self._client_manager,
141
+ self._object_controller,
142
+ self._task_controller,
143
+ self._worker_controller,
144
+ self._scaling_controller,
145
+ )
146
+
147
+ async def connect_to_storage(self):
148
+ object_storage_address = self._config_controller.get_config("object_storage_address")
149
+ await self._connector_storage.connect(object_storage_address.host, object_storage_address.port)
150
+
151
+ async def on_receive_message(self, source: bytes, message: Message):
152
+ # =====================================================================================
153
+ # client manager
154
+ if isinstance(message, ClientHeartbeat):
155
+ await self._client_manager.on_heartbeat(ClientID(source), message)
156
+ return
157
+
158
+ # scheduler receives client shutdown request from upstream
159
+ if isinstance(message, ClientDisconnect):
160
+ await self._client_manager.on_client_disconnect(ClientID(source), message)
161
+ return
162
+
163
+ # =====================================================================================
164
+ # graph manager
165
+ if isinstance(message, GraphTask):
166
+ await self._graph_controller.on_graph_task(ClientID(source), message)
167
+ return
168
+
169
+ # =====================================================================================
170
+ # task manager
171
+ if isinstance(message, Task):
172
+ await self._task_controller.on_task_new(message)
173
+ return
174
+
175
+ if isinstance(message, TaskCancel):
176
+ if self._graph_controller.is_graph_subtask(message.task_id):
177
+ await self._graph_controller.on_graph_task_cancel(message)
178
+ else:
179
+ await self._task_controller.on_task_cancel(ClientID(source), message)
180
+ return
181
+
182
+ if isinstance(message, TaskCancelConfirm):
183
+ await self._task_controller.on_task_cancel_confirm(message)
184
+ return
185
+
186
+ if isinstance(message, TaskResult):
187
+ await self._task_controller.on_task_result(message)
188
+ return
189
+
190
+ if isinstance(message, TaskLog):
191
+ client = self._client_manager.get_client_id(message.task_id)
192
+ if client is not None:
193
+ await self._binder.send(client, message)
194
+ return
195
+
196
+ # =====================================================================================
197
+ # worker manager
198
+ if isinstance(message, WorkerHeartbeat):
199
+ await self._worker_controller.on_heartbeat(WorkerID(source), message)
200
+ return
201
+
202
+ # scheduler receives worker disconnect request from downstream
203
+ if isinstance(message, DisconnectRequest):
204
+ await self._worker_controller.on_disconnect(WorkerID(source), message)
205
+ return
206
+
207
+ # =====================================================================================
208
+ # object manager
209
+ if isinstance(message, ObjectInstruction):
210
+ await self._object_controller.on_object_instruction(source, message)
211
+ return
212
+
213
+ # =====================================================================================
214
+ # information manager
215
+ if isinstance(message, InformationRequest):
216
+ await self._information_controller.on_request(message)
217
+
218
+ logging.error(f"{self.__class__.__name__}: unknown message from {source=}: {message}")
219
+
220
+ async def get_loops(self):
221
+ await self.connect_to_storage()
222
+
223
+ loops = [
224
+ create_async_loop_routine(self._binder.routine, 0),
225
+ create_async_loop_routine(self._connector_storage.routine, 0),
226
+ create_async_loop_routine(self._graph_controller.routine, 0),
227
+ create_async_loop_routine(
228
+ self._balance_controller.routine, self._config_controller.get_config("load_balance_seconds")
229
+ ),
230
+ create_async_loop_routine(self._client_manager.routine, CLEANUP_INTERVAL_SECONDS),
231
+ create_async_loop_routine(self._object_controller.routine, CLEANUP_INTERVAL_SECONDS),
232
+ create_async_loop_routine(self._worker_controller.routine, CLEANUP_INTERVAL_SECONDS),
233
+ create_async_loop_routine(self._information_controller.routine, STATUS_REPORT_INTERVAL_SECONDS),
234
+ ]
235
+
236
+ try:
237
+ await asyncio.gather(*loops)
238
+ except asyncio.CancelledError:
239
+ pass
240
+ except ClientShutdownException as e:
241
+ logging.info(f"{self.__class__.__name__}: {e}")
242
+ pass
243
+
244
+ self._binder.destroy()
245
+ self._binder_monitor.destroy()
246
+
247
+
248
+ @functools.wraps(Scheduler)
249
+ async def scheduler_main(*args, **kwargs):
250
+ scheduler = Scheduler(*args, **kwargs)
251
+ await scheduler.get_loops()
File without changes
@@ -0,0 +1,92 @@
1
+ from typing import Dict, Optional
2
+
3
+ from scaler.protocol.python.common import TaskState, TaskTransition
4
+
5
+
6
+ class TaskStateMachine:
7
+ # see https://github.com/finos/opengris-scaler/issues/56
8
+ TRANSITION_MAP: Dict[TaskState, Dict[TaskTransition, TaskState]] = {
9
+ TaskState.Inactive: {
10
+ TaskTransition.HasCapacity: TaskState.Running,
11
+ TaskTransition.TaskCancel: TaskState.Canceled,
12
+ },
13
+ TaskState.Canceling: {
14
+ TaskTransition.TaskCancelConfirmCanceled: TaskState.Canceled,
15
+ TaskTransition.WorkerDisconnect: TaskState.Canceled,
16
+ TaskTransition.TaskCancelConfirmFailed: TaskState.Running,
17
+ TaskTransition.TaskCancelConfirmNotFound: TaskState.CanceledNotFound,
18
+ },
19
+ TaskState.Running: {
20
+ TaskTransition.TaskResultSuccess: TaskState.Success,
21
+ TaskTransition.TaskResultFailed: TaskState.Failed,
22
+ TaskTransition.TaskResultWorkerDied: TaskState.FailedWorkerDied,
23
+ TaskTransition.TaskCancel: TaskState.Canceling,
24
+ TaskTransition.BalanceTaskCancel: TaskState.BalanceCanceling,
25
+ TaskTransition.WorkerDisconnect: TaskState.WorkerDisconnecting,
26
+ },
27
+ TaskState.BalanceCanceling: {
28
+ TaskTransition.TaskResultSuccess: TaskState.Success,
29
+ TaskTransition.TaskResultFailed: TaskState.Failed,
30
+ TaskTransition.TaskResultWorkerDied: TaskState.FailedWorkerDied,
31
+ TaskTransition.TaskCancel: TaskState.Canceling,
32
+ TaskTransition.TaskCancelConfirmCanceled: TaskState.Inactive,
33
+ TaskTransition.TaskCancelConfirmFailed: TaskState.Running,
34
+ TaskTransition.WorkerDisconnect: TaskState.WorkerDisconnecting,
35
+ },
36
+ TaskState.WorkerDisconnecting: {
37
+ TaskTransition.SchedulerHasTask: TaskState.Inactive,
38
+ TaskTransition.SchedulerHasNoTask: TaskState.FailedWorkerDied,
39
+ },
40
+ }
41
+
42
+ def __init__(self, debug):
43
+ self._debug = debug
44
+ self._paths = list()
45
+
46
+ self._previous_state = None
47
+ self._state = TaskState.Inactive
48
+
49
+ def __repr__(self):
50
+ return f"TaskStateMachine(previous_state={self._previous_state}, state={self._state})"
51
+
52
+ def get_path(self):
53
+ return (
54
+ " ".join(f"[{state.name}] -{transition.name}->" for state, transition in self._paths)
55
+ + f" [{self._state.name}]"
56
+ )
57
+
58
+ def previous_state(self) -> Optional[TaskState]:
59
+ return self._previous_state
60
+
61
+ def current_state(self) -> TaskState:
62
+ return self._state
63
+
64
+ def is_running(self) -> bool:
65
+ return self._state == TaskState.Running
66
+
67
+ def is_canceling(self) -> bool:
68
+ return self._state == TaskState.Canceling
69
+
70
+ def is_finished(self) -> bool:
71
+ return self._state in {TaskState.Success, TaskState.Failed, TaskState.FailedWorkerDied}
72
+
73
+ def is_canceled(self) -> bool:
74
+ return self._state in {TaskState.Canceled, TaskState.CanceledNotFound}
75
+
76
+ def is_done(self) -> bool:
77
+ return self.is_finished() or self.is_canceled()
78
+
79
+ def on_transition(self, transition: TaskTransition) -> bool:
80
+ if self._state not in TaskStateMachine.TRANSITION_MAP:
81
+ return False
82
+
83
+ options = TaskStateMachine.TRANSITION_MAP[self._state]
84
+ if transition not in options:
85
+ return False
86
+
87
+ if self._debug:
88
+ self._paths.append((self._state, transition))
89
+
90
+ self._previous_state = self._state
91
+ self._state = options[transition]
92
+ return True
@@ -0,0 +1,61 @@
1
+ import logging
2
+ from typing import Dict, Optional
3
+
4
+ from scaler.protocol.python.common import TaskState, TaskTransition
5
+ from scaler.scheduler.task.task_state_machine import TaskStateMachine
6
+ from scaler.utility.identifiers import TaskID
7
+
8
+
9
+ class TaskStateManager:
10
+ def __init__(self, debug: bool):
11
+ self._debug = debug
12
+ self._task_id_to_state_machine: Dict[TaskID, TaskStateMachine] = dict()
13
+ self._statistics: Dict[TaskState, int] = {state: 0 for state in TaskState}
14
+
15
+ def add_state_machine(self, task_id: TaskID) -> TaskStateMachine:
16
+ """Create new task state machine, return True if success, False otherwise"""
17
+ assert task_id not in self._task_id_to_state_machine
18
+
19
+ state_machine = TaskStateMachine(self._debug)
20
+ self._task_id_to_state_machine[task_id] = state_machine
21
+ self._statistics[state_machine.current_state()] += 1
22
+ return state_machine
23
+
24
+ def remove_state_machine(self, task_id: TaskID):
25
+ self._task_id_to_state_machine.pop(task_id)
26
+
27
+ def get_state_machine(self, task_id: TaskID) -> Optional[TaskStateMachine]:
28
+ return self._task_id_to_state_machine.get(task_id, None)
29
+
30
+ def on_transition(self, task_id: TaskID, transition: TaskTransition) -> Optional[TaskStateMachine]:
31
+ """if adjust task state machine is successful, then return TaskStateFlags object associate with the task_id,
32
+ return None otherwise
33
+
34
+ This should be a central place to synchronize task state machine, if any unexpected event happened, it will not
35
+ return the TaskStateFlags
36
+ """
37
+
38
+ task_state_machine = self._task_id_to_state_machine.get(task_id, None)
39
+ if task_state_machine is None:
40
+ logging.error(f"{task_id!r}: unknown {transition=} for non-existed state machine")
41
+ return None
42
+
43
+ transit_success = task_state_machine.on_transition(transition)
44
+ if transit_success:
45
+ self._statistics[task_state_machine.previous_state()] -= 1
46
+ self._statistics[task_state_machine.current_state()] += 1
47
+ else:
48
+ logging.error(
49
+ f"{task_id!r}: cannot apply {transition} to current state" f" {task_state_machine.current_state()}"
50
+ )
51
+
52
+ return task_state_machine if transit_success else None
53
+
54
+ def get_statistics(self) -> Dict[TaskState, int]:
55
+ return self._statistics
56
+
57
+ def get_debug_paths(self):
58
+ return "\n".join(
59
+ f"{task_id!r}: {state_machine.get_path()}"
60
+ for task_id, state_machine in self._task_id_to_state_machine.items()
61
+ )
scaler/ui/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,9 @@
1
+ # times are in seconds
2
+
3
+ TASK_LOG_REFRESH_INTERVAL = 0.5
4
+
5
+ WORKER_PROCESSORS_REFRESH_INTERVAL = 2
6
+
7
+ TASK_STREAM_UPDATE_INTERVAL = 0.1
8
+
9
+ MEMORY_USAGE_UPDATE_INTERVAL = 0.1
@@ -0,0 +1,147 @@
1
+ import dataclasses
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, Set
4
+
5
+ from nicegui import ui
6
+ from nicegui.element import Element
7
+
8
+ from scaler.protocol.python.common import WorkerState
9
+ from scaler.protocol.python.message import StateTask, StateWorker
10
+ from scaler.protocol.python.status import WorkerStatus
11
+ from scaler.ui.common.utility import display_capabilities, format_worker_name
12
+ from scaler.utility.formatter import format_microseconds, format_seconds
13
+
14
+
15
+ @dataclasses.dataclass
16
+ class SchedulerSection:
17
+ cpu: str = dataclasses.field(default="")
18
+ rss: str = dataclasses.field(default="")
19
+ rss_free: str = dataclasses.field(default="")
20
+
21
+ handler: Optional[Element] = dataclasses.field(default=None)
22
+
23
+ def handle_task_state(self, _: StateTask):
24
+ return
25
+
26
+ def handle_worker_state(self, _: StateWorker):
27
+ return
28
+
29
+ def draw_section(self):
30
+ with ui.card().classes("w-full"), ui.row() as handler:
31
+ self.handler = handler
32
+ ui.label("Scheduler")
33
+ ui.label()
34
+ ui.label("CPU:")
35
+ ui.label().bind_text_from(self, "cpu")
36
+ ui.label()
37
+ ui.label("RSS:")
38
+ ui.label().bind_text_from(self, "rss")
39
+ ui.label()
40
+ ui.label("RSS Free:")
41
+ ui.label().bind_text_from(self, "rss_free")
42
+
43
+ def delete_section(self):
44
+ self.handler.clear()
45
+ self.handler.delete()
46
+
47
+
48
+ @dataclasses.dataclass
49
+ class WorkerRow:
50
+ worker: str = dataclasses.field(default="")
51
+ agt_cpu: float = dataclasses.field(default=0)
52
+ agt_rss: int = dataclasses.field(default=0)
53
+ cpu: float = dataclasses.field(default=0)
54
+ rss: int = dataclasses.field(default=0)
55
+ rss_free: int = dataclasses.field(default=0)
56
+ free: int = dataclasses.field(default=0)
57
+ sent: int = dataclasses.field(default=0)
58
+ queued: int = dataclasses.field(default=0)
59
+ suspended: int = dataclasses.field(default=0)
60
+ lag: str = dataclasses.field(default="")
61
+ itl: str = dataclasses.field(default="")
62
+ last_seen: str = dataclasses.field(default="")
63
+ capabilities: Set[str] = dataclasses.field(default_factory=set)
64
+ display_capabilities: str = dataclasses.field(default="")
65
+
66
+ handlers: List[Element] = dataclasses.field(default_factory=list)
67
+
68
+ def populate(self, data: WorkerStatus):
69
+ self.worker = data.worker_id.decode()
70
+ self.agt_cpu = data.agent.cpu / 10
71
+ self.agt_rss = int(data.agent.rss / 1e6)
72
+ self.cpu = sum(p.resource.cpu for p in data.processor_statuses) / 10
73
+ self.rss = int(sum(p.resource.rss for p in data.processor_statuses) / 1e6)
74
+ self.rss_free = int(data.rss_free / 1e6)
75
+ self.free = data.free
76
+ self.sent = data.sent
77
+ self.queued = data.queued
78
+ self.suspended = data.suspended
79
+ self.lag = format_microseconds(data.lag_us)
80
+ self.itl = data.itl
81
+ self.last_seen = format_seconds(data.last_s)
82
+
83
+ def set_capabilities(self, capabilities: Set[str]):
84
+ self.capabilities = capabilities
85
+ self.display_capabilities = display_capabilities(self.capabilities)
86
+
87
+ def draw_row(self):
88
+ total_rss = self.rss + self.rss_free
89
+
90
+ ui.label(format_worker_name(self.worker))
91
+ ui.knob(track_color="grey-2", show_value=True, min=0, max=100).bind_value_from(self, "agt_cpu")
92
+ ui.knob(track_color="grey-2", show_value=True, min=0, max=total_rss).bind_value_from(self, "agt_rss")
93
+ ui.knob(track_color="grey-2", show_value=True, min=0, max=100).bind_value_from(self, "cpu")
94
+ ui.knob(track_color="grey-2", show_value=True, min=0, max=total_rss).bind_value_from(self, "rss")
95
+ ui.label().bind_text_from(self, "free")
96
+ ui.label().bind_text_from(self, "sent")
97
+ ui.label().bind_text_from(self, "queued")
98
+ ui.label().bind_text_from(self, "suspended")
99
+ ui.label().bind_text_from(self, "lag")
100
+ ui.label().bind_text_from(self, "itl")
101
+ ui.label().bind_text_from(self, "last_seen")
102
+ ui.label().bind_text_from(self, "display_capabilities")
103
+
104
+ def delete_row(self):
105
+ for element in self.handlers:
106
+ element.delete()
107
+
108
+
109
+ @dataclasses.dataclass
110
+ class WorkersSection:
111
+ workers: Dict[str, WorkerRow] = dataclasses.field(default_factory=lambda: defaultdict(WorkerRow))
112
+
113
+ @ui.refreshable
114
+ def draw_section(self):
115
+ with ui.row().classes("h-max"), ui.card().classes("w-full"), ui.grid(columns=13):
116
+ self.__draw_titles()
117
+ for worker_row in self.workers.values():
118
+ worker_row.draw_row()
119
+
120
+ def handle_task_state(self, _: StateTask):
121
+ return
122
+
123
+ def handle_worker_state(self, state_worker: StateWorker):
124
+ worker_id = state_worker.worker_id.decode()
125
+ state = state_worker.state
126
+
127
+ if state == WorkerState.Connected:
128
+ self.workers[worker_id].set_capabilities(set(state_worker.capabilities.keys()))
129
+ if state == WorkerState.Disconnected:
130
+ self.workers.pop(worker_id, None)
131
+ self.draw_section.refresh()
132
+
133
+ @staticmethod
134
+ def __draw_titles():
135
+ ui.label("Worker")
136
+ ui.label("Agt CPU %")
137
+ ui.label("Agt RSS (in MB)")
138
+ ui.label("Processors CPU %")
139
+ ui.label("Processors RSS (in MB)")
140
+ ui.label("Queue Capacity")
141
+ ui.label("Tasks Sent")
142
+ ui.label("Tasks Queued")
143
+ ui.label("Tasks Suspended")
144
+ ui.label("Lag")
145
+ ui.label("ITL")
146
+ ui.label("Last Seen")
147
+ ui.label("Capabilities")