opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,376 @@
1
+ import asyncio
2
+ import logging
3
+ from collections import deque
4
+ from typing import Any, Awaitable, Callable, Deque, Dict, List, Optional, Tuple
5
+
6
+ from scaler.io.mixins import AsyncBinder, AsyncConnector
7
+ from scaler.protocol.python.common import TaskCancelConfirmType, TaskResultType, TaskState, TaskTransition
8
+ from scaler.protocol.python.message import StateTask, Task, TaskCancel, TaskCancelConfirm, TaskResult
9
+ from scaler.protocol.python.status import TaskManagerStatus
10
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
11
+ from scaler.scheduler.controllers.mixins import (
12
+ ClientController,
13
+ GraphTaskController,
14
+ ObjectController,
15
+ TaskController,
16
+ WorkerController,
17
+ )
18
+ from scaler.scheduler.task.task_state_machine import TaskStateMachine
19
+ from scaler.scheduler.task.task_state_manager import TaskStateManager
20
+ from scaler.utility.identifiers import ClientID, TaskID, WorkerID
21
+ from scaler.utility.mixins import Looper, Reporter
22
+
23
+
24
+ class VanillaTaskController(TaskController, Looper, Reporter):
25
+ def __init__(self, config_controller: VanillaConfigController):
26
+ self._config_controller = config_controller
27
+ self._binder: Optional[AsyncBinder] = None
28
+ self._binder_monitor: Optional[AsyncConnector] = None
29
+
30
+ self._client_controller: Optional[ClientController] = None
31
+ self._object_controller: Optional[ObjectController] = None
32
+ self._worker_controller: Optional[WorkerController] = None
33
+
34
+ self._graph_controller: Optional[GraphTaskController] = None
35
+
36
+ self._task_id_to_task: Dict[TaskID, Task] = dict()
37
+ self._task_state_manager: TaskStateManager = TaskStateManager(debug=True)
38
+
39
+ self._unassigned: Deque[TaskID] = deque() # type: ignore[misc]
40
+
41
+ self._state_functions: Dict[TaskState, Callable[[*Tuple[Any, ...]], Awaitable[None]]] = {
42
+ TaskState.Inactive: self.__state_inactive, # type: ignore[dict-item]
43
+ TaskState.Running: self.__state_running, # type: ignore[dict-item]
44
+ TaskState.Canceling: self.__state_canceling, # type: ignore[dict-item]
45
+ TaskState.BalanceCanceling: self.__state_balance_canceling, # type: ignore[dict-item]
46
+ TaskState.WorkerDisconnecting: self.__state_worker_disconnecting, # type: ignore[dict-item]
47
+ TaskState.Canceled: self.__state_canceled, # type: ignore[dict-item]
48
+ TaskState.CanceledNotFound: self.__state_canceled_not_found, # type: ignore[dict-item]
49
+ TaskState.Success: self.__state_success, # type: ignore[dict-item]
50
+ TaskState.Failed: self.__state_failed, # type: ignore[dict-item]
51
+ TaskState.FailedWorkerDied: self.__state_failed_worker_died, # type: ignore[dict-item]
52
+ }
53
+ self._task_result_transition_map = {
54
+ TaskResultType.Success: TaskTransition.TaskResultSuccess,
55
+ TaskResultType.Failed: TaskTransition.TaskResultFailed,
56
+ TaskResultType.FailedWorkerDied: TaskTransition.TaskResultWorkerDied,
57
+ }
58
+ self._task_cancel_confirm_transition_map = {
59
+ TaskCancelConfirmType.Canceled: TaskTransition.TaskCancelConfirmCanceled,
60
+ TaskCancelConfirmType.CancelNotFound: TaskTransition.TaskCancelConfirmNotFound,
61
+ TaskCancelConfirmType.CancelFailed: TaskTransition.TaskCancelConfirmFailed,
62
+ }
63
+
64
+ def register(
65
+ self,
66
+ binder: AsyncBinder,
67
+ binder_monitor: AsyncConnector,
68
+ client_controller: ClientController,
69
+ object_controller: ObjectController,
70
+ worker_controller: WorkerController,
71
+ graph_controller: GraphTaskController,
72
+ ):
73
+ self._binder = binder
74
+ self._binder_monitor = binder_monitor
75
+
76
+ self._client_controller = client_controller
77
+ self._object_controller = object_controller
78
+ self._worker_controller = worker_controller
79
+ self._graph_controller = graph_controller
80
+
81
+ async def routine(self):
82
+ # TODO: we don't need loop task anymore, but I will leave this routine API here in case we need in the future
83
+ pass
84
+
85
+ async def on_task_new(self, task: Task):
86
+ if self._task_state_manager.get_state_machine(task.task_id) is not None:
87
+ logging.error(
88
+ f"{task.task_id!r}: state machine already exists: "
89
+ f"{self._task_state_manager.get_state_machine(task.task_id)}"
90
+ )
91
+ return
92
+
93
+ state_machine = self._task_state_manager.add_state_machine(task.task_id)
94
+ await self.__state_inactive(task_id=task.task_id, state_machine=state_machine, task=task)
95
+
96
+ async def on_task_cancel(self, client_id: ClientID, task_cancel: TaskCancel):
97
+ state_machine = self._task_state_manager.get_state_machine(task_cancel.task_id)
98
+ if state_machine is None:
99
+ logging.error(f"{task_cancel.task_id!r}: task not exists while received TaskCancel, send TaskCancelConfirm")
100
+
101
+ task_cancel_confirm = TaskCancelConfirm.new_msg(task_cancel.task_id, TaskCancelConfirmType.CancelNotFound)
102
+
103
+ if self._graph_controller.is_graph_subtask(task_cancel.task_id):
104
+ await self._graph_controller.on_graph_sub_task_cancel_confirm(task_cancel_confirm)
105
+
106
+ await self._binder.send(client_id, task_cancel_confirm)
107
+ return
108
+
109
+ if state_machine.current_state() == TaskState.Inactive:
110
+ await self.__routing(
111
+ task_cancel.task_id,
112
+ TaskTransition.TaskCancel,
113
+ task_cancel_confirm=TaskCancelConfirm.new_msg(
114
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled
115
+ ),
116
+ )
117
+ return
118
+
119
+ await self.__routing(task_cancel.task_id, TaskTransition.TaskCancel, client=client_id, task_cancel=task_cancel)
120
+
121
+ async def on_task_balance_cancel(self, task_id: TaskID):
122
+ await self.__routing(task_id, TaskTransition.BalanceTaskCancel)
123
+
124
+ async def on_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
125
+ transition = self._task_cancel_confirm_transition_map.get(task_cancel_confirm.cancel_confirm_type, None)
126
+ if transition is None:
127
+ raise ValueError(f"unknown TaskCancelConfirmType: {task_cancel_confirm.cancel_confirm_type}")
128
+
129
+ state_machine = self._task_state_manager.get_state_machine(task_cancel_confirm.task_id)
130
+ if state_machine is None:
131
+ logging.error(
132
+ f"{task_cancel_confirm.task_id!r}: task not exists while received TaskCancelTaskCancelConfirm, ignore"
133
+ )
134
+ return
135
+
136
+ current_state = state_machine.current_state()
137
+ if (
138
+ current_state == TaskState.BalanceCanceling
139
+ and task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.Canceled
140
+ ):
141
+ # if balance cancel success
142
+ task = self._task_id_to_task[task_cancel_confirm.task_id]
143
+ await self.__routing(task_cancel_confirm.task_id, transition, task=task)
144
+ return
145
+
146
+ if (
147
+ current_state == TaskState.BalanceCanceling
148
+ and task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelFailed
149
+ ):
150
+ # balance cancel failed
151
+ worker_id = self._worker_controller.get_worker_by_task_id(task_cancel_confirm.task_id)
152
+ await self.__routing(task_cancel_confirm.task_id, transition, worker_id=worker_id)
153
+ return
154
+
155
+ await self.__routing(task_cancel_confirm.task_id, transition, task_cancel_confirm=task_cancel_confirm)
156
+
157
+ async def on_task_result(self, task_result: TaskResult):
158
+ transition = self._task_result_transition_map.get(task_result.result_type, None)
159
+ if transition is None:
160
+ raise ValueError(f"unknown TaskResultType: {task_result.result_type}")
161
+
162
+ await self.__routing(task_result.task_id, transition, task_result=task_result)
163
+
164
+ async def on_worker_connect(self, worker_id: WorkerID):
165
+ await self.__retry_unassignable()
166
+
167
+ async def on_worker_disconnect(self, task_id: TaskID, worker_id: WorkerID):
168
+ await self.__routing(task_id, TaskTransition.WorkerDisconnect, worker_id=worker_id)
169
+
170
+ def get_status(self) -> TaskManagerStatus:
171
+ return TaskManagerStatus.new_msg(state_to_count=self._task_state_manager.get_statistics())
172
+
173
+ async def __state_inactive(self, task_id: TaskID, state_machine: TaskStateMachine, task: Task):
174
+ assert task_id == task.task_id
175
+ assert state_machine.current_state() == TaskState.Inactive
176
+
177
+ self._client_controller.on_task_begin(task.source, task.task_id)
178
+ self._task_id_to_task[task.task_id] = task
179
+
180
+ worker_id = self._worker_controller.acquire_worker(self._task_id_to_task[task_id])
181
+ if not worker_id.is_valid():
182
+ # put task on hold until there is worker is added or task is finished/canceled (means have capacity)
183
+ self._unassigned.append(task_id)
184
+ return
185
+
186
+ await self.__routing(task_id, TaskTransition.HasCapacity, worker_id=worker_id)
187
+ await self.__send_monitor(task.task_id, self._object_controller.get_object_name(task.func_object_id))
188
+
189
+ async def __state_running(self, task_id: TaskID, state_machine: TaskStateMachine, worker_id: WorkerID):
190
+ if state_machine.previous_state() in {TaskState.Canceling, TaskState.BalanceCanceling}:
191
+ # if cancel failed (task is ongoing), we should wait here for the result
192
+ return
193
+
194
+ assert state_machine.current_state() == TaskState.Running
195
+
196
+ task = self._task_id_to_task[task_id]
197
+ await self._binder.send(worker_id, task)
198
+ await self.__send_monitor(task_id, self._object_controller.get_object_name(task.func_object_id))
199
+
200
+ async def __state_canceling(
201
+ self, task_id: TaskID, state_machine: TaskStateMachine, client: ClientID, task_cancel: TaskCancel
202
+ ):
203
+ assert task_id == task_cancel.task_id
204
+ assert state_machine.current_state() == TaskState.Canceling
205
+
206
+ if state_machine.previous_state() == TaskState.BalanceCanceling:
207
+ # we don't need to send another TaskCancel as it's already sent in previous state
208
+ return
209
+
210
+ # in case if task trying to cancel doesn't have task in scheduler, so we know which client we can send
211
+ # confirm to
212
+ self._client_controller.on_task_begin(client, task_id)
213
+
214
+ if task_id not in self._unassigned:
215
+ await self.__send_task_cancel_to_worker(task_cancel)
216
+ return
217
+
218
+ # task is either in unassigned or unassignable
219
+ await self.__routing(
220
+ task_id,
221
+ TaskTransition.TaskCancelConfirmCanceled,
222
+ task_cancel_confirm=TaskCancelConfirm.new_msg(task_id, TaskCancelConfirmType.Canceled),
223
+ )
224
+
225
+ if task_id in self._unassigned:
226
+ self._unassigned.remove(task_id)
227
+
228
+ async def __state_balance_canceling(self, task_id: TaskID, state_machine: TaskStateMachine):
229
+ assert state_machine.current_state() == TaskState.BalanceCanceling
230
+ await self.__send_task_cancel_to_worker(
231
+ TaskCancel.new_msg(task_id=task_id, flags=TaskCancel.TaskCancelFlags(force=False))
232
+ )
233
+
234
+ async def __state_worker_disconnecting(self, task_id: TaskID, state_machine: TaskStateMachine, worker_id: WorkerID):
235
+ assert isinstance(worker_id, WorkerID)
236
+ assert state_machine.current_state() == TaskState.WorkerDisconnecting
237
+
238
+ # this is where we decide to reroute or just send fail
239
+ task = self._task_id_to_task.get(task_id)
240
+ if task is None:
241
+ await self.__routing(
242
+ task_id,
243
+ TaskTransition.SchedulerHasNoTask,
244
+ task_result=TaskResult.new_msg(task_id, TaskResultType.FailedWorkerDied),
245
+ )
246
+ else:
247
+ await self.__routing(task_id, TaskTransition.SchedulerHasTask, task=task)
248
+
249
+ async def __state_canceled(
250
+ self, task_id: TaskID, state_machine: TaskStateMachine, task_cancel_confirm: TaskCancelConfirm
251
+ ):
252
+ assert task_id == task_cancel_confirm.task_id
253
+ assert task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.Canceled
254
+ assert state_machine.current_state() == TaskState.Canceled
255
+
256
+ if task_cancel_confirm.task_id in self._unassigned:
257
+ # if task is not assigned to any worker, we don't need to deal with worker manager
258
+ self._unassigned.remove(task_cancel_confirm.task_id)
259
+ else:
260
+ await self._worker_controller.on_task_done(task_cancel_confirm.task_id)
261
+
262
+ await self.__send_task_cancel_confirm_to_client(task_cancel_confirm)
263
+
264
+ async def __state_canceled_not_found(
265
+ self, task_id: TaskID, state_machine: TaskStateMachine, task_cancel_confirm: TaskCancelConfirm
266
+ ):
267
+ assert task_id == task_cancel_confirm.task_id
268
+ assert task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelNotFound
269
+ assert state_machine.current_state() == TaskState.CanceledNotFound
270
+
271
+ await self.__send_task_cancel_confirm_to_client(task_cancel_confirm)
272
+
273
+ async def __state_success(self, task_id: TaskID, state_machine: TaskStateMachine, task_result: TaskResult):
274
+ assert task_id == task_result.task_id
275
+ assert state_machine.current_state() == TaskState.Success
276
+ await self.__send_task_result_to_client(task_result)
277
+
278
+ async def __state_failed(self, task_id: TaskID, state_machine: TaskStateMachine, task_result: TaskResult):
279
+ assert task_id == task_result.task_id
280
+ assert state_machine.current_state() == TaskState.Failed
281
+ await self.__send_task_result_to_client(task_result)
282
+
283
+ async def __state_failed_worker_died(
284
+ self, task_id: TaskID, state_machine: TaskStateMachine, task_result: TaskResult
285
+ ):
286
+ assert task_id == task_result.task_id
287
+ assert state_machine.current_state() == TaskState.FailedWorkerDied
288
+ await self.__send_task_result_to_client(task_result)
289
+
290
+ async def __send_task_cancel_to_worker(self, task_cancel: TaskCancel):
291
+ worker = await self._worker_controller.on_task_cancel(task_cancel)
292
+ if not worker:
293
+ logging.error(f"{task_cancel.task_id!r}: cannot find task in worker to cancel")
294
+ await self.__routing(
295
+ task_cancel.task_id,
296
+ TaskTransition.TaskCancelConfirmNotFound,
297
+ task_cancel_confirm=TaskCancelConfirm.new_msg(
298
+ task_cancel.task_id, TaskCancelConfirmType.CancelNotFound
299
+ ),
300
+ )
301
+ return
302
+
303
+ await self._binder.send(worker, TaskCancel.new_msg(task_cancel.task_id))
304
+ await self.__send_monitor(task_cancel.task_id, b"")
305
+
306
+ async def __send_task_result_to_client(self, task_result: TaskResult):
307
+ await self._worker_controller.on_task_done(task_result.task_id)
308
+ client = self._client_controller.on_task_finish(task_result.task_id)
309
+ await self._binder.send(client, task_result)
310
+ await self.__send_monitor(task_result.task_id, b"", task_result.metadata)
311
+
312
+ self._task_state_manager.remove_state_machine(task_result.task_id)
313
+ self._task_id_to_task.pop(task_result.task_id)
314
+
315
+ if self._graph_controller.is_graph_subtask(task_result.task_id):
316
+ await self._graph_controller.on_graph_sub_task_result(task_result)
317
+
318
+ await self.__retry_unassignable()
319
+
320
+ async def __send_task_cancel_confirm_to_client(self, task_cancel_confirm: TaskCancelConfirm):
321
+ client = self._client_controller.on_task_finish(task_cancel_confirm.task_id)
322
+ await self._binder.send(client, task_cancel_confirm)
323
+ await self.__send_monitor(task_cancel_confirm.task_id, b"")
324
+ self._task_state_manager.remove_state_machine(task_cancel_confirm.task_id)
325
+ self._task_id_to_task.pop(task_cancel_confirm.task_id)
326
+
327
+ if self._graph_controller.is_graph_subtask(task_cancel_confirm.task_id):
328
+ await self._graph_controller.on_graph_sub_task_cancel_confirm(task_cancel_confirm)
329
+
330
+ await self.__retry_unassignable()
331
+
332
+ async def __send_monitor(self, task_id: TaskID, function_name: bytes, metadata: bytes = b""):
333
+ worker = self._worker_controller.get_worker_by_task_id(task_id)
334
+ task_state = self._task_state_manager.get_state_machine(task_id).current_state()
335
+ capabilities = self._task_id_to_task[task_id].capabilities if task_id in self._task_id_to_task else {}
336
+ await self._binder_monitor.send(
337
+ StateTask.new_msg(task_id, function_name, task_state, worker, capabilities, metadata)
338
+ )
339
+
340
+ async def __routing(self, task_id: TaskID, transition: TaskTransition, **kwargs):
341
+ state_machine = self._task_state_manager.on_transition(task_id, transition)
342
+ if state_machine is None:
343
+ logging.info(f"{task_id!r}: unknown transition: {transition}")
344
+ return
345
+
346
+ try:
347
+ await self._state_functions[state_machine.current_state()](task_id, state_machine, **kwargs) # noqa
348
+ except Exception as e:
349
+ logging.exception(
350
+ f"{task_id!r}: exception happened, transition: {transition} path: {state_machine.get_path()}"
351
+ )
352
+ raise e
353
+
354
+ async def __retry_unassignable(self):
355
+ futures = [
356
+ self.__routing(task_id, TaskTransition.HasCapacity, worker_id=worker_id)
357
+ for task_id, worker_id in self.__acquire_workers()
358
+ ]
359
+
360
+ await asyncio.gather(*futures)
361
+
362
+ def __acquire_workers(self) -> List[Tuple[TaskID, WorkerID]]:
363
+ """please note this function has to be atomic, means no async decorated in order to make unassigned queue to be
364
+ synced, also this function should return as list not generator because of atomic
365
+ """
366
+
367
+ ready_to_assign = list()
368
+ while len(self._unassigned) > 0:
369
+ worker_id = self._worker_controller.acquire_worker(self._task_id_to_task[self._unassigned[0]])
370
+ if not worker_id.is_valid():
371
+ break
372
+
373
+ task_id = self._unassigned.popleft()
374
+ ready_to_assign.append((task_id, worker_id))
375
+
376
+ return ready_to_assign
@@ -0,0 +1,169 @@
1
+ import logging
2
+ import time
3
+ from typing import Dict, Optional, Set, Tuple
4
+
5
+ from scaler.io.mixins import AsyncBinder, AsyncConnector
6
+ from scaler.protocol.python.common import WorkerState
7
+ from scaler.protocol.python.message import (
8
+ ClientDisconnect,
9
+ DisconnectRequest,
10
+ DisconnectResponse,
11
+ StateWorker,
12
+ Task,
13
+ TaskCancel,
14
+ WorkerHeartbeat,
15
+ WorkerHeartbeatEcho,
16
+ )
17
+ from scaler.protocol.python.status import ProcessorStatus, Resource, WorkerManagerStatus, WorkerStatus
18
+ from scaler.scheduler.allocate_policy.mixins import TaskAllocatePolicy
19
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
20
+ from scaler.scheduler.controllers.mixins import TaskController, WorkerController
21
+ from scaler.utility.identifiers import ClientID, TaskID, WorkerID
22
+ from scaler.utility.mixins import Looper, Reporter
23
+
24
+ UINT8_MAX = 2**8 - 1
25
+
26
+
27
+ class VanillaWorkerController(WorkerController, Looper, Reporter):
28
+ def __init__(self, config_controller: VanillaConfigController, task_allocate_policy: TaskAllocatePolicy):
29
+ self._config_controller = config_controller
30
+
31
+ self._binder: Optional[AsyncBinder] = None
32
+ self._binder_monitor: Optional[AsyncConnector] = None
33
+ self._task_controller: Optional[TaskController] = None
34
+
35
+ self._worker_alive_since: Dict[WorkerID, Tuple[float, WorkerHeartbeat]] = dict()
36
+ self._allocator_policy = task_allocate_policy
37
+
38
+ def register(self, binder: AsyncBinder, binder_monitor: AsyncConnector, task_controller: TaskController):
39
+ self._binder = binder
40
+ self._binder_monitor = binder_monitor
41
+ self._task_controller = task_controller
42
+
43
+ def acquire_worker(self, task: Task) -> WorkerID:
44
+ return self._allocator_policy.assign_task(task)
45
+
46
+ async def on_task_cancel(self, task_cancel: TaskCancel):
47
+ worker = self._allocator_policy.remove_task(task_cancel.task_id)
48
+ if not worker.is_valid():
49
+ logging.error(f"cannot find task_id={task_cancel.task_id.hex()} in task workers")
50
+ return
51
+
52
+ await self._binder.send(worker, task_cancel)
53
+
54
+ async def on_task_done(self, task_id: TaskID) -> WorkerID:
55
+ worker = self._allocator_policy.remove_task(task_id)
56
+ if not worker.is_valid():
57
+ logging.error(f"Cannot find task in worker queue: task_id={task_id.hex()}")
58
+
59
+ return worker
60
+
61
+ async def on_heartbeat(self, worker_id: WorkerID, info: WorkerHeartbeat):
62
+ if self._allocator_policy.add_worker(worker_id, info.capabilities, info.queue_size):
63
+ logging.info(f"worker {worker_id!r} connected")
64
+ await self._binder_monitor.send(StateWorker.new_msg(worker_id, WorkerState.Connected, info.capabilities))
65
+ await self._task_controller.on_worker_connect(worker_id)
66
+
67
+ self._worker_alive_since[worker_id] = (time.time(), info)
68
+ await self._binder.send(
69
+ worker_id,
70
+ WorkerHeartbeatEcho.new_msg(
71
+ object_storage_address=self._config_controller.get_config("object_storage_address")
72
+ ),
73
+ )
74
+
75
+ async def on_client_shutdown(self, client_id: ClientID):
76
+ for worker in self._allocator_policy.get_worker_ids():
77
+ await self.__shutdown_worker(worker)
78
+
79
+ async def on_disconnect(self, worker_id: WorkerID, request: DisconnectRequest):
80
+ await self.__disconnect_worker(request.worker)
81
+ await self._binder.send(worker_id, DisconnectResponse.new_msg(request.worker))
82
+
83
+ async def routine(self):
84
+ await self.__clean_workers()
85
+
86
+ def get_status(self) -> WorkerManagerStatus:
87
+ worker_to_task_numbers = self._allocator_policy.statistics()
88
+ return WorkerManagerStatus.new_msg(
89
+ [
90
+ self.__worker_status_from_heartbeat(worker, worker_to_task_numbers[worker], last, info)
91
+ for worker, (last, info) in self._worker_alive_since.items()
92
+ ]
93
+ )
94
+
95
+ @staticmethod
96
+ def __worker_status_from_heartbeat(
97
+ worker_id: WorkerID, worker_task_numbers: Dict, last: float, info: WorkerHeartbeat
98
+ ) -> WorkerStatus:
99
+ current_processor = next((p for p in info.processors if not p.suspended), None)
100
+ suspended = min(len([p for p in info.processors if p.suspended]), UINT8_MAX)
101
+ last_s = min(int(time.time() - last), UINT8_MAX)
102
+
103
+ if current_processor:
104
+ debug_info = f"{int(current_processor.initialized)}{int(current_processor.has_task)}{int(info.task_lock)}"
105
+ else:
106
+ debug_info = f"00{int(info.task_lock)}"
107
+
108
+ return WorkerStatus.new_msg(
109
+ worker_id=worker_id,
110
+ agent=info.agent,
111
+ rss_free=info.rss_free,
112
+ free=worker_task_numbers["free"],
113
+ sent=worker_task_numbers["sent"],
114
+ queued=info.queued_tasks,
115
+ suspended=suspended,
116
+ lag_us=info.latency_us,
117
+ last_s=last_s,
118
+ itl=debug_info,
119
+ processor_statuses=[
120
+ ProcessorStatus.new_msg(
121
+ pid=p.pid,
122
+ initialized=p.initialized,
123
+ has_task=p.has_task,
124
+ suspended=p.suspended,
125
+ resource=Resource.new_msg(p.resource.cpu, p.resource.rss),
126
+ )
127
+ for p in info.processors
128
+ ],
129
+ )
130
+
131
+ def has_available_worker(self) -> bool:
132
+ return self._allocator_policy.has_available_worker()
133
+
134
+ def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
135
+ return self._allocator_policy.get_worker_by_task_id(task_id)
136
+
137
+ def get_worker_ids(self) -> Set[WorkerID]:
138
+ return self._allocator_policy.get_worker_ids()
139
+
140
+ async def __clean_workers(self):
141
+ now = time.time()
142
+ dead_workers = [
143
+ dead_worker
144
+ for dead_worker, (alive_since, info) in self._worker_alive_since.items()
145
+ if now - alive_since > self._config_controller.get_config("worker_timeout_seconds")
146
+ ]
147
+ for dead_worker in dead_workers:
148
+ await self.__disconnect_worker(dead_worker)
149
+
150
+ async def __disconnect_worker(self, worker_id: WorkerID):
151
+ """return True if disconnect worker success"""
152
+ if worker_id not in self._worker_alive_since:
153
+ return
154
+
155
+ logging.info(f"{worker_id!r} disconnected")
156
+ await self._binder_monitor.send(StateWorker.new_msg(worker_id, WorkerState.Disconnected, {}))
157
+ self._worker_alive_since.pop(worker_id)
158
+
159
+ task_ids = self._allocator_policy.remove_worker(worker_id)
160
+ if not task_ids:
161
+ return
162
+
163
+ logging.info(f"{len(task_ids)} task(s) failed due to worker {worker_id!r} disconnected")
164
+ for task_id in task_ids:
165
+ await self._task_controller.on_worker_disconnect(task_id, worker_id)
166
+
167
+ async def __shutdown_worker(self, worker_id: WorkerID):
168
+ await self._binder.send(worker_id, ClientDisconnect.new_msg(ClientDisconnect.DisconnectType.Shutdown))
169
+ await self.__disconnect_worker(worker_id)
File without changes
@@ -0,0 +1,131 @@
1
+ import abc
2
+ from typing import Callable, Dict, Generator, Generic, Optional, Set, Tuple, TypeVar
3
+
4
+ from scaler.utility.many_to_many_dict import ManyToManyDict
5
+
6
+ BlockType = TypeVar("BlockType")
7
+ ObjectKeyType = TypeVar("ObjectKeyType")
8
+
9
+
10
+ class ObjectUsage(Generic[ObjectKeyType], metaclass=abc.ABCMeta):
11
+ @abc.abstractmethod
12
+ def get_object_key(self) -> ObjectKeyType:
13
+ raise NotImplementedError()
14
+
15
+
16
+ ObjectType = TypeVar("ObjectType", bound=ObjectUsage)
17
+
18
+
19
+ class ObjectTracker(Generic[BlockType, ObjectKeyType, ObjectType]):
20
+ def __init__(self, prefix: str, callback: Callable[[ObjectType], None]):
21
+ self._prefix = prefix
22
+ self._callback = callback
23
+
24
+ self._current_blocks: Set[BlockType] = set()
25
+ self._object_key_to_block: ManyToManyDict[ObjectKeyType, BlockType] = ManyToManyDict()
26
+ self._object_key_to_object: Dict[ObjectKeyType, ObjectType] = dict()
27
+
28
+ def object_count(self):
29
+ return len(self._object_key_to_object)
30
+
31
+ def items(self):
32
+ return self._object_key_to_object.items()
33
+
34
+ def get_all_object_keys(self) -> Set[ObjectKeyType]:
35
+ return set(self._object_key_to_object.keys())
36
+
37
+ def has_object(self, key: ObjectKeyType) -> bool:
38
+ return key in self._object_key_to_object
39
+
40
+ def get_object(self, key: ObjectKeyType) -> ObjectType:
41
+ return self._object_key_to_object[key]
42
+
43
+ def add_object(self, obj: ObjectType):
44
+ self._object_key_to_object[obj.get_object_key()] = obj
45
+
46
+ def get_object_block_pairs(self, blocks: Set[BlockType]) -> Generator[Tuple[ObjectKeyType, BlockType], None, None]:
47
+ for block in blocks:
48
+ if not self._object_key_to_block.has_right_key(block):
49
+ continue
50
+
51
+ for object_key in self._object_key_to_block.get_left_items(block):
52
+ yield object_key, block
53
+
54
+ def add_blocks_for_one_object(self, object_key: ObjectKeyType, blocks: Set[BlockType]):
55
+ if object_key not in self._object_key_to_object:
56
+ raise KeyError(f"cannot find key={object_key} in ObjectTracker")
57
+
58
+ for block in blocks:
59
+ self._object_key_to_block.add(object_key, block)
60
+
61
+ self._current_blocks.update(blocks)
62
+
63
+ def remove_blocks_for_one_object(self, object_key: ObjectKeyType, blocks: Set[BlockType]):
64
+ ready_objects = []
65
+ for block in blocks:
66
+ obj = self.__remove_block_for_object(object_key, block)
67
+ if obj is None:
68
+ continue
69
+
70
+ ready_objects.append(obj)
71
+
72
+ for obj in ready_objects:
73
+ self._callback(obj)
74
+
75
+ def add_one_block_for_objects(self, object_keys: Set[ObjectKeyType], block: BlockType):
76
+ for object_key in object_keys:
77
+ if object_key not in self._object_key_to_object:
78
+ raise KeyError(f"cannot find key={object_key} in ObjectTracker")
79
+
80
+ self._object_key_to_block.add(object_key, block)
81
+
82
+ self._current_blocks.add(block)
83
+
84
+ def remove_one_block_for_objects(self, object_keys: Set[ObjectKeyType], block: BlockType):
85
+ ready_objects = []
86
+ for object_key in object_keys:
87
+ obj = self.__remove_block_for_object(object_key, block)
88
+ if obj is None:
89
+ continue
90
+
91
+ ready_objects.append(obj)
92
+
93
+ for obj in ready_objects:
94
+ self._callback(obj)
95
+
96
+ def remove_blocks(self, blocks: Set[BlockType]):
97
+ ready_objects = []
98
+ for block in blocks:
99
+ if not self._object_key_to_block.has_right_key(block):
100
+ continue
101
+
102
+ object_keys = self._object_key_to_block.get_left_items(block).copy()
103
+ for object_key in object_keys:
104
+ obj = self.__remove_block_for_object(object_key, block)
105
+ if obj is None:
106
+ continue
107
+
108
+ ready_objects.append(obj)
109
+
110
+ for obj in ready_objects:
111
+ self._callback(obj)
112
+
113
+ def __remove_block_for_object(self, object_key: ObjectKeyType, block: BlockType) -> Optional[ObjectType]:
114
+ if block not in self._current_blocks:
115
+ return None
116
+
117
+ if object_key not in self._object_key_to_object:
118
+ return None
119
+
120
+ if not self._object_key_to_block.has_key_pair(object_key, block):
121
+ return None
122
+
123
+ self._object_key_to_block.remove(object_key, block)
124
+
125
+ if not self._object_key_to_block.has_right_key(block):
126
+ self._current_blocks.remove(block)
127
+
128
+ if self._object_key_to_block.has_left_key(object_key):
129
+ return None
130
+
131
+ return self._object_key_to_object.pop(object_key)