opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,31 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
4
+ from scaler.config.section.scheduler import SchedulerConfig
5
+ from scaler.scheduler.controllers.mixins import ConfigController
6
+
7
+
8
+ class VanillaConfigController(ConfigController):
9
+ def __init__(self, config: SchedulerConfig):
10
+ self._config: Dict[str, Any] = {}
11
+
12
+ for key, value in config.__dict__.items():
13
+ self.update_config(key, value)
14
+
15
+ def get_config(self, path: str) -> Any:
16
+ if path not in self._config:
17
+ raise KeyError(f"No such config: `{path}`")
18
+
19
+ return self._config[path]
20
+
21
+ def update_config(self, path: str, value: Any):
22
+ # TODO: please add update config message and let config able to handle update config on the fly
23
+
24
+ if path not in self._config:
25
+ self._config[path] = value
26
+ logging.info(f"ConfigController: {path} = {value}")
27
+ return
28
+
29
+ old_value = self._config[path]
30
+ self._config[path] = value
31
+ logging.info(f"ConfigController: updated `{path}` from `{old_value}` to `{value}`")
@@ -0,0 +1,424 @@
1
+ import asyncio
2
+ import dataclasses
3
+ import enum
4
+ from asyncio import Queue
5
+ from typing import Dict, List, Optional, Set, Tuple, Union
6
+
7
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
8
+ from scaler.protocol.python.common import ObjectMetadata, TaskCancelConfirmType, TaskResultType
9
+ from scaler.protocol.python.message import GraphTask, StateGraphTask, Task, TaskCancel, TaskCancelConfirm, TaskResult
10
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
11
+ from scaler.scheduler.controllers.mixins import ClientController, GraphTaskController, ObjectController, TaskController
12
+ from scaler.utility.graph.topological_sorter import TopologicalSorter
13
+ from scaler.utility.identifiers import ClientID, ObjectID, TaskID
14
+ from scaler.utility.many_to_many_dict import ManyToManyDict
15
+ from scaler.utility.mixins import Looper, Reporter
16
+
17
+
18
+ class _NodeTaskState(enum.Enum):
19
+ Inactive = enum.auto()
20
+ Running = enum.auto()
21
+ Canceled = enum.auto()
22
+ Failed = enum.auto()
23
+ Success = enum.auto()
24
+
25
+
26
+ class _GraphState(enum.Enum):
27
+ Running = enum.auto()
28
+ Canceling = enum.auto()
29
+ Aborting = enum.auto()
30
+
31
+
32
+ @dataclasses.dataclass
33
+ class _TaskInfo:
34
+ state: _NodeTaskState
35
+ task: Task
36
+ result_object_ids: List[ObjectID] = dataclasses.field(default_factory=list)
37
+
38
+
39
+ @dataclasses.dataclass
40
+ class _Graph:
41
+ target_task_ids: List[TaskID]
42
+ sorter: TopologicalSorter
43
+ tasks: Dict[TaskID, _TaskInfo]
44
+ depended_task_id_to_task_id: ManyToManyDict[TaskID, TaskID]
45
+ client: ClientID
46
+ status: _GraphState = dataclasses.field(default=_GraphState.Running)
47
+ running_task_ids: Set[TaskID] = dataclasses.field(default_factory=set)
48
+
49
+
50
+ class VanillaGraphTaskController(GraphTaskController, Looper, Reporter):
51
+ """
52
+ Graph Task Manager is on top of normal task manager and will maintain a fake graph task once received graph task,
53
+ In the end, will echo back to client for the graph task
54
+ A = func()
55
+ B = func2(A)
56
+ C = func3(A)
57
+ D = func4(B, C)
58
+
59
+ graph
60
+ A = Task(func)
61
+ B = Task(func2, A)
62
+ C = Task(func3, A)
63
+ D = Task(func4, B, C)
64
+
65
+ dependencies
66
+ {"A": {B, C}
67
+ "B": {D},
68
+ "C": {D},
69
+ "D": {},
70
+ }
71
+ """
72
+
73
+ def __init__(self, config_controller: VanillaConfigController):
74
+ self._config_controller = config_controller
75
+
76
+ self._binder: Optional[AsyncBinder] = None
77
+ self._binder_monitor: Optional[AsyncConnector] = None
78
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
79
+
80
+ self._client_controller: Optional[ClientController] = None
81
+ self._task_controller: Optional[TaskController] = None
82
+ self._object_controller: Optional[ObjectController] = None
83
+
84
+ self._unassigned: Queue = Queue()
85
+
86
+ self._graph_task_id_to_graph: Dict[TaskID, _Graph] = dict()
87
+ self._task_id_to_graph_task_id: Dict[TaskID, TaskID] = dict()
88
+
89
+ def register(
90
+ self,
91
+ binder: AsyncBinder,
92
+ binder_monitor: AsyncConnector,
93
+ connector_storage: AsyncObjectStorageConnector,
94
+ client_controller: ClientController,
95
+ task_controller: TaskController,
96
+ object_controller: ObjectController,
97
+ ):
98
+ self._binder = binder
99
+ self._binder_monitor = binder_monitor
100
+ self._connector_storage = connector_storage
101
+ self._client_controller = client_controller
102
+ self._task_controller = task_controller
103
+ self._object_controller = object_controller
104
+
105
+ async def on_graph_task(self, client_id: ClientID, graph_task: GraphTask):
106
+ await self._unassigned.put((client_id, graph_task))
107
+
108
+ async def on_graph_task_cancel(self, task_cancel: TaskCancel):
109
+ graph_task_id = self._task_id_to_graph_task_id[task_cancel.task_id]
110
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
111
+
112
+ if graph_info.status in {_GraphState.Canceling, _GraphState.Aborting}:
113
+ # if graph is already in canceling or aborting, we don't need to proceed whole graph canceling again
114
+ return
115
+
116
+ # received any subtask canceling will lead the whole graph canceling
117
+ await self.__cancel_whole_graph(graph_task_id)
118
+
119
+ async def on_graph_sub_task_result(self, result: TaskResult):
120
+ graph_task_id = self._task_id_to_graph_task_id[result.task_id]
121
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
122
+
123
+ if graph_info.status == _GraphState.Canceling:
124
+ # there will be case when we are canceling the whole graph, and at the moment, result is returning
125
+ # before we see cancel confirm, we treat them as cancel confirm
126
+ await self.on_graph_sub_task_cancel_confirm(
127
+ TaskCancelConfirm.new_msg(result.task_id, TaskCancelConfirmType.Canceled)
128
+ )
129
+ return
130
+
131
+ self.__mark_node_done(result)
132
+
133
+ if result.result_type == TaskResultType.Success:
134
+ await self.__check_one_graph(graph_task_id)
135
+ return
136
+
137
+ assert result.result_type != TaskResultType.Success
138
+ await self.__abort_whole_graph(graph_task_id, result)
139
+
140
+ async def on_graph_sub_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
141
+ graph_task_id = self._task_id_to_graph_task_id[task_cancel_confirm.task_id]
142
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
143
+ self.__mark_node_canceled(graph_info, task_cancel_confirm)
144
+ await self.__cancel_whole_graph(graph_task_id)
145
+
146
+ def is_graph_subtask(self, task_id: TaskID):
147
+ return task_id in self._task_id_to_graph_task_id
148
+
149
+ async def routine(self):
150
+ client, graph_task = await self._unassigned.get()
151
+ await self.__add_new_graph(client, graph_task)
152
+
153
+ def get_status(self) -> Dict:
154
+ return {"graph_manager": {"unassigned": self._unassigned.qsize()}}
155
+
156
+ async def __add_new_graph(self, client_id: ClientID, graph_task: GraphTask):
157
+ graph = {}
158
+
159
+ self._client_controller.on_task_begin(client_id, graph_task.task_id)
160
+
161
+ # add graph umbrella task, note that umbrella is also a graph subtask
162
+ self._task_id_to_graph_task_id[graph_task.task_id] = graph_task.task_id
163
+
164
+ tasks = dict()
165
+ depended_task_id_to_task_id: ManyToManyDict[TaskID, TaskID] = ManyToManyDict()
166
+ for task in graph_task.graph:
167
+ self._task_id_to_graph_task_id[task.task_id] = graph_task.task_id
168
+ tasks[task.task_id] = _TaskInfo(_NodeTaskState.Inactive, task)
169
+
170
+ required_task_ids = {arg for arg in task.function_args if isinstance(arg, TaskID)}
171
+ for required_task_id in required_task_ids:
172
+ depended_task_id_to_task_id.add(required_task_id, task.task_id)
173
+
174
+ graph[task.task_id] = required_task_ids
175
+
176
+ await self._binder_monitor.send(
177
+ StateGraphTask.new_msg(
178
+ graph_task.task_id,
179
+ task.task_id,
180
+ (
181
+ StateGraphTask.NodeTaskType.Target
182
+ if task.task_id in graph_task.targets
183
+ else StateGraphTask.NodeTaskType.Normal
184
+ ),
185
+ required_task_ids,
186
+ )
187
+ )
188
+
189
+ sorter = TopologicalSorter(graph)
190
+ sorter.prepare()
191
+
192
+ self._graph_task_id_to_graph[graph_task.task_id] = _Graph(
193
+ graph_task.targets, sorter, tasks, depended_task_id_to_task_id, client_id
194
+ )
195
+ await self.__check_one_graph(graph_task.task_id)
196
+
197
+ async def __check_one_graph(self, graph_task_id: TaskID):
198
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
199
+ if not graph_info.sorter.is_active():
200
+ await self.__done_graph_umbrella_task(graph_task_id, TaskResultType.Success)
201
+ return
202
+
203
+ ready_task_ids = graph_info.sorter.get_ready()
204
+ if not ready_task_ids:
205
+ return
206
+
207
+ for task_id in ready_task_ids:
208
+ task_info = graph_info.tasks[task_id]
209
+ task_info.state = _NodeTaskState.Running
210
+ graph_info.running_task_ids.add(task_id)
211
+
212
+ task = Task.new_msg(
213
+ task_id=task_info.task.task_id,
214
+ source=task_info.task.source,
215
+ metadata=task_info.task.metadata,
216
+ func_object_id=task_info.task.func_object_id,
217
+ function_args=[self.__get_argument_object(graph_task_id, arg) for arg in task_info.task.function_args],
218
+ capabilities=task_info.task.capabilities,
219
+ )
220
+
221
+ await self._task_controller.on_task_new(task)
222
+
223
+ async def __cancel_whole_graph(self, graph_task_id: TaskID):
224
+ if self.__is_graph_finished(graph_task_id):
225
+ return
226
+
227
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
228
+ graph_info.status = _GraphState.Canceling
229
+
230
+ await asyncio.gather(
231
+ *[
232
+ self._task_controller.on_task_cancel(
233
+ graph_info.client, TaskCancel.new_msg(task_id, flags=TaskCancel.TaskCancelFlags(force=True))
234
+ )
235
+ for task_id in graph_info.running_task_ids
236
+ ]
237
+ )
238
+
239
+ # cancel all inactive tasks
240
+ task_cancel_confirms: List[TaskCancelConfirm] = list()
241
+ while graph_info.sorter.is_active():
242
+ ready_task_ids = graph_info.sorter.get_ready()
243
+ if not ready_task_ids:
244
+ break
245
+
246
+ for task_id in ready_task_ids:
247
+ task_cancel_confirm = TaskCancelConfirm.new_msg(task_id, TaskCancelConfirmType.Canceled)
248
+ self.__mark_node_canceled(graph_info, task_cancel_confirm)
249
+ task_cancel_confirms.append(task_cancel_confirm)
250
+
251
+ await self.__send_task_cancel_confirms(graph_info.client, task_cancel_confirms)
252
+
253
+ if self.__is_graph_finished(graph_task_id):
254
+ await self.__cancel_graph_umbrella_task(graph_task_id)
255
+
256
+ @staticmethod
257
+ def __mark_node_canceled(graph_info: _Graph, task_cancel_confirm: TaskCancelConfirm):
258
+ if task_cancel_confirm.task_id not in graph_info.tasks:
259
+ return
260
+
261
+ task_info = graph_info.tasks[task_cancel_confirm.task_id]
262
+ if task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.Canceled:
263
+ task_info.state = _NodeTaskState.Canceled
264
+ elif task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelFailed:
265
+ pass
266
+ elif task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelNotFound:
267
+ task_info.state = _NodeTaskState.Canceled
268
+ else:
269
+ raise ValueError(f"received unexpected task cancel confirm {task_cancel_confirm}")
270
+
271
+ graph_info.sorter.done(task_cancel_confirm.task_id)
272
+
273
+ if task_cancel_confirm.task_id in graph_info.running_task_ids:
274
+ graph_info.running_task_ids.remove(task_cancel_confirm.task_id)
275
+
276
+ async def __abort_whole_graph(self, graph_task_id: TaskID, result: TaskResult):
277
+ if self.__is_graph_finished(graph_task_id):
278
+ await self.__done_graph_umbrella_task(graph_task_id, result.result_type)
279
+ return
280
+
281
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
282
+ graph_info.status = _GraphState.Aborting
283
+
284
+ result_object_ids = [ObjectID(object_id_bytes) for object_id_bytes in result.results]
285
+ result_objects = [
286
+ (object_id, self._object_controller.get_object_name(object_id)) for object_id in result_object_ids
287
+ ]
288
+
289
+ # mark all running tasks done
290
+ results: List[TaskResult] = list()
291
+ for task_id in graph_info.running_task_ids.copy():
292
+ new_result_object_ids = await self.__duplicate_objects(graph_info.client, result_objects)
293
+ result = TaskResult.new_msg(
294
+ task_id, result.result_type, result.metadata, [bytes(object_id) for object_id in new_result_object_ids]
295
+ )
296
+ self.__mark_node_done(result)
297
+ results.append(result)
298
+
299
+ # mark all inactive tasks done
300
+ while graph_info.sorter.is_active():
301
+ for task_id in graph_info.sorter.get_ready():
302
+ new_result_object_ids = await self.__duplicate_objects(graph_info.client, result_objects)
303
+ result = TaskResult.new_msg(
304
+ task_id,
305
+ result.result_type,
306
+ result.metadata,
307
+ [bytes(object_id) for object_id in new_result_object_ids],
308
+ )
309
+ self.__mark_node_done(result)
310
+ results.append(result)
311
+
312
+ await self.__send_results(graph_info.client, results)
313
+
314
+ if self.__is_graph_finished(graph_task_id):
315
+ await self.__done_graph_umbrella_task(graph_task_id, result.result_type)
316
+ return
317
+
318
+ def __mark_node_done(self, result: TaskResult):
319
+ graph_task_id = self._task_id_to_graph_task_id.pop(result.task_id)
320
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
321
+ task_info = graph_info.tasks[result.task_id]
322
+
323
+ task_info.result_object_ids = [ObjectID(object_id_bytes) for object_id_bytes in result.results]
324
+
325
+ if result.result_type == TaskResultType.Success:
326
+ task_info.state = _NodeTaskState.Success
327
+ elif result.result_type == TaskResultType.Failed:
328
+ task_info.state = _NodeTaskState.Failed
329
+ elif result.result_type == TaskResultType.FailedWorkerDied:
330
+ task_info.state = _NodeTaskState.Failed
331
+ else:
332
+ raise ValueError(f"received unexpected task result {result}")
333
+
334
+ self.__clean_intermediate_result(graph_task_id, result.task_id)
335
+ graph_info.sorter.done(result.task_id)
336
+
337
+ if result.task_id in graph_info.running_task_ids:
338
+ graph_info.running_task_ids.remove(result.task_id)
339
+
340
+ async def __cancel_graph_umbrella_task(self, graph_task_id: TaskID):
341
+ if not self.__is_graph_finished(graph_task_id):
342
+ return
343
+
344
+ self._client_controller.on_task_finish(graph_task_id)
345
+ self._task_id_to_graph_task_id.pop(graph_task_id)
346
+ info = self._graph_task_id_to_graph.pop(graph_task_id)
347
+ await self._binder.send(
348
+ info.client, TaskCancelConfirm.new_msg(graph_task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled)
349
+ )
350
+
351
+ async def __done_graph_umbrella_task(self, graph_task_id: TaskID, result_type: TaskResultType):
352
+ self._client_controller.on_task_finish(graph_task_id)
353
+ self._task_id_to_graph_task_id.pop(graph_task_id)
354
+ info = self._graph_task_id_to_graph.pop(graph_task_id)
355
+ await self._binder.send(info.client, TaskResult.new_msg(graph_task_id, result_type))
356
+
357
+ def __is_graph_finished(self, graph_task_id: TaskID):
358
+ if graph_task_id not in self._graph_task_id_to_graph:
359
+ return True
360
+
361
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
362
+ return not graph_info.sorter.is_active() and not graph_info.running_task_ids
363
+
364
+ def __get_argument_object(self, graph_task_id: TaskID, argument: Union[TaskID, ObjectID]) -> ObjectID:
365
+ if isinstance(argument, ObjectID):
366
+ return argument
367
+
368
+ assert isinstance(argument, TaskID)
369
+
370
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
371
+ task_info = graph_info.tasks[argument]
372
+
373
+ assert len(task_info.result_object_ids) == 1
374
+
375
+ return task_info.result_object_ids[0]
376
+
377
+ def __clean_intermediate_result(self, graph_task_id: TaskID, task_id: TaskID):
378
+ graph_info = self._graph_task_id_to_graph[graph_task_id]
379
+ task_info = graph_info.tasks[task_id]
380
+
381
+ for argument in task_info.task.function_args:
382
+ if not isinstance(argument, TaskID):
383
+ continue
384
+
385
+ graph_info.depended_task_id_to_task_id.remove(argument, task_id)
386
+ if graph_info.depended_task_id_to_task_id.has_left_key(argument):
387
+ continue
388
+
389
+ if argument in graph_info.target_task_ids:
390
+ continue
391
+
392
+ # delete intermediate results as they are not needed anymore
393
+ self._object_controller.on_del_objects(graph_info.client, set(graph_info.tasks[argument].result_object_ids))
394
+
395
+ async def __duplicate_objects(
396
+ self, owner: ClientID, result_objects: List[Tuple[ObjectID, bytes]]
397
+ ) -> List[ObjectID]:
398
+ new_result_object_ids = [ObjectID.generate_object_id(owner) for _ in result_objects]
399
+
400
+ futures = [
401
+ self.__duplicate_object(owner, result_object_id, result_object_name, new_object_id)
402
+ for (result_object_id, result_object_name), new_object_id in zip(result_objects, new_result_object_ids)
403
+ ]
404
+
405
+ await asyncio.gather(*futures)
406
+
407
+ return new_result_object_ids
408
+
409
+ async def __duplicate_object(
410
+ self, owner: ClientID, object_id: ObjectID, object_name: bytes, new_object_id: ObjectID
411
+ ):
412
+ await self._connector_storage.duplicate_object_id(object_id, new_object_id)
413
+
414
+ self._object_controller.on_add_object(
415
+ owner, new_object_id, ObjectMetadata.ObjectContentType.Object, object_name
416
+ )
417
+
418
+ async def __send_results(self, client_id: ClientID, results: List[TaskResult]):
419
+ await asyncio.gather(*[self._binder.send(client_id, result) for result in results])
420
+
421
+ async def __send_task_cancel_confirms(self, client_id: ClientID, task_cancel_confirms: List[TaskCancelConfirm]):
422
+ await asyncio.gather(
423
+ *[self._binder.send(client_id, task_cancel_confirm) for task_cancel_confirm in task_cancel_confirms]
424
+ )
@@ -0,0 +1,81 @@
1
+ from typing import Optional
2
+
3
+ import psutil
4
+
5
+ from scaler.io.mixins import AsyncBinder, AsyncConnector
6
+ from scaler.protocol.python.message import InformationRequest, InformationSnapshot, StateScheduler
7
+ from scaler.protocol.python.status import Resource
8
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
9
+ from scaler.scheduler.controllers.mixins import (
10
+ ClientController,
11
+ InformationController,
12
+ ObjectController,
13
+ TaskController,
14
+ WorkerController,
15
+ )
16
+ from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
17
+ from scaler.utility.mixins import Looper
18
+
19
+
20
+ class VanillaInformationController(InformationController, Looper):
21
+ def __init__(self, config_controller: VanillaConfigController):
22
+ self._config_controller = config_controller
23
+
24
+ self._process = psutil.Process()
25
+
26
+ self._monitor_binder: Optional[AsyncConnector] = None
27
+ self._binder: Optional[AsyncBinder] = None
28
+ self._client_controller: Optional[ClientController] = None
29
+ self._object_controller: Optional[ObjectController] = None
30
+ self._task_controller: Optional[TaskController] = None
31
+ self._worker_controller: Optional[WorkerController] = None
32
+ self._scaling_controller: Optional[ScalingController] = None
33
+
34
+ def register_managers(
35
+ self,
36
+ monitor_binder: AsyncConnector,
37
+ binder: AsyncBinder,
38
+ client_controller: ClientController,
39
+ object_controller: ObjectController,
40
+ task_controller: TaskController,
41
+ worker_controller: WorkerController,
42
+ scaling_controller: ScalingController,
43
+ ):
44
+ self._monitor_binder = monitor_binder
45
+ self._binder = binder
46
+ self._client_controller = client_controller
47
+ self._object_controller = object_controller
48
+ self._task_controller = task_controller
49
+ self._worker_controller = worker_controller
50
+ self._scaling_controller = scaling_controller
51
+
52
+ async def on_request(self, request: InformationRequest):
53
+ # TODO: implement commands
54
+ pass
55
+
56
+ async def routine(self):
57
+ await self._monitor_binder.send(
58
+ StateScheduler.new_msg(
59
+ binder=self._binder.get_status(),
60
+ scheduler=Resource.new_msg(int(self._process.cpu_percent() * 10), self._process.memory_info().rss),
61
+ rss_free=psutil.virtual_memory().available,
62
+ client_manager=self._client_controller.get_status(),
63
+ object_manager=self._object_controller.get_status(),
64
+ task_manager=self._task_controller.get_status(),
65
+ worker_manager=self._worker_controller.get_status(),
66
+ scaling_manager=self._scaling_controller.get_status(),
67
+ )
68
+ )
69
+
70
+ await self._scaling_controller.on_snapshot(
71
+ InformationSnapshot(
72
+ tasks=self._task_controller._task_id_to_task, # type: ignore # noqa: Expose this later
73
+ workers={
74
+ worker_id: worker_heartbeat
75
+ for worker_id, (
76
+ _,
77
+ worker_heartbeat,
78
+ ) in self._worker_controller._worker_alive_since.items() # type: ignore # noqa: Expose this later
79
+ },
80
+ )
81
+ )