opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from scaler.config.section.scheduler import SchedulerConfig
|
|
5
|
+
from scaler.scheduler.controllers.mixins import ConfigController
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VanillaConfigController(ConfigController):
|
|
9
|
+
def __init__(self, config: SchedulerConfig):
|
|
10
|
+
self._config: Dict[str, Any] = {}
|
|
11
|
+
|
|
12
|
+
for key, value in config.__dict__.items():
|
|
13
|
+
self.update_config(key, value)
|
|
14
|
+
|
|
15
|
+
def get_config(self, path: str) -> Any:
|
|
16
|
+
if path not in self._config:
|
|
17
|
+
raise KeyError(f"No such config: `{path}`")
|
|
18
|
+
|
|
19
|
+
return self._config[path]
|
|
20
|
+
|
|
21
|
+
def update_config(self, path: str, value: Any):
|
|
22
|
+
# TODO: please add update config message and let config able to handle update config on the fly
|
|
23
|
+
|
|
24
|
+
if path not in self._config:
|
|
25
|
+
self._config[path] = value
|
|
26
|
+
logging.info(f"ConfigController: {path} = {value}")
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
old_value = self._config[path]
|
|
30
|
+
self._config[path] = value
|
|
31
|
+
logging.info(f"ConfigController: updated `{path}` from `{old_value}` to `{value}`")
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import dataclasses
|
|
3
|
+
import enum
|
|
4
|
+
from asyncio import Queue
|
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
|
|
8
|
+
from scaler.protocol.python.common import ObjectMetadata, TaskCancelConfirmType, TaskResultType
|
|
9
|
+
from scaler.protocol.python.message import GraphTask, StateGraphTask, Task, TaskCancel, TaskCancelConfirm, TaskResult
|
|
10
|
+
from scaler.scheduler.controllers.config_controller import VanillaConfigController
|
|
11
|
+
from scaler.scheduler.controllers.mixins import ClientController, GraphTaskController, ObjectController, TaskController
|
|
12
|
+
from scaler.utility.graph.topological_sorter import TopologicalSorter
|
|
13
|
+
from scaler.utility.identifiers import ClientID, ObjectID, TaskID
|
|
14
|
+
from scaler.utility.many_to_many_dict import ManyToManyDict
|
|
15
|
+
from scaler.utility.mixins import Looper, Reporter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _NodeTaskState(enum.Enum):
|
|
19
|
+
Inactive = enum.auto()
|
|
20
|
+
Running = enum.auto()
|
|
21
|
+
Canceled = enum.auto()
|
|
22
|
+
Failed = enum.auto()
|
|
23
|
+
Success = enum.auto()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _GraphState(enum.Enum):
|
|
27
|
+
Running = enum.auto()
|
|
28
|
+
Canceling = enum.auto()
|
|
29
|
+
Aborting = enum.auto()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclasses.dataclass
|
|
33
|
+
class _TaskInfo:
|
|
34
|
+
state: _NodeTaskState
|
|
35
|
+
task: Task
|
|
36
|
+
result_object_ids: List[ObjectID] = dataclasses.field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclasses.dataclass
|
|
40
|
+
class _Graph:
|
|
41
|
+
target_task_ids: List[TaskID]
|
|
42
|
+
sorter: TopologicalSorter
|
|
43
|
+
tasks: Dict[TaskID, _TaskInfo]
|
|
44
|
+
depended_task_id_to_task_id: ManyToManyDict[TaskID, TaskID]
|
|
45
|
+
client: ClientID
|
|
46
|
+
status: _GraphState = dataclasses.field(default=_GraphState.Running)
|
|
47
|
+
running_task_ids: Set[TaskID] = dataclasses.field(default_factory=set)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class VanillaGraphTaskController(GraphTaskController, Looper, Reporter):
|
|
51
|
+
"""
|
|
52
|
+
Graph Task Manager is on top of normal task manager and will maintain a fake graph task once received graph task,
|
|
53
|
+
In the end, will echo back to client for the graph task
|
|
54
|
+
A = func()
|
|
55
|
+
B = func2(A)
|
|
56
|
+
C = func3(A)
|
|
57
|
+
D = func4(B, C)
|
|
58
|
+
|
|
59
|
+
graph
|
|
60
|
+
A = Task(func)
|
|
61
|
+
B = Task(func2, A)
|
|
62
|
+
C = Task(func3, A)
|
|
63
|
+
D = Task(func4, B, C)
|
|
64
|
+
|
|
65
|
+
dependencies
|
|
66
|
+
{"A": {B, C}
|
|
67
|
+
"B": {D},
|
|
68
|
+
"C": {D},
|
|
69
|
+
"D": {},
|
|
70
|
+
}
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(self, config_controller: VanillaConfigController):
|
|
74
|
+
self._config_controller = config_controller
|
|
75
|
+
|
|
76
|
+
self._binder: Optional[AsyncBinder] = None
|
|
77
|
+
self._binder_monitor: Optional[AsyncConnector] = None
|
|
78
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
79
|
+
|
|
80
|
+
self._client_controller: Optional[ClientController] = None
|
|
81
|
+
self._task_controller: Optional[TaskController] = None
|
|
82
|
+
self._object_controller: Optional[ObjectController] = None
|
|
83
|
+
|
|
84
|
+
self._unassigned: Queue = Queue()
|
|
85
|
+
|
|
86
|
+
self._graph_task_id_to_graph: Dict[TaskID, _Graph] = dict()
|
|
87
|
+
self._task_id_to_graph_task_id: Dict[TaskID, TaskID] = dict()
|
|
88
|
+
|
|
89
|
+
def register(
|
|
90
|
+
self,
|
|
91
|
+
binder: AsyncBinder,
|
|
92
|
+
binder_monitor: AsyncConnector,
|
|
93
|
+
connector_storage: AsyncObjectStorageConnector,
|
|
94
|
+
client_controller: ClientController,
|
|
95
|
+
task_controller: TaskController,
|
|
96
|
+
object_controller: ObjectController,
|
|
97
|
+
):
|
|
98
|
+
self._binder = binder
|
|
99
|
+
self._binder_monitor = binder_monitor
|
|
100
|
+
self._connector_storage = connector_storage
|
|
101
|
+
self._client_controller = client_controller
|
|
102
|
+
self._task_controller = task_controller
|
|
103
|
+
self._object_controller = object_controller
|
|
104
|
+
|
|
105
|
+
async def on_graph_task(self, client_id: ClientID, graph_task: GraphTask):
|
|
106
|
+
await self._unassigned.put((client_id, graph_task))
|
|
107
|
+
|
|
108
|
+
async def on_graph_task_cancel(self, task_cancel: TaskCancel):
|
|
109
|
+
graph_task_id = self._task_id_to_graph_task_id[task_cancel.task_id]
|
|
110
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
111
|
+
|
|
112
|
+
if graph_info.status in {_GraphState.Canceling, _GraphState.Aborting}:
|
|
113
|
+
# if graph is already in canceling or aborting, we don't need to proceed whole graph canceling again
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
# received any subtask canceling will lead the whole graph canceling
|
|
117
|
+
await self.__cancel_whole_graph(graph_task_id)
|
|
118
|
+
|
|
119
|
+
async def on_graph_sub_task_result(self, result: TaskResult):
|
|
120
|
+
graph_task_id = self._task_id_to_graph_task_id[result.task_id]
|
|
121
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
122
|
+
|
|
123
|
+
if graph_info.status == _GraphState.Canceling:
|
|
124
|
+
# there will be case when we are canceling the whole graph, and at the moment, result is returning
|
|
125
|
+
# before we see cancel confirm, we treat them as cancel confirm
|
|
126
|
+
await self.on_graph_sub_task_cancel_confirm(
|
|
127
|
+
TaskCancelConfirm.new_msg(result.task_id, TaskCancelConfirmType.Canceled)
|
|
128
|
+
)
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
self.__mark_node_done(result)
|
|
132
|
+
|
|
133
|
+
if result.result_type == TaskResultType.Success:
|
|
134
|
+
await self.__check_one_graph(graph_task_id)
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
assert result.result_type != TaskResultType.Success
|
|
138
|
+
await self.__abort_whole_graph(graph_task_id, result)
|
|
139
|
+
|
|
140
|
+
async def on_graph_sub_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
|
|
141
|
+
graph_task_id = self._task_id_to_graph_task_id[task_cancel_confirm.task_id]
|
|
142
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
143
|
+
self.__mark_node_canceled(graph_info, task_cancel_confirm)
|
|
144
|
+
await self.__cancel_whole_graph(graph_task_id)
|
|
145
|
+
|
|
146
|
+
def is_graph_subtask(self, task_id: TaskID):
|
|
147
|
+
return task_id in self._task_id_to_graph_task_id
|
|
148
|
+
|
|
149
|
+
async def routine(self):
|
|
150
|
+
client, graph_task = await self._unassigned.get()
|
|
151
|
+
await self.__add_new_graph(client, graph_task)
|
|
152
|
+
|
|
153
|
+
def get_status(self) -> Dict:
|
|
154
|
+
return {"graph_manager": {"unassigned": self._unassigned.qsize()}}
|
|
155
|
+
|
|
156
|
+
async def __add_new_graph(self, client_id: ClientID, graph_task: GraphTask):
|
|
157
|
+
graph = {}
|
|
158
|
+
|
|
159
|
+
self._client_controller.on_task_begin(client_id, graph_task.task_id)
|
|
160
|
+
|
|
161
|
+
# add graph umbrella task, note that umbrella is also a graph subtask
|
|
162
|
+
self._task_id_to_graph_task_id[graph_task.task_id] = graph_task.task_id
|
|
163
|
+
|
|
164
|
+
tasks = dict()
|
|
165
|
+
depended_task_id_to_task_id: ManyToManyDict[TaskID, TaskID] = ManyToManyDict()
|
|
166
|
+
for task in graph_task.graph:
|
|
167
|
+
self._task_id_to_graph_task_id[task.task_id] = graph_task.task_id
|
|
168
|
+
tasks[task.task_id] = _TaskInfo(_NodeTaskState.Inactive, task)
|
|
169
|
+
|
|
170
|
+
required_task_ids = {arg for arg in task.function_args if isinstance(arg, TaskID)}
|
|
171
|
+
for required_task_id in required_task_ids:
|
|
172
|
+
depended_task_id_to_task_id.add(required_task_id, task.task_id)
|
|
173
|
+
|
|
174
|
+
graph[task.task_id] = required_task_ids
|
|
175
|
+
|
|
176
|
+
await self._binder_monitor.send(
|
|
177
|
+
StateGraphTask.new_msg(
|
|
178
|
+
graph_task.task_id,
|
|
179
|
+
task.task_id,
|
|
180
|
+
(
|
|
181
|
+
StateGraphTask.NodeTaskType.Target
|
|
182
|
+
if task.task_id in graph_task.targets
|
|
183
|
+
else StateGraphTask.NodeTaskType.Normal
|
|
184
|
+
),
|
|
185
|
+
required_task_ids,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
sorter = TopologicalSorter(graph)
|
|
190
|
+
sorter.prepare()
|
|
191
|
+
|
|
192
|
+
self._graph_task_id_to_graph[graph_task.task_id] = _Graph(
|
|
193
|
+
graph_task.targets, sorter, tasks, depended_task_id_to_task_id, client_id
|
|
194
|
+
)
|
|
195
|
+
await self.__check_one_graph(graph_task.task_id)
|
|
196
|
+
|
|
197
|
+
async def __check_one_graph(self, graph_task_id: TaskID):
|
|
198
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
199
|
+
if not graph_info.sorter.is_active():
|
|
200
|
+
await self.__done_graph_umbrella_task(graph_task_id, TaskResultType.Success)
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
ready_task_ids = graph_info.sorter.get_ready()
|
|
204
|
+
if not ready_task_ids:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
for task_id in ready_task_ids:
|
|
208
|
+
task_info = graph_info.tasks[task_id]
|
|
209
|
+
task_info.state = _NodeTaskState.Running
|
|
210
|
+
graph_info.running_task_ids.add(task_id)
|
|
211
|
+
|
|
212
|
+
task = Task.new_msg(
|
|
213
|
+
task_id=task_info.task.task_id,
|
|
214
|
+
source=task_info.task.source,
|
|
215
|
+
metadata=task_info.task.metadata,
|
|
216
|
+
func_object_id=task_info.task.func_object_id,
|
|
217
|
+
function_args=[self.__get_argument_object(graph_task_id, arg) for arg in task_info.task.function_args],
|
|
218
|
+
capabilities=task_info.task.capabilities,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
await self._task_controller.on_task_new(task)
|
|
222
|
+
|
|
223
|
+
async def __cancel_whole_graph(self, graph_task_id: TaskID):
|
|
224
|
+
if self.__is_graph_finished(graph_task_id):
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
228
|
+
graph_info.status = _GraphState.Canceling
|
|
229
|
+
|
|
230
|
+
await asyncio.gather(
|
|
231
|
+
*[
|
|
232
|
+
self._task_controller.on_task_cancel(
|
|
233
|
+
graph_info.client, TaskCancel.new_msg(task_id, flags=TaskCancel.TaskCancelFlags(force=True))
|
|
234
|
+
)
|
|
235
|
+
for task_id in graph_info.running_task_ids
|
|
236
|
+
]
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# cancel all inactive tasks
|
|
240
|
+
task_cancel_confirms: List[TaskCancelConfirm] = list()
|
|
241
|
+
while graph_info.sorter.is_active():
|
|
242
|
+
ready_task_ids = graph_info.sorter.get_ready()
|
|
243
|
+
if not ready_task_ids:
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
for task_id in ready_task_ids:
|
|
247
|
+
task_cancel_confirm = TaskCancelConfirm.new_msg(task_id, TaskCancelConfirmType.Canceled)
|
|
248
|
+
self.__mark_node_canceled(graph_info, task_cancel_confirm)
|
|
249
|
+
task_cancel_confirms.append(task_cancel_confirm)
|
|
250
|
+
|
|
251
|
+
await self.__send_task_cancel_confirms(graph_info.client, task_cancel_confirms)
|
|
252
|
+
|
|
253
|
+
if self.__is_graph_finished(graph_task_id):
|
|
254
|
+
await self.__cancel_graph_umbrella_task(graph_task_id)
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def __mark_node_canceled(graph_info: _Graph, task_cancel_confirm: TaskCancelConfirm):
|
|
258
|
+
if task_cancel_confirm.task_id not in graph_info.tasks:
|
|
259
|
+
return
|
|
260
|
+
|
|
261
|
+
task_info = graph_info.tasks[task_cancel_confirm.task_id]
|
|
262
|
+
if task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.Canceled:
|
|
263
|
+
task_info.state = _NodeTaskState.Canceled
|
|
264
|
+
elif task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelFailed:
|
|
265
|
+
pass
|
|
266
|
+
elif task_cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelNotFound:
|
|
267
|
+
task_info.state = _NodeTaskState.Canceled
|
|
268
|
+
else:
|
|
269
|
+
raise ValueError(f"received unexpected task cancel confirm {task_cancel_confirm}")
|
|
270
|
+
|
|
271
|
+
graph_info.sorter.done(task_cancel_confirm.task_id)
|
|
272
|
+
|
|
273
|
+
if task_cancel_confirm.task_id in graph_info.running_task_ids:
|
|
274
|
+
graph_info.running_task_ids.remove(task_cancel_confirm.task_id)
|
|
275
|
+
|
|
276
|
+
async def __abort_whole_graph(self, graph_task_id: TaskID, result: TaskResult):
|
|
277
|
+
if self.__is_graph_finished(graph_task_id):
|
|
278
|
+
await self.__done_graph_umbrella_task(graph_task_id, result.result_type)
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
282
|
+
graph_info.status = _GraphState.Aborting
|
|
283
|
+
|
|
284
|
+
result_object_ids = [ObjectID(object_id_bytes) for object_id_bytes in result.results]
|
|
285
|
+
result_objects = [
|
|
286
|
+
(object_id, self._object_controller.get_object_name(object_id)) for object_id in result_object_ids
|
|
287
|
+
]
|
|
288
|
+
|
|
289
|
+
# mark all running tasks done
|
|
290
|
+
results: List[TaskResult] = list()
|
|
291
|
+
for task_id in graph_info.running_task_ids.copy():
|
|
292
|
+
new_result_object_ids = await self.__duplicate_objects(graph_info.client, result_objects)
|
|
293
|
+
result = TaskResult.new_msg(
|
|
294
|
+
task_id, result.result_type, result.metadata, [bytes(object_id) for object_id in new_result_object_ids]
|
|
295
|
+
)
|
|
296
|
+
self.__mark_node_done(result)
|
|
297
|
+
results.append(result)
|
|
298
|
+
|
|
299
|
+
# mark all inactive tasks done
|
|
300
|
+
while graph_info.sorter.is_active():
|
|
301
|
+
for task_id in graph_info.sorter.get_ready():
|
|
302
|
+
new_result_object_ids = await self.__duplicate_objects(graph_info.client, result_objects)
|
|
303
|
+
result = TaskResult.new_msg(
|
|
304
|
+
task_id,
|
|
305
|
+
result.result_type,
|
|
306
|
+
result.metadata,
|
|
307
|
+
[bytes(object_id) for object_id in new_result_object_ids],
|
|
308
|
+
)
|
|
309
|
+
self.__mark_node_done(result)
|
|
310
|
+
results.append(result)
|
|
311
|
+
|
|
312
|
+
await self.__send_results(graph_info.client, results)
|
|
313
|
+
|
|
314
|
+
if self.__is_graph_finished(graph_task_id):
|
|
315
|
+
await self.__done_graph_umbrella_task(graph_task_id, result.result_type)
|
|
316
|
+
return
|
|
317
|
+
|
|
318
|
+
def __mark_node_done(self, result: TaskResult):
|
|
319
|
+
graph_task_id = self._task_id_to_graph_task_id.pop(result.task_id)
|
|
320
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
321
|
+
task_info = graph_info.tasks[result.task_id]
|
|
322
|
+
|
|
323
|
+
task_info.result_object_ids = [ObjectID(object_id_bytes) for object_id_bytes in result.results]
|
|
324
|
+
|
|
325
|
+
if result.result_type == TaskResultType.Success:
|
|
326
|
+
task_info.state = _NodeTaskState.Success
|
|
327
|
+
elif result.result_type == TaskResultType.Failed:
|
|
328
|
+
task_info.state = _NodeTaskState.Failed
|
|
329
|
+
elif result.result_type == TaskResultType.FailedWorkerDied:
|
|
330
|
+
task_info.state = _NodeTaskState.Failed
|
|
331
|
+
else:
|
|
332
|
+
raise ValueError(f"received unexpected task result {result}")
|
|
333
|
+
|
|
334
|
+
self.__clean_intermediate_result(graph_task_id, result.task_id)
|
|
335
|
+
graph_info.sorter.done(result.task_id)
|
|
336
|
+
|
|
337
|
+
if result.task_id in graph_info.running_task_ids:
|
|
338
|
+
graph_info.running_task_ids.remove(result.task_id)
|
|
339
|
+
|
|
340
|
+
async def __cancel_graph_umbrella_task(self, graph_task_id: TaskID):
|
|
341
|
+
if not self.__is_graph_finished(graph_task_id):
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
self._client_controller.on_task_finish(graph_task_id)
|
|
345
|
+
self._task_id_to_graph_task_id.pop(graph_task_id)
|
|
346
|
+
info = self._graph_task_id_to_graph.pop(graph_task_id)
|
|
347
|
+
await self._binder.send(
|
|
348
|
+
info.client, TaskCancelConfirm.new_msg(graph_task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled)
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
async def __done_graph_umbrella_task(self, graph_task_id: TaskID, result_type: TaskResultType):
|
|
352
|
+
self._client_controller.on_task_finish(graph_task_id)
|
|
353
|
+
self._task_id_to_graph_task_id.pop(graph_task_id)
|
|
354
|
+
info = self._graph_task_id_to_graph.pop(graph_task_id)
|
|
355
|
+
await self._binder.send(info.client, TaskResult.new_msg(graph_task_id, result_type))
|
|
356
|
+
|
|
357
|
+
def __is_graph_finished(self, graph_task_id: TaskID):
|
|
358
|
+
if graph_task_id not in self._graph_task_id_to_graph:
|
|
359
|
+
return True
|
|
360
|
+
|
|
361
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
362
|
+
return not graph_info.sorter.is_active() and not graph_info.running_task_ids
|
|
363
|
+
|
|
364
|
+
def __get_argument_object(self, graph_task_id: TaskID, argument: Union[TaskID, ObjectID]) -> ObjectID:
|
|
365
|
+
if isinstance(argument, ObjectID):
|
|
366
|
+
return argument
|
|
367
|
+
|
|
368
|
+
assert isinstance(argument, TaskID)
|
|
369
|
+
|
|
370
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
371
|
+
task_info = graph_info.tasks[argument]
|
|
372
|
+
|
|
373
|
+
assert len(task_info.result_object_ids) == 1
|
|
374
|
+
|
|
375
|
+
return task_info.result_object_ids[0]
|
|
376
|
+
|
|
377
|
+
def __clean_intermediate_result(self, graph_task_id: TaskID, task_id: TaskID):
|
|
378
|
+
graph_info = self._graph_task_id_to_graph[graph_task_id]
|
|
379
|
+
task_info = graph_info.tasks[task_id]
|
|
380
|
+
|
|
381
|
+
for argument in task_info.task.function_args:
|
|
382
|
+
if not isinstance(argument, TaskID):
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
graph_info.depended_task_id_to_task_id.remove(argument, task_id)
|
|
386
|
+
if graph_info.depended_task_id_to_task_id.has_left_key(argument):
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
if argument in graph_info.target_task_ids:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
# delete intermediate results as they are not needed anymore
|
|
393
|
+
self._object_controller.on_del_objects(graph_info.client, set(graph_info.tasks[argument].result_object_ids))
|
|
394
|
+
|
|
395
|
+
async def __duplicate_objects(
|
|
396
|
+
self, owner: ClientID, result_objects: List[Tuple[ObjectID, bytes]]
|
|
397
|
+
) -> List[ObjectID]:
|
|
398
|
+
new_result_object_ids = [ObjectID.generate_object_id(owner) for _ in result_objects]
|
|
399
|
+
|
|
400
|
+
futures = [
|
|
401
|
+
self.__duplicate_object(owner, result_object_id, result_object_name, new_object_id)
|
|
402
|
+
for (result_object_id, result_object_name), new_object_id in zip(result_objects, new_result_object_ids)
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
await asyncio.gather(*futures)
|
|
406
|
+
|
|
407
|
+
return new_result_object_ids
|
|
408
|
+
|
|
409
|
+
async def __duplicate_object(
|
|
410
|
+
self, owner: ClientID, object_id: ObjectID, object_name: bytes, new_object_id: ObjectID
|
|
411
|
+
):
|
|
412
|
+
await self._connector_storage.duplicate_object_id(object_id, new_object_id)
|
|
413
|
+
|
|
414
|
+
self._object_controller.on_add_object(
|
|
415
|
+
owner, new_object_id, ObjectMetadata.ObjectContentType.Object, object_name
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
async def __send_results(self, client_id: ClientID, results: List[TaskResult]):
|
|
419
|
+
await asyncio.gather(*[self._binder.send(client_id, result) for result in results])
|
|
420
|
+
|
|
421
|
+
async def __send_task_cancel_confirms(self, client_id: ClientID, task_cancel_confirms: List[TaskCancelConfirm]):
|
|
422
|
+
await asyncio.gather(
|
|
423
|
+
*[self._binder.send(client_id, task_cancel_confirm) for task_cancel_confirm in task_cancel_confirms]
|
|
424
|
+
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import psutil
|
|
4
|
+
|
|
5
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector
|
|
6
|
+
from scaler.protocol.python.message import InformationRequest, InformationSnapshot, StateScheduler
|
|
7
|
+
from scaler.protocol.python.status import Resource
|
|
8
|
+
from scaler.scheduler.controllers.config_controller import VanillaConfigController
|
|
9
|
+
from scaler.scheduler.controllers.mixins import (
|
|
10
|
+
ClientController,
|
|
11
|
+
InformationController,
|
|
12
|
+
ObjectController,
|
|
13
|
+
TaskController,
|
|
14
|
+
WorkerController,
|
|
15
|
+
)
|
|
16
|
+
from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
|
|
17
|
+
from scaler.utility.mixins import Looper
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VanillaInformationController(InformationController, Looper):
|
|
21
|
+
def __init__(self, config_controller: VanillaConfigController):
|
|
22
|
+
self._config_controller = config_controller
|
|
23
|
+
|
|
24
|
+
self._process = psutil.Process()
|
|
25
|
+
|
|
26
|
+
self._monitor_binder: Optional[AsyncConnector] = None
|
|
27
|
+
self._binder: Optional[AsyncBinder] = None
|
|
28
|
+
self._client_controller: Optional[ClientController] = None
|
|
29
|
+
self._object_controller: Optional[ObjectController] = None
|
|
30
|
+
self._task_controller: Optional[TaskController] = None
|
|
31
|
+
self._worker_controller: Optional[WorkerController] = None
|
|
32
|
+
self._scaling_controller: Optional[ScalingController] = None
|
|
33
|
+
|
|
34
|
+
def register_managers(
|
|
35
|
+
self,
|
|
36
|
+
monitor_binder: AsyncConnector,
|
|
37
|
+
binder: AsyncBinder,
|
|
38
|
+
client_controller: ClientController,
|
|
39
|
+
object_controller: ObjectController,
|
|
40
|
+
task_controller: TaskController,
|
|
41
|
+
worker_controller: WorkerController,
|
|
42
|
+
scaling_controller: ScalingController,
|
|
43
|
+
):
|
|
44
|
+
self._monitor_binder = monitor_binder
|
|
45
|
+
self._binder = binder
|
|
46
|
+
self._client_controller = client_controller
|
|
47
|
+
self._object_controller = object_controller
|
|
48
|
+
self._task_controller = task_controller
|
|
49
|
+
self._worker_controller = worker_controller
|
|
50
|
+
self._scaling_controller = scaling_controller
|
|
51
|
+
|
|
52
|
+
async def on_request(self, request: InformationRequest):
|
|
53
|
+
# TODO: implement commands
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
async def routine(self):
|
|
57
|
+
await self._monitor_binder.send(
|
|
58
|
+
StateScheduler.new_msg(
|
|
59
|
+
binder=self._binder.get_status(),
|
|
60
|
+
scheduler=Resource.new_msg(int(self._process.cpu_percent() * 10), self._process.memory_info().rss),
|
|
61
|
+
rss_free=psutil.virtual_memory().available,
|
|
62
|
+
client_manager=self._client_controller.get_status(),
|
|
63
|
+
object_manager=self._object_controller.get_status(),
|
|
64
|
+
task_manager=self._task_controller.get_status(),
|
|
65
|
+
worker_manager=self._worker_controller.get_status(),
|
|
66
|
+
scaling_manager=self._scaling_controller.get_status(),
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
await self._scaling_controller.on_snapshot(
|
|
71
|
+
InformationSnapshot(
|
|
72
|
+
tasks=self._task_controller._task_id_to_task, # type: ignore # noqa: Expose this later
|
|
73
|
+
workers={
|
|
74
|
+
worker_id: worker_heartbeat
|
|
75
|
+
for worker_id, (
|
|
76
|
+
_,
|
|
77
|
+
worker_heartbeat,
|
|
78
|
+
) in self._worker_controller._worker_alive_since.items() # type: ignore # noqa: Expose this later
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
)
|