opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Any, Optional, Set
|
|
3
|
+
|
|
4
|
+
from scaler.protocol.python.common import ObjectMetadata
|
|
5
|
+
from scaler.protocol.python.message import (
|
|
6
|
+
ClientDisconnect,
|
|
7
|
+
ClientHeartbeat,
|
|
8
|
+
DisconnectRequest,
|
|
9
|
+
GraphTask,
|
|
10
|
+
InformationRequest,
|
|
11
|
+
ObjectInstruction,
|
|
12
|
+
Task,
|
|
13
|
+
TaskCancel,
|
|
14
|
+
TaskCancelConfirm,
|
|
15
|
+
TaskResult,
|
|
16
|
+
WorkerHeartbeat,
|
|
17
|
+
)
|
|
18
|
+
from scaler.utility.identifiers import ClientID, ObjectID, TaskID, WorkerID
|
|
19
|
+
from scaler.utility.mixins import Reporter
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ConfigController(metaclass=abc.ABCMeta):
|
|
23
|
+
@abc.abstractmethod
|
|
24
|
+
def get_config(self, path: str) -> Any:
|
|
25
|
+
raise NotImplementedError()
|
|
26
|
+
|
|
27
|
+
@abc.abstractmethod
|
|
28
|
+
def update_config(self, path: str, value: Any):
|
|
29
|
+
raise NotImplementedError()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ObjectController(Reporter):
|
|
33
|
+
@abc.abstractmethod
|
|
34
|
+
async def on_object_instruction(self, source: bytes, request: ObjectInstruction):
|
|
35
|
+
raise NotImplementedError()
|
|
36
|
+
|
|
37
|
+
@abc.abstractmethod
|
|
38
|
+
def on_add_object(
|
|
39
|
+
self,
|
|
40
|
+
client_id: ClientID,
|
|
41
|
+
object_id: ObjectID,
|
|
42
|
+
object_type: ObjectMetadata.ObjectContentType,
|
|
43
|
+
object_name: bytes,
|
|
44
|
+
):
|
|
45
|
+
raise NotImplementedError()
|
|
46
|
+
|
|
47
|
+
@abc.abstractmethod
|
|
48
|
+
def on_del_objects(self, client_id: ClientID, object_ids: Set[ObjectID]):
|
|
49
|
+
raise NotImplementedError()
|
|
50
|
+
|
|
51
|
+
@abc.abstractmethod
|
|
52
|
+
def clean_client(self, client_id: ClientID):
|
|
53
|
+
raise NotImplementedError()
|
|
54
|
+
|
|
55
|
+
@abc.abstractmethod
|
|
56
|
+
def has_object(self, object_id: ObjectID) -> bool:
|
|
57
|
+
raise NotImplementedError()
|
|
58
|
+
|
|
59
|
+
@abc.abstractmethod
|
|
60
|
+
def get_object_name(self, object_id: ObjectID) -> bytes:
|
|
61
|
+
raise NotImplementedError()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ClientController(Reporter):
|
|
65
|
+
@abc.abstractmethod
|
|
66
|
+
def get_client_task_ids(self, client_id: ClientID) -> Set[TaskID]:
|
|
67
|
+
raise NotImplementedError()
|
|
68
|
+
|
|
69
|
+
@abc.abstractmethod
|
|
70
|
+
def has_client_id(self, client_id: ClientID) -> bool:
|
|
71
|
+
raise NotImplementedError()
|
|
72
|
+
|
|
73
|
+
@abc.abstractmethod
|
|
74
|
+
def get_client_id(self, task_id: TaskID) -> Optional[ClientID]:
|
|
75
|
+
raise NotImplementedError()
|
|
76
|
+
|
|
77
|
+
@abc.abstractmethod
|
|
78
|
+
def on_task_begin(self, client_id: ClientID, task_id: TaskID):
|
|
79
|
+
raise NotImplementedError()
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
def on_task_finish(self, task_id: TaskID) -> bytes:
|
|
83
|
+
raise NotImplementedError()
|
|
84
|
+
|
|
85
|
+
@abc.abstractmethod
|
|
86
|
+
async def on_heartbeat(self, client_id: ClientID, info: ClientHeartbeat):
|
|
87
|
+
raise NotImplementedError()
|
|
88
|
+
|
|
89
|
+
@abc.abstractmethod
|
|
90
|
+
async def on_client_disconnect(self, client_id: ClientID, request: ClientDisconnect):
|
|
91
|
+
raise NotImplementedError()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class GraphTaskController(Reporter):
|
|
95
|
+
@abc.abstractmethod
|
|
96
|
+
async def on_graph_task(self, client_id: ClientID, graph_task: GraphTask):
|
|
97
|
+
raise NotImplementedError()
|
|
98
|
+
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
async def on_graph_task_cancel(self, graph_task_cancel: TaskCancel):
|
|
101
|
+
raise NotImplementedError()
|
|
102
|
+
|
|
103
|
+
@abc.abstractmethod
|
|
104
|
+
async def on_graph_sub_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
|
|
105
|
+
raise NotImplementedError()
|
|
106
|
+
|
|
107
|
+
@abc.abstractmethod
|
|
108
|
+
async def on_graph_sub_task_result(self, result: TaskResult) -> bool:
|
|
109
|
+
raise NotImplementedError()
|
|
110
|
+
|
|
111
|
+
@abc.abstractmethod
|
|
112
|
+
def is_graph_subtask(self, task_id: TaskID) -> bool:
|
|
113
|
+
raise NotImplementedError()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class TaskController(Reporter):
|
|
117
|
+
@abc.abstractmethod
|
|
118
|
+
async def on_task_new(self, task: Task):
|
|
119
|
+
raise NotImplementedError()
|
|
120
|
+
|
|
121
|
+
@abc.abstractmethod
|
|
122
|
+
async def on_task_cancel(self, client_id: ClientID, task_cancel: TaskCancel):
|
|
123
|
+
raise NotImplementedError()
|
|
124
|
+
|
|
125
|
+
@abc.abstractmethod
|
|
126
|
+
async def on_task_balance_cancel(self, task_id: TaskID):
|
|
127
|
+
raise NotImplementedError()
|
|
128
|
+
|
|
129
|
+
@abc.abstractmethod
|
|
130
|
+
async def on_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
|
|
131
|
+
raise NotImplementedError()
|
|
132
|
+
|
|
133
|
+
@abc.abstractmethod
|
|
134
|
+
async def on_task_result(self, result: TaskResult):
|
|
135
|
+
raise NotImplementedError()
|
|
136
|
+
|
|
137
|
+
@abc.abstractmethod
|
|
138
|
+
async def on_worker_connect(self, worker_id: WorkerID):
|
|
139
|
+
raise NotImplementedError()
|
|
140
|
+
|
|
141
|
+
@abc.abstractmethod
|
|
142
|
+
async def on_worker_disconnect(self, task_id: TaskID, worker_id: WorkerID):
|
|
143
|
+
raise NotImplementedError()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class WorkerController(Reporter):
|
|
147
|
+
@abc.abstractmethod
|
|
148
|
+
def acquire_worker(self, task: Task) -> Optional[WorkerID]:
|
|
149
|
+
"""this acquires worker should be atomic, means it cannot be async decorated, otherwise it will create gap that
|
|
150
|
+
get worker but task is not send to worker, and cannot find task in the worker state"""
|
|
151
|
+
|
|
152
|
+
# TODO: this function should return things that expose 3 kinds of information:
|
|
153
|
+
# TODO: 1. worker id as bytes if have capacity and able to assign to worker id
|
|
154
|
+
# TODO: 2. capacity is full, and unable to add new task
|
|
155
|
+
# TODO: 3. capacity is not full, but all the workers are busy right now, so tasks will be queued
|
|
156
|
+
raise NotImplementedError()
|
|
157
|
+
|
|
158
|
+
@abc.abstractmethod
|
|
159
|
+
async def on_task_cancel(self, task_cancel: TaskCancel) -> bytes:
|
|
160
|
+
raise NotImplementedError()
|
|
161
|
+
|
|
162
|
+
@abc.abstractmethod
|
|
163
|
+
async def on_task_done(self, task_id: TaskID):
|
|
164
|
+
raise NotImplementedError()
|
|
165
|
+
|
|
166
|
+
@abc.abstractmethod
|
|
167
|
+
async def on_heartbeat(self, worker_id: WorkerID, info: WorkerHeartbeat):
|
|
168
|
+
raise NotImplementedError()
|
|
169
|
+
|
|
170
|
+
@abc.abstractmethod
|
|
171
|
+
async def on_client_shutdown(self, client_id: ClientID):
|
|
172
|
+
raise NotImplementedError()
|
|
173
|
+
|
|
174
|
+
@abc.abstractmethod
|
|
175
|
+
async def on_disconnect(self, worker_id: WorkerID, request: DisconnectRequest):
|
|
176
|
+
raise NotImplementedError()
|
|
177
|
+
|
|
178
|
+
@abc.abstractmethod
|
|
179
|
+
def has_available_worker(self) -> bool:
|
|
180
|
+
raise NotImplementedError()
|
|
181
|
+
|
|
182
|
+
@abc.abstractmethod
|
|
183
|
+
def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
|
|
184
|
+
raise NotImplementedError()
|
|
185
|
+
|
|
186
|
+
@abc.abstractmethod
|
|
187
|
+
def get_worker_ids(self) -> Set[WorkerID]:
|
|
188
|
+
raise NotImplementedError()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class InformationController(metaclass=abc.ABCMeta):
|
|
192
|
+
@abc.abstractmethod
|
|
193
|
+
async def on_request(self, request: InformationRequest):
|
|
194
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
from asyncio import Queue
|
|
4
|
+
from typing import Optional, Set
|
|
5
|
+
|
|
6
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
|
|
7
|
+
from scaler.protocol.python.common import ObjectMetadata
|
|
8
|
+
from scaler.protocol.python.message import ObjectInstruction
|
|
9
|
+
from scaler.protocol.python.status import ObjectManagerStatus
|
|
10
|
+
from scaler.scheduler.controllers.config_controller import VanillaConfigController
|
|
11
|
+
from scaler.scheduler.controllers.mixins import ClientController, ObjectController, WorkerController
|
|
12
|
+
from scaler.scheduler.object_usage.object_tracker import ObjectTracker, ObjectUsage
|
|
13
|
+
from scaler.utility.identifiers import ClientID, ObjectID
|
|
14
|
+
from scaler.utility.mixins import Looper, Reporter
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class _ObjectCreation(ObjectUsage):
|
|
19
|
+
object_id: ObjectID
|
|
20
|
+
object_creator: ClientID
|
|
21
|
+
object_type: ObjectMetadata.ObjectContentType
|
|
22
|
+
object_name: bytes
|
|
23
|
+
|
|
24
|
+
def get_object_key(self) -> ObjectID:
|
|
25
|
+
return self.object_id
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class VanillaObjectController(ObjectController, Looper, Reporter):
|
|
29
|
+
def __init__(self, config_controller: VanillaConfigController):
|
|
30
|
+
self._config_controller = config_controller
|
|
31
|
+
|
|
32
|
+
self._object_tracker: ObjectTracker[ClientID, ObjectID, _ObjectCreation] = ObjectTracker(
|
|
33
|
+
"object_usage", self.__finished_object_storage
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
self._queue_deleted_object_ids: Queue[ObjectID] = Queue()
|
|
37
|
+
|
|
38
|
+
self._binder: Optional[AsyncBinder] = None
|
|
39
|
+
self._binder_monitor: Optional[AsyncConnector] = None
|
|
40
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
41
|
+
|
|
42
|
+
self._client_manager: Optional[ClientController] = None
|
|
43
|
+
self._worker_manager: Optional[WorkerController] = None
|
|
44
|
+
|
|
45
|
+
def register(
|
|
46
|
+
self,
|
|
47
|
+
binder: AsyncBinder,
|
|
48
|
+
binder_monitor: AsyncConnector,
|
|
49
|
+
connector_storage: AsyncObjectStorageConnector,
|
|
50
|
+
client_manager: ClientController,
|
|
51
|
+
worker_manager: WorkerController,
|
|
52
|
+
):
|
|
53
|
+
self._binder = binder
|
|
54
|
+
self._binder_monitor = binder_monitor
|
|
55
|
+
self._connector_storage = connector_storage
|
|
56
|
+
self._client_manager = client_manager
|
|
57
|
+
self._worker_manager = worker_manager
|
|
58
|
+
|
|
59
|
+
async def on_object_instruction(self, source: bytes, instruction: ObjectInstruction):
|
|
60
|
+
if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Create:
|
|
61
|
+
self.__on_object_create(source, instruction)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
|
|
65
|
+
self.on_del_objects(instruction.object_user, set(instruction.object_metadata.object_ids))
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
logging.error(f"received unknown object instruction_type={instruction.instruction_type} from {source=}")
|
|
69
|
+
|
|
70
|
+
def on_add_object(
|
|
71
|
+
self,
|
|
72
|
+
client_id: ClientID,
|
|
73
|
+
object_id: ObjectID,
|
|
74
|
+
object_type: ObjectMetadata.ObjectContentType,
|
|
75
|
+
object_name: bytes,
|
|
76
|
+
):
|
|
77
|
+
creation = _ObjectCreation(object_id, client_id, object_type, object_name)
|
|
78
|
+
logging.debug(
|
|
79
|
+
f"add object cache "
|
|
80
|
+
f"object_name={creation.object_name!r}, "
|
|
81
|
+
f"object_type={creation.object_type}, "
|
|
82
|
+
f"object_id={creation.object_id!r}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self._object_tracker.add_object(creation)
|
|
86
|
+
self._object_tracker.add_blocks_for_one_object(creation.get_object_key(), {creation.object_creator})
|
|
87
|
+
|
|
88
|
+
def on_del_objects(self, client_id: ClientID, object_ids: Set[ObjectID]):
|
|
89
|
+
for object_id in object_ids:
|
|
90
|
+
self._object_tracker.remove_one_block_for_objects({object_id}, client_id)
|
|
91
|
+
|
|
92
|
+
def clean_client(self, client_id: ClientID):
|
|
93
|
+
self._object_tracker.remove_blocks({client_id})
|
|
94
|
+
|
|
95
|
+
async def routine(self):
|
|
96
|
+
await self.__routine_send_objects_deletions()
|
|
97
|
+
|
|
98
|
+
def has_object(self, object_id: ObjectID) -> bool:
|
|
99
|
+
return self._object_tracker.has_object(object_id)
|
|
100
|
+
|
|
101
|
+
def get_object_name(self, object_id: ObjectID) -> bytes:
|
|
102
|
+
if not self.has_object(object_id):
|
|
103
|
+
return b"<Unknown>"
|
|
104
|
+
|
|
105
|
+
return self._object_tracker.get_object(object_id).object_name
|
|
106
|
+
|
|
107
|
+
def get_status(self) -> ObjectManagerStatus:
|
|
108
|
+
return ObjectManagerStatus.new_msg(self._object_tracker.object_count())
|
|
109
|
+
|
|
110
|
+
async def __routine_send_objects_deletions(self):
|
|
111
|
+
deleted_object_ids = [await self._queue_deleted_object_ids.get()]
|
|
112
|
+
self._queue_deleted_object_ids.task_done()
|
|
113
|
+
|
|
114
|
+
while not self._queue_deleted_object_ids.empty():
|
|
115
|
+
deleted_object_ids.append(self._queue_deleted_object_ids.get_nowait())
|
|
116
|
+
self._queue_deleted_object_ids.task_done()
|
|
117
|
+
|
|
118
|
+
for worker in self._worker_manager.get_worker_ids():
|
|
119
|
+
await self._binder.send(
|
|
120
|
+
worker,
|
|
121
|
+
ObjectInstruction.new_msg(
|
|
122
|
+
ObjectInstruction.ObjectInstructionType.Delete,
|
|
123
|
+
# TODO: ideally object_user should be set to the owning client ID, but then we cannot batch these
|
|
124
|
+
# Delete instructions.
|
|
125
|
+
None,
|
|
126
|
+
ObjectMetadata.new_msg(tuple(deleted_object_ids)),
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
for object_id in deleted_object_ids:
|
|
131
|
+
await self._connector_storage.delete_object(object_id)
|
|
132
|
+
|
|
133
|
+
def __on_object_create(self, source: bytes, instruction: ObjectInstruction):
|
|
134
|
+
if not self._client_manager.has_client_id(instruction.object_user):
|
|
135
|
+
logging.error(f"received object creation from {source!r} for unknown client {instruction.object_user!r}")
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
for object_id, object_type, object_name in zip(
|
|
139
|
+
instruction.object_metadata.object_ids,
|
|
140
|
+
instruction.object_metadata.object_types,
|
|
141
|
+
instruction.object_metadata.object_names,
|
|
142
|
+
):
|
|
143
|
+
self.on_add_object(instruction.object_user, object_id, object_type, object_name)
|
|
144
|
+
|
|
145
|
+
def __finished_object_storage(self, creation: _ObjectCreation):
|
|
146
|
+
logging.debug(f"del object cache object_name={creation.object_name!r}, object_id={creation.object_id!r}")
|
|
147
|
+
self._queue_deleted_object_ids.put_nowait(creation.object_id)
|
|
File without changes
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import math
|
|
3
|
+
from typing import Dict, List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
from aiohttp import web
|
|
7
|
+
|
|
8
|
+
from scaler.protocol.python.message import InformationSnapshot
|
|
9
|
+
from scaler.protocol.python.status import ScalingManagerStatus
|
|
10
|
+
from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
|
|
11
|
+
from scaler.scheduler.controllers.scaling_policies.types import WorkerGroupID
|
|
12
|
+
from scaler.utility.identifiers import WorkerID
|
|
13
|
+
|
|
14
|
+
WorkerAdapterLabel = Literal["primary", "secondary"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FixedElasticScalingController(ScalingController):
|
|
18
|
+
def __init__(self, primary_adapter_webhook_url: str, secondary_adapter_webhook_url: str):
|
|
19
|
+
self._primary_webhook = primary_adapter_webhook_url
|
|
20
|
+
self._secondary_webhook = secondary_adapter_webhook_url
|
|
21
|
+
self._primary_group_limit = 1
|
|
22
|
+
self._lower_task_ratio = 1
|
|
23
|
+
self._upper_task_ratio = 10
|
|
24
|
+
|
|
25
|
+
self._worker_groups: Dict[WorkerGroupID, List[WorkerID]] = {}
|
|
26
|
+
self._worker_group_source: Dict[WorkerGroupID, WorkerAdapterLabel] = {}
|
|
27
|
+
|
|
28
|
+
def get_status(self):
|
|
29
|
+
return ScalingManagerStatus.new_msg(worker_groups=self._worker_groups)
|
|
30
|
+
|
|
31
|
+
async def on_snapshot(self, information_snapshot: InformationSnapshot):
|
|
32
|
+
if not information_snapshot.workers:
|
|
33
|
+
if information_snapshot.tasks:
|
|
34
|
+
await self._start_worker_group()
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
task_ratio = len(information_snapshot.tasks) / len(information_snapshot.workers)
|
|
38
|
+
if task_ratio > self._upper_task_ratio:
|
|
39
|
+
await self._start_worker_group()
|
|
40
|
+
elif task_ratio < self._lower_task_ratio:
|
|
41
|
+
worker_group_task_counts = {
|
|
42
|
+
worker_group_id: sum(
|
|
43
|
+
information_snapshot.workers[worker_id].queued_tasks
|
|
44
|
+
for worker_id in worker_ids
|
|
45
|
+
if worker_id in information_snapshot.workers
|
|
46
|
+
)
|
|
47
|
+
for worker_group_id, worker_ids in self._worker_groups.items()
|
|
48
|
+
}
|
|
49
|
+
if not worker_group_task_counts:
|
|
50
|
+
logging.warning("No worker groups available to shut down.")
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
# Prefer shutting down secondary adapter groups first
|
|
54
|
+
secondary_groups = [
|
|
55
|
+
(group_id, task_count)
|
|
56
|
+
for group_id, task_count in worker_group_task_counts.items()
|
|
57
|
+
if self._worker_group_source.get(group_id) == "secondary"
|
|
58
|
+
]
|
|
59
|
+
if secondary_groups:
|
|
60
|
+
worker_group_id = min(secondary_groups, key=lambda item: item[1])[0]
|
|
61
|
+
else:
|
|
62
|
+
worker_group_id = min(worker_group_task_counts, key=worker_group_task_counts.get)
|
|
63
|
+
|
|
64
|
+
await self._shutdown_worker_group(worker_group_id)
|
|
65
|
+
|
|
66
|
+
async def _start_worker_group(self):
|
|
67
|
+
# Select adapter: use primary if under limit, otherwise use secondary
|
|
68
|
+
adapter: Optional[WorkerAdapterLabel] = None
|
|
69
|
+
webhook = None
|
|
70
|
+
|
|
71
|
+
if self._primary_webhook:
|
|
72
|
+
primary_count = sum(source == "primary" for source in self._worker_group_source.values())
|
|
73
|
+
if self._primary_group_limit is None or primary_count < self._primary_group_limit:
|
|
74
|
+
adapter = "primary"
|
|
75
|
+
webhook = self._primary_webhook
|
|
76
|
+
else:
|
|
77
|
+
logging.debug(f"Primary adapter worker group limit reached ({self._primary_group_limit}).")
|
|
78
|
+
|
|
79
|
+
if adapter is None and self._secondary_webhook:
|
|
80
|
+
adapter = "secondary"
|
|
81
|
+
webhook = self._secondary_webhook
|
|
82
|
+
|
|
83
|
+
if adapter is None:
|
|
84
|
+
logging.warning("All worker adapters have reached their capacity; cannot start a new worker group.")
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
response, status = await self._make_request(webhook, {"action": "get_worker_adapter_info"})
|
|
88
|
+
if status != web.HTTPOk.status_code:
|
|
89
|
+
logging.warning("Failed to get worker adapter info.")
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
if sum(adapter == "secondary" for adapter in self._worker_group_source.values()) >= response.get(
|
|
93
|
+
"max_worker_groups", math.inf
|
|
94
|
+
):
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
response, status = await self._make_request(webhook, {"action": "start_worker_group"})
|
|
98
|
+
if status == web.HTTPTooManyRequests.status_code:
|
|
99
|
+
logging.warning(f"{adapter.capitalize()} adapter capacity exceeded, cannot start new worker group.")
|
|
100
|
+
return
|
|
101
|
+
if status == web.HTTPInternalServerError.status_code:
|
|
102
|
+
logging.error(
|
|
103
|
+
f"{adapter.capitalize()} adapter failed to start worker group:"
|
|
104
|
+
f" {response.get('error', 'Unknown error')}"
|
|
105
|
+
)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
worker_group_id = response["worker_group_id"].encode()
|
|
109
|
+
self._worker_groups[worker_group_id] = [WorkerID(worker_id.encode()) for worker_id in response["worker_ids"]]
|
|
110
|
+
self._worker_group_source[worker_group_id] = adapter
|
|
111
|
+
logging.info(f"Started worker group {worker_group_id.decode()} on {adapter} adapter.")
|
|
112
|
+
|
|
113
|
+
async def _shutdown_worker_group(self, worker_group_id: WorkerGroupID):
|
|
114
|
+
if worker_group_id not in self._worker_groups:
|
|
115
|
+
logging.error(f"Worker group with ID {worker_group_id.decode()} does not exist.")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
adapter = self._worker_group_source.get(worker_group_id)
|
|
119
|
+
if adapter is None:
|
|
120
|
+
logging.error(f"Worker group {worker_group_id.decode()} has no associated adapter recorded.")
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
webhook = self._primary_webhook if adapter == "primary" else self._secondary_webhook
|
|
124
|
+
response, status = await self._make_request(
|
|
125
|
+
webhook, {"action": "shutdown_worker_group", "worker_group_id": worker_group_id.decode()}
|
|
126
|
+
)
|
|
127
|
+
if status == web.HTTPNotFound.status_code:
|
|
128
|
+
logging.error(f"Worker group with ID {worker_group_id.decode()} not found in {adapter} adapter.")
|
|
129
|
+
return
|
|
130
|
+
if status == web.HTTPInternalServerError.status_code:
|
|
131
|
+
logging.error(
|
|
132
|
+
f"{adapter.capitalize()} adapter failed to shutdown worker group:"
|
|
133
|
+
f" {response.get('error', 'Unknown error')}"
|
|
134
|
+
)
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
self._worker_groups.pop(worker_group_id)
|
|
138
|
+
self._worker_group_source.pop(worker_group_id)
|
|
139
|
+
logging.info(f"Shutdown worker group {worker_group_id.decode()} on {adapter} adapter.")
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
async def _make_request(webhook_url: str, payload):
|
|
143
|
+
async with aiohttp.ClientSession() as session:
|
|
144
|
+
async with session.post(webhook_url, json=payload) as response:
|
|
145
|
+
return await response.json(), response.status
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
|
|
3
|
+
from scaler.protocol.python.message import InformationSnapshot
|
|
4
|
+
from scaler.utility.mixins import Reporter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ScalingController(Reporter):
|
|
8
|
+
@abc.abstractmethod
|
|
9
|
+
async def on_snapshot(self, snapshot: InformationSnapshot):
|
|
10
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from scaler.protocol.python.message import InformationSnapshot
|
|
2
|
+
from scaler.protocol.python.status import ScalingManagerStatus
|
|
3
|
+
from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NullScalingController(ScalingController):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
def get_status(self):
|
|
11
|
+
return ScalingManagerStatus.new_msg(worker_groups={})
|
|
12
|
+
|
|
13
|
+
async def on_snapshot(self, information_snapshot: InformationSnapshot):
|
|
14
|
+
pass
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
from scaler.scheduler.controllers.scaling_policies.fixed_elastic import FixedElasticScalingController
|
|
4
|
+
from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
|
|
5
|
+
from scaler.scheduler.controllers.scaling_policies.null import NullScalingController
|
|
6
|
+
from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
|
|
7
|
+
from scaler.scheduler.controllers.scaling_policies.vanilla import VanillaScalingController
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_scaling_controller(
|
|
11
|
+
scaling_controller_strategy: ScalingControllerStrategy, adapter_webhook_urls: Tuple[str, ...]
|
|
12
|
+
) -> ScalingController:
|
|
13
|
+
if scaling_controller_strategy == ScalingControllerStrategy.NULL:
|
|
14
|
+
return NullScalingController(*adapter_webhook_urls)
|
|
15
|
+
elif scaling_controller_strategy == ScalingControllerStrategy.VANILLA:
|
|
16
|
+
return VanillaScalingController(*adapter_webhook_urls)
|
|
17
|
+
elif scaling_controller_strategy == ScalingControllerStrategy.FIXED_ELASTIC:
|
|
18
|
+
return FixedElasticScalingController(*adapter_webhook_urls)
|
|
19
|
+
|
|
20
|
+
raise ValueError(f"unsupported scaling controller strategy: {scaling_controller_strategy}")
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import math
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
from aiohttp import web
|
|
7
|
+
|
|
8
|
+
from scaler.protocol.python.message import InformationSnapshot
|
|
9
|
+
from scaler.protocol.python.status import ScalingManagerStatus
|
|
10
|
+
from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
|
|
11
|
+
from scaler.scheduler.controllers.scaling_policies.types import WorkerGroupID
|
|
12
|
+
from scaler.utility.identifiers import WorkerID
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class VanillaScalingController(ScalingController):
|
|
16
|
+
def __init__(self, adapter_webhook_url: str):
|
|
17
|
+
self._adapter_webhook_url = adapter_webhook_url
|
|
18
|
+
self._lower_task_ratio = 1
|
|
19
|
+
self._upper_task_ratio = 10
|
|
20
|
+
|
|
21
|
+
self._worker_groups: Dict[WorkerGroupID, List[WorkerID]] = {}
|
|
22
|
+
|
|
23
|
+
def get_status(self):
|
|
24
|
+
return ScalingManagerStatus.new_msg(worker_groups=self._worker_groups)
|
|
25
|
+
|
|
26
|
+
async def on_snapshot(self, information_snapshot: InformationSnapshot):
|
|
27
|
+
if not information_snapshot.workers:
|
|
28
|
+
if information_snapshot.tasks:
|
|
29
|
+
await self._start_worker_group()
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
task_ratio = len(information_snapshot.tasks) / len(information_snapshot.workers)
|
|
33
|
+
if task_ratio > self._upper_task_ratio:
|
|
34
|
+
await self._start_worker_group()
|
|
35
|
+
elif task_ratio < self._lower_task_ratio:
|
|
36
|
+
worker_group_task_counts = {
|
|
37
|
+
worker_group_id: sum(
|
|
38
|
+
information_snapshot.workers[worker_id].queued_tasks
|
|
39
|
+
for worker_id in worker_ids
|
|
40
|
+
if worker_id in information_snapshot.workers
|
|
41
|
+
)
|
|
42
|
+
for worker_group_id, worker_ids in self._worker_groups.items()
|
|
43
|
+
}
|
|
44
|
+
if not worker_group_task_counts:
|
|
45
|
+
logging.warning(
|
|
46
|
+
"No worker groups available to shut down. There might be statically provisioned workers."
|
|
47
|
+
)
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
worker_group_id = min(worker_group_task_counts, key=worker_group_task_counts.get)
|
|
51
|
+
await self._shutdown_worker_group(worker_group_id)
|
|
52
|
+
|
|
53
|
+
async def _start_worker_group(self):
|
|
54
|
+
response, status = await self._make_request({"action": "get_worker_adapter_info"})
|
|
55
|
+
if status != web.HTTPOk.status_code:
|
|
56
|
+
logging.warning("Failed to get worker adapter info.")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
if len(self._worker_groups) >= response.get("max_worker_groups", math.inf):
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
response, status = await self._make_request({"action": "start_worker_group"})
|
|
63
|
+
if status == web.HTTPTooManyRequests.status_code:
|
|
64
|
+
logging.warning("Capacity exceeded, cannot start new worker group.")
|
|
65
|
+
return
|
|
66
|
+
if status == web.HTTPInternalServerError.status_code:
|
|
67
|
+
logging.error(f"Failed to start worker group: {response.get('error', 'Unknown error')}")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
worker_group_id = response["worker_group_id"].encode()
|
|
71
|
+
self._worker_groups[worker_group_id] = [WorkerID(worker_id.encode()) for worker_id in response["worker_ids"]]
|
|
72
|
+
logging.info(f"Started worker group: {worker_group_id.decode()}")
|
|
73
|
+
|
|
74
|
+
async def _shutdown_worker_group(self, worker_group_id: WorkerGroupID):
|
|
75
|
+
if worker_group_id not in self._worker_groups:
|
|
76
|
+
logging.error(f"Worker group with ID {worker_group_id.decode()} does not exist.")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
response, status = await self._make_request(
|
|
80
|
+
{"action": "shutdown_worker_group", "worker_group_id": worker_group_id.decode()}
|
|
81
|
+
)
|
|
82
|
+
if status == web.HTTPNotFound.status_code:
|
|
83
|
+
logging.error(f"Worker group with ID {worker_group_id.decode()} not found in adapter.")
|
|
84
|
+
return
|
|
85
|
+
if status == web.HTTPInternalServerError.status_code:
|
|
86
|
+
logging.error(f"Failed to shutdown worker group: {response.get('error', 'Unknown error')}")
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
self._worker_groups.pop(worker_group_id)
|
|
90
|
+
logging.info(f"Shutdown worker group: {worker_group_id.decode()}")
|
|
91
|
+
|
|
92
|
+
async def _make_request(self, payload):
|
|
93
|
+
async with aiohttp.ClientSession() as session:
|
|
94
|
+
async with session.post(self._adapter_webhook_url, json=payload) as response:
|
|
95
|
+
return await response.json(), response.status
|