opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import enum
|
|
3
|
+
import struct
|
|
4
|
+
|
|
5
|
+
from scaler.protocol.capnp._python import _object_storage # noqa
|
|
6
|
+
from scaler.protocol.python.mixins import Message
|
|
7
|
+
from scaler.utility.identifiers import ObjectID
|
|
8
|
+
|
|
9
|
+
OBJECT_ID_FORMAT = "!QQQQ"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclasses.dataclass
|
|
13
|
+
class ObjectRequestHeader(Message):
|
|
14
|
+
class ObjectRequestType(enum.Enum):
|
|
15
|
+
SetObject = _object_storage.ObjectRequestHeader.ObjectRequestType.setObject
|
|
16
|
+
GetObject = _object_storage.ObjectRequestHeader.ObjectRequestType.getObject
|
|
17
|
+
DeleteObject = _object_storage.ObjectRequestHeader.ObjectRequestType.deleteObject
|
|
18
|
+
DuplicateObjectID = _object_storage.ObjectRequestHeader.ObjectRequestType.duplicateObjectID
|
|
19
|
+
|
|
20
|
+
def __init__(self, msg):
|
|
21
|
+
super().__init__(msg)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def object_id(self) -> ObjectID:
|
|
25
|
+
return from_capnp_object_id(self._msg.objectID)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def payload_length(self) -> int:
|
|
29
|
+
return self._msg.payloadLength
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def request_id(self) -> int:
|
|
33
|
+
return self._msg.requestID
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def request_type(self) -> ObjectRequestType:
|
|
37
|
+
return ObjectRequestHeader.ObjectRequestType(self._msg.requestType.raw)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def new_msg(
|
|
41
|
+
object_id: ObjectID, payload_length: int, request_id: int, request_type: ObjectRequestType
|
|
42
|
+
) -> "ObjectRequestHeader":
|
|
43
|
+
return ObjectRequestHeader(
|
|
44
|
+
_object_storage.ObjectRequestHeader(
|
|
45
|
+
objectID=to_capnp_object_id(object_id),
|
|
46
|
+
payloadLength=payload_length,
|
|
47
|
+
requestID=request_id,
|
|
48
|
+
requestType=request_type.value,
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def get_message(self):
|
|
53
|
+
return self._msg
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclasses.dataclass
|
|
57
|
+
class ObjectResponseHeader(Message):
|
|
58
|
+
MESSAGE_LENGTH = 80 # there does not seem to be a way to statically know the size of a pycapnp message
|
|
59
|
+
|
|
60
|
+
class ObjectResponseType(enum.Enum):
|
|
61
|
+
SetOK = _object_storage.ObjectResponseHeader.ObjectResponseType.setOK
|
|
62
|
+
GetOK = _object_storage.ObjectResponseHeader.ObjectResponseType.getOK
|
|
63
|
+
DelOK = _object_storage.ObjectResponseHeader.ObjectResponseType.delOK
|
|
64
|
+
DelNotExists = _object_storage.ObjectResponseHeader.ObjectResponseType.delNotExists
|
|
65
|
+
DuplicateOK = _object_storage.ObjectResponseHeader.ObjectResponseType.duplicateOK
|
|
66
|
+
|
|
67
|
+
def __init__(self, msg):
|
|
68
|
+
super().__init__(msg)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def object_id(self) -> ObjectID:
|
|
72
|
+
return from_capnp_object_id(self._msg.objectID)
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def payload_length(self) -> int:
|
|
76
|
+
return self._msg.payloadLength
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def response_id(self) -> int:
|
|
80
|
+
return self._msg.responseID
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def response_type(self) -> ObjectResponseType:
|
|
84
|
+
return ObjectResponseHeader.ObjectResponseType(self._msg.responseType.raw)
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def new_msg(
|
|
88
|
+
object_id: ObjectID, payload_length: int, response_id: int, response_type: ObjectResponseType
|
|
89
|
+
) -> "ObjectResponseHeader":
|
|
90
|
+
return ObjectResponseHeader(
|
|
91
|
+
_object_storage.ObjectResponseHeader(
|
|
92
|
+
objectID=to_capnp_object_id(object_id),
|
|
93
|
+
payloadLength=payload_length,
|
|
94
|
+
responseID=response_id,
|
|
95
|
+
responseType=response_type.value,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get_message(self):
|
|
100
|
+
return self._msg
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def to_capnp_object_id(object_id: ObjectID) -> _object_storage.ObjectID:
|
|
104
|
+
field0, field1, field2, field3 = struct.unpack(OBJECT_ID_FORMAT, object_id)
|
|
105
|
+
|
|
106
|
+
return _object_storage.ObjectID(field0=field0, field1=field1, field2=field2, field3=field3)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def from_capnp_object_id(capnp_object_id: _object_storage.ObjectID) -> ObjectID:
|
|
110
|
+
return ObjectID(
|
|
111
|
+
struct.pack(
|
|
112
|
+
OBJECT_ID_FORMAT,
|
|
113
|
+
capnp_object_id.field0,
|
|
114
|
+
capnp_object_id.field1,
|
|
115
|
+
capnp_object_id.field2,
|
|
116
|
+
capnp_object_id.field3,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from scaler.protocol.capnp._python import _status # noqa
|
|
4
|
+
from scaler.protocol.python.common import TaskState
|
|
5
|
+
from scaler.protocol.python.mixins import Message
|
|
6
|
+
from scaler.utility.identifiers import ClientID, WorkerID
|
|
7
|
+
|
|
8
|
+
CPU_MAXIMUM = 1000
|
|
9
|
+
WorkerGroupID = bytes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Resource(Message):
|
|
13
|
+
def __init__(self, msg):
|
|
14
|
+
super().__init__(msg)
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def cpu(self) -> int:
|
|
18
|
+
return self._msg.cpu
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def rss(self) -> int:
|
|
22
|
+
return self._msg.rss
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def new_msg(cpu: int, rss: int) -> "Resource": # type: ignore[override]
|
|
26
|
+
return Resource(_status.Resource(cpu=min(cpu, CPU_MAXIMUM), rss=rss))
|
|
27
|
+
|
|
28
|
+
def get_message(self):
|
|
29
|
+
return self._msg
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ObjectManagerStatus(Message):
|
|
33
|
+
def __init__(self, msg):
|
|
34
|
+
super().__init__(msg)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def number_of_objects(self) -> int:
|
|
38
|
+
return self._msg.numberOfObjects
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def new_msg(number_of_objects: int) -> "ObjectManagerStatus": # type: ignore[override]
|
|
42
|
+
return ObjectManagerStatus(_status.ObjectManagerStatus(numberOfObjects=number_of_objects))
|
|
43
|
+
|
|
44
|
+
def get_message(self):
|
|
45
|
+
return self._msg
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ClientManagerStatus(Message):
|
|
49
|
+
def __init__(self, msg):
|
|
50
|
+
super().__init__(msg)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def client_to_num_of_tasks(self) -> Dict[ClientID, int]:
|
|
54
|
+
return {p.client: p.numTask for p in self._msg.clientToNumOfTask}
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def new_msg(client_to_num_of_tasks: Dict[ClientID, int]) -> "ClientManagerStatus": # type: ignore[override]
|
|
58
|
+
return ClientManagerStatus(
|
|
59
|
+
_status.ClientManagerStatus(
|
|
60
|
+
clientToNumOfTask=[
|
|
61
|
+
_status.ClientManagerStatus.Pair(client=client_id.decode(), numTask=num_tasks)
|
|
62
|
+
for client_id, num_tasks in client_to_num_of_tasks.items()
|
|
63
|
+
]
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def get_message(self):
|
|
68
|
+
return self._msg
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TaskManagerStatus(Message):
|
|
72
|
+
VALUE_SIZE_LIMIT = 2**32
|
|
73
|
+
|
|
74
|
+
def __init__(self, msg):
|
|
75
|
+
super().__init__(msg)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def state_to_count(self) -> Dict[TaskState, int]:
|
|
79
|
+
return {TaskState(p.state): p.count for p in self._msg.stateToCount}
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def new_msg(state_to_count: Dict[TaskState, int]) -> "TaskManagerStatus": # type: ignore[override]
|
|
83
|
+
return TaskManagerStatus(
|
|
84
|
+
_status.TaskManagerStatus(
|
|
85
|
+
stateToCount=[
|
|
86
|
+
_status.TaskManagerStatus.Pair(state=p[0].value, count=p[1] % TaskManagerStatus.VALUE_SIZE_LIMIT)
|
|
87
|
+
for p in state_to_count.items()
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def get_message(self):
|
|
93
|
+
return self._msg
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ProcessorStatus(Message):
|
|
97
|
+
def __init__(self, msg):
|
|
98
|
+
super().__init__(msg)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def pid(self) -> int:
|
|
102
|
+
return self._msg.pid
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def initialized(self) -> int:
|
|
106
|
+
return self._msg.initialized
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def has_task(self) -> bool:
|
|
110
|
+
return self._msg.hasTask
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def suspended(self) -> bool:
|
|
114
|
+
return self._msg.suspended
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def resource(self) -> Resource:
|
|
118
|
+
return Resource(self._msg.resource)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def new_msg(
|
|
122
|
+
pid: int, initialized: int, has_task: bool, suspended: bool, resource: Resource # type: ignore[override]
|
|
123
|
+
) -> "ProcessorStatus":
|
|
124
|
+
return ProcessorStatus(
|
|
125
|
+
_status.ProcessorStatus(
|
|
126
|
+
pid=pid, initialized=initialized, hasTask=has_task, suspended=suspended, resource=resource.get_message()
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def get_message(self):
|
|
131
|
+
return self._msg
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class WorkerStatus(Message):
|
|
135
|
+
def __init__(self, msg):
|
|
136
|
+
super().__init__(msg)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def worker_id(self) -> WorkerID:
|
|
140
|
+
return WorkerID(self._msg.workerId)
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def agent(self) -> Resource:
|
|
144
|
+
return Resource(self._msg.agent)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def rss_free(self) -> int:
|
|
148
|
+
return self._msg.rssFree
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def free(self) -> int:
|
|
152
|
+
return self._msg.free
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def sent(self) -> int:
|
|
156
|
+
return self._msg.sent
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def queued(self) -> int:
|
|
160
|
+
return self._msg.queued
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def suspended(self) -> bool:
|
|
164
|
+
return self._msg.suspended
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def lag_us(self) -> int:
|
|
168
|
+
return self._msg.lagUS
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def last_s(self) -> int:
|
|
172
|
+
return self._msg.lastS
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def itl(self) -> str:
|
|
176
|
+
return self._msg.itl
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def processor_statuses(self) -> List[ProcessorStatus]:
|
|
180
|
+
return [ProcessorStatus(ps) for ps in self._msg.processorStatuses]
|
|
181
|
+
|
|
182
|
+
@staticmethod
|
|
183
|
+
def new_msg( # type: ignore[override]
|
|
184
|
+
worker_id: WorkerID,
|
|
185
|
+
agent: Resource,
|
|
186
|
+
rss_free: int,
|
|
187
|
+
free: int,
|
|
188
|
+
sent: int,
|
|
189
|
+
queued: int,
|
|
190
|
+
suspended: int,
|
|
191
|
+
lag_us: int,
|
|
192
|
+
last_s: int,
|
|
193
|
+
itl: str,
|
|
194
|
+
processor_statuses: List[ProcessorStatus],
|
|
195
|
+
) -> "WorkerStatus":
|
|
196
|
+
return WorkerStatus(
|
|
197
|
+
_status.WorkerStatus(
|
|
198
|
+
workerId=bytes(worker_id),
|
|
199
|
+
agent=agent.get_message(),
|
|
200
|
+
rssFree=rss_free,
|
|
201
|
+
free=free,
|
|
202
|
+
sent=sent,
|
|
203
|
+
queued=queued,
|
|
204
|
+
suspended=suspended,
|
|
205
|
+
lagUS=lag_us,
|
|
206
|
+
lastS=last_s,
|
|
207
|
+
itl=itl,
|
|
208
|
+
processorStatuses=[ps.get_message() for ps in processor_statuses],
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def get_message(self):
|
|
213
|
+
return self._msg
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class WorkerManagerStatus(Message):
|
|
217
|
+
def __init__(self, msg):
|
|
218
|
+
super().__init__(msg)
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def workers(self) -> List[WorkerStatus]:
|
|
222
|
+
return [WorkerStatus(ws) for ws in self._msg.workers]
|
|
223
|
+
|
|
224
|
+
@staticmethod
|
|
225
|
+
def new_msg(workers: List[WorkerStatus]) -> "WorkerManagerStatus": # type: ignore[override]
|
|
226
|
+
return WorkerManagerStatus(_status.WorkerManagerStatus(workers=[ws.get_message() for ws in workers]))
|
|
227
|
+
|
|
228
|
+
def get_message(self):
|
|
229
|
+
return self._msg
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class ScalingManagerStatus(Message):
|
|
233
|
+
def __init__(self, msg):
|
|
234
|
+
super().__init__(msg)
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def worker_groups(self) -> Dict[WorkerGroupID, List[WorkerID]]:
|
|
238
|
+
return {wg.workerGroupID: [WorkerID(wid) for wid in wg.workerIDs] for wg in self._msg.workerGroups}
|
|
239
|
+
|
|
240
|
+
@staticmethod
|
|
241
|
+
def new_msg(worker_groups: Dict[WorkerGroupID, List[WorkerID]]) -> "ScalingManagerStatus": # type: ignore[override]
|
|
242
|
+
return ScalingManagerStatus(
|
|
243
|
+
_status.ScalingManagerStatus(
|
|
244
|
+
workerGroups=[
|
|
245
|
+
_status.ScalingManagerStatus.Pair(
|
|
246
|
+
workerGroupID=worker_group_id, workerIDs=[bytes(worker_id) for worker_id in worker_ids]
|
|
247
|
+
)
|
|
248
|
+
for worker_group_id, worker_ids in worker_groups.items()
|
|
249
|
+
]
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def get_message(self):
|
|
254
|
+
return self._msg
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class BinderStatus(Message):
|
|
258
|
+
def __init__(self, msg):
|
|
259
|
+
super().__init__(msg)
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def received(self) -> Dict[str, int]:
|
|
263
|
+
return {p.client: p.number for p in self._msg.received}
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def sent(self) -> Dict[str, int]:
|
|
267
|
+
return {p.client: p.number for p in self._msg.sent}
|
|
268
|
+
|
|
269
|
+
@staticmethod
|
|
270
|
+
def new_msg(received: Dict[str, int], sent: Dict[str, int]) -> "BinderStatus": # type: ignore[override]
|
|
271
|
+
return BinderStatus(
|
|
272
|
+
_status.BinderStatus(
|
|
273
|
+
received=[_status.BinderStatus.Pair(client=p[0], number=p[1]) for p in received.items()],
|
|
274
|
+
sent=[_status.BinderStatus.Pair(client=p[0], number=p[1]) for p in sent.items()],
|
|
275
|
+
)
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def get_message(self):
|
|
279
|
+
return self._msg
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Custom Worker Implementation
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The existed python worker implementation can work with scheduler and scaler protocol is under `scaler.worker`
|
|
6
|
+
|
|
7
|
+
- workers are connecting to 1 single TCP port of scheduler by using zmq
|
|
8
|
+
- each worker has a fixed length task queue to store the incoming tasks
|
|
9
|
+
- in Task message or `TaskResult` message, it will contain object ids instead of actual data, once worker received one
|
|
10
|
+
task, it should ask scheduler for the actual data by sending `ObjectRequest` message
|
|
11
|
+
- once worker finished the task, it should send `TaskResult` message back to scheduler, the `TaskResult` message should
|
|
12
|
+
have the object id of the result, and worker should send `ObjectInstruction` message to scheduler to store the result
|
|
13
|
+
- each worker have a unique id and can be identified by the scheduler that can be set by zmq (see below setup)
|
|
14
|
+
- worker is responsible for sending the heartbeat to scheduler, if scheduler didn't receive heartbeat from worker for
|
|
15
|
+
a period of time, then scheduler will consider the worker is dead and will reallocate all tasks to other workers, also
|
|
16
|
+
heartbeat message will contain the worker's resource usage information and queue capacity and usage status
|
|
17
|
+
- each Task is tied to a source, each source has a dedicated serializer, the source is used to choose the proper
|
|
18
|
+
serializer for deserializing the function and arguments, worker will need retrieve the serializer bytes using an
|
|
19
|
+
`ObjectRequest` message, and then deserialize the serializer bytes using cloudpickle, deserialized serializer is
|
|
20
|
+
implements the interface `scaler.client.serializer.mixins.Serializer`
|
|
21
|
+
- When worker received TaskCancel message, it should cancel the task regardless of tasks status with the given task ID
|
|
22
|
+
and send a `TaskResult` back to the scheduler
|
|
23
|
+
|
|
24
|
+
- worker can request to scheduler to balance the tasks, scheduler will send the tasks to be given up by the worker (this
|
|
25
|
+
will be replaced by TaskCancel message in the future)
|
|
26
|
+
|
|
27
|
+
## Setup
|
|
28
|
+
|
|
29
|
+
The zmq `IDENTITY` must be unique. The zmq `SNDHWM` and `RCVHWM` should both be set to 0 to prevent messages from being
|
|
30
|
+
dropped unexpectedly, and it should be DEALER socket type.
|
|
31
|
+
|
|
32
|
+
All message will be list of bytes represented by frames, the first frame is the message type, and the rest of the frames
|
|
33
|
+
are the message data. One message is the whole list of frames.
|
|
34
|
+
|
|
35
|
+
To send a single message using pyzmq, use the `send_multipart` method:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
# Synchronous API
|
|
39
|
+
socket.send_multipart(frames)
|
|
40
|
+
|
|
41
|
+
# Asyncronous API
|
|
42
|
+
await socket.send_multipart(frames)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Each message may contain multiple zmq frames. To read a single message using pyzmq, use the `recv_multipart` method:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
# Synchronous API
|
|
49
|
+
frames = socket.recv_multipart()
|
|
50
|
+
|
|
51
|
+
# Asyncronous API
|
|
52
|
+
frames = await socket.recv_multipart()
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
All below messages, please see `scaler.protocol.python.message.py` for actual message structure to help you understand
|
|
56
|
+
|
|
57
|
+
## Recv messages
|
|
58
|
+
|
|
59
|
+
### Task `TK`
|
|
60
|
+
|
|
61
|
+
| message_type | task_id | source | metadata | func_object_id | arg 1 type | arg 1 data | (...) | arg N type | arg N data |
|
|
62
|
+
|:------------:|:-------:|:-------:|:--------:|:--------------:|:----------:|:----------:|:-----:|:----------:|:----------:|
|
|
63
|
+
| b"TK" | X bytes | X bytes | X bytes | X bytes | b"R" | X bytes | | b"R" | X bytes |
|
|
64
|
+
|
|
65
|
+
* task_id: Task ID
|
|
66
|
+
* source: Source ID. This is used to choose the proper serializer for deserializing the
|
|
67
|
+
function and arguments. The serializer object ID is md5 hash of `source + b"serializer"` and the serializer object
|
|
68
|
+
bytes data must first be deserialized using cloudpickle. source is essentially the client id, indicate which client
|
|
69
|
+
this task it belongs to, please refer to `scaler.utility.object_utility` for `generate_serializer_object_id`
|
|
70
|
+
* metadata: The metadata of the task, can be empty bytes like `b""`
|
|
71
|
+
* func_object_id: Function object ID
|
|
72
|
+
* arg type: Must be type `b"R"` for ObjectID
|
|
73
|
+
* arg data: Object ID
|
|
74
|
+
|
|
75
|
+
After executing `Task`, the worker should send a `TaskResult` message with the final task result.
|
|
76
|
+
|
|
77
|
+
### TaskCancel `TC`
|
|
78
|
+
|
|
79
|
+
| message_type | task_id |
|
|
80
|
+
|:------------:|:-------:|
|
|
81
|
+
| b"TC" | X bytes |
|
|
82
|
+
|
|
83
|
+
* task_id: Task ID to cancel
|
|
84
|
+
|
|
85
|
+
When a `TaskCancel` message is received, the worker should cancel the task with the given task ID and send a
|
|
86
|
+
`TaskResult`
|
|
87
|
+
|
|
88
|
+
### ObjectInstruction `OI`
|
|
89
|
+
|
|
90
|
+
| message_type | source | type | num_object_ids | num_object_names | num_object_bytes | object 1 id | (...) | object N id |
|
|
91
|
+
|:------------:|:-------:|:----:|:--------------:|:----------------:|:----------------:|:-----------:|:-----:|:-----------:|
|
|
92
|
+
| b"OI" | X bytes | b"D" | unsigned int | unsigned int | unsigned int | X bytes | | X bytes |
|
|
93
|
+
|
|
94
|
+
* source: Source ID
|
|
95
|
+
* type: Must be `b"D` for Delete
|
|
96
|
+
* num_object_ids: Number of object IDs
|
|
97
|
+
* num_object_names: Number of object names, value must be zero
|
|
98
|
+
* num_object_bytes: Number of object bytes, value must be zero
|
|
99
|
+
* object id: Object ID
|
|
100
|
+
|
|
101
|
+
When a Delete `ObjectInstruction` message is received, the worker should delete the objects with the given object IDs on
|
|
102
|
+
worker side
|
|
103
|
+
|
|
104
|
+
### ObjectResponse `OA`
|
|
105
|
+
|
|
106
|
+
| message_type | type | num_object_ids | num_object_names | num_object_bytes | object 1 id | (...) | object N id | object 1 name | (...) | object N name | object 1 bytes | (...) | object N bytes |
|
|
107
|
+
|:------------:|:------:|:--------------:|:----------------:|:----------------:|:-----------:|:-----:|:-----------:|:-------------:|:-----:|:-------------:|:--------------:|:-----:|:--------------:|
|
|
108
|
+
| b"OA" | 1 byte | unsigned int | unsigned int | unsigned int | X bytes | | X bytes | X bytes | | X bytes | X bytes | | X bytes |
|
|
109
|
+
|
|
110
|
+
* type: `b"C"` for object found, `b"N"` for object not found
|
|
111
|
+
* num_object_ids: Number of object IDs
|
|
112
|
+
* num_object_names: Number of object names
|
|
113
|
+
* num_object_bytes: Number of object bytes
|
|
114
|
+
* object id: Object ID
|
|
115
|
+
* object name: Object name
|
|
116
|
+
* object bytes: Object bytes
|
|
117
|
+
|
|
118
|
+
### BalanceRequest `BQ` (will be replaced by TaskCancel in the future, low priority to implement this)
|
|
119
|
+
|
|
120
|
+
| message_type | num_tasks |
|
|
121
|
+
|:------------:|:------------:|
|
|
122
|
+
| b"BQ" | unsigned int |
|
|
123
|
+
|
|
124
|
+
* num_tasks: Number of tasks to give up
|
|
125
|
+
|
|
126
|
+
When a `BalanceRequest` message is received, the worker should send a `BalanceResponse` message with num_tasks number of
|
|
127
|
+
task IDs.
|
|
128
|
+
|
|
129
|
+
### WorkerHeartbeatEcho `HE` (optional)
|
|
130
|
+
|
|
131
|
+
| message_type | empty |
|
|
132
|
+
|:------------:|:-------:|
|
|
133
|
+
| b"HE" | 0 bytes |
|
|
134
|
+
|
|
135
|
+
A `WorkerHeartbeatEcho` message indicates that the scheduler has received the worker's `WorkerHeartbeat` message.
|
|
136
|
+
|
|
137
|
+
### ClientDisconnect `CS` (optional)
|
|
138
|
+
|
|
139
|
+
| message_type | type |
|
|
140
|
+
|:------------:|:----:|
|
|
141
|
+
| b"CS" | b"S" |
|
|
142
|
+
|
|
143
|
+
* type: Must be type `b"S"` for Shutdown.
|
|
144
|
+
|
|
145
|
+
When a Shutdown `ClientDisconnect` message is received, the worker should shutdown.
|
|
146
|
+
|
|
147
|
+
## Send messages
|
|
148
|
+
|
|
149
|
+
### TaskResult `TR`
|
|
150
|
+
|
|
151
|
+
| message_type | task_id | status | result | metadata |
|
|
152
|
+
|:------------:|:-------:|:------:|:-------:|:--------:|
|
|
153
|
+
| b"TR" | X bytes | 1 byte | X bytes | X bytes |
|
|
154
|
+
|
|
155
|
+
* task_id: Task ID
|
|
156
|
+
* status: `b"S"` for Success, `b"F"` for Failed, `b"C"` for Canceled, `b"K"` for WorkerDied, `b"W"` for NoWorker, `b"I"`
|
|
157
|
+
for Inactive, `b"R"` for Running, and `b"X"` for Canceling
|
|
158
|
+
* result: Task result object ID
|
|
159
|
+
* metadata: Task metadata
|
|
160
|
+
|
|
161
|
+
Worker must submit a Create `ObjectInstruction` message BEFORE returning the task result containing a task result object
|
|
162
|
+
ID.
|
|
163
|
+
|
|
164
|
+
### ObjectInstruction `OI`
|
|
165
|
+
|
|
166
|
+
| message_type | source | type | num_object_ids | num_object_names | num_object_bytes | object 1 id | (...) | object N id | object 1 name | (...) | object N name | object 1 bytes | (...) | object N bytes |
|
|
167
|
+
|:------------:|:-------:|:----:|:--------------:|:----------------:|:----------------:|:-----------:|:-----:|:-----------:|:-------------:|:-----:|:-------------:|:--------------:|:-----:|:--------------:|
|
|
168
|
+
| b"OI" | X bytes | b"C" | unsigned int | unsigned int | unsigned int | X bytes | | X bytes | X bytes | | X bytes | X bytes | | X bytes |
|
|
169
|
+
|
|
170
|
+
* source: Source ID (this should be the same as the source in the corresponding `Task` message)
|
|
171
|
+
* type: Must be `b"C` for Create
|
|
172
|
+
* num_object_ids: Number of object IDs
|
|
173
|
+
* num_object_names: Number of object names
|
|
174
|
+
* num_object_bytes: Number of object bytes
|
|
175
|
+
* object id: Object ID (please use uuid.uuid4.bytes for object id)
|
|
176
|
+
* object name: Object name
|
|
177
|
+
* object bytes: Object bytes
|
|
178
|
+
|
|
179
|
+
### ObjectRequest `OR`
|
|
180
|
+
|
|
181
|
+
| message_type | type | object 1 id | (...) | object N id |
|
|
182
|
+
|:------------:|:----:|:-----------:|:-----:|:-----------:|
|
|
183
|
+
| b"OR" | b"A" | X bytes | | X bytes |
|
|
184
|
+
|
|
185
|
+
* type: Must be `b"A"` for Get
|
|
186
|
+
* object id: Object ID
|
|
187
|
+
|
|
188
|
+
When received a `Task` message, all the functions and arguments are just object IDs, the worker will need get object
|
|
189
|
+
content by sending `ObjectRequest` message to the scheduler, the scheduler will reply with `ObjectResponse` message,
|
|
190
|
+
then worker can deserialize the object content and start executing the task
|
|
191
|
+
|
|
192
|
+
### BalanceResponse `BR`
|
|
193
|
+
|
|
194
|
+
| message_type | task 1 id | (...) | task N id |
|
|
195
|
+
|:------------:|:---------:|:-----:|:---------:|
|
|
196
|
+
| b"BR" | X bytes | | X bytes |
|
|
197
|
+
|
|
198
|
+
* task id: Task ID
|
|
199
|
+
|
|
200
|
+
### WorkerHeartbeat `HB`
|
|
201
|
+
|
|
202
|
+
| message_type | agent_cpu | agent_rss | worker_cpu | worker_rss | rss_free | queued_tasks | latency_us | initialized | has_task | task_lock |
|
|
203
|
+
|:------------:|:--------------:|:------------------:|:--------------:|:------------------:|:------------------:|:--------------:|:------------:|:-----------:|:--------:|:---------:|
|
|
204
|
+
| b"HB" | unsigned short | unsigned long long | unsigned short | unsigned long long | unsigned long long | unsigned short | unsigned int | _Bool | _Bool | _Bool |
|
|
205
|
+
|
|
206
|
+
* agent_cpu: Agent CPU usage
|
|
207
|
+
* agent_rss: Agent resident set size in bytes
|
|
208
|
+
* worker_cpu: Worker CPU usage
|
|
209
|
+
* worker_rss: Worker resident set size in bytes
|
|
210
|
+
* rss_free: Free memory in bytes
|
|
211
|
+
* queued_tasks: Number of queued tasks
|
|
212
|
+
* latency_us: Latency in microseconds
|
|
213
|
+
* initialized: Worker initialized
|
|
214
|
+
* has_task: Worker has task
|
|
215
|
+
* task_lock: Worker task lock
|
|
216
|
+
|
|
217
|
+
Worker must send a `WorkerHeartbeat` message every heartbeat interval (at least 1 second) or else the scheduler will
|
|
218
|
+
consider the worker dead.
|
|
219
|
+
|
|
220
|
+
### DisconnectRequest `DR` (optional)
|
|
221
|
+
|
|
222
|
+
| message_type | worker |
|
|
223
|
+
|:------------:|:-------:|
|
|
224
|
+
| b"DR" | X bytes |
|
|
225
|
+
|
|
226
|
+
* worker: Worker ID
|
|
227
|
+
|
|
228
|
+
When a `DisconnectRequest` message is sent, the worker should disconnect from the scheduler.
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
3
|
+
from scaler.scheduler.allocate_policy.capability_allocate_policy import CapabilityAllocatePolicy
|
|
4
|
+
from scaler.scheduler.allocate_policy.even_load_allocate_policy import EvenLoadAllocatePolicy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AllocatePolicy(enum.Enum):
|
|
8
|
+
even = EvenLoadAllocatePolicy
|
|
9
|
+
capability = CapabilityAllocatePolicy
|