opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opengris-scaler might be problematic. Click here for more details.
- opengris_scaler-1.12.28.dist-info/METADATA +728 -0
- opengris_scaler-1.12.28.dist-info/RECORD +187 -0
- opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +210 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +658 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +115 -0
- scaler/cluster/combo.py +150 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/defaults.py +94 -0
- scaler/config/loader.py +96 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +55 -0
- scaler/config/section/ecs_worker_adapter.py +85 -0
- scaler/config/section/native_worker_adapter.py +43 -0
- scaler/config/section/object_storage_server.py +8 -0
- scaler/config/section/scheduler.py +54 -0
- scaler/config/section/symphony_worker_adapter.py +47 -0
- scaler/config/section/top.py +13 -0
- scaler/config/section/webui.py +21 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +62 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +133 -0
- scaler/entry_points/object_storage_server.py +45 -0
- scaler/entry_points/scheduler.py +144 -0
- scaler/entry_points/top.py +286 -0
- scaler/entry_points/webui.py +48 -0
- scaler/entry_points/worker_adapter_ecs.py +191 -0
- scaler/entry_points/worker_adapter_native.py +137 -0
- scaler/entry_points/worker_adapter_symphony.py +98 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +247 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/constants.py +9 -0
- scaler/ui/live_display.py +147 -0
- scaler/ui/memory_window.py +146 -0
- scaler/ui/setting_page.py +40 -0
- scaler/ui/task_graph.py +832 -0
- scaler/ui/task_log.py +107 -0
- scaler/ui/utility.py +66 -0
- scaler/ui/webui.py +147 -0
- scaler/ui/worker_processors.py +104 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +107 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +269 -0
- scaler/worker_adapter/native.py +155 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +139 -0
- src/scaler/io/ymq/_ymq.so +0 -0
- src/scaler/object_storage/object_storage_server.so +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
@0xaf44f44ea94a4675;
|
|
2
|
+
|
|
3
|
+
using CommonType = import "common.capnp";
|
|
4
|
+
using Status = import "status.capnp";
|
|
5
|
+
|
|
6
|
+
struct Task {
|
|
7
|
+
taskId @0 :Data;
|
|
8
|
+
source @1 :Data;
|
|
9
|
+
metadata @2 :Data;
|
|
10
|
+
funcObjectId @3 :Data;
|
|
11
|
+
functionArgs @4 :List(Argument);
|
|
12
|
+
capabilities @5 :List(CommonType.TaskCapability);
|
|
13
|
+
|
|
14
|
+
struct Argument {
|
|
15
|
+
type @0 :ArgumentType;
|
|
16
|
+
data @1 :Data;
|
|
17
|
+
|
|
18
|
+
enum ArgumentType {
|
|
19
|
+
task @0;
|
|
20
|
+
objectID @1;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
struct TaskCancel {
|
|
26
|
+
struct TaskCancelFlags {
|
|
27
|
+
force @0 :Bool;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
taskId @0 :Data;
|
|
31
|
+
flags @1 :TaskCancelFlags;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
struct TaskLog {
|
|
35
|
+
taskId @0 :Data;
|
|
36
|
+
logType @1 :LogType;
|
|
37
|
+
content @2 :Text;
|
|
38
|
+
|
|
39
|
+
enum LogType {
|
|
40
|
+
stdout @0;
|
|
41
|
+
stderr @1;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
struct TaskResult {
|
|
46
|
+
taskId @0 :Data;
|
|
47
|
+
resultType @1 :CommonType.TaskResultType;
|
|
48
|
+
metadata @2 :Data;
|
|
49
|
+
results @3 :List(Data);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
struct TaskCancelConfirm {
|
|
53
|
+
taskId @0 :Data;
|
|
54
|
+
cancelConfirmType @1 :CommonType.TaskCancelConfirmType;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
struct GraphTask {
|
|
58
|
+
taskId @0 :Data;
|
|
59
|
+
source @1 :Data;
|
|
60
|
+
targets @2 :List(Data);
|
|
61
|
+
graph @3 :List(Task);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
struct ClientHeartbeat {
|
|
65
|
+
resource @0 :Status.Resource;
|
|
66
|
+
latencyUS @1 :UInt32;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
struct ClientHeartbeatEcho {
|
|
70
|
+
objectStorageAddress @0 :CommonType.ObjectStorageAddress;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
struct WorkerHeartbeat {
|
|
74
|
+
agent @0 :Status.Resource;
|
|
75
|
+
rssFree @1 :UInt64;
|
|
76
|
+
queueSize @2 :UInt32;
|
|
77
|
+
queuedTasks @3 :UInt32;
|
|
78
|
+
latencyUS @4 :UInt32;
|
|
79
|
+
taskLock @5 :Bool;
|
|
80
|
+
processors @6 :List(Status.ProcessorStatus);
|
|
81
|
+
capabilities @7 :List(CommonType.TaskCapability);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
struct WorkerHeartbeatEcho {
|
|
85
|
+
objectStorageAddress @0 :CommonType.ObjectStorageAddress;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
struct ObjectInstruction {
|
|
89
|
+
instructionType @0 :ObjectInstructionType;
|
|
90
|
+
objectUser @1 :Data;
|
|
91
|
+
objectMetadata @2 :CommonType.ObjectMetadata;
|
|
92
|
+
|
|
93
|
+
enum ObjectInstructionType {
|
|
94
|
+
create @0;
|
|
95
|
+
delete @1;
|
|
96
|
+
clear @2;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
struct DisconnectRequest {
|
|
101
|
+
worker @0 :Data;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
struct DisconnectResponse {
|
|
105
|
+
worker @0 :Data;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
struct ClientDisconnect {
|
|
109
|
+
disconnectType @0 :DisconnectType;
|
|
110
|
+
|
|
111
|
+
enum DisconnectType {
|
|
112
|
+
disconnect @0;
|
|
113
|
+
shutdown @1;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
struct ClientShutdownResponse {
|
|
118
|
+
accepted @0 :Bool;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
struct StateClient {
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
struct StateObject {
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
struct StateBalanceAdvice {
|
|
128
|
+
workerId @0 :Data;
|
|
129
|
+
taskIds @1 :List(Data);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
struct StateScheduler {
|
|
133
|
+
binder @0 :Status.BinderStatus;
|
|
134
|
+
scheduler @1 :Status.Resource;
|
|
135
|
+
rssFree @2 :UInt64;
|
|
136
|
+
clientManager @3 :Status.ClientManagerStatus;
|
|
137
|
+
objectManager @4 :Status.ObjectManagerStatus;
|
|
138
|
+
taskManager @5 :Status.TaskManagerStatus;
|
|
139
|
+
workerManager @6 :Status.WorkerManagerStatus;
|
|
140
|
+
scalingManager @7 :Status.ScalingManagerStatus;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
struct StateWorker {
|
|
144
|
+
workerId @0 :Data;
|
|
145
|
+
state@1 :CommonType.WorkerState;
|
|
146
|
+
capabilities @2 :List(CommonType.TaskCapability);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
struct StateTask {
|
|
150
|
+
taskId @0 :Data;
|
|
151
|
+
functionName @1 :Data;
|
|
152
|
+
state @2 :CommonType.TaskState;
|
|
153
|
+
worker @3 :Data;
|
|
154
|
+
capabilities @4 :List(CommonType.TaskCapability);
|
|
155
|
+
metadata @5 :Data;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
struct StateGraphTask {
|
|
159
|
+
enum NodeTaskType {
|
|
160
|
+
normal @0;
|
|
161
|
+
target @1;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
graphTaskId @0 :Data;
|
|
165
|
+
taskId @1 :Data;
|
|
166
|
+
nodeTaskType @2 :NodeTaskType;
|
|
167
|
+
parentTaskIds @3 :List(Data);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
struct ProcessorInitialized {
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
struct InformationRequest {
|
|
174
|
+
request @0 :Data;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
struct InformationResponse {
|
|
178
|
+
response @0 :Data;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
struct Message {
|
|
182
|
+
union {
|
|
183
|
+
task @0 :Task;
|
|
184
|
+
taskCancel @1 :TaskCancel;
|
|
185
|
+
taskCancelConfirm @2 :TaskCancelConfirm;
|
|
186
|
+
taskResult @3 :TaskResult;
|
|
187
|
+
taskLog @4 :TaskLog;
|
|
188
|
+
|
|
189
|
+
graphTask @5 :GraphTask;
|
|
190
|
+
|
|
191
|
+
objectInstruction @6 :ObjectInstruction;
|
|
192
|
+
|
|
193
|
+
clientHeartbeat @7 :ClientHeartbeat;
|
|
194
|
+
clientHeartbeatEcho @8 :ClientHeartbeatEcho;
|
|
195
|
+
|
|
196
|
+
workerHeartbeat @9 :WorkerHeartbeat;
|
|
197
|
+
workerHeartbeatEcho @10 :WorkerHeartbeatEcho;
|
|
198
|
+
|
|
199
|
+
disconnectRequest @11 :DisconnectRequest;
|
|
200
|
+
disconnectResponse @12 :DisconnectResponse;
|
|
201
|
+
|
|
202
|
+
stateClient @13 :StateClient;
|
|
203
|
+
stateObject @14 :StateObject;
|
|
204
|
+
stateBalanceAdvice @15 :StateBalanceAdvice;
|
|
205
|
+
stateScheduler @16 :StateScheduler;
|
|
206
|
+
stateWorker @17 :StateWorker;
|
|
207
|
+
stateTask @18 :StateTask;
|
|
208
|
+
stateGraphTask @19 :StateGraphTask;
|
|
209
|
+
|
|
210
|
+
clientDisconnect @20 :ClientDisconnect;
|
|
211
|
+
clientShutdownResponse @21 :ClientShutdownResponse;
|
|
212
|
+
|
|
213
|
+
processorInitialized @22 :ProcessorInitialized;
|
|
214
|
+
|
|
215
|
+
informationRequest @23 :InformationRequest;
|
|
216
|
+
informationResponse @24 :InformationResponse;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
@0xc2a14174aa42a12a;
|
|
2
|
+
|
|
3
|
+
using Cxx = import "/capnp/c++.capnp";
|
|
4
|
+
$Cxx.namespace("scaler::protocol");
|
|
5
|
+
|
|
6
|
+
struct ObjectRequestHeader {
|
|
7
|
+
objectID @0: ObjectID; # 32 bytes
|
|
8
|
+
payloadLength @1: UInt64; # 8 bytes
|
|
9
|
+
requestID @2: UInt64; # 8 bytes
|
|
10
|
+
requestType @3: ObjectRequestType; # 2 bytes
|
|
11
|
+
|
|
12
|
+
enum ObjectRequestType {
|
|
13
|
+
# Set or override an object to the message's payload.
|
|
14
|
+
# Overrides the object's content if it already exists
|
|
15
|
+
# Always immediately answers with a setOK message.
|
|
16
|
+
setObject @0;
|
|
17
|
+
|
|
18
|
+
# Get an object's content.
|
|
19
|
+
# If the object does not exist, delays the getOk response until the object is created.
|
|
20
|
+
getObject @1;
|
|
21
|
+
|
|
22
|
+
# Remove the object.
|
|
23
|
+
deleteObject @2;
|
|
24
|
+
|
|
25
|
+
# Creates the provided object ID by linking it to the content of the object ID provided in payload.
|
|
26
|
+
# Overrides the object content if the new object ID already exists.
|
|
27
|
+
# If the referenced object does not exist, delays the duplicateOK response until the original object is created.
|
|
28
|
+
duplicateObjectID @3;
|
|
29
|
+
|
|
30
|
+
# Request the server to give back internal information, result is returned as payload.
|
|
31
|
+
# schema: three uint64_t tuple (number of ids, number of objects (hashes), total actual object size in bytes)
|
|
32
|
+
infoGetTotal @4;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
struct ObjectID {
|
|
37
|
+
field0 @0: UInt64;
|
|
38
|
+
field1 @1: UInt64;
|
|
39
|
+
field2 @2: UInt64;
|
|
40
|
+
field3 @3: UInt64;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
struct ObjectResponseHeader {
|
|
44
|
+
objectID @0: ObjectID;
|
|
45
|
+
payloadLength @1: UInt64;
|
|
46
|
+
responseID @2: UInt64; # 8 bytes
|
|
47
|
+
responseType @3: ObjectResponseType;
|
|
48
|
+
|
|
49
|
+
enum ObjectResponseType {
|
|
50
|
+
setOK @0;
|
|
51
|
+
getOK @1;
|
|
52
|
+
delOK @2;
|
|
53
|
+
delNotExists @3;
|
|
54
|
+
duplicateOK @4;
|
|
55
|
+
infoGetTotalOK @5;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
@0xa4dfa1212ad2d0f0;
|
|
2
|
+
|
|
3
|
+
struct Resource {
|
|
4
|
+
cpu @0 :UInt16; # 99.2% will be represented as 992 as integer
|
|
5
|
+
rss @1 :UInt64; # 32bit is capped to 4GB, so use 64bit to represent
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
struct ObjectManagerStatus {
|
|
9
|
+
numberOfObjects @0 :UInt32;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
struct ClientManagerStatus {
|
|
13
|
+
clientToNumOfTask @0 :List(Pair);
|
|
14
|
+
|
|
15
|
+
struct Pair {
|
|
16
|
+
client @0 :Data;
|
|
17
|
+
numTask @1 :UInt32;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
struct TaskManagerStatus {
|
|
22
|
+
stateToCount @0 :List(Pair);
|
|
23
|
+
|
|
24
|
+
struct Pair {
|
|
25
|
+
state @0 :UInt8;
|
|
26
|
+
count @1 :UInt32;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
struct ProcessorStatus {
|
|
31
|
+
pid @0 :UInt32;
|
|
32
|
+
initialized @1 :Bool;
|
|
33
|
+
hasTask @2 :Bool;
|
|
34
|
+
suspended @3 :Bool;
|
|
35
|
+
resource @4 :Resource;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
struct WorkerStatus {
|
|
39
|
+
workerId @0 :Data;
|
|
40
|
+
agent @1 :Resource;
|
|
41
|
+
rssFree @2 :UInt64;
|
|
42
|
+
free @3 :UInt32;
|
|
43
|
+
sent @4 :UInt32;
|
|
44
|
+
queued @5 :UInt32;
|
|
45
|
+
suspended @6: UInt8;
|
|
46
|
+
lagUS @7 :UInt64;
|
|
47
|
+
lastS @8 :UInt8;
|
|
48
|
+
itl @9 :Text;
|
|
49
|
+
processorStatuses @10 :List(ProcessorStatus);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
struct WorkerManagerStatus {
|
|
53
|
+
workers @0 :List(WorkerStatus);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
struct ScalingManagerStatus {
|
|
57
|
+
workerGroups @0 :List(Pair);
|
|
58
|
+
|
|
59
|
+
struct Pair {
|
|
60
|
+
workerGroupID @0 :Data;
|
|
61
|
+
workerIDs @1 :List(Data);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
struct BinderStatus {
|
|
66
|
+
received @0 :List(Pair);
|
|
67
|
+
sent @1 :List(Pair);
|
|
68
|
+
|
|
69
|
+
struct Pair {
|
|
70
|
+
client @0 :Text;
|
|
71
|
+
number @1 :UInt32;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Roles
|
|
2
|
+
|
|
3
|
+
The communication protocol include 3 roles: client, scheduler and worker:
|
|
4
|
+
|
|
5
|
+
- client is upstream of scheduler, scheduler is upstream of worker
|
|
6
|
+
- worker is downstream of scheduler, scheduler is downstream of client
|
|
7
|
+
|
|
8
|
+
```plaintext
|
|
9
|
+
+--------------+
|
|
10
|
+
+-----------------+ TCP | |
|
|
11
|
+
| +------------+ worker |
|
|
12
|
+
+-----------+ | | | |
|
|
13
|
+
| | TCP | | +--------------+
|
|
14
|
+
| client +---------+ |
|
|
15
|
+
| | | | +--------------+
|
|
16
|
+
+-----------+ | | TCP | |
|
|
17
|
+
| scheduler +------------+ worker |
|
|
18
|
+
+-----------+ | (object store) | | |
|
|
19
|
+
| | TCP | | +--------------+
|
|
20
|
+
| client +---------+ |
|
|
21
|
+
| | | | +--------------+
|
|
22
|
+
+-----------+ | | TCP | |
|
|
23
|
+
| +------------+ worker |
|
|
24
|
+
+-----------------+ | |
|
|
25
|
+
+--------------+
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
each client to scheduler and each worker to scheduler only maintains 1 TCP connection
|
|
30
|
+
|
|
31
|
+
# Message format
|
|
32
|
+
|
|
33
|
+
Scaler is using capnp library to serialize/deserialize and use zmq to communicate between client and scheduler and
|
|
34
|
+
worker
|
|
35
|
+
|
|
36
|
+
# Message Type Category
|
|
37
|
+
|
|
38
|
+
In general, there are 2 categories of the message types: object and task
|
|
39
|
+
|
|
40
|
+
object normally has an object id associated with actual object data, object data is immutable bytes, serialized by
|
|
41
|
+
client/worker, and deserialized by client/worker. protocol didn't define the way to serialize it, it's up to the
|
|
42
|
+
client/worker to decide
|
|
43
|
+
|
|
44
|
+
task is a function call, it has a task id associate with the actual function call, and the function call contains
|
|
45
|
+
function and series of arguments, but task message doesn't contain the actual function and arguments, instead it
|
|
46
|
+
contains object ids, workers are responsible to fetch the function/argument data from scheduler and deserialize and
|
|
47
|
+
execute the function call.
|
|
48
|
+
|
|
49
|
+
## Object Channel
|
|
50
|
+
|
|
51
|
+
Scheduler is the center of the object storage, client and worker are identical and can push
|
|
52
|
+
|
|
53
|
+
```plaintext
|
|
54
|
+
ObjectInstruction
|
|
55
|
+
ObjectResponse +--------------+
|
|
56
|
+
+------------------>| |
|
|
57
|
+
| | Worker |
|
|
58
|
+
+---------+ +-----------+ | +----------------+ |
|
|
59
|
+
| | ObjectRequest | +----+ | ObjectRequest +--------------+
|
|
60
|
+
| | ObjectInstruction | | |
|
|
61
|
+
| +----------------------->| |<------+
|
|
62
|
+
| Client | | Scheduler |
|
|
63
|
+
| |<-----------------------+ +-------+
|
|
64
|
+
| | ObjectResponse | | | ObjectInstruction
|
|
65
|
+
| | | +<---+ | ObjectResponse +--------------+
|
|
66
|
+
+---------+ +-----------+ | +--------------->| |
|
|
67
|
+
| | Worker |
|
|
68
|
+
+-------------------+ |
|
|
69
|
+
ObjectRequest +--------------+
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
ObjectInstruction = b"OI"
|
|
74
|
+
client can send object instruction to scheduler, scheduler can send object instruction to worker
|
|
75
|
+
it has 2 subtypes: create b"C", delete b"D"
|
|
76
|
+
when subtype is create, it has to include:
|
|
77
|
+
|
|
78
|
+
- list of object id (type bytes)
|
|
79
|
+
- list of object names (type bytes)
|
|
80
|
+
- list of object bytes (type bytes)
|
|
81
|
+
All above 3 lists, the number of items need match
|
|
82
|
+
|
|
83
|
+
ObjectRequest = b"OR"
|
|
84
|
+
ObjectResponse = b"OA"
|
|
85
|
+
|
|
86
|
+
## Task Channel
|
|
87
|
+
|
|
88
|
+
```plaintext
|
|
89
|
+
Task
|
|
90
|
+
TaskCancel +--------------+
|
|
91
|
+
+-------------------+ |
|
|
92
|
+
| | Worker |
|
|
93
|
+
+---------+ Task +-----------+ | +----------------+ |
|
|
94
|
+
| | TaskCancel | +----+ | TaskResult +--------------+
|
|
95
|
+
| | GraphTask | | |
|
|
96
|
+
| +------------------------+ |<------+
|
|
97
|
+
| Client | | Scheduler |
|
|
98
|
+
| |<-----------------------+ +-------+
|
|
99
|
+
| | TaskEcho | | | Task
|
|
100
|
+
| | TaskResult | +----+ | TaskCancel +--------------+
|
|
101
|
+
+---------+ +-----------+ | +--------------->| |
|
|
102
|
+
| | Worker |
|
|
103
|
+
+-------------------+ |
|
|
104
|
+
TaskResult +--------------+
|
|
105
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import enum
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
|
|
5
|
+
from scaler.protocol.capnp._python import _common # noqa
|
|
6
|
+
from scaler.protocol.python.mixins import Message
|
|
7
|
+
from scaler.utility.identifiers import ObjectID
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TaskResultType(enum.Enum):
|
|
11
|
+
Success = _common.TaskResultType.success # if submit and task is done and get result
|
|
12
|
+
Failed = _common.TaskResultType.failed # if submit and task is failed on worker
|
|
13
|
+
FailedWorkerDied = _common.TaskResultType.failedWorkerDied # if submit and worker died
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaskCancelConfirmType(enum.Enum):
|
|
17
|
+
Canceled = _common.TaskCancelConfirmType.canceled # if cancel success
|
|
18
|
+
CancelFailed = _common.TaskCancelConfirmType.cancelFailed # if failed to cancel if task is running
|
|
19
|
+
CancelNotFound = _common.TaskCancelConfirmType.cancelNotFound # if try to cancel, and task is not found
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TaskTransition(enum.Enum):
|
|
23
|
+
HasCapacity = _common.TaskTransition.hasCapacity
|
|
24
|
+
TaskResultSuccess = _common.TaskTransition.taskResultSuccess
|
|
25
|
+
TaskResultFailed = _common.TaskTransition.taskResultFailed
|
|
26
|
+
TaskResultWorkerDied = _common.TaskTransition.taskResultWorkerDied
|
|
27
|
+
TaskCancel = _common.TaskTransition.taskCancel
|
|
28
|
+
TaskCancelConfirmCanceled = _common.TaskTransition.taskCancelConfirmCanceled
|
|
29
|
+
TaskCancelConfirmFailed = _common.TaskTransition.taskCancelConfirmFailed
|
|
30
|
+
TaskCancelConfirmNotFound = _common.TaskTransition.taskCancelConfirmNotFound
|
|
31
|
+
BalanceTaskCancel = _common.TaskTransition.balanceTaskCancel
|
|
32
|
+
WorkerDisconnect = _common.TaskTransition.workerDisconnect
|
|
33
|
+
SchedulerHasTask = _common.TaskTransition.schedulerHasTask
|
|
34
|
+
SchedulerHasNoTask = _common.TaskTransition.schedulerHasNoTask
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TaskState(enum.Enum):
|
|
38
|
+
Inactive = _common.TaskState.inactive # task is scheduled but not allocate to worker
|
|
39
|
+
Running = _common.TaskState.running # task is running in worker
|
|
40
|
+
Canceling = _common.TaskState.canceling # task is canceling state
|
|
41
|
+
BalanceCanceling = _common.TaskState.balanceCanceling # task is in balance canceling state
|
|
42
|
+
Success = _common.TaskState.success # task is finished properly
|
|
43
|
+
Failed = _common.TaskState.failed # task is finished but exception happened
|
|
44
|
+
FailedWorkerDied = _common.TaskState.failedWorkerDied # task is failed due to worker died
|
|
45
|
+
Canceled = _common.TaskState.canceled # task is canceled (received task cancel confirm)
|
|
46
|
+
CanceledNotFound = _common.TaskState.canceledNotFound # task is not found when trying to cancel
|
|
47
|
+
WorkerDisconnecting = _common.TaskState.workerDisconnecting # task is lost due to worker disconnecting
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class WorkerState(enum.Enum):
|
|
51
|
+
Connected = _common.WorkerState.connected
|
|
52
|
+
Disconnected = _common.WorkerState.disconnected
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclasses.dataclass
|
|
56
|
+
class TaskCapability(Message):
|
|
57
|
+
def __init__(self, msg):
|
|
58
|
+
super().__init__(msg)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
return self._msg.name
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def value(self) -> int:
|
|
66
|
+
return self._msg.value
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def new_msg(name: str, value: int) -> "TaskCapability":
|
|
70
|
+
return TaskCapability(_common.TaskCapability(name=name, value=value))
|
|
71
|
+
|
|
72
|
+
def get_message(self):
|
|
73
|
+
return self._msg
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclasses.dataclass
|
|
77
|
+
class ObjectMetadata(Message):
|
|
78
|
+
class ObjectContentType(enum.Enum):
|
|
79
|
+
# FIXME: Pycapnp does not support assignment of raw enum values when the enum is itself declared within a list.
|
|
80
|
+
# However, assigning the enum's string value works.
|
|
81
|
+
# See https://github.com/capnproto/pycapnp/issues/374
|
|
82
|
+
|
|
83
|
+
Serializer = "serializer"
|
|
84
|
+
Object = "object"
|
|
85
|
+
|
|
86
|
+
def __init__(self, msg):
|
|
87
|
+
super().__init__(msg)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def object_ids(self) -> Tuple[ObjectID, ...]:
|
|
91
|
+
return tuple(ObjectID(object_id_bytes) for object_id_bytes in self._msg.objectIds)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def object_types(self) -> Tuple[ObjectContentType, ...]:
|
|
95
|
+
return tuple(ObjectMetadata.ObjectContentType(object_type._as_str()) for object_type in self._msg.objectTypes)
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def object_names(self) -> Tuple[bytes, ...]:
|
|
99
|
+
return tuple(self._msg.objectNames)
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def new_msg(
|
|
103
|
+
object_ids: Tuple[ObjectID, ...],
|
|
104
|
+
object_types: Tuple[ObjectContentType, ...] = tuple(),
|
|
105
|
+
object_names: Tuple[bytes, ...] = tuple(),
|
|
106
|
+
) -> "ObjectMetadata":
|
|
107
|
+
return ObjectMetadata(
|
|
108
|
+
_common.ObjectMetadata(
|
|
109
|
+
objectIds=[bytes(object_id) for object_id in object_ids],
|
|
110
|
+
objectTypes=[object_type.value for object_type in object_types],
|
|
111
|
+
objectNames=list(object_names),
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def get_message(self):
|
|
116
|
+
return self._msg
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclasses.dataclass
|
|
120
|
+
class ObjectStorageAddress(Message):
|
|
121
|
+
def __init__(self, msg):
|
|
122
|
+
super().__init__(msg)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def host(self) -> str:
|
|
126
|
+
return self._msg.host
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def port(self) -> int:
|
|
130
|
+
return self._msg.port
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def new_msg(host: str, port: int) -> "ObjectStorageAddress":
|
|
134
|
+
return ObjectStorageAddress(_common.ObjectStorageAddress(host=host, port=port))
|
|
135
|
+
|
|
136
|
+
def get_message(self):
|
|
137
|
+
return self._msg
|
|
138
|
+
|
|
139
|
+
def __repr__(self) -> str:
|
|
140
|
+
return f"tcp://{self.host}:{self.port}"
|