opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opengris-scaler might be problematic. Click here for more details.
- opengris_scaler-1.12.28.dist-info/METADATA +728 -0
- opengris_scaler-1.12.28.dist-info/RECORD +187 -0
- opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +210 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +658 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +115 -0
- scaler/cluster/combo.py +150 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/defaults.py +94 -0
- scaler/config/loader.py +96 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +55 -0
- scaler/config/section/ecs_worker_adapter.py +85 -0
- scaler/config/section/native_worker_adapter.py +43 -0
- scaler/config/section/object_storage_server.py +8 -0
- scaler/config/section/scheduler.py +54 -0
- scaler/config/section/symphony_worker_adapter.py +47 -0
- scaler/config/section/top.py +13 -0
- scaler/config/section/webui.py +21 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +62 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +133 -0
- scaler/entry_points/object_storage_server.py +45 -0
- scaler/entry_points/scheduler.py +144 -0
- scaler/entry_points/top.py +286 -0
- scaler/entry_points/webui.py +48 -0
- scaler/entry_points/worker_adapter_ecs.py +191 -0
- scaler/entry_points/worker_adapter_native.py +137 -0
- scaler/entry_points/worker_adapter_symphony.py +98 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +247 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/constants.py +9 -0
- scaler/ui/live_display.py +147 -0
- scaler/ui/memory_window.py +146 -0
- scaler/ui/setting_page.py +40 -0
- scaler/ui/task_graph.py +832 -0
- scaler/ui/task_log.py +107 -0
- scaler/ui/utility.py +66 -0
- scaler/ui/webui.py +147 -0
- scaler/ui/worker_processors.py +104 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +107 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +269 -0
- scaler/worker_adapter/native.py +155 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +139 -0
- src/scaler/io/ymq/_ymq.so +0 -0
- src/scaler/object_storage/object_storage_server.so +0 -0
scaler/client/future.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
from typing import Any, Callable, Optional
|
|
3
|
+
|
|
4
|
+
from scaler.client.serializer.mixins import Serializer
|
|
5
|
+
from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
|
|
6
|
+
from scaler.protocol.python.common import TaskState
|
|
7
|
+
from scaler.protocol.python.message import Task, TaskCancel
|
|
8
|
+
from scaler.utility.event_list import EventList
|
|
9
|
+
from scaler.utility.identifiers import ObjectID, TaskID
|
|
10
|
+
from scaler.utility.metadata.profile_result import ProfileResult
|
|
11
|
+
from scaler.utility.serialization import deserialize_failure
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ScalerFuture(concurrent.futures.Future):
|
|
15
|
+
"""
|
|
16
|
+
A drop-in replacement for Python's `concurrent.futures.Future`.
|
|
17
|
+
|
|
18
|
+
This class is designed to be compatible with Python's Future API, but with some key differences:
|
|
19
|
+
|
|
20
|
+
- Delayed futures (`is_delayed` set to `True`) might not fetch the result data when the future is done.
|
|
21
|
+
Instead, the result is lazily fetched when `result()` or `exception()` is called, or when a callback or waiter is
|
|
22
|
+
added. That is, `result()` might temporarily be blocking even if `done()` is `True`.
|
|
23
|
+
|
|
24
|
+
- `cancel()` may block until a cancellation confirmation is received from Scaler's scheduler.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
task: Task,
|
|
30
|
+
is_delayed: bool,
|
|
31
|
+
group_task_id: Optional[TaskID],
|
|
32
|
+
serializer: Serializer,
|
|
33
|
+
connector_agent: SyncConnector,
|
|
34
|
+
connector_storage: SyncObjectStorageConnector,
|
|
35
|
+
):
|
|
36
|
+
super().__init__()
|
|
37
|
+
|
|
38
|
+
self._waiters = EventList(self._waiters) # type: ignore[assignment]
|
|
39
|
+
self._waiters.add_update_callback(self._on_waiters_updated) # type: ignore[attr-defined]
|
|
40
|
+
|
|
41
|
+
self._task_id: TaskID = task.task_id
|
|
42
|
+
self._is_delayed: bool = is_delayed
|
|
43
|
+
self._group_task_id: Optional[TaskID] = group_task_id
|
|
44
|
+
self._serializer: Serializer = serializer
|
|
45
|
+
self._connector_agent: SyncConnector = connector_agent
|
|
46
|
+
self._connector_storage: SyncObjectStorageConnector = connector_storage
|
|
47
|
+
|
|
48
|
+
self._result_object_id: Optional[ObjectID] = None
|
|
49
|
+
self._result_received = False
|
|
50
|
+
self._task_state: Optional[TaskState] = None
|
|
51
|
+
self._cancel_requested: bool = False
|
|
52
|
+
|
|
53
|
+
self._profiling_info: Optional[ProfileResult] = None
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def task_id(self) -> TaskID:
|
|
57
|
+
return self._task_id
|
|
58
|
+
|
|
59
|
+
def profiling_info(self) -> ProfileResult:
|
|
60
|
+
with self._condition: # type: ignore[attr-defined]
|
|
61
|
+
if self._profiling_info is None:
|
|
62
|
+
raise ValueError(f"didn't receive profiling info for {self} yet")
|
|
63
|
+
|
|
64
|
+
return self._profiling_info
|
|
65
|
+
|
|
66
|
+
def set_result_ready(
|
|
67
|
+
self, object_id: Optional[ObjectID], task_state: TaskState, profile_result: Optional[ProfileResult] = None
|
|
68
|
+
) -> None:
|
|
69
|
+
with self._condition: # type: ignore[attr-defined]
|
|
70
|
+
if self.done():
|
|
71
|
+
raise concurrent.futures.InvalidStateError(f"invalid future state: {self._state}")
|
|
72
|
+
|
|
73
|
+
self._state = "FINISHED"
|
|
74
|
+
|
|
75
|
+
self._result_object_id = object_id
|
|
76
|
+
|
|
77
|
+
self._task_state = task_state
|
|
78
|
+
|
|
79
|
+
if profile_result is not None:
|
|
80
|
+
self._profiling_info = profile_result
|
|
81
|
+
|
|
82
|
+
# if it's not delayed future, or if there is any listener (waiter or callback), get the result immediately
|
|
83
|
+
if not self._is_delayed or self._has_result_listeners():
|
|
84
|
+
self._get_result_object()
|
|
85
|
+
|
|
86
|
+
self._condition.notify_all() # type: ignore[attr-defined]
|
|
87
|
+
|
|
88
|
+
def set_canceled(self):
|
|
89
|
+
with self._condition:
|
|
90
|
+
if self.done():
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
self._state = "CANCELLED_AND_NOTIFIED"
|
|
94
|
+
self._result_received = True
|
|
95
|
+
self._cancel_requested = True
|
|
96
|
+
|
|
97
|
+
for waiter in self._waiters:
|
|
98
|
+
waiter.add_cancelled(self)
|
|
99
|
+
|
|
100
|
+
self._condition.notify_all() # type: ignore[attr-defined]
|
|
101
|
+
|
|
102
|
+
self._invoke_callbacks() # type: ignore[attr-defined]
|
|
103
|
+
|
|
104
|
+
def _set_result_or_exception(
|
|
105
|
+
self,
|
|
106
|
+
result: Optional[Any] = None,
|
|
107
|
+
exception: Optional[BaseException] = None,
|
|
108
|
+
profiling_info: Optional[ProfileResult] = None,
|
|
109
|
+
) -> None:
|
|
110
|
+
with self._condition: # type: ignore[attr-defined]
|
|
111
|
+
if self.cancelled():
|
|
112
|
+
raise concurrent.futures.InvalidStateError(f"invalid future state: {self._state}")
|
|
113
|
+
|
|
114
|
+
if self._result_received:
|
|
115
|
+
raise concurrent.futures.InvalidStateError("future already received object data.")
|
|
116
|
+
|
|
117
|
+
if profiling_info is not None:
|
|
118
|
+
if self._profiling_info is not None:
|
|
119
|
+
raise concurrent.futures.InvalidStateError("cannot set profiling info twice.")
|
|
120
|
+
|
|
121
|
+
self._profiling_info = profiling_info
|
|
122
|
+
|
|
123
|
+
self._state = "FINISHED"
|
|
124
|
+
self._result_received = True
|
|
125
|
+
|
|
126
|
+
if exception is not None:
|
|
127
|
+
assert result is None
|
|
128
|
+
self._exception = exception
|
|
129
|
+
for waiter in self._waiters:
|
|
130
|
+
waiter.add_exception(self)
|
|
131
|
+
else:
|
|
132
|
+
self._result = result
|
|
133
|
+
for waiter in self._waiters:
|
|
134
|
+
waiter.add_result(self)
|
|
135
|
+
|
|
136
|
+
self._condition.notify_all() # type: ignore[attr-defined]
|
|
137
|
+
|
|
138
|
+
self._invoke_callbacks() # type: ignore[attr-defined]
|
|
139
|
+
|
|
140
|
+
def set_result(self, result: Any, profiling_info: Optional[ProfileResult] = None) -> None:
|
|
141
|
+
self._set_result_or_exception(result=result, profiling_info=profiling_info)
|
|
142
|
+
|
|
143
|
+
def set_exception(self, exception: Optional[BaseException], profiling_info: Optional[ProfileResult] = None) -> None:
|
|
144
|
+
self._set_result_or_exception(exception=exception, profiling_info=profiling_info)
|
|
145
|
+
|
|
146
|
+
def result(self, timeout: Optional[float] = None) -> Any:
|
|
147
|
+
with self._condition: # type: ignore[attr-defined]
|
|
148
|
+
self._wait_result_ready(timeout)
|
|
149
|
+
|
|
150
|
+
# if it's delayed future, get the result when future.result() gets called
|
|
151
|
+
if self._is_delayed:
|
|
152
|
+
self._get_result_object()
|
|
153
|
+
|
|
154
|
+
return super().result()
|
|
155
|
+
|
|
156
|
+
def exception(self, timeout: Optional[float] = None) -> Optional[BaseException]:
|
|
157
|
+
with self._condition: # type: ignore[attr-defined]
|
|
158
|
+
self._wait_result_ready(timeout)
|
|
159
|
+
|
|
160
|
+
# if it's delayed future, get the result when future.exception() gets called
|
|
161
|
+
if self._is_delayed:
|
|
162
|
+
self._get_result_object()
|
|
163
|
+
|
|
164
|
+
return super().exception()
|
|
165
|
+
|
|
166
|
+
def cancel(self, timeout: Optional[float] = None) -> bool:
|
|
167
|
+
with self._condition: # type: ignore[attr-defined]
|
|
168
|
+
if self.cancelled():
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
if self.done():
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
if not self._cancel_requested:
|
|
175
|
+
# Send cancellation request to the server
|
|
176
|
+
cancel_flags = TaskCancel.TaskCancelFlags(force=True)
|
|
177
|
+
|
|
178
|
+
if self._group_task_id is not None:
|
|
179
|
+
self._connector_agent.send(TaskCancel.new_msg(self._group_task_id, flags=cancel_flags))
|
|
180
|
+
else:
|
|
181
|
+
self._connector_agent.send(TaskCancel.new_msg(self._task_id, flags=cancel_flags))
|
|
182
|
+
|
|
183
|
+
self._cancel_requested = True
|
|
184
|
+
|
|
185
|
+
# Wait for the answer from the server, can either be a cancel confirmation, or the results if the task
|
|
186
|
+
# finished while being canceled.
|
|
187
|
+
self._wait_result_ready(timeout)
|
|
188
|
+
|
|
189
|
+
return self.cancelled()
|
|
190
|
+
|
|
191
|
+
def add_done_callback(self, fn: Callable[["ScalerFuture"], Any]) -> None:
|
|
192
|
+
with self._condition:
|
|
193
|
+
if self.done():
|
|
194
|
+
self._get_result_object()
|
|
195
|
+
else:
|
|
196
|
+
self._done_callbacks.append(fn) # type: ignore[attr-defined]
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
fn(self)
|
|
201
|
+
except Exception:
|
|
202
|
+
concurrent.futures._base.LOGGER.exception(f"exception calling callback for {self!r}")
|
|
203
|
+
raise
|
|
204
|
+
|
|
205
|
+
def _on_waiters_updated(self, waiters: EventList):
|
|
206
|
+
with self._condition: # type: ignore[attr-defined]
|
|
207
|
+
# if it's delayed future, get the result when waiter gets added
|
|
208
|
+
if self._is_delayed and len(self._waiters) > 0:
|
|
209
|
+
self._get_result_object()
|
|
210
|
+
|
|
211
|
+
def _has_result_listeners(self) -> bool:
|
|
212
|
+
return len(self._done_callbacks) > 0 or len(self._waiters) > 0 # type: ignore[attr-defined]
|
|
213
|
+
|
|
214
|
+
def _get_result_object(self):
|
|
215
|
+
with self._condition: # type: ignore[attr-defined]
|
|
216
|
+
if self._result_object_id is None or self.cancelled() or self._result_received:
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
object_bytes = self._connector_storage.get_object(self._result_object_id)
|
|
220
|
+
|
|
221
|
+
if self._is_simple_task():
|
|
222
|
+
# immediately delete non graph result objects
|
|
223
|
+
# TODO: graph task results could also be deleted if these are not required by another task of the graph.
|
|
224
|
+
self._connector_storage.delete_object(self._result_object_id)
|
|
225
|
+
|
|
226
|
+
if self._task_state == TaskState.Success:
|
|
227
|
+
self.set_result(self._serializer.deserialize(object_bytes))
|
|
228
|
+
elif self._task_state == TaskState.Failed:
|
|
229
|
+
self.set_exception(deserialize_failure(object_bytes))
|
|
230
|
+
else:
|
|
231
|
+
raise ValueError(f"unexpected task status: {self._task_state}")
|
|
232
|
+
|
|
233
|
+
def _wait_result_ready(self, timeout: Optional[float] = None):
|
|
234
|
+
"""
|
|
235
|
+
Blocks until the future is done (either successfully, or on failure/cancellation).
|
|
236
|
+
|
|
237
|
+
Raises a `TimeoutError` if it blocks more than `timeout` seconds.
|
|
238
|
+
"""
|
|
239
|
+
if not self.done() and not self._condition.wait(timeout):
|
|
240
|
+
raise concurrent.futures.TimeoutError()
|
|
241
|
+
|
|
242
|
+
def _is_simple_task(self):
|
|
243
|
+
return self._group_task_id is None and self._task_id is not None
|
|
244
|
+
|
|
245
|
+
def __task_type(self) -> str:
|
|
246
|
+
if self._group_task_id is None:
|
|
247
|
+
return "SimpleTask"
|
|
248
|
+
|
|
249
|
+
if self._group_task_id == self._task_id:
|
|
250
|
+
return "GraphUmbrellaTask"
|
|
251
|
+
else:
|
|
252
|
+
return "GraphSubTask"
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import pickle
|
|
3
|
+
from typing import Any, Callable, List, Optional, Set
|
|
4
|
+
|
|
5
|
+
import cloudpickle
|
|
6
|
+
|
|
7
|
+
from scaler.client.serializer.mixins import Serializer
|
|
8
|
+
from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
|
|
9
|
+
from scaler.protocol.python.common import ObjectMetadata
|
|
10
|
+
from scaler.protocol.python.message import ObjectInstruction
|
|
11
|
+
from scaler.utility.identifiers import ClientID, ObjectID
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class ObjectCache:
|
|
16
|
+
object_id: ObjectID
|
|
17
|
+
object_type: ObjectMetadata.ObjectContentType
|
|
18
|
+
object_name: bytes
|
|
19
|
+
object_payload: bytes
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ObjectBuffer:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
identity: ClientID,
|
|
26
|
+
serializer: Serializer,
|
|
27
|
+
connector_agent: SyncConnector,
|
|
28
|
+
connector_storage: SyncObjectStorageConnector,
|
|
29
|
+
):
|
|
30
|
+
self._identity = identity
|
|
31
|
+
self._serializer = serializer
|
|
32
|
+
|
|
33
|
+
self._connector_agent = connector_agent
|
|
34
|
+
self._connector_storage = connector_storage
|
|
35
|
+
|
|
36
|
+
self._valid_object_ids: Set[ObjectID] = set()
|
|
37
|
+
self._pending_objects: List[ObjectCache] = list()
|
|
38
|
+
|
|
39
|
+
self._serializer_object_id = self.__send_serializer()
|
|
40
|
+
|
|
41
|
+
def buffer_send_function(self, fn: Callable) -> ObjectCache:
|
|
42
|
+
return self.__buffer_send_serialized_object(self.__construct_function(fn))
|
|
43
|
+
|
|
44
|
+
def buffer_send_object(self, obj: Any, name: Optional[str] = None) -> ObjectCache:
|
|
45
|
+
return self.__buffer_send_serialized_object(self.__construct_object(obj, name))
|
|
46
|
+
|
|
47
|
+
def commit_send_objects(self):
|
|
48
|
+
if not self._pending_objects:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
object_instructions_to_send = [
|
|
52
|
+
(obj_cache.object_id, obj_cache.object_type, obj_cache.object_name) for obj_cache in self._pending_objects
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
self._connector_agent.send(
|
|
56
|
+
ObjectInstruction.new_msg(
|
|
57
|
+
ObjectInstruction.ObjectInstructionType.Create,
|
|
58
|
+
self._identity,
|
|
59
|
+
ObjectMetadata.new_msg(*zip(*object_instructions_to_send)),
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
for obj_cache in self._pending_objects:
|
|
64
|
+
self._connector_storage.set_object(obj_cache.object_id, obj_cache.object_payload)
|
|
65
|
+
|
|
66
|
+
self._pending_objects.clear()
|
|
67
|
+
|
|
68
|
+
def clear(self):
|
|
69
|
+
"""
|
|
70
|
+
remove all committed and pending objects.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
self._pending_objects.clear()
|
|
74
|
+
|
|
75
|
+
# the Clear instruction does not clear the serializer.
|
|
76
|
+
self._valid_object_ids.clear()
|
|
77
|
+
self._valid_object_ids.add(self._serializer_object_id)
|
|
78
|
+
|
|
79
|
+
self._connector_agent.send(
|
|
80
|
+
ObjectInstruction.new_msg(
|
|
81
|
+
ObjectInstruction.ObjectInstructionType.Clear, self._identity, ObjectMetadata.new_msg(tuple())
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def is_valid_object_id(self, object_id: ObjectID) -> bool:
|
|
86
|
+
return object_id in self._valid_object_ids
|
|
87
|
+
|
|
88
|
+
def __construct_serializer(self) -> ObjectCache:
|
|
89
|
+
serializer_payload = cloudpickle.dumps(self._serializer, protocol=pickle.HIGHEST_PROTOCOL)
|
|
90
|
+
object_id = ObjectID.generate_serializer_object_id(self._identity)
|
|
91
|
+
serializer_cache = ObjectCache(
|
|
92
|
+
object_id, ObjectMetadata.ObjectContentType.Serializer, b"serializer", serializer_payload
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return serializer_cache
|
|
96
|
+
|
|
97
|
+
def __construct_function(self, fn: Callable) -> ObjectCache:
|
|
98
|
+
function_payload = self._serializer.serialize(fn)
|
|
99
|
+
object_id = ObjectID.generate_object_id(self._identity)
|
|
100
|
+
function_cache = ObjectCache(
|
|
101
|
+
object_id,
|
|
102
|
+
ObjectMetadata.ObjectContentType.Object,
|
|
103
|
+
getattr(fn, "__name__", f"<func {repr(object_id)}>").encode(),
|
|
104
|
+
function_payload,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return function_cache
|
|
108
|
+
|
|
109
|
+
def __construct_object(self, obj: Any, name: Optional[str] = None) -> ObjectCache:
|
|
110
|
+
object_payload = self._serializer.serialize(obj)
|
|
111
|
+
object_id = ObjectID.generate_object_id(self._identity)
|
|
112
|
+
name_bytes = name.encode() if name else f"<obj {repr(object_id)}>".encode()
|
|
113
|
+
object_cache = ObjectCache(object_id, ObjectMetadata.ObjectContentType.Object, name_bytes, object_payload)
|
|
114
|
+
|
|
115
|
+
return object_cache
|
|
116
|
+
|
|
117
|
+
def __buffer_send_serialized_object(self, object_cache: ObjectCache) -> ObjectCache:
|
|
118
|
+
if object_cache.object_id not in self._valid_object_ids:
|
|
119
|
+
self._pending_objects.append(object_cache)
|
|
120
|
+
self._valid_object_ids.add(object_cache.object_id)
|
|
121
|
+
|
|
122
|
+
return object_cache
|
|
123
|
+
|
|
124
|
+
def __send_serializer(self) -> ObjectID:
|
|
125
|
+
serialized_serializer = self.__construct_serializer()
|
|
126
|
+
self.__buffer_send_serialized_object(serialized_serializer)
|
|
127
|
+
self.commit_send_objects()
|
|
128
|
+
|
|
129
|
+
return serialized_serializer.object_id
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
|
|
3
|
+
from scaler.utility.identifiers import ObjectID
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclasses.dataclass
|
|
7
|
+
class ObjectReference:
|
|
8
|
+
name: bytes
|
|
9
|
+
size: int
|
|
10
|
+
object_id: ObjectID
|
|
11
|
+
|
|
12
|
+
def __repr__(self):
|
|
13
|
+
return f"ObjectReference(name={self.name!r}, size={self.size} bytes, id={self.object_id!r})"
|
|
14
|
+
|
|
15
|
+
def __hash__(self):
|
|
16
|
+
return hash(self.object_id)
|
|
17
|
+
|
|
18
|
+
def __eq__(self, other: object) -> bool:
|
|
19
|
+
if not isinstance(other, ObjectReference):
|
|
20
|
+
return NotImplemented
|
|
21
|
+
|
|
22
|
+
return self.object_id == other.object_id
|
|
23
|
+
|
|
24
|
+
def __ne__(self, other):
|
|
25
|
+
return not self.__eq__(other)
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import cloudpickle
|
|
5
|
+
|
|
6
|
+
from scaler.client.serializer.mixins import Serializer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DefaultSerializer(Serializer):
|
|
10
|
+
@staticmethod
|
|
11
|
+
def serialize(obj: Any) -> bytes:
|
|
12
|
+
return cloudpickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def deserialize(payload: bytes) -> Any:
|
|
16
|
+
return cloudpickle.loads(payload)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Serializer(metaclass=abc.ABCMeta):
|
|
6
|
+
@staticmethod
|
|
7
|
+
@abc.abstractmethod
|
|
8
|
+
def serialize(obj: Any) -> bytes:
|
|
9
|
+
"""
|
|
10
|
+
Serialize the object to bytes, this serialization method is used to call for function object and EACH argument
|
|
11
|
+
object and function result object, for example:
|
|
12
|
+
|
|
13
|
+
def add(a, b):
|
|
14
|
+
return a + b
|
|
15
|
+
|
|
16
|
+
client.submit(add, 1, 2)
|
|
17
|
+
|
|
18
|
+
The add function and the arguments 1 and 2 will be serialized and sent to the worker, and the result of the a+b
|
|
19
|
+
will be serialized and sent back to the client, client will use deserialize function below to deserialize
|
|
20
|
+
|
|
21
|
+
:param obj: the object to be serialized, can be function object, argument object, or function result object
|
|
22
|
+
:return: serialized bytes of the object
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
raise NotImplementedError()
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def deserialize(payload: bytes) -> Any:
|
|
30
|
+
"""
|
|
31
|
+
Deserialize the bytes to the original object, this de-serialize method is used to deserialize the function
|
|
32
|
+
object bytes and EACH serialized argument and serialized function result.
|
|
33
|
+
|
|
34
|
+
:param payload: the serialized bytes of the object, can be function object, argument object, or function result
|
|
35
|
+
object
|
|
36
|
+
:return: any deserialized object
|
|
37
|
+
"""
|
|
38
|
+
raise NotImplementedError()
|
|
File without changes
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import os
|
|
4
|
+
import signal
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
8
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
9
|
+
from scaler.utility.logging.utility import setup_logger
|
|
10
|
+
from scaler.worker.worker import Worker
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Cluster(multiprocessing.get_context("spawn").Process): # type: ignore[misc]
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
address: ZMQConfig,
|
|
18
|
+
object_storage_address: Optional[ObjectStorageConfig],
|
|
19
|
+
preload: Optional[str],
|
|
20
|
+
worker_io_threads: int,
|
|
21
|
+
worker_names: List[str],
|
|
22
|
+
per_worker_capabilities: Dict[str, int],
|
|
23
|
+
per_worker_task_queue_size: int,
|
|
24
|
+
heartbeat_interval_seconds: int,
|
|
25
|
+
task_timeout_seconds: int,
|
|
26
|
+
death_timeout_seconds: int,
|
|
27
|
+
garbage_collect_interval_seconds: int,
|
|
28
|
+
trim_memory_threshold_bytes: int,
|
|
29
|
+
hard_processor_suspend: bool,
|
|
30
|
+
event_loop: str,
|
|
31
|
+
logging_paths: Tuple[str, ...],
|
|
32
|
+
logging_config_file: Optional[str],
|
|
33
|
+
logging_level: str,
|
|
34
|
+
):
|
|
35
|
+
multiprocessing.Process.__init__(self, name="WorkerMaster")
|
|
36
|
+
|
|
37
|
+
self._address = address
|
|
38
|
+
self._object_storage_address = object_storage_address
|
|
39
|
+
self._preload = preload
|
|
40
|
+
self._worker_io_threads = worker_io_threads
|
|
41
|
+
self._worker_names = worker_names
|
|
42
|
+
self._per_worker_capabilities = per_worker_capabilities
|
|
43
|
+
|
|
44
|
+
self._per_worker_task_queue_size = per_worker_task_queue_size
|
|
45
|
+
self._heartbeat_interval_seconds = heartbeat_interval_seconds
|
|
46
|
+
self._task_timeout_seconds = task_timeout_seconds
|
|
47
|
+
self._death_timeout_seconds = death_timeout_seconds
|
|
48
|
+
self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
|
|
49
|
+
self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
|
|
50
|
+
self._hard_processor_suspend = hard_processor_suspend
|
|
51
|
+
self._event_loop = event_loop
|
|
52
|
+
|
|
53
|
+
self._logging_paths = logging_paths
|
|
54
|
+
self._logging_config_file = logging_config_file
|
|
55
|
+
self._logging_level = logging_level
|
|
56
|
+
|
|
57
|
+
self._workers: List[Worker] = []
|
|
58
|
+
|
|
59
|
+
def run(self):
|
|
60
|
+
setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
|
|
61
|
+
self.__register_signal()
|
|
62
|
+
self.__start_workers_and_run_forever()
|
|
63
|
+
|
|
64
|
+
def __destroy(self, *args):
|
|
65
|
+
assert args is not None
|
|
66
|
+
logging.info(f"{self.__get_prefix()} received signal, shutting down")
|
|
67
|
+
for worker in self._workers:
|
|
68
|
+
logging.info(f"{self.__get_prefix()} shutting down {worker.identity!r}")
|
|
69
|
+
os.kill(worker.pid, signal.SIGINT)
|
|
70
|
+
|
|
71
|
+
def __register_signal(self):
|
|
72
|
+
signal.signal(signal.SIGINT, self.__destroy)
|
|
73
|
+
signal.signal(signal.SIGTERM, self.__destroy)
|
|
74
|
+
|
|
75
|
+
def __start_workers_and_run_forever(self):
|
|
76
|
+
logging.info(
|
|
77
|
+
f"{self.__get_prefix()} starting {len(self._worker_names)} workers, heartbeat_interval_seconds="
|
|
78
|
+
f"{self._heartbeat_interval_seconds}, task_timeout_seconds={self._task_timeout_seconds}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
self._workers = [
|
|
82
|
+
Worker(
|
|
83
|
+
event_loop=self._event_loop,
|
|
84
|
+
name=name,
|
|
85
|
+
address=self._address,
|
|
86
|
+
object_storage_address=self._object_storage_address,
|
|
87
|
+
capabilities=self._per_worker_capabilities,
|
|
88
|
+
preload=self._preload,
|
|
89
|
+
io_threads=self._worker_io_threads,
|
|
90
|
+
task_queue_size=self._per_worker_task_queue_size,
|
|
91
|
+
heartbeat_interval_seconds=self._heartbeat_interval_seconds,
|
|
92
|
+
garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
|
|
93
|
+
trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
|
|
94
|
+
task_timeout_seconds=self._task_timeout_seconds,
|
|
95
|
+
death_timeout_seconds=self._death_timeout_seconds,
|
|
96
|
+
hard_processor_suspend=self._hard_processor_suspend,
|
|
97
|
+
logging_paths=self._logging_paths,
|
|
98
|
+
logging_level=self._logging_level,
|
|
99
|
+
)
|
|
100
|
+
for name in self._worker_names
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
for worker in self._workers:
|
|
104
|
+
worker.start()
|
|
105
|
+
|
|
106
|
+
for worker in self._workers:
|
|
107
|
+
logging.info(f"{worker.identity!r} started")
|
|
108
|
+
|
|
109
|
+
for worker in self._workers:
|
|
110
|
+
worker.join()
|
|
111
|
+
|
|
112
|
+
logging.info(f"{self.__get_prefix()} shutdown")
|
|
113
|
+
|
|
114
|
+
def __get_prefix(self):
|
|
115
|
+
return f"{self.__class__.__name__}:"
|