opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
scaler/client/client.py
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import functools
|
|
3
|
+
import logging
|
|
4
|
+
import threading
|
|
5
|
+
import uuid
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from inspect import signature
|
|
8
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import zmq
|
|
11
|
+
|
|
12
|
+
from scaler.client.agent.client_agent import ClientAgent
|
|
13
|
+
from scaler.client.agent.future_manager import ClientFutureManager
|
|
14
|
+
from scaler.client.future import ScalerFuture
|
|
15
|
+
from scaler.client.object_buffer import ObjectBuffer
|
|
16
|
+
from scaler.client.object_reference import ObjectReference
|
|
17
|
+
from scaler.client.serializer.default import DefaultSerializer
|
|
18
|
+
from scaler.client.serializer.mixins import Serializer
|
|
19
|
+
from scaler.config.defaults import DEFAULT_CLIENT_TIMEOUT_SECONDS, DEFAULT_HEARTBEAT_INTERVAL_SECONDS
|
|
20
|
+
from scaler.config.types.zmq import ZMQConfig, ZMQType
|
|
21
|
+
from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
|
|
22
|
+
from scaler.io.sync_connector import ZMQSyncConnector
|
|
23
|
+
from scaler.io.utility import create_sync_object_storage_connector
|
|
24
|
+
from scaler.protocol.python.message import ClientDisconnect, ClientShutdownResponse, GraphTask, Task
|
|
25
|
+
from scaler.utility.exceptions import ClientQuitException, MissingObjects
|
|
26
|
+
from scaler.utility.graph.optimization import cull_graph
|
|
27
|
+
from scaler.utility.graph.topological_sorter import TopologicalSorter
|
|
28
|
+
from scaler.utility.identifiers import ClientID, ObjectID, TaskID
|
|
29
|
+
from scaler.utility.metadata.profile_result import ProfileResult
|
|
30
|
+
from scaler.utility.metadata.task_flags import TaskFlags, retrieve_task_flags_from_task
|
|
31
|
+
from scaler.worker.agent.processor.processor import Processor
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclasses.dataclass
|
|
35
|
+
class _CallNode:
|
|
36
|
+
func: Callable
|
|
37
|
+
args: Tuple[str, ...]
|
|
38
|
+
|
|
39
|
+
def __post_init__(self):
|
|
40
|
+
if not callable(self.func):
|
|
41
|
+
raise TypeError(f"the first item of the tuple must be function, get {self.func}")
|
|
42
|
+
|
|
43
|
+
if not isinstance(self.args, tuple):
|
|
44
|
+
raise TypeError(f"arguments must be tuple, get {self.args}")
|
|
45
|
+
|
|
46
|
+
for arg in self.args:
|
|
47
|
+
if not isinstance(arg, str):
|
|
48
|
+
raise TypeError(f"argument `{arg}` must be a string and the string has to be in the graph")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Client:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
address: Optional[str] = None,
|
|
55
|
+
profiling: bool = False,
|
|
56
|
+
timeout_seconds: int = DEFAULT_CLIENT_TIMEOUT_SECONDS,
|
|
57
|
+
heartbeat_interval_seconds: int = DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
|
|
58
|
+
serializer: Serializer = DefaultSerializer(),
|
|
59
|
+
stream_output: bool = False,
|
|
60
|
+
object_storage_address: Optional[str] = None,
|
|
61
|
+
):
|
|
62
|
+
"""
|
|
63
|
+
The Scaler Client used to send tasks to a scheduler.
|
|
64
|
+
|
|
65
|
+
:param address: Address of Scheduler to submit work to. If None, will attempt to auto-detect
|
|
66
|
+
when running inside a worker context.
|
|
67
|
+
:type address: Optional[str]
|
|
68
|
+
:param profiling: If True, the returned futures will have the `task_duration()` property enabled.
|
|
69
|
+
:type profiling: bool
|
|
70
|
+
:param timeout_seconds: Seconds until heartbeat times out
|
|
71
|
+
:type timeout_seconds: int
|
|
72
|
+
:param heartbeat_interval_seconds: Frequency of heartbeat to scheduler in seconds
|
|
73
|
+
:type heartbeat_interval_seconds: int
|
|
74
|
+
:param stream_output: If True, stdout/stderr will be streamed to client during task execution
|
|
75
|
+
:type stream_output: bool
|
|
76
|
+
:param object_storage_address: Override object storage address (e.g., for Docker/Kubernetes port mapping).
|
|
77
|
+
If None, will use address received from scheduler.
|
|
78
|
+
:type object_storage_address: Optional[str]
|
|
79
|
+
"""
|
|
80
|
+
address = self._resolve_scheduler_address(address)
|
|
81
|
+
self.__initialize__(
|
|
82
|
+
address,
|
|
83
|
+
profiling,
|
|
84
|
+
timeout_seconds,
|
|
85
|
+
heartbeat_interval_seconds,
|
|
86
|
+
serializer,
|
|
87
|
+
stream_output,
|
|
88
|
+
object_storage_address,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def __initialize__(
|
|
92
|
+
self,
|
|
93
|
+
address: str,
|
|
94
|
+
profiling: bool,
|
|
95
|
+
timeout_seconds: int,
|
|
96
|
+
heartbeat_interval_seconds: int,
|
|
97
|
+
serializer: Serializer = DefaultSerializer(),
|
|
98
|
+
stream_output: bool = False,
|
|
99
|
+
object_storage_address: Optional[str] = None,
|
|
100
|
+
):
|
|
101
|
+
self._serializer = serializer
|
|
102
|
+
|
|
103
|
+
self._profiling = profiling
|
|
104
|
+
self._stream_output = stream_output
|
|
105
|
+
self._identity = ClientID.generate_client_id()
|
|
106
|
+
|
|
107
|
+
self._client_agent_address = ZMQConfig(ZMQType.inproc, host=f"scaler_client_{uuid.uuid4().hex}")
|
|
108
|
+
self._scheduler_address = ZMQConfig.from_string(address)
|
|
109
|
+
self._timeout_seconds = timeout_seconds
|
|
110
|
+
self._heartbeat_interval_seconds = heartbeat_interval_seconds
|
|
111
|
+
|
|
112
|
+
self._stop_event = threading.Event()
|
|
113
|
+
self._context = zmq.Context()
|
|
114
|
+
self._connector_agent: SyncConnector = ZMQSyncConnector(
|
|
115
|
+
context=self._context, socket_type=zmq.PAIR, address=self._client_agent_address, identity=self._identity
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self._future_manager = ClientFutureManager(self._serializer)
|
|
119
|
+
self._agent = ClientAgent(
|
|
120
|
+
identity=self._identity,
|
|
121
|
+
client_agent_address=self._client_agent_address,
|
|
122
|
+
scheduler_address=ZMQConfig.from_string(address),
|
|
123
|
+
context=self._context,
|
|
124
|
+
future_manager=self._future_manager,
|
|
125
|
+
stop_event=self._stop_event,
|
|
126
|
+
timeout_seconds=self._timeout_seconds,
|
|
127
|
+
heartbeat_interval_seconds=self._heartbeat_interval_seconds,
|
|
128
|
+
serializer=self._serializer,
|
|
129
|
+
object_storage_address=object_storage_address,
|
|
130
|
+
)
|
|
131
|
+
self._agent.start()
|
|
132
|
+
|
|
133
|
+
logging.info(f"ScalerClient: connect to scheduler at {self._scheduler_address}")
|
|
134
|
+
|
|
135
|
+
# Blocks until the agent receives the object storage address
|
|
136
|
+
self._object_storage_address = self._agent.get_object_storage_address()
|
|
137
|
+
|
|
138
|
+
logging.info(f"ScalerClient: connect to object storage at {self._object_storage_address}")
|
|
139
|
+
self._connector_storage: SyncObjectStorageConnector = create_sync_object_storage_connector(
|
|
140
|
+
self._object_storage_address.host, self._object_storage_address.port
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self._object_buffer = ObjectBuffer(
|
|
144
|
+
self._identity, self._serializer, self._connector_agent, self._connector_storage
|
|
145
|
+
)
|
|
146
|
+
self._future_factory = functools.partial(
|
|
147
|
+
ScalerFuture,
|
|
148
|
+
serializer=self._serializer,
|
|
149
|
+
connector_agent=self._connector_agent,
|
|
150
|
+
connector_storage=self._connector_storage,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def identity(self) -> ClientID:
|
|
155
|
+
return self._identity
|
|
156
|
+
|
|
157
|
+
def __del__(self):
|
|
158
|
+
self.disconnect()
|
|
159
|
+
|
|
160
|
+
def __enter__(self):
|
|
161
|
+
return self
|
|
162
|
+
|
|
163
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
164
|
+
self.disconnect()
|
|
165
|
+
|
|
166
|
+
def __getstate__(self) -> dict:
|
|
167
|
+
"""
|
|
168
|
+
Serializes the client object's state.
|
|
169
|
+
|
|
170
|
+
Client serialization is useful when a client reference is used within a remote task:
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
.. code:: python
|
|
174
|
+
|
|
175
|
+
client = Client(...)
|
|
176
|
+
|
|
177
|
+
def fibonacci(client: Client, n: int):
|
|
178
|
+
if n == 0:
|
|
179
|
+
return 0
|
|
180
|
+
elif n == 1:
|
|
181
|
+
return 1
|
|
182
|
+
else:
|
|
183
|
+
a = client.submit(fibonacci, n - 1)
|
|
184
|
+
b = client.submit(fibonacci, n - 2)
|
|
185
|
+
return a.result() + b.result()
|
|
186
|
+
|
|
187
|
+
print(client.submit(fibonacci, client, 7).result())
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
When serializing the client, only saves the address parameters. When deserialized, a new client object
|
|
191
|
+
connecting to the same scheduler and remote logger will be instantiated.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
"address": self._scheduler_address.to_address(),
|
|
196
|
+
"profiling": self._profiling,
|
|
197
|
+
"stream_output": self._stream_output,
|
|
198
|
+
"timeout_seconds": self._timeout_seconds,
|
|
199
|
+
"heartbeat_interval_seconds": self._heartbeat_interval_seconds,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
def __setstate__(self, state: dict) -> None:
|
|
203
|
+
# TODO: fix copy the serializer
|
|
204
|
+
self.__initialize__(
|
|
205
|
+
address=state["address"],
|
|
206
|
+
profiling=state["profiling"],
|
|
207
|
+
stream_output=state["stream_output"],
|
|
208
|
+
timeout_seconds=state["timeout_seconds"],
|
|
209
|
+
heartbeat_interval_seconds=state["heartbeat_interval_seconds"],
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def submit(self, fn: Callable, *args, **kwargs) -> ScalerFuture:
|
|
213
|
+
"""
|
|
214
|
+
Submit a single task (function with arguments) to the scheduler, and return a future.
|
|
215
|
+
|
|
216
|
+
See `submit_verbose()` for additional parameters.
|
|
217
|
+
|
|
218
|
+
:param fn: function to be executed remotely
|
|
219
|
+
:type fn: Callable
|
|
220
|
+
:param args: positional arguments will be passed to function
|
|
221
|
+
:param kwargs: keyword arguments will be passed to function
|
|
222
|
+
:return: future of the submitted task
|
|
223
|
+
:rtype: ScalerFuture
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
return self.submit_verbose(fn, args, kwargs)
|
|
227
|
+
|
|
228
|
+
def submit_verbose(
|
|
229
|
+
self, fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any], capabilities: Optional[Dict[str, int]] = None
|
|
230
|
+
) -> ScalerFuture:
|
|
231
|
+
"""
|
|
232
|
+
Submit a single task (function with arguments) to the scheduler, and return a future. Possibly route the task to
|
|
233
|
+
specific workers.
|
|
234
|
+
|
|
235
|
+
:param fn: function to be executed remotely
|
|
236
|
+
:type fn: Callable
|
|
237
|
+
:param args: positional arguments will be passed to function
|
|
238
|
+
:param kwargs: keyword arguments will be passed to function
|
|
239
|
+
:param capabilities: capabilities used for routing the tasks, e.g. `{"gpu": 2, "memory": 1_000_000_000}`.
|
|
240
|
+
:type capabilities: Optional[Dict[str, int]]
|
|
241
|
+
:return: future of the submitted task
|
|
242
|
+
:rtype: ScalerFuture
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
self.__assert_client_not_stopped()
|
|
246
|
+
|
|
247
|
+
function_object_id = self._object_buffer.buffer_send_function(fn).object_id
|
|
248
|
+
all_args = Client.__convert_kwargs_to_args(fn, args, kwargs)
|
|
249
|
+
|
|
250
|
+
task, future = self.__submit(function_object_id, all_args, delayed=True, capabilities=capabilities)
|
|
251
|
+
|
|
252
|
+
self._object_buffer.commit_send_objects()
|
|
253
|
+
self._connector_agent.send(task)
|
|
254
|
+
return future
|
|
255
|
+
|
|
256
|
+
def map(
|
|
257
|
+
self, fn: Callable, iterable: Iterable[Tuple[Any, ...]], capabilities: Optional[Dict[str, int]] = None
|
|
258
|
+
) -> List[Any]:
|
|
259
|
+
if not all(isinstance(args, (tuple, list)) for args in iterable):
|
|
260
|
+
raise TypeError("iterable should be list of arguments(list or tuple-like) of function")
|
|
261
|
+
|
|
262
|
+
self.__assert_client_not_stopped()
|
|
263
|
+
|
|
264
|
+
function_object_id = self._object_buffer.buffer_send_function(fn).object_id
|
|
265
|
+
tasks, futures = zip(
|
|
266
|
+
*[self.__submit(function_object_id, args, delayed=False, capabilities=capabilities) for args in iterable]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
self._object_buffer.commit_send_objects()
|
|
270
|
+
for task in tasks:
|
|
271
|
+
self._connector_agent.send(task)
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
results = [fut.result() for fut in futures]
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logging.exception(f"error happened when do scaler client.map:\n{e}")
|
|
277
|
+
self.disconnect()
|
|
278
|
+
raise e
|
|
279
|
+
|
|
280
|
+
return results
|
|
281
|
+
|
|
282
|
+
def get(
|
|
283
|
+
self,
|
|
284
|
+
graph: Dict[str, Union[Any, Tuple[Union[Callable, str], ...]]],
|
|
285
|
+
keys: List[str],
|
|
286
|
+
block: bool = True,
|
|
287
|
+
capabilities: Optional[Dict[str, int]] = None,
|
|
288
|
+
) -> Dict[str, Union[Any, ScalerFuture]]:
|
|
289
|
+
"""
|
|
290
|
+
.. code-block:: python
|
|
291
|
+
:linenos:
|
|
292
|
+
graph = {
|
|
293
|
+
"a": 1,
|
|
294
|
+
"b": 2,
|
|
295
|
+
"c": (inc, "a"),
|
|
296
|
+
"d": (inc, "b"),
|
|
297
|
+
"e": (add, "c", "d")
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
:param graph: dictionary presentation of task graphs
|
|
301
|
+
:type graph: Dict[str, Union[Any, Tuple[Union[Callable, Any]]
|
|
302
|
+
:param keys: list of keys want to get results from computed graph
|
|
303
|
+
:type keys: List[str]
|
|
304
|
+
:param block: if True, it will directly return a dictionary that maps from keys to results
|
|
305
|
+
:return: dictionary of mapping keys to futures, or map to results if block=True is specified
|
|
306
|
+
:param capabilities: capabilities used for routing the tasks, e.g. `{"gpu": 2, "memory": 1_000_000_000}`.
|
|
307
|
+
:type capabilities: Optional[Dict[str, int]]
|
|
308
|
+
:rtype: Dict[ScalerFuture]
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
self.__assert_client_not_stopped()
|
|
312
|
+
|
|
313
|
+
capabilities = capabilities or {}
|
|
314
|
+
|
|
315
|
+
graph = cull_graph(graph, keys)
|
|
316
|
+
|
|
317
|
+
node_name_to_argument, call_graph = self.__split_data_and_graph(graph)
|
|
318
|
+
self.__check_graph(node_name_to_argument, call_graph, keys)
|
|
319
|
+
|
|
320
|
+
graph_task, compute_futures, finished_futures = self.__construct_graph(
|
|
321
|
+
node_name_to_argument, call_graph, keys, block, capabilities
|
|
322
|
+
)
|
|
323
|
+
self._object_buffer.commit_send_objects()
|
|
324
|
+
self._connector_agent.send(graph_task)
|
|
325
|
+
|
|
326
|
+
self._future_manager.add_future(
|
|
327
|
+
self._future_factory(
|
|
328
|
+
task=Task.new_msg(
|
|
329
|
+
task_id=graph_task.task_id,
|
|
330
|
+
source=self._identity,
|
|
331
|
+
metadata=b"",
|
|
332
|
+
func_object_id=None,
|
|
333
|
+
function_args=[],
|
|
334
|
+
capabilities=capabilities,
|
|
335
|
+
),
|
|
336
|
+
is_delayed=not block,
|
|
337
|
+
group_task_id=graph_task.task_id,
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
for future in compute_futures.values():
|
|
341
|
+
self._future_manager.add_future(future)
|
|
342
|
+
|
|
343
|
+
# preserve the future insertion order based on inputted keys
|
|
344
|
+
futures = {}
|
|
345
|
+
for key in keys:
|
|
346
|
+
if key in compute_futures:
|
|
347
|
+
futures[key] = compute_futures[key]
|
|
348
|
+
else:
|
|
349
|
+
futures[key] = finished_futures[key]
|
|
350
|
+
|
|
351
|
+
if not block:
|
|
352
|
+
# just return futures
|
|
353
|
+
return futures
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
results = {k: v.result() for k, v in futures.items()}
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logging.exception(f"error happened when do scaler client.get:\n{e}")
|
|
359
|
+
self.disconnect()
|
|
360
|
+
raise e
|
|
361
|
+
|
|
362
|
+
return results
|
|
363
|
+
|
|
364
|
+
def send_object(self, obj: Any, name: Optional[str] = None) -> ObjectReference:
|
|
365
|
+
"""
|
|
366
|
+
send object to scheduler, this can be used to cache very large data to scheduler, and reuse it in multiple
|
|
367
|
+
tasks
|
|
368
|
+
|
|
369
|
+
:param obj: object to send, it will be serialized and send to scheduler
|
|
370
|
+
:type obj: Any
|
|
371
|
+
:param name: give a name to the cached argument
|
|
372
|
+
:type name: Optional[str]
|
|
373
|
+
:return: object reference
|
|
374
|
+
:rtype ObjectReference
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
self.__assert_client_not_stopped()
|
|
378
|
+
|
|
379
|
+
cache = self._object_buffer.buffer_send_object(obj, name)
|
|
380
|
+
return ObjectReference(cache.object_name, len(cache.object_payload), cache.object_id)
|
|
381
|
+
|
|
382
|
+
def clear(self):
|
|
383
|
+
"""
|
|
384
|
+
clear all resources used by the client, this will cancel all running futures and invalidate all existing object
|
|
385
|
+
references
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
# It's important to be ensure that all running futures are cancelled/finished before clearing object, or else we
|
|
389
|
+
# might end up with tasks indefinitely waiting on no longer existing objects.
|
|
390
|
+
self._future_manager.cancel_all_futures()
|
|
391
|
+
|
|
392
|
+
self._object_buffer.clear()
|
|
393
|
+
|
|
394
|
+
def disconnect(self):
|
|
395
|
+
"""
|
|
396
|
+
disconnect from connected scheduler, this will not shut down the scheduler
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
# Handle case where client wasn't fully initialized
|
|
400
|
+
if not hasattr(self, "_stop_event"):
|
|
401
|
+
return
|
|
402
|
+
|
|
403
|
+
if self._stop_event.is_set():
|
|
404
|
+
self.__destroy()
|
|
405
|
+
return
|
|
406
|
+
|
|
407
|
+
logging.info(f"ScalerClient: disconnect from {self._scheduler_address.to_address()}")
|
|
408
|
+
|
|
409
|
+
self._future_manager.cancel_all_futures()
|
|
410
|
+
|
|
411
|
+
self._connector_agent.send(ClientDisconnect.new_msg(ClientDisconnect.DisconnectType.Disconnect))
|
|
412
|
+
|
|
413
|
+
self.__destroy()
|
|
414
|
+
|
|
415
|
+
def __receive_shutdown_response(self):
|
|
416
|
+
message: Optional[ClientShutdownResponse] = None
|
|
417
|
+
while not isinstance(message, ClientShutdownResponse):
|
|
418
|
+
message = self._connector_agent.receive()
|
|
419
|
+
|
|
420
|
+
if not message.accepted:
|
|
421
|
+
raise ValueError("Scheduler is in protected mode. Can't shutdown")
|
|
422
|
+
|
|
423
|
+
def shutdown(self):
|
|
424
|
+
"""
|
|
425
|
+
shutdown all workers that connected to the scheduler this client connects to, it will cancel all other
|
|
426
|
+
clients' ongoing tasks, please be aware shutdown might not success if scheduler is configured as protected mode,
|
|
427
|
+
then it cannot shut down scheduler and the workers
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
if not self._agent.is_alive():
|
|
431
|
+
self.__destroy()
|
|
432
|
+
return
|
|
433
|
+
|
|
434
|
+
logging.info(f"ScalerClient: request shutdown for {self._scheduler_address.to_address()}")
|
|
435
|
+
|
|
436
|
+
self._future_manager.cancel_all_futures()
|
|
437
|
+
|
|
438
|
+
self._connector_agent.send(ClientDisconnect.new_msg(ClientDisconnect.DisconnectType.Shutdown))
|
|
439
|
+
try:
|
|
440
|
+
self.__receive_shutdown_response()
|
|
441
|
+
finally:
|
|
442
|
+
self.__destroy()
|
|
443
|
+
|
|
444
|
+
def __submit(
|
|
445
|
+
self,
|
|
446
|
+
function_object_id: ObjectID,
|
|
447
|
+
args: Tuple[Any, ...],
|
|
448
|
+
delayed: bool,
|
|
449
|
+
capabilities: Optional[Dict[str, int]] = None,
|
|
450
|
+
) -> Tuple[Task, ScalerFuture]:
|
|
451
|
+
task_id = TaskID.generate_task_id()
|
|
452
|
+
|
|
453
|
+
capabilities = capabilities or {}
|
|
454
|
+
|
|
455
|
+
function_args: List[Union[ObjectID, TaskID]] = []
|
|
456
|
+
for arg in args:
|
|
457
|
+
if isinstance(arg, ObjectReference):
|
|
458
|
+
if not self._object_buffer.is_valid_object_id(arg.object_id):
|
|
459
|
+
raise MissingObjects(f"unknown object: {arg.object_id!r}.")
|
|
460
|
+
|
|
461
|
+
function_args.append(arg.object_id)
|
|
462
|
+
else:
|
|
463
|
+
function_args.append(self._object_buffer.buffer_send_object(arg).object_id)
|
|
464
|
+
|
|
465
|
+
task_flags_bytes = self.__get_task_flags().serialize()
|
|
466
|
+
|
|
467
|
+
task = Task.new_msg(
|
|
468
|
+
task_id=task_id,
|
|
469
|
+
source=self._identity,
|
|
470
|
+
metadata=task_flags_bytes,
|
|
471
|
+
func_object_id=function_object_id,
|
|
472
|
+
function_args=function_args,
|
|
473
|
+
capabilities=capabilities,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
future = self._future_factory(task=task, is_delayed=delayed, group_task_id=None)
|
|
477
|
+
self._future_manager.add_future(future)
|
|
478
|
+
return task, future
|
|
479
|
+
|
|
480
|
+
@staticmethod
|
|
481
|
+
def __convert_kwargs_to_args(fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Tuple[Any, ...]:
|
|
482
|
+
all_params = [p for p in signature(fn).parameters.values()]
|
|
483
|
+
|
|
484
|
+
params = [p for p in all_params if p.kind in {p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD}]
|
|
485
|
+
|
|
486
|
+
if len(args) >= len(params):
|
|
487
|
+
return args
|
|
488
|
+
|
|
489
|
+
number_of_required = len([p for p in params if p.default is p.empty])
|
|
490
|
+
|
|
491
|
+
args_list = list(args)
|
|
492
|
+
kwargs = kwargs.copy()
|
|
493
|
+
kwargs.update({p.name: p.default for p in all_params if p.kind == p.KEYWORD_ONLY if p.default != p.empty})
|
|
494
|
+
|
|
495
|
+
for p in params[len(args_list) : number_of_required]:
|
|
496
|
+
try:
|
|
497
|
+
args_list.append(kwargs.pop(p.name))
|
|
498
|
+
except KeyError:
|
|
499
|
+
missing = tuple(p.name for p in params[len(args_list) : number_of_required])
|
|
500
|
+
raise TypeError(f"{fn} missing {len(missing)} arguments: {missing}")
|
|
501
|
+
|
|
502
|
+
for p in params[len(args_list) :]:
|
|
503
|
+
args_list.append(kwargs.pop(p.name, p.default))
|
|
504
|
+
|
|
505
|
+
return tuple(args_list)
|
|
506
|
+
|
|
507
|
+
def __split_data_and_graph(
|
|
508
|
+
self, graph: Dict[str, Union[Any, Tuple[Union[Callable, str], ...]]]
|
|
509
|
+
) -> Tuple[Dict[str, Tuple[ObjectID, Any]], Dict[str, _CallNode]]:
|
|
510
|
+
call_graph = {}
|
|
511
|
+
node_name_to_argument: Dict[str, Tuple[ObjectID, Union[Any, Tuple[Union[Callable, Any], ...]]]] = dict()
|
|
512
|
+
|
|
513
|
+
for node_name, node in graph.items():
|
|
514
|
+
if isinstance(node, tuple) and len(node) > 0 and callable(node[0]):
|
|
515
|
+
call_graph[node_name] = _CallNode(func=node[0], args=node[1:]) # type: ignore[arg-type]
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
if isinstance(node, ObjectReference):
|
|
519
|
+
object_id = node.object_id
|
|
520
|
+
else:
|
|
521
|
+
object_id = self._object_buffer.buffer_send_object(node, name=node_name).object_id
|
|
522
|
+
|
|
523
|
+
node_name_to_argument[node_name] = (object_id, node)
|
|
524
|
+
|
|
525
|
+
return node_name_to_argument, call_graph
|
|
526
|
+
|
|
527
|
+
@staticmethod
|
|
528
|
+
def __check_graph(
|
|
529
|
+
node_to_argument: Dict[str, Tuple[ObjectID, Any]], call_graph: Dict[str, _CallNode], keys: List[str]
|
|
530
|
+
):
|
|
531
|
+
duplicate_keys = [key for key, count in dict(Counter(keys)).items() if count > 1]
|
|
532
|
+
if duplicate_keys:
|
|
533
|
+
raise KeyError(f"duplicate key detected in argument keys: {duplicate_keys}")
|
|
534
|
+
|
|
535
|
+
# sanity check graph
|
|
536
|
+
for key in keys:
|
|
537
|
+
if key not in call_graph and key not in node_to_argument:
|
|
538
|
+
raise KeyError(f"key {key} has to be in graph")
|
|
539
|
+
|
|
540
|
+
sorter: TopologicalSorter[str] = TopologicalSorter()
|
|
541
|
+
for node_name, node in call_graph.items():
|
|
542
|
+
for arg in node.args:
|
|
543
|
+
if arg not in node_to_argument and arg not in call_graph:
|
|
544
|
+
raise KeyError(f"argument {arg} in node '{node_name}': {node} is not defined in graph")
|
|
545
|
+
|
|
546
|
+
sorter.add(node_name, *node.args)
|
|
547
|
+
|
|
548
|
+
# check cyclic dependencies
|
|
549
|
+
sorter.prepare()
|
|
550
|
+
|
|
551
|
+
def __construct_graph(
|
|
552
|
+
self,
|
|
553
|
+
node_name_to_arguments: Dict[str, Tuple[ObjectID, Any]],
|
|
554
|
+
call_graph: Dict[str, _CallNode],
|
|
555
|
+
keys: List[str],
|
|
556
|
+
block: bool,
|
|
557
|
+
capabilities: Dict[str, int],
|
|
558
|
+
) -> Tuple[GraphTask, Dict[str, ScalerFuture], Dict[str, ScalerFuture]]:
|
|
559
|
+
graph_task_id = TaskID.generate_task_id()
|
|
560
|
+
|
|
561
|
+
node_name_to_task_id = {node_name: TaskID.generate_task_id() for node_name in call_graph.keys()}
|
|
562
|
+
|
|
563
|
+
task_flags_bytes = self.__get_task_flags().serialize()
|
|
564
|
+
|
|
565
|
+
task_id_to_tasks = dict()
|
|
566
|
+
|
|
567
|
+
for node_name, node in call_graph.items():
|
|
568
|
+
task_id = node_name_to_task_id[node_name]
|
|
569
|
+
function_cache = self._object_buffer.buffer_send_function(node.func)
|
|
570
|
+
|
|
571
|
+
arguments: List[Union[TaskID, ObjectID]] = []
|
|
572
|
+
for arg in node.args:
|
|
573
|
+
assert arg in call_graph or arg in node_name_to_arguments
|
|
574
|
+
|
|
575
|
+
if arg in call_graph:
|
|
576
|
+
arguments.append(TaskID(node_name_to_task_id[arg]))
|
|
577
|
+
elif arg in node_name_to_arguments:
|
|
578
|
+
argument, _ = node_name_to_arguments[arg]
|
|
579
|
+
arguments.append(argument)
|
|
580
|
+
else:
|
|
581
|
+
raise ValueError("Not possible")
|
|
582
|
+
|
|
583
|
+
task_id_to_tasks[task_id] = Task.new_msg(
|
|
584
|
+
task_id=task_id,
|
|
585
|
+
source=self._identity,
|
|
586
|
+
metadata=task_flags_bytes,
|
|
587
|
+
func_object_id=function_cache.object_id,
|
|
588
|
+
function_args=arguments,
|
|
589
|
+
capabilities=capabilities,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
result_task_ids = [node_name_to_task_id[key] for key in keys if key in call_graph]
|
|
593
|
+
graph_task = GraphTask.new_msg(graph_task_id, self._identity, result_task_ids, list(task_id_to_tasks.values()))
|
|
594
|
+
|
|
595
|
+
compute_futures = {}
|
|
596
|
+
ready_futures = {}
|
|
597
|
+
for key in keys:
|
|
598
|
+
if key in call_graph:
|
|
599
|
+
compute_futures[key] = self._future_factory(
|
|
600
|
+
task=task_id_to_tasks[node_name_to_task_id[key]], is_delayed=not block, group_task_id=graph_task_id
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
elif key in node_name_to_arguments:
|
|
604
|
+
argument, data = node_name_to_arguments[key]
|
|
605
|
+
future: ScalerFuture = self._future_factory(
|
|
606
|
+
task=Task.new_msg(
|
|
607
|
+
task_id=TaskID.generate_task_id(),
|
|
608
|
+
source=self._identity,
|
|
609
|
+
metadata=b"",
|
|
610
|
+
func_object_id=None,
|
|
611
|
+
function_args=[],
|
|
612
|
+
capabilities={},
|
|
613
|
+
),
|
|
614
|
+
is_delayed=False,
|
|
615
|
+
group_task_id=graph_task_id,
|
|
616
|
+
)
|
|
617
|
+
future.set_result(data, ProfileResult())
|
|
618
|
+
ready_futures[key] = future
|
|
619
|
+
|
|
620
|
+
else:
|
|
621
|
+
raise ValueError(f"cannot find {key=} in graph")
|
|
622
|
+
|
|
623
|
+
return graph_task, compute_futures, ready_futures
|
|
624
|
+
|
|
625
|
+
def __get_task_flags(self) -> TaskFlags:
|
|
626
|
+
parent_task_priority = self.__get_parent_task_priority()
|
|
627
|
+
|
|
628
|
+
if parent_task_priority is not None:
|
|
629
|
+
task_priority = parent_task_priority + 1
|
|
630
|
+
else:
|
|
631
|
+
task_priority = 0
|
|
632
|
+
|
|
633
|
+
return TaskFlags(profiling=self._profiling, priority=task_priority, stream_output=self._stream_output)
|
|
634
|
+
|
|
635
|
+
def __assert_client_not_stopped(self):
|
|
636
|
+
if self._stop_event.is_set():
|
|
637
|
+
raise ClientQuitException("client is already stopped.")
|
|
638
|
+
|
|
639
|
+
def __destroy(self):
|
|
640
|
+
self._agent.join()
|
|
641
|
+
self._context.destroy(linger=1)
|
|
642
|
+
|
|
643
|
+
@staticmethod
|
|
644
|
+
def __get_parent_task_priority() -> Optional[int]:
|
|
645
|
+
"""If the client is running inside a Scaler processor, returns the priority of the associated task."""
|
|
646
|
+
|
|
647
|
+
current_processor = Processor.get_current_processor()
|
|
648
|
+
|
|
649
|
+
if current_processor is None:
|
|
650
|
+
return None
|
|
651
|
+
|
|
652
|
+
current_task = current_processor.current_task()
|
|
653
|
+
assert current_task is not None
|
|
654
|
+
|
|
655
|
+
return retrieve_task_flags_from_task(current_task).priority
|
|
656
|
+
|
|
657
|
+
def _resolve_scheduler_address(self, address: Optional[str]) -> str:
|
|
658
|
+
"""Resolve the scheduler address based on the provided address and worker context."""
|
|
659
|
+
# Provided address always takes precedence
|
|
660
|
+
if address is not None:
|
|
661
|
+
return address
|
|
662
|
+
|
|
663
|
+
# No address provided, check if we're running inside a worker context
|
|
664
|
+
current_processor = Processor.get_current_processor()
|
|
665
|
+
if current_processor is None:
|
|
666
|
+
raise ValueError(
|
|
667
|
+
"No scheduler address provided and not running inside a worker context. "
|
|
668
|
+
"Please provide a scheduler address when creating the Client outside of a worker."
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
# Return the scheduler address from the current processor
|
|
672
|
+
return current_processor.scheduler_address().to_address()
|