opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,672 @@
1
+ import dataclasses
2
+ import functools
3
+ import logging
4
+ import threading
5
+ import uuid
6
+ from collections import Counter
7
+ from inspect import signature
8
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
9
+
10
+ import zmq
11
+
12
+ from scaler.client.agent.client_agent import ClientAgent
13
+ from scaler.client.agent.future_manager import ClientFutureManager
14
+ from scaler.client.future import ScalerFuture
15
+ from scaler.client.object_buffer import ObjectBuffer
16
+ from scaler.client.object_reference import ObjectReference
17
+ from scaler.client.serializer.default import DefaultSerializer
18
+ from scaler.client.serializer.mixins import Serializer
19
+ from scaler.config.defaults import DEFAULT_CLIENT_TIMEOUT_SECONDS, DEFAULT_HEARTBEAT_INTERVAL_SECONDS
20
+ from scaler.config.types.zmq import ZMQConfig, ZMQType
21
+ from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
22
+ from scaler.io.sync_connector import ZMQSyncConnector
23
+ from scaler.io.utility import create_sync_object_storage_connector
24
+ from scaler.protocol.python.message import ClientDisconnect, ClientShutdownResponse, GraphTask, Task
25
+ from scaler.utility.exceptions import ClientQuitException, MissingObjects
26
+ from scaler.utility.graph.optimization import cull_graph
27
+ from scaler.utility.graph.topological_sorter import TopologicalSorter
28
+ from scaler.utility.identifiers import ClientID, ObjectID, TaskID
29
+ from scaler.utility.metadata.profile_result import ProfileResult
30
+ from scaler.utility.metadata.task_flags import TaskFlags, retrieve_task_flags_from_task
31
+ from scaler.worker.agent.processor.processor import Processor
32
+
33
+
34
+ @dataclasses.dataclass
35
+ class _CallNode:
36
+ func: Callable
37
+ args: Tuple[str, ...]
38
+
39
+ def __post_init__(self):
40
+ if not callable(self.func):
41
+ raise TypeError(f"the first item of the tuple must be function, get {self.func}")
42
+
43
+ if not isinstance(self.args, tuple):
44
+ raise TypeError(f"arguments must be tuple, get {self.args}")
45
+
46
+ for arg in self.args:
47
+ if not isinstance(arg, str):
48
+ raise TypeError(f"argument `{arg}` must be a string and the string has to be in the graph")
49
+
50
+
51
+ class Client:
52
+ def __init__(
53
+ self,
54
+ address: Optional[str] = None,
55
+ profiling: bool = False,
56
+ timeout_seconds: int = DEFAULT_CLIENT_TIMEOUT_SECONDS,
57
+ heartbeat_interval_seconds: int = DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
58
+ serializer: Serializer = DefaultSerializer(),
59
+ stream_output: bool = False,
60
+ object_storage_address: Optional[str] = None,
61
+ ):
62
+ """
63
+ The Scaler Client used to send tasks to a scheduler.
64
+
65
+ :param address: Address of Scheduler to submit work to. If None, will attempt to auto-detect
66
+ when running inside a worker context.
67
+ :type address: Optional[str]
68
+ :param profiling: If True, the returned futures will have the `task_duration()` property enabled.
69
+ :type profiling: bool
70
+ :param timeout_seconds: Seconds until heartbeat times out
71
+ :type timeout_seconds: int
72
+ :param heartbeat_interval_seconds: Frequency of heartbeat to scheduler in seconds
73
+ :type heartbeat_interval_seconds: int
74
+ :param stream_output: If True, stdout/stderr will be streamed to client during task execution
75
+ :type stream_output: bool
76
+ :param object_storage_address: Override object storage address (e.g., for Docker/Kubernetes port mapping).
77
+ If None, will use address received from scheduler.
78
+ :type object_storage_address: Optional[str]
79
+ """
80
+ address = self._resolve_scheduler_address(address)
81
+ self.__initialize__(
82
+ address,
83
+ profiling,
84
+ timeout_seconds,
85
+ heartbeat_interval_seconds,
86
+ serializer,
87
+ stream_output,
88
+ object_storage_address,
89
+ )
90
+
91
+ def __initialize__(
92
+ self,
93
+ address: str,
94
+ profiling: bool,
95
+ timeout_seconds: int,
96
+ heartbeat_interval_seconds: int,
97
+ serializer: Serializer = DefaultSerializer(),
98
+ stream_output: bool = False,
99
+ object_storage_address: Optional[str] = None,
100
+ ):
101
+ self._serializer = serializer
102
+
103
+ self._profiling = profiling
104
+ self._stream_output = stream_output
105
+ self._identity = ClientID.generate_client_id()
106
+
107
+ self._client_agent_address = ZMQConfig(ZMQType.inproc, host=f"scaler_client_{uuid.uuid4().hex}")
108
+ self._scheduler_address = ZMQConfig.from_string(address)
109
+ self._timeout_seconds = timeout_seconds
110
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
111
+
112
+ self._stop_event = threading.Event()
113
+ self._context = zmq.Context()
114
+ self._connector_agent: SyncConnector = ZMQSyncConnector(
115
+ context=self._context, socket_type=zmq.PAIR, address=self._client_agent_address, identity=self._identity
116
+ )
117
+
118
+ self._future_manager = ClientFutureManager(self._serializer)
119
+ self._agent = ClientAgent(
120
+ identity=self._identity,
121
+ client_agent_address=self._client_agent_address,
122
+ scheduler_address=ZMQConfig.from_string(address),
123
+ context=self._context,
124
+ future_manager=self._future_manager,
125
+ stop_event=self._stop_event,
126
+ timeout_seconds=self._timeout_seconds,
127
+ heartbeat_interval_seconds=self._heartbeat_interval_seconds,
128
+ serializer=self._serializer,
129
+ object_storage_address=object_storage_address,
130
+ )
131
+ self._agent.start()
132
+
133
+ logging.info(f"ScalerClient: connect to scheduler at {self._scheduler_address}")
134
+
135
+ # Blocks until the agent receives the object storage address
136
+ self._object_storage_address = self._agent.get_object_storage_address()
137
+
138
+ logging.info(f"ScalerClient: connect to object storage at {self._object_storage_address}")
139
+ self._connector_storage: SyncObjectStorageConnector = create_sync_object_storage_connector(
140
+ self._object_storage_address.host, self._object_storage_address.port
141
+ )
142
+
143
+ self._object_buffer = ObjectBuffer(
144
+ self._identity, self._serializer, self._connector_agent, self._connector_storage
145
+ )
146
+ self._future_factory = functools.partial(
147
+ ScalerFuture,
148
+ serializer=self._serializer,
149
+ connector_agent=self._connector_agent,
150
+ connector_storage=self._connector_storage,
151
+ )
152
+
153
+ @property
154
+ def identity(self) -> ClientID:
155
+ return self._identity
156
+
157
+ def __del__(self):
158
+ self.disconnect()
159
+
160
+ def __enter__(self):
161
+ return self
162
+
163
+ def __exit__(self, exc_type, exc_val, exc_tb):
164
+ self.disconnect()
165
+
166
+ def __getstate__(self) -> dict:
167
+ """
168
+ Serializes the client object's state.
169
+
170
+ Client serialization is useful when a client reference is used within a remote task:
171
+
172
+
173
+ .. code:: python
174
+
175
+ client = Client(...)
176
+
177
+ def fibonacci(client: Client, n: int):
178
+ if n == 0:
179
+ return 0
180
+ elif n == 1:
181
+ return 1
182
+ else:
183
+ a = client.submit(fibonacci, n - 1)
184
+ b = client.submit(fibonacci, n - 2)
185
+ return a.result() + b.result()
186
+
187
+ print(client.submit(fibonacci, client, 7).result())
188
+
189
+
190
+ When serializing the client, only saves the address parameters. When deserialized, a new client object
191
+ connecting to the same scheduler and remote logger will be instantiated.
192
+ """
193
+
194
+ return {
195
+ "address": self._scheduler_address.to_address(),
196
+ "profiling": self._profiling,
197
+ "stream_output": self._stream_output,
198
+ "timeout_seconds": self._timeout_seconds,
199
+ "heartbeat_interval_seconds": self._heartbeat_interval_seconds,
200
+ }
201
+
202
+ def __setstate__(self, state: dict) -> None:
203
+ # TODO: fix copy the serializer
204
+ self.__initialize__(
205
+ address=state["address"],
206
+ profiling=state["profiling"],
207
+ stream_output=state["stream_output"],
208
+ timeout_seconds=state["timeout_seconds"],
209
+ heartbeat_interval_seconds=state["heartbeat_interval_seconds"],
210
+ )
211
+
212
+ def submit(self, fn: Callable, *args, **kwargs) -> ScalerFuture:
213
+ """
214
+ Submit a single task (function with arguments) to the scheduler, and return a future.
215
+
216
+ See `submit_verbose()` for additional parameters.
217
+
218
+ :param fn: function to be executed remotely
219
+ :type fn: Callable
220
+ :param args: positional arguments will be passed to function
221
+ :param kwargs: keyword arguments will be passed to function
222
+ :return: future of the submitted task
223
+ :rtype: ScalerFuture
224
+ """
225
+
226
+ return self.submit_verbose(fn, args, kwargs)
227
+
228
+ def submit_verbose(
229
+ self, fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any], capabilities: Optional[Dict[str, int]] = None
230
+ ) -> ScalerFuture:
231
+ """
232
+ Submit a single task (function with arguments) to the scheduler, and return a future. Possibly route the task to
233
+ specific workers.
234
+
235
+ :param fn: function to be executed remotely
236
+ :type fn: Callable
237
+ :param args: positional arguments will be passed to function
238
+ :param kwargs: keyword arguments will be passed to function
239
+ :param capabilities: capabilities used for routing the tasks, e.g. `{"gpu": 2, "memory": 1_000_000_000}`.
240
+ :type capabilities: Optional[Dict[str, int]]
241
+ :return: future of the submitted task
242
+ :rtype: ScalerFuture
243
+ """
244
+
245
+ self.__assert_client_not_stopped()
246
+
247
+ function_object_id = self._object_buffer.buffer_send_function(fn).object_id
248
+ all_args = Client.__convert_kwargs_to_args(fn, args, kwargs)
249
+
250
+ task, future = self.__submit(function_object_id, all_args, delayed=True, capabilities=capabilities)
251
+
252
+ self._object_buffer.commit_send_objects()
253
+ self._connector_agent.send(task)
254
+ return future
255
+
256
+ def map(
257
+ self, fn: Callable, iterable: Iterable[Tuple[Any, ...]], capabilities: Optional[Dict[str, int]] = None
258
+ ) -> List[Any]:
259
+ if not all(isinstance(args, (tuple, list)) for args in iterable):
260
+ raise TypeError("iterable should be list of arguments(list or tuple-like) of function")
261
+
262
+ self.__assert_client_not_stopped()
263
+
264
+ function_object_id = self._object_buffer.buffer_send_function(fn).object_id
265
+ tasks, futures = zip(
266
+ *[self.__submit(function_object_id, args, delayed=False, capabilities=capabilities) for args in iterable]
267
+ )
268
+
269
+ self._object_buffer.commit_send_objects()
270
+ for task in tasks:
271
+ self._connector_agent.send(task)
272
+
273
+ try:
274
+ results = [fut.result() for fut in futures]
275
+ except Exception as e:
276
+ logging.exception(f"error happened when do scaler client.map:\n{e}")
277
+ self.disconnect()
278
+ raise e
279
+
280
+ return results
281
+
282
+ def get(
283
+ self,
284
+ graph: Dict[str, Union[Any, Tuple[Union[Callable, str], ...]]],
285
+ keys: List[str],
286
+ block: bool = True,
287
+ capabilities: Optional[Dict[str, int]] = None,
288
+ ) -> Dict[str, Union[Any, ScalerFuture]]:
289
+ """
290
+ .. code-block:: python
291
+ :linenos:
292
+ graph = {
293
+ "a": 1,
294
+ "b": 2,
295
+ "c": (inc, "a"),
296
+ "d": (inc, "b"),
297
+ "e": (add, "c", "d")
298
+ }
299
+
300
+ :param graph: dictionary presentation of task graphs
301
+ :type graph: Dict[str, Union[Any, Tuple[Union[Callable, Any]]
302
+ :param keys: list of keys want to get results from computed graph
303
+ :type keys: List[str]
304
+ :param block: if True, it will directly return a dictionary that maps from keys to results
305
+ :return: dictionary of mapping keys to futures, or map to results if block=True is specified
306
+ :param capabilities: capabilities used for routing the tasks, e.g. `{"gpu": 2, "memory": 1_000_000_000}`.
307
+ :type capabilities: Optional[Dict[str, int]]
308
+ :rtype: Dict[ScalerFuture]
309
+ """
310
+
311
+ self.__assert_client_not_stopped()
312
+
313
+ capabilities = capabilities or {}
314
+
315
+ graph = cull_graph(graph, keys)
316
+
317
+ node_name_to_argument, call_graph = self.__split_data_and_graph(graph)
318
+ self.__check_graph(node_name_to_argument, call_graph, keys)
319
+
320
+ graph_task, compute_futures, finished_futures = self.__construct_graph(
321
+ node_name_to_argument, call_graph, keys, block, capabilities
322
+ )
323
+ self._object_buffer.commit_send_objects()
324
+ self._connector_agent.send(graph_task)
325
+
326
+ self._future_manager.add_future(
327
+ self._future_factory(
328
+ task=Task.new_msg(
329
+ task_id=graph_task.task_id,
330
+ source=self._identity,
331
+ metadata=b"",
332
+ func_object_id=None,
333
+ function_args=[],
334
+ capabilities=capabilities,
335
+ ),
336
+ is_delayed=not block,
337
+ group_task_id=graph_task.task_id,
338
+ )
339
+ )
340
+ for future in compute_futures.values():
341
+ self._future_manager.add_future(future)
342
+
343
+ # preserve the future insertion order based on inputted keys
344
+ futures = {}
345
+ for key in keys:
346
+ if key in compute_futures:
347
+ futures[key] = compute_futures[key]
348
+ else:
349
+ futures[key] = finished_futures[key]
350
+
351
+ if not block:
352
+ # just return futures
353
+ return futures
354
+
355
+ try:
356
+ results = {k: v.result() for k, v in futures.items()}
357
+ except Exception as e:
358
+ logging.exception(f"error happened when do scaler client.get:\n{e}")
359
+ self.disconnect()
360
+ raise e
361
+
362
+ return results
363
+
364
+ def send_object(self, obj: Any, name: Optional[str] = None) -> ObjectReference:
365
+ """
366
+ send object to scheduler, this can be used to cache very large data to scheduler, and reuse it in multiple
367
+ tasks
368
+
369
+ :param obj: object to send, it will be serialized and send to scheduler
370
+ :type obj: Any
371
+ :param name: give a name to the cached argument
372
+ :type name: Optional[str]
373
+ :return: object reference
374
+ :rtype ObjectReference
375
+ """
376
+
377
+ self.__assert_client_not_stopped()
378
+
379
+ cache = self._object_buffer.buffer_send_object(obj, name)
380
+ return ObjectReference(cache.object_name, len(cache.object_payload), cache.object_id)
381
+
382
+ def clear(self):
383
+ """
384
+ clear all resources used by the client, this will cancel all running futures and invalidate all existing object
385
+ references
386
+ """
387
+
388
+ # It's important to be ensure that all running futures are cancelled/finished before clearing object, or else we
389
+ # might end up with tasks indefinitely waiting on no longer existing objects.
390
+ self._future_manager.cancel_all_futures()
391
+
392
+ self._object_buffer.clear()
393
+
394
+ def disconnect(self):
395
+ """
396
+ disconnect from connected scheduler, this will not shut down the scheduler
397
+ """
398
+
399
+ # Handle case where client wasn't fully initialized
400
+ if not hasattr(self, "_stop_event"):
401
+ return
402
+
403
+ if self._stop_event.is_set():
404
+ self.__destroy()
405
+ return
406
+
407
+ logging.info(f"ScalerClient: disconnect from {self._scheduler_address.to_address()}")
408
+
409
+ self._future_manager.cancel_all_futures()
410
+
411
+ self._connector_agent.send(ClientDisconnect.new_msg(ClientDisconnect.DisconnectType.Disconnect))
412
+
413
+ self.__destroy()
414
+
415
+ def __receive_shutdown_response(self):
416
+ message: Optional[ClientShutdownResponse] = None
417
+ while not isinstance(message, ClientShutdownResponse):
418
+ message = self._connector_agent.receive()
419
+
420
+ if not message.accepted:
421
+ raise ValueError("Scheduler is in protected mode. Can't shutdown")
422
+
423
+ def shutdown(self):
424
+ """
425
+ shutdown all workers that connected to the scheduler this client connects to, it will cancel all other
426
+ clients' ongoing tasks, please be aware shutdown might not success if scheduler is configured as protected mode,
427
+ then it cannot shut down scheduler and the workers
428
+ """
429
+
430
+ if not self._agent.is_alive():
431
+ self.__destroy()
432
+ return
433
+
434
+ logging.info(f"ScalerClient: request shutdown for {self._scheduler_address.to_address()}")
435
+
436
+ self._future_manager.cancel_all_futures()
437
+
438
+ self._connector_agent.send(ClientDisconnect.new_msg(ClientDisconnect.DisconnectType.Shutdown))
439
+ try:
440
+ self.__receive_shutdown_response()
441
+ finally:
442
+ self.__destroy()
443
+
444
+ def __submit(
445
+ self,
446
+ function_object_id: ObjectID,
447
+ args: Tuple[Any, ...],
448
+ delayed: bool,
449
+ capabilities: Optional[Dict[str, int]] = None,
450
+ ) -> Tuple[Task, ScalerFuture]:
451
+ task_id = TaskID.generate_task_id()
452
+
453
+ capabilities = capabilities or {}
454
+
455
+ function_args: List[Union[ObjectID, TaskID]] = []
456
+ for arg in args:
457
+ if isinstance(arg, ObjectReference):
458
+ if not self._object_buffer.is_valid_object_id(arg.object_id):
459
+ raise MissingObjects(f"unknown object: {arg.object_id!r}.")
460
+
461
+ function_args.append(arg.object_id)
462
+ else:
463
+ function_args.append(self._object_buffer.buffer_send_object(arg).object_id)
464
+
465
+ task_flags_bytes = self.__get_task_flags().serialize()
466
+
467
+ task = Task.new_msg(
468
+ task_id=task_id,
469
+ source=self._identity,
470
+ metadata=task_flags_bytes,
471
+ func_object_id=function_object_id,
472
+ function_args=function_args,
473
+ capabilities=capabilities,
474
+ )
475
+
476
+ future = self._future_factory(task=task, is_delayed=delayed, group_task_id=None)
477
+ self._future_manager.add_future(future)
478
+ return task, future
479
+
480
+ @staticmethod
481
+ def __convert_kwargs_to_args(fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Tuple[Any, ...]:
482
+ all_params = [p for p in signature(fn).parameters.values()]
483
+
484
+ params = [p for p in all_params if p.kind in {p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD}]
485
+
486
+ if len(args) >= len(params):
487
+ return args
488
+
489
+ number_of_required = len([p for p in params if p.default is p.empty])
490
+
491
+ args_list = list(args)
492
+ kwargs = kwargs.copy()
493
+ kwargs.update({p.name: p.default for p in all_params if p.kind == p.KEYWORD_ONLY if p.default != p.empty})
494
+
495
+ for p in params[len(args_list) : number_of_required]:
496
+ try:
497
+ args_list.append(kwargs.pop(p.name))
498
+ except KeyError:
499
+ missing = tuple(p.name for p in params[len(args_list) : number_of_required])
500
+ raise TypeError(f"{fn} missing {len(missing)} arguments: {missing}")
501
+
502
+ for p in params[len(args_list) :]:
503
+ args_list.append(kwargs.pop(p.name, p.default))
504
+
505
+ return tuple(args_list)
506
+
507
+ def __split_data_and_graph(
508
+ self, graph: Dict[str, Union[Any, Tuple[Union[Callable, str], ...]]]
509
+ ) -> Tuple[Dict[str, Tuple[ObjectID, Any]], Dict[str, _CallNode]]:
510
+ call_graph = {}
511
+ node_name_to_argument: Dict[str, Tuple[ObjectID, Union[Any, Tuple[Union[Callable, Any], ...]]]] = dict()
512
+
513
+ for node_name, node in graph.items():
514
+ if isinstance(node, tuple) and len(node) > 0 and callable(node[0]):
515
+ call_graph[node_name] = _CallNode(func=node[0], args=node[1:]) # type: ignore[arg-type]
516
+ continue
517
+
518
+ if isinstance(node, ObjectReference):
519
+ object_id = node.object_id
520
+ else:
521
+ object_id = self._object_buffer.buffer_send_object(node, name=node_name).object_id
522
+
523
+ node_name_to_argument[node_name] = (object_id, node)
524
+
525
+ return node_name_to_argument, call_graph
526
+
527
+ @staticmethod
528
+ def __check_graph(
529
+ node_to_argument: Dict[str, Tuple[ObjectID, Any]], call_graph: Dict[str, _CallNode], keys: List[str]
530
+ ):
531
+ duplicate_keys = [key for key, count in dict(Counter(keys)).items() if count > 1]
532
+ if duplicate_keys:
533
+ raise KeyError(f"duplicate key detected in argument keys: {duplicate_keys}")
534
+
535
+ # sanity check graph
536
+ for key in keys:
537
+ if key not in call_graph and key not in node_to_argument:
538
+ raise KeyError(f"key {key} has to be in graph")
539
+
540
+ sorter: TopologicalSorter[str] = TopologicalSorter()
541
+ for node_name, node in call_graph.items():
542
+ for arg in node.args:
543
+ if arg not in node_to_argument and arg not in call_graph:
544
+ raise KeyError(f"argument {arg} in node '{node_name}': {node} is not defined in graph")
545
+
546
+ sorter.add(node_name, *node.args)
547
+
548
+ # check cyclic dependencies
549
+ sorter.prepare()
550
+
551
+ def __construct_graph(
552
+ self,
553
+ node_name_to_arguments: Dict[str, Tuple[ObjectID, Any]],
554
+ call_graph: Dict[str, _CallNode],
555
+ keys: List[str],
556
+ block: bool,
557
+ capabilities: Dict[str, int],
558
+ ) -> Tuple[GraphTask, Dict[str, ScalerFuture], Dict[str, ScalerFuture]]:
559
+ graph_task_id = TaskID.generate_task_id()
560
+
561
+ node_name_to_task_id = {node_name: TaskID.generate_task_id() for node_name in call_graph.keys()}
562
+
563
+ task_flags_bytes = self.__get_task_flags().serialize()
564
+
565
+ task_id_to_tasks = dict()
566
+
567
+ for node_name, node in call_graph.items():
568
+ task_id = node_name_to_task_id[node_name]
569
+ function_cache = self._object_buffer.buffer_send_function(node.func)
570
+
571
+ arguments: List[Union[TaskID, ObjectID]] = []
572
+ for arg in node.args:
573
+ assert arg in call_graph or arg in node_name_to_arguments
574
+
575
+ if arg in call_graph:
576
+ arguments.append(TaskID(node_name_to_task_id[arg]))
577
+ elif arg in node_name_to_arguments:
578
+ argument, _ = node_name_to_arguments[arg]
579
+ arguments.append(argument)
580
+ else:
581
+ raise ValueError("Not possible")
582
+
583
+ task_id_to_tasks[task_id] = Task.new_msg(
584
+ task_id=task_id,
585
+ source=self._identity,
586
+ metadata=task_flags_bytes,
587
+ func_object_id=function_cache.object_id,
588
+ function_args=arguments,
589
+ capabilities=capabilities,
590
+ )
591
+
592
+ result_task_ids = [node_name_to_task_id[key] for key in keys if key in call_graph]
593
+ graph_task = GraphTask.new_msg(graph_task_id, self._identity, result_task_ids, list(task_id_to_tasks.values()))
594
+
595
+ compute_futures = {}
596
+ ready_futures = {}
597
+ for key in keys:
598
+ if key in call_graph:
599
+ compute_futures[key] = self._future_factory(
600
+ task=task_id_to_tasks[node_name_to_task_id[key]], is_delayed=not block, group_task_id=graph_task_id
601
+ )
602
+
603
+ elif key in node_name_to_arguments:
604
+ argument, data = node_name_to_arguments[key]
605
+ future: ScalerFuture = self._future_factory(
606
+ task=Task.new_msg(
607
+ task_id=TaskID.generate_task_id(),
608
+ source=self._identity,
609
+ metadata=b"",
610
+ func_object_id=None,
611
+ function_args=[],
612
+ capabilities={},
613
+ ),
614
+ is_delayed=False,
615
+ group_task_id=graph_task_id,
616
+ )
617
+ future.set_result(data, ProfileResult())
618
+ ready_futures[key] = future
619
+
620
+ else:
621
+ raise ValueError(f"cannot find {key=} in graph")
622
+
623
+ return graph_task, compute_futures, ready_futures
624
+
625
+ def __get_task_flags(self) -> TaskFlags:
626
+ parent_task_priority = self.__get_parent_task_priority()
627
+
628
+ if parent_task_priority is not None:
629
+ task_priority = parent_task_priority + 1
630
+ else:
631
+ task_priority = 0
632
+
633
+ return TaskFlags(profiling=self._profiling, priority=task_priority, stream_output=self._stream_output)
634
+
635
+ def __assert_client_not_stopped(self):
636
+ if self._stop_event.is_set():
637
+ raise ClientQuitException("client is already stopped.")
638
+
639
+ def __destroy(self):
640
+ self._agent.join()
641
+ self._context.destroy(linger=1)
642
+
643
+ @staticmethod
644
+ def __get_parent_task_priority() -> Optional[int]:
645
+ """If the client is running inside a Scaler processor, returns the priority of the associated task."""
646
+
647
+ current_processor = Processor.get_current_processor()
648
+
649
+ if current_processor is None:
650
+ return None
651
+
652
+ current_task = current_processor.current_task()
653
+ assert current_task is not None
654
+
655
+ return retrieve_task_flags_from_task(current_task).priority
656
+
657
+ def _resolve_scheduler_address(self, address: Optional[str]) -> str:
658
+ """Resolve the scheduler address based on the provided address and worker context."""
659
+ # Provided address always takes precedence
660
+ if address is not None:
661
+ return address
662
+
663
+ # No address provided, check if we're running inside a worker context
664
+ current_processor = Processor.get_current_processor()
665
+ if current_processor is None:
666
+ raise ValueError(
667
+ "No scheduler address provided and not running inside a worker context. "
668
+ "Please provide a scheduler address when creating the Client outside of a worker."
669
+ )
670
+
671
+ # Return the scheduler address from the current processor
672
+ return current_processor.scheduler_address().to_address()