opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,218 @@
1
+ import asyncio
2
+ import logging
3
+ import sys
4
+ import threading
5
+ from concurrent.futures import Future
6
+ from typing import Optional
7
+
8
+ import zmq.asyncio
9
+
10
+ from scaler.client.agent.disconnect_manager import ClientDisconnectManager
11
+ from scaler.client.agent.future_manager import ClientFutureManager
12
+ from scaler.client.agent.heartbeat_manager import ClientHeartbeatManager
13
+ from scaler.client.agent.object_manager import ClientObjectManager
14
+ from scaler.client.agent.task_manager import ClientTaskManager
15
+ from scaler.client.serializer.mixins import Serializer
16
+ from scaler.config.types.zmq import ZMQConfig
17
+ from scaler.io.async_connector import ZMQAsyncConnector
18
+ from scaler.io.mixins import AsyncConnector
19
+ from scaler.protocol.python.common import ObjectStorageAddress
20
+ from scaler.protocol.python.message import (
21
+ ClientDisconnect,
22
+ ClientHeartbeatEcho,
23
+ ClientShutdownResponse,
24
+ GraphTask,
25
+ ObjectInstruction,
26
+ Task,
27
+ TaskCancel,
28
+ TaskCancelConfirm,
29
+ TaskLog,
30
+ TaskResult,
31
+ )
32
+ from scaler.protocol.python.mixins import Message
33
+ from scaler.utility.event_loop import create_async_loop_routine
34
+ from scaler.utility.exceptions import ClientCancelledException, ClientQuitException, ClientShutdownException
35
+ from scaler.utility.identifiers import ClientID
36
+
37
+
38
+ class ClientAgent(threading.Thread):
39
+ def __init__(
40
+ self,
41
+ identity: ClientID,
42
+ client_agent_address: ZMQConfig,
43
+ scheduler_address: ZMQConfig,
44
+ context: zmq.Context,
45
+ future_manager: ClientFutureManager,
46
+ stop_event: threading.Event,
47
+ timeout_seconds: int,
48
+ heartbeat_interval_seconds: int,
49
+ serializer: Serializer,
50
+ object_storage_address: Optional[str] = None,
51
+ ):
52
+ threading.Thread.__init__(self, daemon=True)
53
+
54
+ self._stop_event = stop_event
55
+ self._timeout_seconds = timeout_seconds
56
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
57
+ self._serializer = serializer
58
+
59
+ self._identity = identity
60
+ self._client_agent_address = client_agent_address
61
+ self._scheduler_address = scheduler_address
62
+ self._context = context
63
+ self._object_storage_address: Future[ObjectStorageAddress] = Future()
64
+ if object_storage_address is not None:
65
+ manual_config = ZMQConfig.from_string(object_storage_address)
66
+ self._object_storage_address_override = ObjectStorageAddress.new_msg(manual_config.host, manual_config.port)
67
+ else:
68
+ self._object_storage_address_override = None
69
+
70
+ self._future_manager = future_manager
71
+
72
+ self._connector_internal: AsyncConnector = ZMQAsyncConnector(
73
+ context=zmq.asyncio.Context.shadow(self._context),
74
+ name="client_agent_internal",
75
+ socket_type=zmq.PAIR,
76
+ bind_or_connect="bind",
77
+ address=self._client_agent_address,
78
+ callback=self.__on_receive_from_client,
79
+ identity=None,
80
+ )
81
+ self._connector_external: AsyncConnector = ZMQAsyncConnector(
82
+ context=zmq.asyncio.Context.shadow(self._context),
83
+ name="client_agent_external",
84
+ socket_type=zmq.DEALER,
85
+ address=self._scheduler_address,
86
+ bind_or_connect="connect",
87
+ callback=self.__on_receive_from_scheduler,
88
+ identity=self._identity,
89
+ )
90
+
91
+ self._disconnect_manager: Optional[ClientDisconnectManager] = None
92
+ self._heartbeat_manager: Optional[ClientHeartbeatManager] = None
93
+ self._task_manager: Optional[ClientTaskManager] = None
94
+
95
+ def __initialize(self):
96
+ self._disconnect_manager = ClientDisconnectManager()
97
+ self._heartbeat_manager = ClientHeartbeatManager(
98
+ death_timeout_seconds=self._timeout_seconds, storage_address_future=self._object_storage_address
99
+ )
100
+ self._object_manager = ClientObjectManager(identity=self._identity)
101
+ self._task_manager = ClientTaskManager()
102
+
103
+ # register all managers
104
+ self._disconnect_manager.register(
105
+ connector_internal=self._connector_internal, connector_external=self._connector_external
106
+ )
107
+ self._object_manager.register(
108
+ connector_internal=self._connector_internal, connector_external=self._connector_external
109
+ )
110
+ self._task_manager.register(
111
+ connector_external=self._connector_external,
112
+ object_manager=self._object_manager,
113
+ future_manager=self._future_manager,
114
+ )
115
+ self._heartbeat_manager.register(connector_external=self._connector_external)
116
+
117
+ def __run_loop(self):
118
+ self._loop = asyncio.new_event_loop()
119
+ self._task = self._loop.create_task(self.__get_loops())
120
+ self._loop.run_until_complete(self._task)
121
+ self._loop.close()
122
+
123
+ def run(self):
124
+ self.__initialize()
125
+ self.__run_loop()
126
+
127
+ def get_object_storage_address(self) -> ObjectStorageAddress:
128
+ """Returns the object storage address, or block until it receives it."""
129
+ if self._object_storage_address_override is not None:
130
+ return self._object_storage_address_override
131
+ return self._object_storage_address.result()
132
+
133
+ async def __on_receive_from_client(self, message: Message):
134
+ if isinstance(message, ClientDisconnect):
135
+ await self._disconnect_manager.on_client_disconnect(message)
136
+ return
137
+
138
+ if isinstance(message, ObjectInstruction):
139
+ await self._object_manager.on_object_instruction(message)
140
+ return
141
+
142
+ if isinstance(message, Task):
143
+ await self._task_manager.on_new_task(message)
144
+ return
145
+
146
+ if isinstance(message, TaskCancel):
147
+ await self._task_manager.on_cancel_task(message)
148
+ return
149
+
150
+ if isinstance(message, GraphTask):
151
+ await self._task_manager.on_new_graph_task(message)
152
+ return
153
+
154
+ raise TypeError(f"Unknown {message=}")
155
+
156
+ async def __on_receive_from_scheduler(self, message: Message):
157
+ if isinstance(message, ClientShutdownResponse):
158
+ await self._disconnect_manager.on_client_shutdown_response(message)
159
+ return
160
+
161
+ if isinstance(message, ClientHeartbeatEcho):
162
+ await self._heartbeat_manager.on_heartbeat_echo(message)
163
+ return
164
+
165
+ if isinstance(message, TaskLog):
166
+ log_type = sys.stdout if message.log_type == TaskLog.LogType.Stdout else sys.stderr
167
+ print(message.content, file=log_type, end="")
168
+ return
169
+
170
+ if isinstance(message, TaskResult):
171
+ await self._task_manager.on_task_result(message)
172
+ return
173
+
174
+ if isinstance(message, TaskCancelConfirm):
175
+ await self._task_manager.on_task_cancel_confirm(message)
176
+ return
177
+
178
+ raise TypeError(f"Unknown {message=}")
179
+
180
+ async def __get_loops(self):
181
+ await self._heartbeat_manager.send_heartbeat()
182
+
183
+ loops = [
184
+ create_async_loop_routine(self._connector_external.routine, 0),
185
+ create_async_loop_routine(self._connector_internal.routine, 0),
186
+ create_async_loop_routine(self._heartbeat_manager.routine, self._heartbeat_interval_seconds),
187
+ ]
188
+
189
+ exception = None
190
+ try:
191
+ await asyncio.gather(*loops)
192
+ except BaseException as e:
193
+ exception = e
194
+ finally:
195
+ self._stop_event.set() # always set the stop event before setting futures' exceptions
196
+
197
+ await self._object_manager.clear_all_objects(clear_serializer=True)
198
+
199
+ self._connector_external.destroy()
200
+ self._connector_internal.destroy()
201
+
202
+ if exception is None:
203
+ return
204
+
205
+ if not self._object_storage_address.done():
206
+ self._object_storage_address.set_exception(exception)
207
+
208
+ if isinstance(exception, asyncio.CancelledError):
209
+ logging.error("ClientAgent: async. loop cancelled")
210
+ self._future_manager.set_all_futures_with_exception(ClientCancelledException("client cancelled"))
211
+ elif isinstance(exception, (ClientQuitException, ClientShutdownException)):
212
+ logging.info("ClientAgent: client quitting")
213
+ self._future_manager.set_all_futures_with_exception(exception)
214
+ elif isinstance(exception, TimeoutError):
215
+ logging.error(f"ClientAgent: client timeout when connecting to {self._scheduler_address.to_address()}")
216
+ self._future_manager.set_all_futures_with_exception(exception)
217
+ else:
218
+ raise exception
@@ -0,0 +1,27 @@
1
+ from typing import Optional
2
+
3
+ from scaler.client.agent.mixins import DisconnectManager
4
+ from scaler.io.mixins import AsyncConnector
5
+ from scaler.protocol.python.message import ClientDisconnect, ClientShutdownResponse
6
+ from scaler.utility.exceptions import ClientQuitException, ClientShutdownException
7
+
8
+
9
+ class ClientDisconnectManager(DisconnectManager):
10
+ def __init__(self):
11
+ self._connector_internal: Optional[AsyncConnector] = None
12
+ self._connector_external: Optional[AsyncConnector] = None
13
+
14
+ def register(self, connector_internal: AsyncConnector, connector_external: AsyncConnector):
15
+ self._connector_internal = connector_internal
16
+ self._connector_external = connector_external
17
+
18
+ async def on_client_disconnect(self, disconnect: ClientDisconnect):
19
+ await self._connector_external.send(disconnect)
20
+
21
+ if disconnect.disconnect_type == ClientDisconnect.DisconnectType.Disconnect:
22
+ raise ClientQuitException("client disconnecting")
23
+
24
+ async def on_client_shutdown_response(self, response: ClientShutdownResponse):
25
+ await self._connector_internal.send(response)
26
+
27
+ raise ClientShutdownException("cluster shutting down")
@@ -0,0 +1,112 @@
1
+ import logging
2
+ import threading
3
+ from concurrent.futures import Future, InvalidStateError
4
+ from typing import Dict, Optional
5
+
6
+ from scaler.client.agent.mixins import FutureManager
7
+ from scaler.client.future import ScalerFuture
8
+ from scaler.client.serializer.mixins import Serializer
9
+ from scaler.protocol.python.common import TaskCancelConfirmType, TaskResultType, TaskState
10
+ from scaler.protocol.python.message import TaskCancelConfirm, TaskResult
11
+ from scaler.utility.exceptions import WorkerDiedError
12
+ from scaler.utility.identifiers import ObjectID, TaskID
13
+ from scaler.utility.metadata.profile_result import retrieve_profiling_result_from_task_result
14
+
15
+
16
+ class ClientFutureManager(FutureManager):
17
+ def __init__(self, serializer: Serializer):
18
+ self._lock = threading.RLock()
19
+ self._serializer = serializer
20
+
21
+ self._task_id_to_future: Dict[TaskID, ScalerFuture] = dict()
22
+
23
+ def add_future(self, future: Future):
24
+ assert isinstance(future, ScalerFuture)
25
+ with self._lock:
26
+ future.set_running_or_notify_cancel()
27
+ self._task_id_to_future[future.task_id] = future
28
+
29
+ def cancel_all_futures(self):
30
+ with self._lock:
31
+ futures_to_cancel = list(self._task_id_to_future.values())
32
+
33
+ # Actually cancelling the futures should occur without holding the future manager's lock. That's because
34
+ # `cancel()` is blocking, and requires the manager to process result and cancel confirm messages.
35
+
36
+ logging.info(f"canceling {len(futures_to_cancel)} task(s)")
37
+ for future in futures_to_cancel:
38
+ future.cancel()
39
+
40
+ def set_all_futures_with_exception(self, exception: Exception):
41
+ with self._lock:
42
+ for future in self._task_id_to_future.values():
43
+ try:
44
+ future.set_exception(exception)
45
+ except InvalidStateError:
46
+ continue # Future got canceled
47
+
48
+ self._task_id_to_future.clear()
49
+
50
+ def on_task_result(self, result: TaskResult):
51
+ with self._lock:
52
+ task_id = result.task_id
53
+ if task_id not in self._task_id_to_future:
54
+ return
55
+
56
+ future = self._task_id_to_future.pop(task_id)
57
+ assert result.task_id == future.task_id
58
+
59
+ profile_result = retrieve_profiling_result_from_task_result(result)
60
+
61
+ if result.result_type == TaskResultType.FailedWorkerDied:
62
+ future.set_exception(
63
+ WorkerDiedError(f"worker died when processing task: {task_id.hex()}"), profile_result
64
+ )
65
+
66
+ elif result.result_type == TaskResultType.Success:
67
+ result_object_id = self.__get_result_object_id(result)
68
+ future.set_result_ready(result_object_id, TaskState.Success, profile_result)
69
+
70
+ elif result.result_type == TaskResultType.Failed:
71
+ result_object_id = self.__get_result_object_id(result)
72
+ future.set_result_ready(result_object_id, TaskState.Failed, profile_result)
73
+
74
+ else:
75
+ raise TypeError(f"{result.task_id.hex()}: Unknown task status: {result.result_type}")
76
+
77
+ def on_task_cancel_confirm(self, cancel_confirm: TaskCancelConfirm):
78
+ with self._lock:
79
+ task_id = cancel_confirm.task_id
80
+ if task_id not in self._task_id_to_future:
81
+ return
82
+
83
+ future = self._task_id_to_future.pop(task_id)
84
+ assert cancel_confirm.task_id == future.task_id
85
+
86
+ if cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.Canceled:
87
+ future.set_canceled()
88
+
89
+ elif cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelNotFound:
90
+ logging.error(f"{task_id!r}: task to cancel not found")
91
+ future.set_canceled()
92
+
93
+ elif cancel_confirm.cancel_confirm_type == TaskCancelConfirmType.CancelFailed:
94
+ logging.error(f"{task_id!r}: task cancel failed")
95
+ self._task_id_to_future[task_id] = future
96
+
97
+ else:
98
+ raise TypeError(
99
+ f"{task_id}: unknown task cancel confirm type:" f" {cancel_confirm.cancel_confirm_type}"
100
+ )
101
+
102
+ @staticmethod
103
+ def __get_result_object_id(result: TaskResult) -> Optional[ObjectID]:
104
+ if len(result.results) == 1:
105
+ result_object_id = ObjectID(result.results[0])
106
+ elif len(result.results) == 0:
107
+ # this will happen only if umbrella task is done
108
+ result_object_id = None
109
+ else:
110
+ raise ValueError(f"{result.task_id!r}: received multiple objects for the results: {len(result.results)=}")
111
+
112
+ return result_object_id
@@ -0,0 +1,74 @@
1
+ import time
2
+ from concurrent.futures import Future
3
+ from typing import Optional
4
+
5
+ import psutil
6
+
7
+ from scaler.client.agent.mixins import HeartbeatManager, ObjectManager
8
+ from scaler.io.mixins import AsyncConnector
9
+ from scaler.protocol.python.common import ObjectStorageAddress
10
+ from scaler.protocol.python.message import ClientHeartbeat, ClientHeartbeatEcho
11
+ from scaler.protocol.python.status import Resource
12
+ from scaler.utility.mixins import Looper
13
+
14
+
15
+ class ClientHeartbeatManager(Looper, HeartbeatManager):
16
+ def __init__(self, death_timeout_seconds: int, storage_address_future: Future):
17
+ self._death_timeout_seconds = death_timeout_seconds
18
+ self._object_storage_address = storage_address_future
19
+
20
+ self._process = psutil.Process()
21
+
22
+ self._last_scheduler_contact = time.time()
23
+ self._start_timestamp_ns = 0
24
+ self._latency_us = 0
25
+ self._connected = False
26
+
27
+ self._connector_external: Optional[AsyncConnector] = None
28
+ self._object_manager: Optional[ObjectManager] = None
29
+
30
+ def register(self, connector_external: AsyncConnector):
31
+ self._connector_external = connector_external
32
+
33
+ async def send_heartbeat(self):
34
+ await self._connector_external.send(
35
+ ClientHeartbeat.new_msg(
36
+ Resource.new_msg(int(self._process.cpu_percent() * 10), self._process.memory_info().rss),
37
+ self._latency_us,
38
+ )
39
+ )
40
+
41
+ async def on_heartbeat_echo(self, heartbeat: ClientHeartbeatEcho):
42
+ if not self._connected:
43
+ self._connected = True
44
+
45
+ self._last_scheduler_contact = time.time()
46
+ if self._start_timestamp_ns == 0:
47
+ # not handling echo if we didn't send out heartbeat
48
+ return
49
+
50
+ self._latency_us = int(((time.time_ns() - self._start_timestamp_ns) / 2) // 1_000)
51
+ self._start_timestamp_ns = 0
52
+
53
+ if self._object_storage_address.done():
54
+ return
55
+
56
+ self._object_storage_address.set_result(heartbeat.object_storage_address())
57
+
58
+ async def routine(self):
59
+ if time.time() - self._last_scheduler_contact > self._death_timeout_seconds:
60
+ raise TimeoutError(
61
+ f"Timeout when connecting to scheduler {self._connector_external.address} "
62
+ f"in {self._death_timeout_seconds} seconds"
63
+ )
64
+
65
+ if self._start_timestamp_ns != 0:
66
+ # already sent heartbeat, expecting heartbeat echo, so not sending
67
+ return
68
+
69
+ await self.send_heartbeat()
70
+ self._start_timestamp_ns = time.time_ns()
71
+
72
+ def get_object_storage_address(self) -> ObjectStorageAddress:
73
+ """Returns the object storage configuration, or block until it receives it."""
74
+ return self._object_storage_address.result()
@@ -0,0 +1,89 @@
1
+ import abc
2
+ from concurrent.futures import Future
3
+
4
+ from scaler.protocol.python.message import (
5
+ ClientDisconnect,
6
+ ClientHeartbeatEcho,
7
+ ClientShutdownResponse,
8
+ GraphTask,
9
+ ObjectInstruction,
10
+ Task,
11
+ TaskCancelConfirm,
12
+ TaskResult,
13
+ )
14
+
15
+
16
+ class HeartbeatManager(metaclass=abc.ABCMeta):
17
+ @abc.abstractmethod
18
+ async def send_heartbeat(self):
19
+ raise NotImplementedError()
20
+
21
+ @abc.abstractmethod
22
+ async def on_heartbeat_echo(self, heartbeat: ClientHeartbeatEcho):
23
+ raise NotImplementedError()
24
+
25
+
26
+ class TimeoutManager(metaclass=abc.ABCMeta):
27
+ @abc.abstractmethod
28
+ def update_last_seen_time(self):
29
+ raise NotImplementedError()
30
+
31
+
32
+ class ObjectManager(metaclass=abc.ABCMeta):
33
+ @abc.abstractmethod
34
+ async def on_object_instruction(self, object_instruction: ObjectInstruction):
35
+ raise NotImplementedError()
36
+
37
+ @abc.abstractmethod
38
+ def on_task_result(self, result: TaskResult):
39
+ raise NotImplementedError()
40
+
41
+ @abc.abstractmethod
42
+ async def clear_all_objects(self, clear_serializer: bool):
43
+ raise NotImplementedError()
44
+
45
+
46
+ class TaskManager(metaclass=abc.ABCMeta):
47
+ @abc.abstractmethod
48
+ async def on_new_task(self, task: Task):
49
+ raise NotImplementedError()
50
+
51
+ @abc.abstractmethod
52
+ async def on_new_graph_task(self, task: GraphTask):
53
+ raise NotImplementedError()
54
+
55
+ @abc.abstractmethod
56
+ async def on_task_result(self, result: TaskResult):
57
+ raise NotImplementedError()
58
+
59
+
60
+ class FutureManager(metaclass=abc.ABCMeta):
61
+ @abc.abstractmethod
62
+ def add_future(self, future: Future):
63
+ raise NotImplementedError()
64
+
65
+ @abc.abstractmethod
66
+ def cancel_all_futures(self):
67
+ raise NotImplementedError()
68
+
69
+ @abc.abstractmethod
70
+ def set_all_futures_with_exception(self, exception: Exception):
71
+ raise NotImplementedError()
72
+
73
+ @abc.abstractmethod
74
+ def on_task_result(self, result: TaskResult):
75
+ raise NotImplementedError()
76
+
77
+ @abc.abstractmethod
78
+ def on_task_cancel_confirm(self, cancel_confirm: TaskCancelConfirm):
79
+ raise NotImplementedError()
80
+
81
+
82
+ class DisconnectManager(metaclass=abc.ABCMeta):
83
+ @abc.abstractmethod
84
+ async def on_client_disconnect(self, disconnect: ClientDisconnect):
85
+ raise NotImplementedError()
86
+
87
+ @abc.abstractmethod
88
+ async def on_client_shutdown_response(self, response: ClientShutdownResponse):
89
+ raise NotImplementedError()
@@ -0,0 +1,98 @@
1
+ from typing import Optional, Set
2
+
3
+ from scaler.client.agent.mixins import ObjectManager
4
+ from scaler.io.mixins import AsyncConnector
5
+ from scaler.protocol.python.common import ObjectMetadata
6
+ from scaler.protocol.python.message import ObjectInstruction, TaskResult
7
+ from scaler.utility.identifiers import ClientID, ObjectID
8
+
9
+
10
+ class ClientObjectManager(ObjectManager):
11
+ def __init__(self, identity: ClientID):
12
+ self._sent_object_ids: Set[ObjectID] = set()
13
+ self._sent_serializer_id: Optional[ObjectID] = None
14
+
15
+ self._identity = identity
16
+
17
+ self._connector_internal: Optional[AsyncConnector] = None
18
+ self._connector_external: Optional[AsyncConnector] = None
19
+
20
+ def register(self, connector_internal: AsyncConnector, connector_external: AsyncConnector):
21
+ self._connector_internal = connector_internal
22
+ self._connector_external = connector_external
23
+
24
+ async def on_object_instruction(self, instruction: ObjectInstruction):
25
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Create:
26
+ await self.__send_object_creation(instruction)
27
+ elif instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
28
+ await self.__delete_objects(instruction)
29
+ elif instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Clear:
30
+ await self.clear_all_objects(clear_serializer=False)
31
+
32
+ def on_task_result(self, task_result: TaskResult):
33
+ self._sent_object_ids.update((ObjectID(object_id_bytes) for object_id_bytes in task_result.results))
34
+
35
+ async def clear_all_objects(self, clear_serializer):
36
+ cleared_object_ids = self._sent_object_ids.copy()
37
+
38
+ if clear_serializer:
39
+ self._sent_serializer_id = None
40
+ elif self._sent_serializer_id is not None:
41
+ cleared_object_ids.remove(self._sent_serializer_id)
42
+
43
+ self._sent_object_ids.difference_update(cleared_object_ids)
44
+
45
+ await self._connector_external.send(
46
+ ObjectInstruction.new_msg(
47
+ ObjectInstruction.ObjectInstructionType.Delete,
48
+ self._identity,
49
+ ObjectMetadata.new_msg(tuple(cleared_object_ids)),
50
+ )
51
+ )
52
+
53
+ async def __send_object_creation(self, instruction: ObjectInstruction):
54
+ assert instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Create
55
+
56
+ new_object_ids = set(instruction.object_metadata.object_ids) - self._sent_object_ids
57
+ if not new_object_ids:
58
+ return
59
+
60
+ if ObjectMetadata.ObjectContentType.Serializer in instruction.object_metadata.object_types:
61
+ if self._sent_serializer_id is not None:
62
+ raise ValueError("trying to send multiple serializers.")
63
+
64
+ serializer_index = instruction.object_metadata.object_types.index(
65
+ ObjectMetadata.ObjectContentType.Serializer
66
+ )
67
+ self._sent_serializer_id = instruction.object_metadata.object_ids[serializer_index]
68
+
69
+ new_object_content = ObjectMetadata.new_msg(
70
+ *zip(
71
+ *filter(
72
+ lambda object_pack: object_pack[0] in new_object_ids,
73
+ zip(
74
+ instruction.object_metadata.object_ids,
75
+ instruction.object_metadata.object_types,
76
+ instruction.object_metadata.object_names,
77
+ ),
78
+ )
79
+ )
80
+ )
81
+
82
+ self._sent_object_ids.update(set(new_object_content.object_ids))
83
+
84
+ await self._connector_external.send(
85
+ ObjectInstruction.new_msg(
86
+ ObjectInstruction.ObjectInstructionType.Create, instruction.object_user, new_object_content
87
+ )
88
+ )
89
+
90
+ async def __delete_objects(self, instruction: ObjectInstruction):
91
+ assert instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete
92
+
93
+ if self._sent_serializer_id in instruction.object_metadata.object_ids:
94
+ raise ValueError("trying to delete serializer.")
95
+
96
+ self._sent_object_ids.difference_update(instruction.object_metadata.object_ids)
97
+
98
+ await self._connector_external.send(instruction)
@@ -0,0 +1,64 @@
1
+ from typing import Optional, Set
2
+
3
+ from scaler.client.agent.future_manager import ClientFutureManager
4
+ from scaler.client.agent.mixins import ObjectManager, TaskManager
5
+ from scaler.io.mixins import AsyncConnector
6
+ from scaler.protocol.python.message import GraphTask, Task, TaskCancel, TaskCancelConfirm, TaskResult
7
+
8
+
9
+ class ClientTaskManager(TaskManager):
10
+ def __init__(self):
11
+ self._task_ids: Set[bytes] = set()
12
+
13
+ self._connector_external: Optional[AsyncConnector] = None
14
+ self._object_manager: Optional[ObjectManager] = None
15
+ self._future_manager: Optional[ClientFutureManager] = None
16
+
17
+ def register(
18
+ self, connector_external: AsyncConnector, object_manager: ObjectManager, future_manager: ClientFutureManager
19
+ ):
20
+ self._connector_external = connector_external
21
+ self._object_manager = object_manager
22
+ self._future_manager = future_manager
23
+
24
+ async def on_new_task(self, task: Task):
25
+ self._task_ids.add(task.task_id)
26
+ await self._connector_external.send(task)
27
+
28
+ async def on_cancel_task(self, task_cancel: TaskCancel):
29
+ # We might receive a cancel task event on a previously finished task if:
30
+ # - The scheduler sends a TaskResult message to the client agent
31
+ # - The client sends a TaskCancel message to the client agent, as it's not yet aware the task finished.
32
+ # - The client agent processes the TaskResult message and removes the task from self._task_ids
33
+ # - The client agent processes the TaskCancel message (that was already queued before processing the
34
+ # TaskResult), and fails on self._task_ids.remove() as the task_id no longer exists.
35
+
36
+ if task_cancel.task_id not in self._task_ids:
37
+ return
38
+
39
+ await self._connector_external.send(task_cancel)
40
+
41
+ async def on_new_graph_task(self, task: GraphTask):
42
+ self._task_ids.add(task.task_id)
43
+ self._task_ids.update(set(task.targets))
44
+ await self._connector_external.send(task)
45
+
46
+ async def on_task_result(self, result: TaskResult):
47
+ # All task result objects must be propagated to the object manager, even if we do not track the task anymore
48
+ # (e.g. if it got cancelled). If we don't, we might lose track of these result objects and not properly clear
49
+ # them.
50
+ self._object_manager.on_task_result(result)
51
+
52
+ if result.task_id not in self._task_ids:
53
+ return
54
+
55
+ self._task_ids.remove(result.task_id)
56
+
57
+ self._future_manager.on_task_result(result)
58
+
59
+ async def on_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
60
+ if task_cancel_confirm.task_id not in self._task_ids:
61
+ return
62
+
63
+ self._task_ids.remove(task_cancel_confirm.task_id)
64
+ self._future_manager.on_task_cancel_confirm(task_cancel_confirm)