opengris-scaler 1.12.7__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (232) hide show
  1. opengris_scaler-1.12.7.dist-info/METADATA +729 -0
  2. opengris_scaler-1.12.7.dist-info/RECORD +232 -0
  3. opengris_scaler-1.12.7.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.7.dist-info/entry_points.txt +9 -0
  5. opengris_scaler-1.12.7.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.7.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.7.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-b787335c.1.0.so +0 -0
  9. opengris_scaler.libs/libkj-1-094aa318.1.0.so +0 -0
  10. scaler/CMakeLists.txt +11 -0
  11. scaler/__init__.py +14 -0
  12. scaler/about.py +5 -0
  13. scaler/client/__init__.py +0 -0
  14. scaler/client/agent/__init__.py +0 -0
  15. scaler/client/agent/client_agent.py +210 -0
  16. scaler/client/agent/disconnect_manager.py +27 -0
  17. scaler/client/agent/future_manager.py +112 -0
  18. scaler/client/agent/heartbeat_manager.py +74 -0
  19. scaler/client/agent/mixins.py +89 -0
  20. scaler/client/agent/object_manager.py +98 -0
  21. scaler/client/agent/task_manager.py +64 -0
  22. scaler/client/client.py +635 -0
  23. scaler/client/future.py +252 -0
  24. scaler/client/object_buffer.py +129 -0
  25. scaler/client/object_reference.py +25 -0
  26. scaler/client/serializer/__init__.py +0 -0
  27. scaler/client/serializer/default.py +16 -0
  28. scaler/client/serializer/mixins.py +38 -0
  29. scaler/cluster/__init__.py +0 -0
  30. scaler/cluster/cluster.py +115 -0
  31. scaler/cluster/combo.py +148 -0
  32. scaler/cluster/object_storage_server.py +45 -0
  33. scaler/cluster/scheduler.py +83 -0
  34. scaler/config/__init__.py +0 -0
  35. scaler/config/defaults.py +87 -0
  36. scaler/config/loader.py +95 -0
  37. scaler/config/mixins.py +15 -0
  38. scaler/config/section/__init__.py +0 -0
  39. scaler/config/section/cluster.py +56 -0
  40. scaler/config/section/native_worker_adapter.py +44 -0
  41. scaler/config/section/object_storage_server.py +7 -0
  42. scaler/config/section/scheduler.py +53 -0
  43. scaler/config/section/symphony_worker_adapter.py +47 -0
  44. scaler/config/section/top.py +13 -0
  45. scaler/config/section/webui.py +16 -0
  46. scaler/config/types/__init__.py +0 -0
  47. scaler/config/types/object_storage_server.py +45 -0
  48. scaler/config/types/worker.py +57 -0
  49. scaler/config/types/zmq.py +79 -0
  50. scaler/entry_points/__init__.py +0 -0
  51. scaler/entry_points/cluster.py +133 -0
  52. scaler/entry_points/object_storage_server.py +41 -0
  53. scaler/entry_points/scheduler.py +135 -0
  54. scaler/entry_points/top.py +286 -0
  55. scaler/entry_points/webui.py +26 -0
  56. scaler/entry_points/worker_adapter_native.py +137 -0
  57. scaler/entry_points/worker_adapter_symphony.py +102 -0
  58. scaler/io/__init__.py +0 -0
  59. scaler/io/async_binder.py +85 -0
  60. scaler/io/async_connector.py +95 -0
  61. scaler/io/async_object_storage_connector.py +185 -0
  62. scaler/io/mixins.py +154 -0
  63. scaler/io/sync_connector.py +68 -0
  64. scaler/io/sync_object_storage_connector.py +185 -0
  65. scaler/io/sync_subscriber.py +83 -0
  66. scaler/io/utility.py +31 -0
  67. scaler/io/ymq/CMakeLists.txt +98 -0
  68. scaler/io/ymq/__init__.py +0 -0
  69. scaler/io/ymq/_ymq.pyi +96 -0
  70. scaler/io/ymq/_ymq.so +0 -0
  71. scaler/io/ymq/bytes.h +114 -0
  72. scaler/io/ymq/common.h +29 -0
  73. scaler/io/ymq/configuration.h +60 -0
  74. scaler/io/ymq/epoll_context.cpp +185 -0
  75. scaler/io/ymq/epoll_context.h +85 -0
  76. scaler/io/ymq/error.h +132 -0
  77. scaler/io/ymq/event_loop.h +55 -0
  78. scaler/io/ymq/event_loop_thread.cpp +64 -0
  79. scaler/io/ymq/event_loop_thread.h +46 -0
  80. scaler/io/ymq/event_manager.h +81 -0
  81. scaler/io/ymq/file_descriptor.h +203 -0
  82. scaler/io/ymq/interruptive_concurrent_queue.h +169 -0
  83. scaler/io/ymq/io_context.cpp +98 -0
  84. scaler/io/ymq/io_context.h +44 -0
  85. scaler/io/ymq/io_socket.cpp +299 -0
  86. scaler/io/ymq/io_socket.h +121 -0
  87. scaler/io/ymq/iocp_context.cpp +102 -0
  88. scaler/io/ymq/iocp_context.h +83 -0
  89. scaler/io/ymq/logging.h +163 -0
  90. scaler/io/ymq/message.h +15 -0
  91. scaler/io/ymq/message_connection.h +16 -0
  92. scaler/io/ymq/message_connection_tcp.cpp +672 -0
  93. scaler/io/ymq/message_connection_tcp.h +96 -0
  94. scaler/io/ymq/network_utils.h +179 -0
  95. scaler/io/ymq/pymod_ymq/bytes.h +113 -0
  96. scaler/io/ymq/pymod_ymq/exception.h +124 -0
  97. scaler/io/ymq/pymod_ymq/gil.h +15 -0
  98. scaler/io/ymq/pymod_ymq/io_context.h +166 -0
  99. scaler/io/ymq/pymod_ymq/io_socket.h +285 -0
  100. scaler/io/ymq/pymod_ymq/message.h +99 -0
  101. scaler/io/ymq/pymod_ymq/python.h +153 -0
  102. scaler/io/ymq/pymod_ymq/ymq.cpp +23 -0
  103. scaler/io/ymq/pymod_ymq/ymq.h +357 -0
  104. scaler/io/ymq/readme.md +114 -0
  105. scaler/io/ymq/simple_interface.cpp +80 -0
  106. scaler/io/ymq/simple_interface.h +24 -0
  107. scaler/io/ymq/tcp_client.cpp +367 -0
  108. scaler/io/ymq/tcp_client.h +75 -0
  109. scaler/io/ymq/tcp_operations.h +41 -0
  110. scaler/io/ymq/tcp_server.cpp +410 -0
  111. scaler/io/ymq/tcp_server.h +79 -0
  112. scaler/io/ymq/third_party/concurrentqueue.h +3747 -0
  113. scaler/io/ymq/timed_queue.h +272 -0
  114. scaler/io/ymq/timestamp.h +102 -0
  115. scaler/io/ymq/typedefs.h +20 -0
  116. scaler/io/ymq/utils.h +34 -0
  117. scaler/io/ymq/ymq.py +130 -0
  118. scaler/object_storage/CMakeLists.txt +50 -0
  119. scaler/object_storage/__init__.py +0 -0
  120. scaler/object_storage/constants.h +11 -0
  121. scaler/object_storage/defs.h +14 -0
  122. scaler/object_storage/io_helper.cpp +44 -0
  123. scaler/object_storage/io_helper.h +9 -0
  124. scaler/object_storage/message.cpp +56 -0
  125. scaler/object_storage/message.h +130 -0
  126. scaler/object_storage/object_manager.cpp +126 -0
  127. scaler/object_storage/object_manager.h +52 -0
  128. scaler/object_storage/object_storage_server.cpp +359 -0
  129. scaler/object_storage/object_storage_server.h +126 -0
  130. scaler/object_storage/object_storage_server.so +0 -0
  131. scaler/object_storage/pymod_object_storage_server.cpp +104 -0
  132. scaler/protocol/__init__.py +0 -0
  133. scaler/protocol/capnp/__init__.py +0 -0
  134. scaler/protocol/capnp/_python.py +6 -0
  135. scaler/protocol/capnp/common.capnp +63 -0
  136. scaler/protocol/capnp/message.capnp +216 -0
  137. scaler/protocol/capnp/object_storage.capnp +52 -0
  138. scaler/protocol/capnp/status.capnp +73 -0
  139. scaler/protocol/introduction.md +105 -0
  140. scaler/protocol/python/__init__.py +0 -0
  141. scaler/protocol/python/common.py +135 -0
  142. scaler/protocol/python/message.py +726 -0
  143. scaler/protocol/python/mixins.py +13 -0
  144. scaler/protocol/python/object_storage.py +118 -0
  145. scaler/protocol/python/status.py +279 -0
  146. scaler/protocol/worker.md +228 -0
  147. scaler/scheduler/__init__.py +0 -0
  148. scaler/scheduler/allocate_policy/__init__.py +0 -0
  149. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  150. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  151. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  152. scaler/scheduler/allocate_policy/mixins.py +55 -0
  153. scaler/scheduler/controllers/__init__.py +0 -0
  154. scaler/scheduler/controllers/balance_controller.py +65 -0
  155. scaler/scheduler/controllers/client_controller.py +131 -0
  156. scaler/scheduler/controllers/config_controller.py +31 -0
  157. scaler/scheduler/controllers/graph_controller.py +424 -0
  158. scaler/scheduler/controllers/information_controller.py +81 -0
  159. scaler/scheduler/controllers/mixins.py +201 -0
  160. scaler/scheduler/controllers/object_controller.py +147 -0
  161. scaler/scheduler/controllers/scaling_controller.py +86 -0
  162. scaler/scheduler/controllers/task_controller.py +373 -0
  163. scaler/scheduler/controllers/worker_controller.py +168 -0
  164. scaler/scheduler/object_usage/__init__.py +0 -0
  165. scaler/scheduler/object_usage/object_tracker.py +131 -0
  166. scaler/scheduler/scheduler.py +253 -0
  167. scaler/scheduler/task/__init__.py +0 -0
  168. scaler/scheduler/task/task_state_machine.py +92 -0
  169. scaler/scheduler/task/task_state_manager.py +61 -0
  170. scaler/ui/__init__.py +0 -0
  171. scaler/ui/constants.py +9 -0
  172. scaler/ui/live_display.py +118 -0
  173. scaler/ui/memory_window.py +146 -0
  174. scaler/ui/setting_page.py +47 -0
  175. scaler/ui/task_graph.py +370 -0
  176. scaler/ui/task_log.py +83 -0
  177. scaler/ui/utility.py +35 -0
  178. scaler/ui/webui.py +125 -0
  179. scaler/ui/worker_processors.py +85 -0
  180. scaler/utility/__init__.py +0 -0
  181. scaler/utility/debug.py +19 -0
  182. scaler/utility/event_list.py +63 -0
  183. scaler/utility/event_loop.py +58 -0
  184. scaler/utility/exceptions.py +42 -0
  185. scaler/utility/formatter.py +44 -0
  186. scaler/utility/graph/__init__.py +0 -0
  187. scaler/utility/graph/optimization.py +27 -0
  188. scaler/utility/graph/topological_sorter.py +11 -0
  189. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  190. scaler/utility/identifiers.py +105 -0
  191. scaler/utility/logging/__init__.py +0 -0
  192. scaler/utility/logging/decorators.py +25 -0
  193. scaler/utility/logging/scoped_logger.py +33 -0
  194. scaler/utility/logging/utility.py +183 -0
  195. scaler/utility/many_to_many_dict.py +123 -0
  196. scaler/utility/metadata/__init__.py +0 -0
  197. scaler/utility/metadata/profile_result.py +31 -0
  198. scaler/utility/metadata/task_flags.py +30 -0
  199. scaler/utility/mixins.py +13 -0
  200. scaler/utility/network_util.py +7 -0
  201. scaler/utility/one_to_many_dict.py +72 -0
  202. scaler/utility/queues/__init__.py +0 -0
  203. scaler/utility/queues/async_indexed_queue.py +37 -0
  204. scaler/utility/queues/async_priority_queue.py +70 -0
  205. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  206. scaler/utility/queues/indexed_queue.py +114 -0
  207. scaler/utility/serialization.py +9 -0
  208. scaler/version.txt +1 -0
  209. scaler/worker/__init__.py +0 -0
  210. scaler/worker/agent/__init__.py +0 -0
  211. scaler/worker/agent/heartbeat_manager.py +107 -0
  212. scaler/worker/agent/mixins.py +137 -0
  213. scaler/worker/agent/processor/__init__.py +0 -0
  214. scaler/worker/agent/processor/object_cache.py +107 -0
  215. scaler/worker/agent/processor/processor.py +279 -0
  216. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  217. scaler/worker/agent/processor_holder.py +145 -0
  218. scaler/worker/agent/processor_manager.py +365 -0
  219. scaler/worker/agent/profiling_manager.py +109 -0
  220. scaler/worker/agent/task_manager.py +150 -0
  221. scaler/worker/agent/timeout_manager.py +19 -0
  222. scaler/worker/preload.py +84 -0
  223. scaler/worker/worker.py +264 -0
  224. scaler/worker_adapter/__init__.py +0 -0
  225. scaler/worker_adapter/native.py +154 -0
  226. scaler/worker_adapter/symphony/__init__.py +0 -0
  227. scaler/worker_adapter/symphony/callback.py +45 -0
  228. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  229. scaler/worker_adapter/symphony/message.py +24 -0
  230. scaler/worker_adapter/symphony/task_manager.py +288 -0
  231. scaler/worker_adapter/symphony/worker.py +205 -0
  232. scaler/worker_adapter/symphony/worker_adapter.py +142 -0
@@ -0,0 +1,288 @@
1
+ import asyncio
2
+ import logging
3
+ from concurrent.futures import Future
4
+ from typing import Dict, Optional, Set, cast
5
+
6
+ import cloudpickle
7
+ from bidict import bidict
8
+
9
+ from scaler import Serializer
10
+ from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
11
+ from scaler.protocol.python.common import ObjectMetadata, ObjectStorageAddress, TaskCancelConfirmType, TaskResultType
12
+ from scaler.protocol.python.message import ObjectInstruction, Task, TaskCancel, TaskCancelConfirm, TaskResult
13
+ from scaler.utility.identifiers import ObjectID, TaskID
14
+ from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
15
+ from scaler.utility.mixins import Looper
16
+ from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
17
+ from scaler.utility.serialization import serialize_failure
18
+ from scaler.worker.agent.mixins import HeartbeatManager, TaskManager
19
+ from scaler.worker_adapter.symphony.callback import SessionCallback
20
+ from scaler.worker_adapter.symphony.message import SoamMessage
21
+
22
+ try:
23
+ import soamapi
24
+ except ImportError:
25
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
26
+
27
+
28
+ class SymphonyTaskManager(Looper, TaskManager):
29
+ def __init__(self, base_concurrency: int, service_name: str):
30
+ if isinstance(base_concurrency, int) and base_concurrency <= 0:
31
+ raise ValueError(f"base_concurrency must be a possible integer, got {base_concurrency}")
32
+
33
+ self._base_concurrency = base_concurrency
34
+ self._service_name = service_name
35
+
36
+ self._executor_semaphore = asyncio.Semaphore(value=self._base_concurrency)
37
+
38
+ self._task_id_to_task: Dict[TaskID, Task] = dict()
39
+ self._task_id_to_future: bidict[TaskID, asyncio.Future] = bidict()
40
+
41
+ self._serializers: Dict[bytes, Serializer] = dict()
42
+
43
+ self._queued_task_id_queue = AsyncSortedPriorityQueue()
44
+ self._queued_task_ids: Set[bytes] = set()
45
+
46
+ self._acquiring_task_ids: Set[TaskID] = set() # tasks contesting the semaphore
47
+ self._processing_task_ids: Set[TaskID] = set()
48
+ self._canceled_task_ids: Set[TaskID] = set()
49
+
50
+ self._storage_address: Optional[ObjectStorageAddress] = None
51
+
52
+ self._connector_external: Optional[AsyncConnector] = None
53
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
54
+
55
+ """
56
+ SOAM specific code
57
+ """
58
+ soamapi.initialize()
59
+
60
+ self._session_callback = SessionCallback()
61
+
62
+ self._ibm_soam_connection = soamapi.connect(
63
+ self._service_name, soamapi.DefaultSecurityCallback("Guest", "Guest")
64
+ )
65
+ logging.info(f"established IBM Spectrum Symphony connection {self._ibm_soam_connection.get_id()}")
66
+
67
+ ibm_soam_session_attr = soamapi.SessionCreationAttributes()
68
+ ibm_soam_session_attr.set_session_type("RecoverableAllHistoricalData")
69
+ ibm_soam_session_attr.set_session_name("ScalerSession")
70
+ ibm_soam_session_attr.set_session_flags(soamapi.SessionFlags.PARTIAL_ASYNC)
71
+ ibm_soam_session_attr.set_session_callback(self._session_callback)
72
+ self._ibm_soam_session = self._ibm_soam_connection.create_session(ibm_soam_session_attr)
73
+ logging.info(f"established IBM Spectrum Symphony session {self._ibm_soam_session.get_id()}")
74
+
75
+ def register(
76
+ self,
77
+ connector_external: AsyncConnector,
78
+ connector_storage: AsyncObjectStorageConnector,
79
+ heartbeat_manager: HeartbeatManager,
80
+ ):
81
+ self._connector_external = connector_external
82
+ self._connector_storage = connector_storage
83
+ self._heartbeat_manager = heartbeat_manager
84
+
85
+ async def routine(self): # SymphonyTaskManager has two loops
86
+ pass
87
+
88
+ async def on_object_instruction(self, instruction: ObjectInstruction):
89
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
90
+ for object_id in instruction.object_metadata.object_ids:
91
+ self._serializers.pop(object_id, None) # we only cache serializers
92
+
93
+ return
94
+
95
+ logging.error(f"worker received unknown object instruction type {instruction=}")
96
+
97
+ async def on_task_new(self, task: Task):
98
+ task_priority = self.__get_task_priority(task)
99
+
100
+ # if semaphore is locked, check if task is higher priority than all acquired tasks
101
+ # if so, bypass acquiring and execute the task immediately
102
+ if self._executor_semaphore.locked():
103
+ for acquired_task_id in self._acquiring_task_ids:
104
+ acquired_task = self._task_id_to_task[acquired_task_id]
105
+ acquired_task_priority = self.__get_task_priority(acquired_task)
106
+ if task_priority <= acquired_task_priority:
107
+ break
108
+ else:
109
+ self._task_id_to_task[task.task_id] = task
110
+ self._processing_task_ids.add(task.task_id)
111
+ self._task_id_to_future[task.task_id] = await self.__execute_task(task)
112
+ return
113
+
114
+ self._task_id_to_task[task.task_id] = task
115
+ self._queued_task_id_queue.put_nowait((-task_priority, task.task_id))
116
+ self._queued_task_ids.add(task.task_id)
117
+
118
+ async def on_cancel_task(self, task_cancel: TaskCancel):
119
+ task_queued = task_cancel.task_id in self._queued_task_ids
120
+ task_processing = task_cancel.task_id in self._processing_task_ids
121
+
122
+ if not task_queued and not task_processing:
123
+ await self._connector_external.send(
124
+ TaskCancelConfirm.new_msg(
125
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
126
+ )
127
+ )
128
+ return
129
+
130
+ if task_processing and not task_cancel.flags.force:
131
+ await self._connector_external.send(
132
+ TaskCancelConfirm.new_msg(
133
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
134
+ )
135
+ )
136
+ return
137
+
138
+ if task_queued:
139
+ self._queued_task_ids.remove(task_cancel.task_id)
140
+ self._queued_task_id_queue.remove(task_cancel.task_id)
141
+
142
+ # task can be discarded because task was never submitted
143
+ self._task_id_to_task.pop(task_cancel.task_id)
144
+
145
+ if task_processing:
146
+ future = self._task_id_to_future[task_cancel.task_id]
147
+ future.cancel()
148
+
149
+ # regardless of the future being canceled, the task is considered canceled and cleanup will occur later
150
+ self._processing_task_ids.remove(task_cancel.task_id)
151
+ self._canceled_task_ids.add(task_cancel.task_id)
152
+
153
+ result = TaskCancelConfirm.new_msg(
154
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled
155
+ )
156
+ await self._connector_external.send(result)
157
+
158
+ async def on_task_result(self, result: TaskResult):
159
+ if result.task_id in self._queued_task_ids:
160
+ self._queued_task_ids.remove(result.task_id)
161
+ self._queued_task_id_queue.remove(result.task_id)
162
+
163
+ self._processing_task_ids.remove(result.task_id)
164
+ self._task_id_to_task.pop(result.task_id)
165
+
166
+ await self._connector_external.send(result)
167
+
168
+ def get_queued_size(self):
169
+ return self._queued_task_id_queue.qsize()
170
+
171
+ def can_accept_task(self):
172
+ return not self._executor_semaphore.locked()
173
+
174
+ async def resolve_tasks(self):
175
+ if not self._task_id_to_future:
176
+ return
177
+
178
+ done, _ = await asyncio.wait(self._task_id_to_future.values(), return_when=asyncio.FIRST_COMPLETED)
179
+ for future in done:
180
+ task_id = self._task_id_to_future.inv.pop(future)
181
+ task = self._task_id_to_task[task_id]
182
+
183
+ if task_id in self._processing_task_ids:
184
+ self._processing_task_ids.remove(task_id)
185
+
186
+ if future.exception() is None:
187
+ serializer_id = ObjectID.generate_serializer_object_id(task.source)
188
+ serializer = self._serializers[serializer_id]
189
+ result_bytes = serializer.serialize(future.result())
190
+ result_type = TaskResultType.Success
191
+ else:
192
+ result_bytes = serialize_failure(cast(Exception, future.exception()))
193
+ result_type = TaskResultType.Failed
194
+
195
+ result_object_id = ObjectID.generate_object_id(task.source)
196
+
197
+ await self._connector_storage.set_object(result_object_id, result_bytes)
198
+ await self._connector_external.send(
199
+ ObjectInstruction.new_msg(
200
+ ObjectInstruction.ObjectInstructionType.Create,
201
+ task.source,
202
+ ObjectMetadata.new_msg(
203
+ object_ids=(result_object_id,),
204
+ object_types=(ObjectMetadata.ObjectContentType.Object,),
205
+ object_names=(f"<res {result_object_id.hex()[:6]}>".encode(),),
206
+ ),
207
+ )
208
+ )
209
+
210
+ await self._connector_external.send(
211
+ TaskResult.new_msg(task_id, result_type, metadata=b"", results=[bytes(result_object_id)])
212
+ )
213
+
214
+ elif task_id in self._canceled_task_ids:
215
+ self._canceled_task_ids.remove(task_id)
216
+
217
+ else:
218
+ raise ValueError(f"task_id {task_id.hex()} not found in processing or canceled tasks")
219
+
220
+ if task_id in self._acquiring_task_ids:
221
+ self._acquiring_task_ids.remove(task_id)
222
+ self._executor_semaphore.release()
223
+
224
+ self._task_id_to_task.pop(task_id)
225
+
226
+ async def process_task(self):
227
+ await self._executor_semaphore.acquire()
228
+
229
+ _, task_id = await self._queued_task_id_queue.get()
230
+ task = self._task_id_to_task[task_id]
231
+
232
+ self._acquiring_task_ids.add(task_id)
233
+ self._processing_task_ids.add(task_id)
234
+ self._task_id_to_future[task.task_id] = await self.__execute_task(task)
235
+
236
+ async def __execute_task(self, task: Task) -> asyncio.Future:
237
+ """
238
+ This method is not very efficient because it does let objects linger in the cache. Each time inputs are
239
+ requested, all object data are requested.
240
+ """
241
+ serializer_id = ObjectID.generate_serializer_object_id(task.source)
242
+
243
+ if serializer_id not in self._serializers:
244
+ serializer_bytes = await self._connector_storage.get_object(serializer_id)
245
+ serializer = cloudpickle.loads(serializer_bytes)
246
+ self._serializers[serializer_id] = serializer
247
+ else:
248
+ serializer = self._serializers[serializer_id]
249
+
250
+ # Fetches the function object and the argument objects concurrently
251
+
252
+ get_tasks = [
253
+ self._connector_storage.get_object(object_id)
254
+ for object_id in [task.func_object_id, *(cast(ObjectID, arg) for arg in task.function_args)]
255
+ ]
256
+
257
+ function_bytes, *arg_bytes = await asyncio.gather(*get_tasks)
258
+
259
+ function = serializer.deserialize(function_bytes)
260
+ arg_objects = [serializer.deserialize(object_bytes) for object_bytes in arg_bytes]
261
+
262
+ """
263
+ SOAM specific code
264
+ """
265
+ input_message = SoamMessage()
266
+ input_message.set_payload(cloudpickle.dumps((function, *arg_objects)))
267
+
268
+ task_attr = soamapi.TaskSubmissionAttributes()
269
+ task_attr.set_task_input(input_message)
270
+
271
+ with self._session_callback.get_callback_lock():
272
+ symphony_task = self._ibm_soam_session.send_task_input(task_attr)
273
+
274
+ future: Future = Future()
275
+ future.set_running_or_notify_cancel()
276
+
277
+ self._session_callback.submit_task(symphony_task.get_id(), future)
278
+
279
+ return asyncio.wrap_future(future)
280
+
281
+ @staticmethod
282
+ def __get_task_priority(task: Task) -> int:
283
+ priority = retrieve_task_flags_from_task(task).priority
284
+
285
+ if priority < 0:
286
+ raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
287
+
288
+ return priority
@@ -0,0 +1,205 @@
1
+ import asyncio
2
+ import logging
3
+ import multiprocessing
4
+ import signal
5
+ from collections import deque
6
+ from typing import Dict, Optional
7
+
8
+ import zmq
9
+
10
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
11
+ from scaler.config.types.zmq import ZMQConfig
12
+ from scaler.io.async_connector import ZMQAsyncConnector
13
+ from scaler.io.async_object_storage_connector import PyAsyncObjectStorageConnector
14
+ from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
15
+ from scaler.protocol.python.message import (
16
+ ClientDisconnect,
17
+ DisconnectRequest,
18
+ ObjectInstruction,
19
+ Task,
20
+ TaskCancel,
21
+ WorkerHeartbeatEcho,
22
+ )
23
+ from scaler.protocol.python.mixins import Message
24
+ from scaler.utility.event_loop import create_async_loop_routine, register_event_loop
25
+ from scaler.utility.exceptions import ClientShutdownException
26
+ from scaler.utility.identifiers import WorkerID
27
+ from scaler.utility.logging.utility import setup_logger
28
+ from scaler.worker.agent.timeout_manager import VanillaTimeoutManager
29
+ from scaler.worker_adapter.symphony.heartbeat_manager import SymphonyHeartbeatManager
30
+ from scaler.worker_adapter.symphony.task_manager import SymphonyTaskManager
31
+
32
+
33
+ class SymphonyWorker(multiprocessing.get_context("spawn").Process): # type: ignore
34
+ """
35
+ SymphonyWorker is an implementation of a worker that can handle multiple tasks concurrently.
36
+ Most of the task execution logic is handled by SymphonyTaskManager.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ name: str,
42
+ address: ZMQConfig,
43
+ storage_address: Optional[ObjectStorageConfig],
44
+ service_name: str,
45
+ capabilities: Dict[str, int],
46
+ base_concurrency: int,
47
+ heartbeat_interval_seconds: int,
48
+ death_timeout_seconds: int,
49
+ task_queue_size: int,
50
+ io_threads: int,
51
+ event_loop: str,
52
+ ):
53
+ multiprocessing.Process.__init__(self, name="Agent")
54
+
55
+ self._event_loop = event_loop
56
+ self._name = name
57
+ self._address = address
58
+ self._storage_address = storage_address
59
+ self._capabilities = capabilities
60
+ self._io_threads = io_threads
61
+
62
+ self._ident = WorkerID.generate_worker_id(name) # _identity is internal to multiprocessing.Process
63
+
64
+ self._service_name = service_name
65
+ self._base_concurrency = base_concurrency
66
+
67
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
68
+ self._death_timeout_seconds = death_timeout_seconds
69
+ self._task_queue_size = task_queue_size
70
+
71
+ self._context: Optional[zmq.asyncio.Context] = None
72
+ self._connector_external: Optional[AsyncConnector] = None
73
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
74
+ self._task_manager: Optional[SymphonyTaskManager] = None
75
+ self._heartbeat_manager: Optional[SymphonyHeartbeatManager] = None
76
+
77
+ """
78
+ Sometimes the first message received is not a heartbeat echo, so we need to backoff processing other tasks
79
+ until we receive the first heartbeat echo.
80
+ """
81
+ self._heartbeat_received: bool = False
82
+ self._backoff_message_queue: deque = deque()
83
+
84
+ @property
85
+ def identity(self) -> WorkerID:
86
+ return self._ident
87
+
88
+ def run(self) -> None:
89
+ self.__initialize()
90
+ self.__run_forever()
91
+
92
+ def __initialize(self):
93
+ setup_logger()
94
+ register_event_loop(self._event_loop)
95
+
96
+ self._context = zmq.asyncio.Context()
97
+ self._connector_external = ZMQAsyncConnector(
98
+ context=self._context,
99
+ name=self.name,
100
+ socket_type=zmq.DEALER,
101
+ address=self._address,
102
+ bind_or_connect="connect",
103
+ callback=self.__on_receive_external,
104
+ identity=self._ident,
105
+ )
106
+
107
+ self._connector_storage = PyAsyncObjectStorageConnector()
108
+
109
+ self._heartbeat_manager = SymphonyHeartbeatManager(
110
+ storage_address=self._storage_address,
111
+ capabilities=self._capabilities,
112
+ task_queue_size=self._task_queue_size,
113
+ )
114
+ self._task_manager = SymphonyTaskManager(
115
+ base_concurrency=self._base_concurrency, service_name=self._service_name
116
+ )
117
+ self._timeout_manager = VanillaTimeoutManager(death_timeout_seconds=self._death_timeout_seconds)
118
+
119
+ # register
120
+ self._heartbeat_manager.register(
121
+ connector_external=self._connector_external,
122
+ connector_storage=self._connector_storage,
123
+ worker_task_manager=self._task_manager,
124
+ timeout_manager=self._timeout_manager,
125
+ )
126
+ self._task_manager.register(
127
+ connector_external=self._connector_external,
128
+ connector_storage=self._connector_storage,
129
+ heartbeat_manager=self._heartbeat_manager,
130
+ )
131
+
132
+ self._loop = asyncio.get_event_loop()
133
+ self.__register_signal()
134
+ self._task = self._loop.create_task(self.__get_loops())
135
+
136
+ async def __on_receive_external(self, message: Message):
137
+ if not self._heartbeat_received and not isinstance(message, WorkerHeartbeatEcho):
138
+ self._backoff_message_queue.append(message)
139
+ return
140
+
141
+ if isinstance(message, WorkerHeartbeatEcho):
142
+ await self._heartbeat_manager.on_heartbeat_echo(message)
143
+ self._heartbeat_received = True
144
+
145
+ while self._backoff_message_queue:
146
+ backoff_message = self._backoff_message_queue.popleft()
147
+ await self.__on_receive_external(backoff_message)
148
+
149
+ return
150
+
151
+ if isinstance(message, Task):
152
+ await self._task_manager.on_task_new(message)
153
+ return
154
+
155
+ if isinstance(message, TaskCancel):
156
+ await self._task_manager.on_cancel_task(message)
157
+ return
158
+
159
+ if isinstance(message, ObjectInstruction):
160
+ await self._task_manager.on_object_instruction(message)
161
+ return
162
+
163
+ if isinstance(message, ClientDisconnect):
164
+ if message.disconnect_type == ClientDisconnect.DisconnectType.Shutdown:
165
+ raise ClientShutdownException("received client shutdown, quitting")
166
+ logging.error(f"Worker received invalid ClientDisconnect type, ignoring {message=}")
167
+ return
168
+
169
+ raise TypeError(f"Unknown {message=}")
170
+
171
+ async def __get_loops(self):
172
+ if self._storage_address is not None:
173
+ # With a manually set storage address, immediately connect to the object storage server.
174
+ await self._connector_storage.connect(self._storage_address.host, self._storage_address.port)
175
+
176
+ try:
177
+ await asyncio.gather(
178
+ create_async_loop_routine(self._connector_external.routine, 0),
179
+ create_async_loop_routine(self._connector_storage.routine, 0),
180
+ create_async_loop_routine(self._heartbeat_manager.routine, self._heartbeat_interval_seconds),
181
+ create_async_loop_routine(self._timeout_manager.routine, 1),
182
+ create_async_loop_routine(self._task_manager.routine, 0),
183
+ create_async_loop_routine(self._task_manager.process_task, 0),
184
+ create_async_loop_routine(self._task_manager.resolve_tasks, 0),
185
+ )
186
+ except asyncio.CancelledError:
187
+ pass
188
+ except (ClientShutdownException, TimeoutError) as e:
189
+ logging.info(f"{self.identity!r}: {str(e)}")
190
+ except Exception as e:
191
+ logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
192
+
193
+ await self._connector_external.send(DisconnectRequest.new_msg(self.identity))
194
+
195
+ self._connector_external.destroy()
196
+ logging.info(f"{self.identity!r}: quit")
197
+
198
+ def __run_forever(self):
199
+ self._loop.run_until_complete(self._task)
200
+
201
+ def __register_signal(self):
202
+ self._loop.add_signal_handler(signal.SIGINT, self.__destroy)
203
+
204
+ def __destroy(self):
205
+ self._task.cancel()
@@ -0,0 +1,142 @@
1
+ import os
2
+ import signal
3
+ import uuid
4
+ from typing import Dict, Optional, Tuple
5
+
6
+ from aiohttp import web
7
+ from aiohttp.web_request import Request
8
+
9
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
10
+ from scaler.config.types.zmq import ZMQConfig
11
+ from scaler.utility.identifiers import WorkerID
12
+ from scaler.worker_adapter.symphony.worker import SymphonyWorker
13
+
14
+ WorkerGroupID = bytes
15
+
16
+
17
+ class CapacityExceededError(Exception):
18
+ pass
19
+
20
+
21
+ class WorkerGroupNotFoundError(Exception):
22
+ pass
23
+
24
+
25
+ class SymphonyWorkerAdapter:
26
+ def __init__(
27
+ self,
28
+ address: ZMQConfig,
29
+ storage_address: Optional[ObjectStorageConfig],
30
+ service_name: str,
31
+ base_concurrency: int,
32
+ capabilities: Dict[str, int],
33
+ io_threads: int,
34
+ task_queue_size: int,
35
+ heartbeat_interval_seconds: int,
36
+ death_timeout_seconds: int,
37
+ event_loop: str,
38
+ logging_paths: Tuple[str, ...],
39
+ logging_level: str,
40
+ logging_config_file: Optional[str],
41
+ ):
42
+ self._address = address
43
+ self._storage_address = storage_address
44
+ self._service_name = service_name
45
+ self._base_concurrency = base_concurrency
46
+ self._capabilities = capabilities
47
+ self._io_threads = io_threads
48
+ self._task_queue_size = task_queue_size
49
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
50
+ self._death_timeout_seconds = death_timeout_seconds
51
+ self._event_loop = event_loop
52
+ self._logging_paths = logging_paths
53
+ self._logging_level = logging_level
54
+ self._logging_config_file = logging_config_file
55
+
56
+ """
57
+ Although a worker group can contain multiple workers, in this Symphony adapter implementation,
58
+ there will be only one worker group which contains one Symphony worker.
59
+ """
60
+ self._worker_groups: Dict[WorkerGroupID, Dict[WorkerID, SymphonyWorker]] = {}
61
+
62
+ async def start_worker_group(self) -> WorkerGroupID:
63
+ if self._worker_groups:
64
+ raise CapacityExceededError("Symphony worker already started")
65
+
66
+ worker = SymphonyWorker(
67
+ name=uuid.uuid4().hex,
68
+ address=self._address,
69
+ storage_address=self._storage_address,
70
+ service_name=self._service_name,
71
+ base_concurrency=self._base_concurrency,
72
+ capabilities=self._capabilities,
73
+ io_threads=self._io_threads,
74
+ task_queue_size=self._task_queue_size,
75
+ heartbeat_interval_seconds=self._heartbeat_interval_seconds,
76
+ death_timeout_seconds=self._death_timeout_seconds,
77
+ event_loop=self._event_loop,
78
+ )
79
+
80
+ worker.start()
81
+ worker_group_id = f"symphony-{uuid.uuid4().hex}".encode()
82
+ self._worker_groups[worker_group_id] = {worker.identity: worker}
83
+ return worker_group_id
84
+
85
+ async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
86
+ if worker_group_id not in self._worker_groups:
87
+ raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
88
+
89
+ for worker in self._worker_groups[worker_group_id].values():
90
+ os.kill(worker.pid, signal.SIGINT)
91
+ worker.join()
92
+
93
+ self._worker_groups.pop(worker_group_id)
94
+
95
+ async def webhook_handler(self, request: Request):
96
+ request_json = await request.json()
97
+
98
+ if "action" not in request_json:
99
+ return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
100
+
101
+ action = request_json["action"]
102
+
103
+ if action == "start_worker_group":
104
+ try:
105
+ worker_group_id = await self.start_worker_group()
106
+ except CapacityExceededError as e:
107
+ return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
108
+ except Exception as e:
109
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
110
+
111
+ return web.json_response(
112
+ {
113
+ "status": "Worker group started",
114
+ "worker_group_id": worker_group_id.decode(),
115
+ "worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].keys()],
116
+ },
117
+ status=web.HTTPOk.status_code,
118
+ )
119
+
120
+ elif action == "shutdown_worker_group":
121
+ if "worker_group_id" not in request_json:
122
+ return web.json_response(
123
+ {"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
124
+ )
125
+
126
+ worker_group_id = request_json["worker_group_id"].encode()
127
+ try:
128
+ await self.shutdown_worker_group(worker_group_id)
129
+ except WorkerGroupNotFoundError as e:
130
+ return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
131
+ except Exception as e:
132
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
133
+
134
+ return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
135
+
136
+ else:
137
+ return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
138
+
139
+ def create_app(self):
140
+ app = web.Application()
141
+ app.router.add_post("/", self.webhook_handler)
142
+ return app