opengris-scaler 1.12.7__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (232) hide show
  1. opengris_scaler-1.12.7.dist-info/METADATA +729 -0
  2. opengris_scaler-1.12.7.dist-info/RECORD +232 -0
  3. opengris_scaler-1.12.7.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.7.dist-info/entry_points.txt +9 -0
  5. opengris_scaler-1.12.7.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.7.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.7.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-b787335c.1.0.so +0 -0
  9. opengris_scaler.libs/libkj-1-094aa318.1.0.so +0 -0
  10. scaler/CMakeLists.txt +11 -0
  11. scaler/__init__.py +14 -0
  12. scaler/about.py +5 -0
  13. scaler/client/__init__.py +0 -0
  14. scaler/client/agent/__init__.py +0 -0
  15. scaler/client/agent/client_agent.py +210 -0
  16. scaler/client/agent/disconnect_manager.py +27 -0
  17. scaler/client/agent/future_manager.py +112 -0
  18. scaler/client/agent/heartbeat_manager.py +74 -0
  19. scaler/client/agent/mixins.py +89 -0
  20. scaler/client/agent/object_manager.py +98 -0
  21. scaler/client/agent/task_manager.py +64 -0
  22. scaler/client/client.py +635 -0
  23. scaler/client/future.py +252 -0
  24. scaler/client/object_buffer.py +129 -0
  25. scaler/client/object_reference.py +25 -0
  26. scaler/client/serializer/__init__.py +0 -0
  27. scaler/client/serializer/default.py +16 -0
  28. scaler/client/serializer/mixins.py +38 -0
  29. scaler/cluster/__init__.py +0 -0
  30. scaler/cluster/cluster.py +115 -0
  31. scaler/cluster/combo.py +148 -0
  32. scaler/cluster/object_storage_server.py +45 -0
  33. scaler/cluster/scheduler.py +83 -0
  34. scaler/config/__init__.py +0 -0
  35. scaler/config/defaults.py +87 -0
  36. scaler/config/loader.py +95 -0
  37. scaler/config/mixins.py +15 -0
  38. scaler/config/section/__init__.py +0 -0
  39. scaler/config/section/cluster.py +56 -0
  40. scaler/config/section/native_worker_adapter.py +44 -0
  41. scaler/config/section/object_storage_server.py +7 -0
  42. scaler/config/section/scheduler.py +53 -0
  43. scaler/config/section/symphony_worker_adapter.py +47 -0
  44. scaler/config/section/top.py +13 -0
  45. scaler/config/section/webui.py +16 -0
  46. scaler/config/types/__init__.py +0 -0
  47. scaler/config/types/object_storage_server.py +45 -0
  48. scaler/config/types/worker.py +57 -0
  49. scaler/config/types/zmq.py +79 -0
  50. scaler/entry_points/__init__.py +0 -0
  51. scaler/entry_points/cluster.py +133 -0
  52. scaler/entry_points/object_storage_server.py +41 -0
  53. scaler/entry_points/scheduler.py +135 -0
  54. scaler/entry_points/top.py +286 -0
  55. scaler/entry_points/webui.py +26 -0
  56. scaler/entry_points/worker_adapter_native.py +137 -0
  57. scaler/entry_points/worker_adapter_symphony.py +102 -0
  58. scaler/io/__init__.py +0 -0
  59. scaler/io/async_binder.py +85 -0
  60. scaler/io/async_connector.py +95 -0
  61. scaler/io/async_object_storage_connector.py +185 -0
  62. scaler/io/mixins.py +154 -0
  63. scaler/io/sync_connector.py +68 -0
  64. scaler/io/sync_object_storage_connector.py +185 -0
  65. scaler/io/sync_subscriber.py +83 -0
  66. scaler/io/utility.py +31 -0
  67. scaler/io/ymq/CMakeLists.txt +98 -0
  68. scaler/io/ymq/__init__.py +0 -0
  69. scaler/io/ymq/_ymq.pyi +96 -0
  70. scaler/io/ymq/_ymq.so +0 -0
  71. scaler/io/ymq/bytes.h +114 -0
  72. scaler/io/ymq/common.h +29 -0
  73. scaler/io/ymq/configuration.h +60 -0
  74. scaler/io/ymq/epoll_context.cpp +185 -0
  75. scaler/io/ymq/epoll_context.h +85 -0
  76. scaler/io/ymq/error.h +132 -0
  77. scaler/io/ymq/event_loop.h +55 -0
  78. scaler/io/ymq/event_loop_thread.cpp +64 -0
  79. scaler/io/ymq/event_loop_thread.h +46 -0
  80. scaler/io/ymq/event_manager.h +81 -0
  81. scaler/io/ymq/file_descriptor.h +203 -0
  82. scaler/io/ymq/interruptive_concurrent_queue.h +169 -0
  83. scaler/io/ymq/io_context.cpp +98 -0
  84. scaler/io/ymq/io_context.h +44 -0
  85. scaler/io/ymq/io_socket.cpp +299 -0
  86. scaler/io/ymq/io_socket.h +121 -0
  87. scaler/io/ymq/iocp_context.cpp +102 -0
  88. scaler/io/ymq/iocp_context.h +83 -0
  89. scaler/io/ymq/logging.h +163 -0
  90. scaler/io/ymq/message.h +15 -0
  91. scaler/io/ymq/message_connection.h +16 -0
  92. scaler/io/ymq/message_connection_tcp.cpp +672 -0
  93. scaler/io/ymq/message_connection_tcp.h +96 -0
  94. scaler/io/ymq/network_utils.h +179 -0
  95. scaler/io/ymq/pymod_ymq/bytes.h +113 -0
  96. scaler/io/ymq/pymod_ymq/exception.h +124 -0
  97. scaler/io/ymq/pymod_ymq/gil.h +15 -0
  98. scaler/io/ymq/pymod_ymq/io_context.h +166 -0
  99. scaler/io/ymq/pymod_ymq/io_socket.h +285 -0
  100. scaler/io/ymq/pymod_ymq/message.h +99 -0
  101. scaler/io/ymq/pymod_ymq/python.h +153 -0
  102. scaler/io/ymq/pymod_ymq/ymq.cpp +23 -0
  103. scaler/io/ymq/pymod_ymq/ymq.h +357 -0
  104. scaler/io/ymq/readme.md +114 -0
  105. scaler/io/ymq/simple_interface.cpp +80 -0
  106. scaler/io/ymq/simple_interface.h +24 -0
  107. scaler/io/ymq/tcp_client.cpp +367 -0
  108. scaler/io/ymq/tcp_client.h +75 -0
  109. scaler/io/ymq/tcp_operations.h +41 -0
  110. scaler/io/ymq/tcp_server.cpp +410 -0
  111. scaler/io/ymq/tcp_server.h +79 -0
  112. scaler/io/ymq/third_party/concurrentqueue.h +3747 -0
  113. scaler/io/ymq/timed_queue.h +272 -0
  114. scaler/io/ymq/timestamp.h +102 -0
  115. scaler/io/ymq/typedefs.h +20 -0
  116. scaler/io/ymq/utils.h +34 -0
  117. scaler/io/ymq/ymq.py +130 -0
  118. scaler/object_storage/CMakeLists.txt +50 -0
  119. scaler/object_storage/__init__.py +0 -0
  120. scaler/object_storage/constants.h +11 -0
  121. scaler/object_storage/defs.h +14 -0
  122. scaler/object_storage/io_helper.cpp +44 -0
  123. scaler/object_storage/io_helper.h +9 -0
  124. scaler/object_storage/message.cpp +56 -0
  125. scaler/object_storage/message.h +130 -0
  126. scaler/object_storage/object_manager.cpp +126 -0
  127. scaler/object_storage/object_manager.h +52 -0
  128. scaler/object_storage/object_storage_server.cpp +359 -0
  129. scaler/object_storage/object_storage_server.h +126 -0
  130. scaler/object_storage/object_storage_server.so +0 -0
  131. scaler/object_storage/pymod_object_storage_server.cpp +104 -0
  132. scaler/protocol/__init__.py +0 -0
  133. scaler/protocol/capnp/__init__.py +0 -0
  134. scaler/protocol/capnp/_python.py +6 -0
  135. scaler/protocol/capnp/common.capnp +63 -0
  136. scaler/protocol/capnp/message.capnp +216 -0
  137. scaler/protocol/capnp/object_storage.capnp +52 -0
  138. scaler/protocol/capnp/status.capnp +73 -0
  139. scaler/protocol/introduction.md +105 -0
  140. scaler/protocol/python/__init__.py +0 -0
  141. scaler/protocol/python/common.py +135 -0
  142. scaler/protocol/python/message.py +726 -0
  143. scaler/protocol/python/mixins.py +13 -0
  144. scaler/protocol/python/object_storage.py +118 -0
  145. scaler/protocol/python/status.py +279 -0
  146. scaler/protocol/worker.md +228 -0
  147. scaler/scheduler/__init__.py +0 -0
  148. scaler/scheduler/allocate_policy/__init__.py +0 -0
  149. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  150. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  151. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  152. scaler/scheduler/allocate_policy/mixins.py +55 -0
  153. scaler/scheduler/controllers/__init__.py +0 -0
  154. scaler/scheduler/controllers/balance_controller.py +65 -0
  155. scaler/scheduler/controllers/client_controller.py +131 -0
  156. scaler/scheduler/controllers/config_controller.py +31 -0
  157. scaler/scheduler/controllers/graph_controller.py +424 -0
  158. scaler/scheduler/controllers/information_controller.py +81 -0
  159. scaler/scheduler/controllers/mixins.py +201 -0
  160. scaler/scheduler/controllers/object_controller.py +147 -0
  161. scaler/scheduler/controllers/scaling_controller.py +86 -0
  162. scaler/scheduler/controllers/task_controller.py +373 -0
  163. scaler/scheduler/controllers/worker_controller.py +168 -0
  164. scaler/scheduler/object_usage/__init__.py +0 -0
  165. scaler/scheduler/object_usage/object_tracker.py +131 -0
  166. scaler/scheduler/scheduler.py +253 -0
  167. scaler/scheduler/task/__init__.py +0 -0
  168. scaler/scheduler/task/task_state_machine.py +92 -0
  169. scaler/scheduler/task/task_state_manager.py +61 -0
  170. scaler/ui/__init__.py +0 -0
  171. scaler/ui/constants.py +9 -0
  172. scaler/ui/live_display.py +118 -0
  173. scaler/ui/memory_window.py +146 -0
  174. scaler/ui/setting_page.py +47 -0
  175. scaler/ui/task_graph.py +370 -0
  176. scaler/ui/task_log.py +83 -0
  177. scaler/ui/utility.py +35 -0
  178. scaler/ui/webui.py +125 -0
  179. scaler/ui/worker_processors.py +85 -0
  180. scaler/utility/__init__.py +0 -0
  181. scaler/utility/debug.py +19 -0
  182. scaler/utility/event_list.py +63 -0
  183. scaler/utility/event_loop.py +58 -0
  184. scaler/utility/exceptions.py +42 -0
  185. scaler/utility/formatter.py +44 -0
  186. scaler/utility/graph/__init__.py +0 -0
  187. scaler/utility/graph/optimization.py +27 -0
  188. scaler/utility/graph/topological_sorter.py +11 -0
  189. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  190. scaler/utility/identifiers.py +105 -0
  191. scaler/utility/logging/__init__.py +0 -0
  192. scaler/utility/logging/decorators.py +25 -0
  193. scaler/utility/logging/scoped_logger.py +33 -0
  194. scaler/utility/logging/utility.py +183 -0
  195. scaler/utility/many_to_many_dict.py +123 -0
  196. scaler/utility/metadata/__init__.py +0 -0
  197. scaler/utility/metadata/profile_result.py +31 -0
  198. scaler/utility/metadata/task_flags.py +30 -0
  199. scaler/utility/mixins.py +13 -0
  200. scaler/utility/network_util.py +7 -0
  201. scaler/utility/one_to_many_dict.py +72 -0
  202. scaler/utility/queues/__init__.py +0 -0
  203. scaler/utility/queues/async_indexed_queue.py +37 -0
  204. scaler/utility/queues/async_priority_queue.py +70 -0
  205. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  206. scaler/utility/queues/indexed_queue.py +114 -0
  207. scaler/utility/serialization.py +9 -0
  208. scaler/version.txt +1 -0
  209. scaler/worker/__init__.py +0 -0
  210. scaler/worker/agent/__init__.py +0 -0
  211. scaler/worker/agent/heartbeat_manager.py +107 -0
  212. scaler/worker/agent/mixins.py +137 -0
  213. scaler/worker/agent/processor/__init__.py +0 -0
  214. scaler/worker/agent/processor/object_cache.py +107 -0
  215. scaler/worker/agent/processor/processor.py +279 -0
  216. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  217. scaler/worker/agent/processor_holder.py +145 -0
  218. scaler/worker/agent/processor_manager.py +365 -0
  219. scaler/worker/agent/profiling_manager.py +109 -0
  220. scaler/worker/agent/task_manager.py +150 -0
  221. scaler/worker/agent/timeout_manager.py +19 -0
  222. scaler/worker/preload.py +84 -0
  223. scaler/worker/worker.py +264 -0
  224. scaler/worker_adapter/__init__.py +0 -0
  225. scaler/worker_adapter/native.py +154 -0
  226. scaler/worker_adapter/symphony/__init__.py +0 -0
  227. scaler/worker_adapter/symphony/callback.py +45 -0
  228. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  229. scaler/worker_adapter/symphony/message.py +24 -0
  230. scaler/worker_adapter/symphony/task_manager.py +288 -0
  231. scaler/worker_adapter/symphony/worker.py +205 -0
  232. scaler/worker_adapter/symphony/worker_adapter.py +142 -0
@@ -0,0 +1,365 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ import tblib.pickling_support
6
+
7
+ # from scaler.utility.logging.utility import setup_logger
8
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
9
+ from scaler.protocol.python.common import ObjectMetadata, TaskResultType
10
+ from scaler.protocol.python.message import ObjectInstruction, ProcessorInitialized, Task, TaskResult
11
+ from scaler.utility.exceptions import ProcessorDiedError
12
+ from scaler.utility.identifiers import ObjectID, ProcessorID, TaskID, WorkerID
13
+ from scaler.utility.metadata.profile_result import ProfileResult
14
+ from scaler.utility.serialization import serialize_failure
15
+ from scaler.config.types.zmq import ZMQConfig
16
+ from scaler.worker.agent.mixins import HeartbeatManager, ProcessorManager, ProfilingManager, TaskManager
17
+ from scaler.worker.agent.processor_holder import ProcessorHolder
18
+
19
+
20
+ class VanillaProcessorManager(ProcessorManager):
21
+ def __init__(
22
+ self,
23
+ identity: WorkerID,
24
+ event_loop: str,
25
+ address_internal: ZMQConfig,
26
+ preload: Optional[str],
27
+ garbage_collect_interval_seconds: int,
28
+ trim_memory_threshold_bytes: int,
29
+ hard_processor_suspend: bool,
30
+ logging_paths: Tuple[str, ...],
31
+ logging_level: str,
32
+ ):
33
+ tblib.pickling_support.install()
34
+
35
+ self._identity = identity
36
+ self._event_loop = event_loop
37
+ self._preload = preload
38
+
39
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
40
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
41
+ self._hard_processor_suspend = hard_processor_suspend
42
+ self._logging_paths = logging_paths
43
+ self._logging_level = logging_level
44
+
45
+ self._heartbeat_manager: Optional[HeartbeatManager] = None
46
+ self._task_manager: Optional[TaskManager] = None
47
+ self._profiling_manager: Optional[ProfilingManager] = None
48
+ self._connector_external: Optional[AsyncConnector] = None
49
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
50
+
51
+ self._address_internal: ZMQConfig = address_internal
52
+
53
+ self._current_holder: Optional[ProcessorHolder] = None
54
+ self._suspended_holders_by_task_id: Dict[bytes, ProcessorHolder] = {}
55
+ self._holders_by_processor_id: Dict[ProcessorID, ProcessorHolder] = {}
56
+
57
+ self._can_accept_task_lock: asyncio.Lock = asyncio.Lock()
58
+
59
+ self._binder_internal: Optional[AsyncBinder] = None
60
+
61
+ def register(
62
+ self,
63
+ heartbeat_manager: HeartbeatManager,
64
+ task_manager: TaskManager,
65
+ profiling_manager: ProfilingManager,
66
+ connector_external: AsyncConnector,
67
+ binder_internal: AsyncBinder,
68
+ connector_storage: AsyncObjectStorageConnector,
69
+ ):
70
+ self._heartbeat_manager = heartbeat_manager
71
+ self._task_manager = task_manager
72
+ self._profiling_manager = profiling_manager
73
+ self._connector_external = connector_external
74
+ self._binder_internal = binder_internal
75
+ self._connector_storage = connector_storage
76
+
77
+ async def initialize(self):
78
+ await self._can_accept_task_lock.acquire() # prevents any processor to accept task until initialized
79
+
80
+ await self._connector_storage.wait_until_connected()
81
+
82
+ self.__start_new_processor() # we can start the processor now that we know the storage address.
83
+
84
+ def can_accept_task(self) -> bool:
85
+ return not self._can_accept_task_lock.locked()
86
+
87
+ async def wait_until_can_accept_task(self):
88
+ """
89
+ Makes sure a processor is ready to start processing a new or suspended task.
90
+
91
+ Must be called before any call to `on_task()` or `on_task_resume()`.
92
+ """
93
+
94
+ await self._can_accept_task_lock.acquire()
95
+
96
+ async def on_processor_initialized(self, processor_id: ProcessorID, processor_initialized: ProcessorInitialized):
97
+ assert self._current_holder is not None
98
+
99
+ if self._current_holder.initialized():
100
+ return
101
+
102
+ self._holders_by_processor_id[processor_id] = self._current_holder
103
+ self._current_holder.initialize(processor_id)
104
+
105
+ self._can_accept_task_lock.release()
106
+
107
+ async def on_task(self, task: Task) -> bool:
108
+ assert self._can_accept_task_lock.locked()
109
+ assert self.current_processor_is_initialized()
110
+
111
+ holder = self._current_holder
112
+
113
+ assert holder.task() is None
114
+ holder.set_task(task)
115
+
116
+ self._profiling_manager.on_task_start(holder.pid(), task.task_id)
117
+
118
+ await self._binder_internal.send(holder.processor_id(), task)
119
+
120
+ return True
121
+
122
+ async def on_cancel_task(self, task_id: TaskID) -> Optional[Task]:
123
+ assert self._current_holder is not None
124
+
125
+ if self.current_task_id() == task_id:
126
+ current_task = self.current_task()
127
+ self.__restart_current_processor(f"cancel task_id={task_id.hex()}")
128
+ return current_task
129
+
130
+ if task_id in self._suspended_holders_by_task_id:
131
+ suspended_holder = self._suspended_holders_by_task_id.pop(task_id)
132
+ task = suspended_holder.task()
133
+ self.__kill_processor(f"cancel suspended task_id={task_id.hex()}", suspended_holder)
134
+ return task
135
+
136
+ return None
137
+
138
+ async def on_failing_processor(self, processor_id: ProcessorID, process_status: str):
139
+ assert self._current_holder is not None
140
+
141
+ holder = self._holders_by_processor_id.get(processor_id)
142
+
143
+ if holder is None:
144
+ return
145
+
146
+ task = holder.task()
147
+ if task is not None:
148
+ profile_result = self.__end_task(holder) # profiling the task should happen before killing the processor
149
+ else:
150
+ profile_result = None
151
+
152
+ reason = f"process died {process_status=}"
153
+ if holder == self._current_holder:
154
+ self.__restart_current_processor(reason)
155
+ else:
156
+ self.__kill_processor(reason, holder)
157
+
158
+ if task is not None:
159
+ source = task.source
160
+ task_id = task.task_id
161
+
162
+ result_object_id = ObjectID.generate_object_id(source)
163
+ result_object_bytes = serialize_failure(ProcessorDiedError(f"{process_status=}"))
164
+
165
+ await self._connector_storage.set_object(result_object_id, result_object_bytes)
166
+ await self._connector_external.send(
167
+ ObjectInstruction.new_msg(
168
+ ObjectInstruction.ObjectInstructionType.Create,
169
+ source,
170
+ ObjectMetadata.new_msg((result_object_id,), (ObjectMetadata.ObjectContentType.Object,), (b"",)),
171
+ )
172
+ )
173
+
174
+ await self._task_manager.on_task_result(
175
+ TaskResult.new_msg(
176
+ task_id, TaskResultType.Failed, profile_result.serialize(), [bytes(result_object_id)]
177
+ )
178
+ )
179
+
180
+ async def on_suspend_task(self, task_id: TaskID) -> bool:
181
+ assert self._current_holder is not None
182
+ holder = self._current_holder
183
+
184
+ current_task = holder.task()
185
+
186
+ if current_task is None or current_task.task_id != task_id:
187
+ return False
188
+
189
+ holder.suspend()
190
+ self._suspended_holders_by_task_id[task_id] = holder
191
+
192
+ logging.info(f"{self._identity!r}: suspend Processor[{holder.pid()}]")
193
+
194
+ self.__start_new_processor()
195
+
196
+ return True
197
+
198
+ def on_resume_task(self, task_id: TaskID) -> bool:
199
+ assert self._can_accept_task_lock.locked()
200
+ assert self.current_processor_is_initialized()
201
+
202
+ if self.current_task() is not None:
203
+ return False
204
+
205
+ suspended_holder = self._suspended_holders_by_task_id.pop(task_id, None)
206
+
207
+ if suspended_holder is None:
208
+ return False
209
+
210
+ self.__kill_processor("replaced by suspended processor", self._current_holder)
211
+
212
+ self._current_holder = suspended_holder
213
+ suspended_holder.resume()
214
+
215
+ logging.info(f"{self._identity!r}: resume Processor[{self._current_holder.pid()}]")
216
+
217
+ return True
218
+
219
+ async def on_task_result(self, processor_id: ProcessorID, task_result: TaskResult):
220
+ assert self._current_holder is not None
221
+ task_id = task_result.task_id
222
+
223
+ if task_id == self.current_task_id():
224
+ assert self._current_holder.processor_id() == processor_id
225
+
226
+ profile_result = self.__end_task(self._current_holder)
227
+
228
+ release_task_lock = True
229
+ elif task_id in self._suspended_holders_by_task_id:
230
+ # Receiving a task result from a suspended processor is possible as the message might have been queued while
231
+ # we were suspending the process.
232
+
233
+ holder = self._suspended_holders_by_task_id.pop(task_id)
234
+ assert holder.processor_id() == processor_id
235
+
236
+ profile_result = self.__end_task(holder)
237
+
238
+ self.__kill_processor("task finished in suspended processor", holder)
239
+
240
+ release_task_lock = False
241
+ else:
242
+ return
243
+
244
+ await self._task_manager.on_task_result(
245
+ TaskResult.new_msg(
246
+ task_id=task_id,
247
+ result_type=task_result.result_type,
248
+ metadata=profile_result.serialize(),
249
+ results=task_result.results,
250
+ )
251
+ )
252
+
253
+ # task lock must be released after calling `TaskManager.on_task_result()`
254
+ if release_task_lock:
255
+ self._can_accept_task_lock.release()
256
+
257
+ async def on_external_object_instruction(self, instruction: ObjectInstruction):
258
+ for processor_id in self._holders_by_processor_id.keys():
259
+ await self._binder_internal.send(processor_id, instruction)
260
+
261
+ async def on_internal_object_instruction(self, processor_id: ProcessorID, instruction: ObjectInstruction):
262
+ if not self.__processor_ready_to_process_object(processor_id):
263
+ return
264
+
265
+ await self._connector_external.send(instruction)
266
+
267
+ def destroy(self, reason: str):
268
+ if self._connector_storage is not None:
269
+ self._connector_external.destroy()
270
+
271
+ self.__kill_all_processors(reason)
272
+
273
+ def current_processor_is_initialized(self) -> bool:
274
+ return self._current_holder is not None and self._current_holder.initialized()
275
+
276
+ def current_task(self) -> Optional[Task]:
277
+ if self._current_holder is None: # worker is not yet initialized
278
+ return None
279
+
280
+ return self._current_holder.task()
281
+
282
+ def current_task_id(self) -> Optional[TaskID]:
283
+ task = self.current_task()
284
+
285
+ if task is None:
286
+ return None
287
+ else:
288
+ return task.task_id
289
+
290
+ def processors(self) -> List[ProcessorHolder]:
291
+ return list(self._holders_by_processor_id.values())
292
+
293
+ def num_suspended_processors(self) -> int:
294
+ return len(self._suspended_holders_by_task_id)
295
+
296
+ def __start_new_processor(self):
297
+ storage_address = self._heartbeat_manager.get_storage_address()
298
+
299
+ self._current_holder = ProcessorHolder(
300
+ self._event_loop,
301
+ self._address_internal,
302
+ storage_address,
303
+ self._preload,
304
+ self._garbage_collect_interval_seconds,
305
+ self._trim_memory_threshold_bytes,
306
+ self._hard_processor_suspend,
307
+ self._logging_paths,
308
+ self._logging_level,
309
+ )
310
+
311
+ processor_pid = self._current_holder.pid()
312
+
313
+ self._profiling_manager.on_process_start(processor_pid)
314
+
315
+ logging.info(f"{self._identity!r}: start Processor[{processor_pid}]")
316
+
317
+ def __kill_processor(self, reason: str, holder: ProcessorHolder):
318
+ processor_pid = holder.pid()
319
+
320
+ self._profiling_manager.on_process_end(processor_pid)
321
+
322
+ if holder.initialized():
323
+ self._holders_by_processor_id.pop(holder.processor_id(), None)
324
+
325
+ holder.kill()
326
+
327
+ logging.info(f"{self._identity!r}: stop Processor[{processor_pid}], reason: {reason}")
328
+
329
+ def __restart_current_processor(self, reason: str):
330
+ assert self._current_holder is not None
331
+
332
+ self.__kill_processor(reason, self._current_holder)
333
+ self.__start_new_processor()
334
+
335
+ def __kill_all_processors(self, reason: str):
336
+ if self._current_holder is not None:
337
+ self.__kill_processor(reason, self._current_holder)
338
+ self._current_holder = None
339
+
340
+ for processor_holder in self._suspended_holders_by_task_id.values():
341
+ self.__kill_processor(reason, processor_holder)
342
+
343
+ self._suspended_holders_by_task_id = {}
344
+ self._holders_by_processor_id = {}
345
+
346
+ def __end_task(self, processor_holder: ProcessorHolder) -> ProfileResult:
347
+ profile_result = self._profiling_manager.on_task_end(processor_holder.pid(), processor_holder.task().task_id)
348
+ processor_holder.set_task(None)
349
+
350
+ return profile_result
351
+
352
+ def __processor_ready_to_process_object(self, processor_id: ProcessorID) -> bool:
353
+ holder = self._holders_by_processor_id.get(processor_id)
354
+
355
+ if holder is None:
356
+ return False
357
+
358
+ assert holder.initialized()
359
+
360
+ if holder.task() is None:
361
+ return False
362
+
363
+ # TODO: check if the objects belong to the task
364
+
365
+ return True
@@ -0,0 +1,109 @@
1
+ import dataclasses
2
+ import logging
3
+ import time
4
+ from typing import Dict, Optional
5
+
6
+ import psutil
7
+
8
+ from scaler.utility.identifiers import TaskID
9
+ from scaler.utility.metadata.profile_result import ProfileResult
10
+ from scaler.utility.mixins import Looper
11
+ from scaler.worker.agent.mixins import ProfilingManager
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class _ProcessProfiler:
16
+ process: psutil.Process
17
+
18
+ current_task_id: Optional[TaskID] = None
19
+
20
+ start_time: Optional[float] = None
21
+ start_cpu_time: Optional[float] = None
22
+ init_memory_rss: Optional[int] = None
23
+ peak_memory_rss: Optional[int] = None
24
+
25
+
26
+ class VanillaProfilingManager(ProfilingManager, Looper):
27
+ def __init__(self):
28
+ self._process_profiler_by_pid: Dict[int, _ProcessProfiler] = {}
29
+
30
+ def on_process_start(self, pid: int):
31
+ if pid in self._process_profiler_by_pid:
32
+ raise ValueError(f"process {pid=} is already registered.")
33
+
34
+ self._process_profiler_by_pid[pid] = _ProcessProfiler(psutil.Process(pid))
35
+
36
+ def on_process_end(self, pid: int):
37
+ if pid not in self._process_profiler_by_pid:
38
+ raise ValueError(f"process {pid=} is not registered.")
39
+
40
+ self._process_profiler_by_pid.pop(pid)
41
+
42
+ def on_task_start(self, pid: int, task_id: TaskID):
43
+ process_profiler = self._process_profiler_by_pid.get(pid)
44
+
45
+ if process_profiler is None:
46
+ raise ValueError(f"process {pid=} is not registered.")
47
+
48
+ process_profiler.current_task_id = task_id
49
+
50
+ process = process_profiler.process
51
+
52
+ process_profiler.start_time = self.__process_time()
53
+ process_profiler.start_cpu_time = self.__process_cpu_time(process)
54
+ process_profiler.init_memory_rss = self.__process_memory_rss(process)
55
+ process_profiler.peak_memory_rss = process_profiler.init_memory_rss
56
+
57
+ def on_task_end(self, pid: int, task_id: TaskID) -> ProfileResult:
58
+ process_profiler = self._process_profiler_by_pid.get(pid)
59
+
60
+ if process_profiler is None:
61
+ raise ValueError(f"process {pid=} is not registered.")
62
+
63
+ if task_id != process_profiler.current_task_id:
64
+ raise ValueError(f"task {task_id=!r} is not the current task task_id={process_profiler.current_task_id!r}.")
65
+
66
+ assert process_profiler.start_time is not None
67
+ assert process_profiler.init_memory_rss is not None
68
+ assert process_profiler.peak_memory_rss is not None
69
+
70
+ process = process_profiler.process
71
+
72
+ time_delta = self.__process_time() - process_profiler.start_time
73
+
74
+ try:
75
+ cpu_time_delta = self.__process_cpu_time(process) - process_profiler.start_cpu_time
76
+ except psutil.ZombieProcess:
77
+ logging.warning(f"profiling zombie process: {pid=}")
78
+ cpu_time_delta = 0
79
+
80
+ memory_delta = process_profiler.peak_memory_rss - process_profiler.init_memory_rss
81
+
82
+ process_profiler.current_task_id = None
83
+ process_profiler.init_memory_rss = None
84
+ process_profiler.peak_memory_rss = None
85
+
86
+ return ProfileResult(time_delta, memory_delta, cpu_time_delta)
87
+
88
+ async def routine(self):
89
+ for process_profiler in self._process_profiler_by_pid.values():
90
+ if process_profiler.current_task_id is not None:
91
+ try:
92
+ process_profiler.peak_memory_rss = max(
93
+ process_profiler.peak_memory_rss, self.__process_memory_rss(process_profiler.process)
94
+ )
95
+ except psutil.ZombieProcess:
96
+ logging.warning(f"profiling zombie process: pid={process_profiler.process.pid}")
97
+
98
+ @staticmethod
99
+ def __process_time():
100
+ return time.monotonic()
101
+
102
+ @staticmethod
103
+ def __process_cpu_time(process: psutil.Process) -> float:
104
+ cpu_times = process.cpu_times()
105
+ return cpu_times.user + cpu_times.system
106
+
107
+ @staticmethod
108
+ def __process_memory_rss(process: psutil.Process) -> int:
109
+ return process.memory_info().rss
@@ -0,0 +1,150 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from scaler.io.mixins import AsyncConnector
4
+ from scaler.protocol.python.common import TaskCancelConfirmType
5
+ from scaler.protocol.python.message import Task, TaskCancel, TaskCancelConfirm, TaskResult
6
+ from scaler.utility.identifiers import TaskID
7
+ from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
8
+ from scaler.utility.mixins import Looper
9
+ from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
10
+ from scaler.worker.agent.mixins import ProcessorManager, TaskManager
11
+
12
+ _SUSPENDED_TASKS_PRIORITY = 1
13
+ _QUEUED_TASKS_PRIORITY = 2
14
+
15
+
16
+ class VanillaTaskManager(Looper, TaskManager):
17
+ def __init__(self, task_timeout_seconds: int):
18
+ self._task_timeout_seconds = task_timeout_seconds
19
+
20
+ self._queued_task_id_to_task: Dict[TaskID, Task] = dict()
21
+
22
+ # Queued tasks are sorted first by task's priorities, then suspended tasks are prioritized over non yet started
23
+ # tasks, finally the sorted queue ensure we execute the oldest tasks first.
24
+ #
25
+ # For example, if we receive these tasks in this order:
26
+ # 1. Task(priority=0) [suspended]
27
+ # 2. Task(priority=3) [suspended]
28
+ # 3. Task(priority=3)
29
+ # 4. Task(priority=0)
30
+ #
31
+ # We want to execute the tasks in this order: 2-3-1-4.
32
+ self._queued_task_ids = AsyncSortedPriorityQueue()
33
+
34
+ self._processing_task_ids: Set[TaskID] = set() # Tasks associated with a processor, including suspended tasks
35
+
36
+ self._connector_external: Optional[AsyncConnector] = None
37
+ self._processor_manager: Optional[ProcessorManager] = None
38
+
39
+ def register(self, connector: AsyncConnector, processor_manager: ProcessorManager):
40
+ self._connector_external = connector
41
+ self._processor_manager = processor_manager
42
+
43
+ async def on_task_new(self, task: Task):
44
+ self.__enqueue_task(task, is_suspended=False)
45
+
46
+ await self.__suspend_if_priority_is_higher(task)
47
+
48
+ async def on_cancel_task(self, task_cancel: TaskCancel):
49
+ task_not_found = (
50
+ task_cancel.task_id not in self._processing_task_ids
51
+ and task_cancel.task_id not in self._queued_task_id_to_task
52
+ )
53
+ if task_not_found:
54
+ await self._connector_external.send(
55
+ TaskCancelConfirm.new_msg(
56
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
57
+ )
58
+ )
59
+ return
60
+
61
+ if task_cancel.task_id in self._processing_task_ids and not task_cancel.flags.force:
62
+ # ignore cancel task while in processing if is not force cancel
63
+ await self._connector_external.send(
64
+ TaskCancelConfirm.new_msg(
65
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
66
+ )
67
+ )
68
+ return
69
+
70
+ # A suspended task will be both processing AND queued
71
+
72
+ if task_cancel.task_id in self._processing_task_ids:
73
+ # if task is in processing
74
+ self._processing_task_ids.remove(task_cancel.task_id)
75
+ _ = await self._processor_manager.on_cancel_task(task_cancel.task_id)
76
+ else:
77
+ # if task is queued
78
+ assert task_cancel.task_id in self._queued_task_id_to_task
79
+ self._queued_task_ids.remove(task_cancel.task_id)
80
+ _ = self._queued_task_id_to_task.pop(task_cancel.task_id)
81
+
82
+ await self._connector_external.send(
83
+ TaskCancelConfirm.new_msg(task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled)
84
+ )
85
+
86
+ async def on_task_result(self, result: TaskResult):
87
+ if result.task_id in self._queued_task_id_to_task:
88
+ # Finishing a queued task might happen if a task ended during the suspension process.
89
+ self._queued_task_id_to_task.pop(result.task_id)
90
+ self._queued_task_ids.remove(result.task_id)
91
+
92
+ self._processing_task_ids.remove(result.task_id)
93
+
94
+ await self._connector_external.send(result)
95
+
96
+ async def routine(self):
97
+ await self.__processing_task()
98
+
99
+ def get_queued_size(self):
100
+ return self._queued_task_ids.qsize()
101
+
102
+ async def __processing_task(self):
103
+ await self._processor_manager.wait_until_can_accept_task()
104
+
105
+ _, task_id = await self._queued_task_ids.get()
106
+ task = self._queued_task_id_to_task.pop(task_id)
107
+
108
+ if task_id not in self._processing_task_ids:
109
+ self._processing_task_ids.add(task_id)
110
+ await self._processor_manager.on_task(task)
111
+ else:
112
+ self._processor_manager.on_resume_task(task_id)
113
+
114
+ async def __suspend_if_priority_is_higher(self, new_task: Task):
115
+ current_task = self._processor_manager.current_task()
116
+
117
+ if current_task is None:
118
+ return
119
+
120
+ new_task_priority = self.__get_task_priority(new_task)
121
+ current_task_priority = self.__get_task_priority(current_task)
122
+
123
+ if new_task_priority <= current_task_priority:
124
+ return
125
+
126
+ self.__enqueue_task(current_task, is_suspended=True)
127
+
128
+ await self._processor_manager.on_suspend_task(current_task.task_id)
129
+
130
+ def __enqueue_task(self, task: Task, is_suspended: bool):
131
+ task_priority = self.__get_task_priority(task)
132
+
133
+ # Higher-priority tasks have a higher priority value. But as the queue is sorted by increasing order, we negate
134
+ # the inserted value that it will be at the head of the queue.
135
+ if is_suspended:
136
+ queue_priority = (-task_priority, _SUSPENDED_TASKS_PRIORITY)
137
+ else:
138
+ queue_priority = (-task_priority, _QUEUED_TASKS_PRIORITY)
139
+
140
+ self._queued_task_ids.put_nowait((queue_priority, task.task_id))
141
+ self._queued_task_id_to_task[task.task_id] = task
142
+
143
+ @staticmethod
144
+ def __get_task_priority(task: Task) -> int:
145
+ priority = retrieve_task_flags_from_task(task).priority
146
+
147
+ if priority < 0:
148
+ raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
149
+
150
+ return priority
@@ -0,0 +1,19 @@
1
+ import time
2
+
3
+ from scaler.utility.mixins import Looper
4
+ from scaler.worker.agent.mixins import TimeoutManager
5
+
6
+
7
+ class VanillaTimeoutManager(Looper, TimeoutManager):
8
+ def __init__(self, death_timeout_seconds: int):
9
+ self._death_timeout_seconds = death_timeout_seconds
10
+ self._last_seen_time = time.time()
11
+
12
+ def update_last_seen_time(self):
13
+ self._last_seen_time = time.time()
14
+
15
+ async def routine(self):
16
+ if (time.time() - self._last_seen_time) < self._death_timeout_seconds:
17
+ return
18
+
19
+ raise TimeoutError("timeout when connect to scheduler, quitting")