opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (187) hide show
  1. opengris_scaler-1.12.28.dist-info/METADATA +728 -0
  2. opengris_scaler-1.12.28.dist-info/RECORD +187 -0
  3. opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +210 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +658 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +115 -0
  32. scaler/cluster/combo.py +150 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/defaults.py +94 -0
  37. scaler/config/loader.py +96 -0
  38. scaler/config/mixins.py +20 -0
  39. scaler/config/section/__init__.py +0 -0
  40. scaler/config/section/cluster.py +55 -0
  41. scaler/config/section/ecs_worker_adapter.py +85 -0
  42. scaler/config/section/native_worker_adapter.py +43 -0
  43. scaler/config/section/object_storage_server.py +8 -0
  44. scaler/config/section/scheduler.py +54 -0
  45. scaler/config/section/symphony_worker_adapter.py +47 -0
  46. scaler/config/section/top.py +13 -0
  47. scaler/config/section/webui.py +21 -0
  48. scaler/config/types/__init__.py +0 -0
  49. scaler/config/types/network_backend.py +12 -0
  50. scaler/config/types/object_storage_server.py +45 -0
  51. scaler/config/types/worker.py +62 -0
  52. scaler/config/types/zmq.py +83 -0
  53. scaler/entry_points/__init__.py +0 -0
  54. scaler/entry_points/cluster.py +133 -0
  55. scaler/entry_points/object_storage_server.py +45 -0
  56. scaler/entry_points/scheduler.py +144 -0
  57. scaler/entry_points/top.py +286 -0
  58. scaler/entry_points/webui.py +48 -0
  59. scaler/entry_points/worker_adapter_ecs.py +191 -0
  60. scaler/entry_points/worker_adapter_native.py +137 -0
  61. scaler/entry_points/worker_adapter_symphony.py +98 -0
  62. scaler/io/__init__.py +0 -0
  63. scaler/io/async_binder.py +89 -0
  64. scaler/io/async_connector.py +95 -0
  65. scaler/io/async_object_storage_connector.py +225 -0
  66. scaler/io/mixins.py +154 -0
  67. scaler/io/sync_connector.py +68 -0
  68. scaler/io/sync_object_storage_connector.py +247 -0
  69. scaler/io/sync_subscriber.py +83 -0
  70. scaler/io/utility.py +80 -0
  71. scaler/io/ymq/__init__.py +0 -0
  72. scaler/io/ymq/_ymq.pyi +95 -0
  73. scaler/io/ymq/ymq.py +138 -0
  74. scaler/io/ymq_async_object_storage_connector.py +184 -0
  75. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  76. scaler/object_storage/__init__.py +0 -0
  77. scaler/protocol/__init__.py +0 -0
  78. scaler/protocol/capnp/__init__.py +0 -0
  79. scaler/protocol/capnp/_python.py +6 -0
  80. scaler/protocol/capnp/common.capnp +68 -0
  81. scaler/protocol/capnp/message.capnp +218 -0
  82. scaler/protocol/capnp/object_storage.capnp +57 -0
  83. scaler/protocol/capnp/status.capnp +73 -0
  84. scaler/protocol/introduction.md +105 -0
  85. scaler/protocol/python/__init__.py +0 -0
  86. scaler/protocol/python/common.py +140 -0
  87. scaler/protocol/python/message.py +751 -0
  88. scaler/protocol/python/mixins.py +13 -0
  89. scaler/protocol/python/object_storage.py +118 -0
  90. scaler/protocol/python/status.py +279 -0
  91. scaler/protocol/worker.md +228 -0
  92. scaler/scheduler/__init__.py +0 -0
  93. scaler/scheduler/allocate_policy/__init__.py +0 -0
  94. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  95. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  96. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  97. scaler/scheduler/allocate_policy/mixins.py +55 -0
  98. scaler/scheduler/controllers/__init__.py +0 -0
  99. scaler/scheduler/controllers/balance_controller.py +65 -0
  100. scaler/scheduler/controllers/client_controller.py +131 -0
  101. scaler/scheduler/controllers/config_controller.py +31 -0
  102. scaler/scheduler/controllers/graph_controller.py +424 -0
  103. scaler/scheduler/controllers/information_controller.py +81 -0
  104. scaler/scheduler/controllers/mixins.py +194 -0
  105. scaler/scheduler/controllers/object_controller.py +147 -0
  106. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  107. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  108. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  109. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  110. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  111. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  112. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  113. scaler/scheduler/controllers/task_controller.py +376 -0
  114. scaler/scheduler/controllers/worker_controller.py +169 -0
  115. scaler/scheduler/object_usage/__init__.py +0 -0
  116. scaler/scheduler/object_usage/object_tracker.py +131 -0
  117. scaler/scheduler/scheduler.py +251 -0
  118. scaler/scheduler/task/__init__.py +0 -0
  119. scaler/scheduler/task/task_state_machine.py +92 -0
  120. scaler/scheduler/task/task_state_manager.py +61 -0
  121. scaler/ui/__init__.py +0 -0
  122. scaler/ui/constants.py +9 -0
  123. scaler/ui/live_display.py +147 -0
  124. scaler/ui/memory_window.py +146 -0
  125. scaler/ui/setting_page.py +40 -0
  126. scaler/ui/task_graph.py +832 -0
  127. scaler/ui/task_log.py +107 -0
  128. scaler/ui/utility.py +66 -0
  129. scaler/ui/webui.py +147 -0
  130. scaler/ui/worker_processors.py +104 -0
  131. scaler/utility/__init__.py +0 -0
  132. scaler/utility/debug.py +19 -0
  133. scaler/utility/event_list.py +63 -0
  134. scaler/utility/event_loop.py +58 -0
  135. scaler/utility/exceptions.py +42 -0
  136. scaler/utility/formatter.py +44 -0
  137. scaler/utility/graph/__init__.py +0 -0
  138. scaler/utility/graph/optimization.py +27 -0
  139. scaler/utility/graph/topological_sorter.py +11 -0
  140. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  141. scaler/utility/identifiers.py +107 -0
  142. scaler/utility/logging/__init__.py +0 -0
  143. scaler/utility/logging/decorators.py +25 -0
  144. scaler/utility/logging/scoped_logger.py +33 -0
  145. scaler/utility/logging/utility.py +183 -0
  146. scaler/utility/many_to_many_dict.py +123 -0
  147. scaler/utility/metadata/__init__.py +0 -0
  148. scaler/utility/metadata/profile_result.py +31 -0
  149. scaler/utility/metadata/task_flags.py +30 -0
  150. scaler/utility/mixins.py +13 -0
  151. scaler/utility/network_util.py +7 -0
  152. scaler/utility/one_to_many_dict.py +72 -0
  153. scaler/utility/queues/__init__.py +0 -0
  154. scaler/utility/queues/async_indexed_queue.py +37 -0
  155. scaler/utility/queues/async_priority_queue.py +70 -0
  156. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  157. scaler/utility/queues/indexed_queue.py +114 -0
  158. scaler/utility/serialization.py +9 -0
  159. scaler/version.txt +1 -0
  160. scaler/worker/__init__.py +0 -0
  161. scaler/worker/agent/__init__.py +0 -0
  162. scaler/worker/agent/heartbeat_manager.py +107 -0
  163. scaler/worker/agent/mixins.py +137 -0
  164. scaler/worker/agent/processor/__init__.py +0 -0
  165. scaler/worker/agent/processor/object_cache.py +107 -0
  166. scaler/worker/agent/processor/processor.py +285 -0
  167. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  168. scaler/worker/agent/processor_holder.py +147 -0
  169. scaler/worker/agent/processor_manager.py +369 -0
  170. scaler/worker/agent/profiling_manager.py +109 -0
  171. scaler/worker/agent/task_manager.py +150 -0
  172. scaler/worker/agent/timeout_manager.py +19 -0
  173. scaler/worker/preload.py +84 -0
  174. scaler/worker/worker.py +265 -0
  175. scaler/worker_adapter/__init__.py +0 -0
  176. scaler/worker_adapter/common.py +26 -0
  177. scaler/worker_adapter/ecs.py +269 -0
  178. scaler/worker_adapter/native.py +155 -0
  179. scaler/worker_adapter/symphony/__init__.py +0 -0
  180. scaler/worker_adapter/symphony/callback.py +45 -0
  181. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  182. scaler/worker_adapter/symphony/message.py +24 -0
  183. scaler/worker_adapter/symphony/task_manager.py +289 -0
  184. scaler/worker_adapter/symphony/worker.py +204 -0
  185. scaler/worker_adapter/symphony/worker_adapter.py +139 -0
  186. src/scaler/io/ymq/_ymq.so +0 -0
  187. src/scaler/object_storage/object_storage_server.so +0 -0
@@ -0,0 +1,369 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ import tblib.pickling_support
6
+
7
+ from scaler.config.types.zmq import ZMQConfig
8
+
9
+ # from scaler.utility.logging.utility import setup_logger
10
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
11
+ from scaler.protocol.python.common import ObjectMetadata, TaskResultType
12
+ from scaler.protocol.python.message import ObjectInstruction, ProcessorInitialized, Task, TaskResult
13
+ from scaler.utility.exceptions import ProcessorDiedError
14
+ from scaler.utility.identifiers import ObjectID, ProcessorID, TaskID, WorkerID
15
+ from scaler.utility.metadata.profile_result import ProfileResult
16
+ from scaler.utility.serialization import serialize_failure
17
+ from scaler.worker.agent.mixins import HeartbeatManager, ProcessorManager, ProfilingManager, TaskManager
18
+ from scaler.worker.agent.processor_holder import ProcessorHolder
19
+
20
+
21
+ class VanillaProcessorManager(ProcessorManager):
22
+ def __init__(
23
+ self,
24
+ identity: WorkerID,
25
+ event_loop: str,
26
+ address_internal: ZMQConfig,
27
+ scheduler_address: ZMQConfig,
28
+ preload: Optional[str],
29
+ garbage_collect_interval_seconds: int,
30
+ trim_memory_threshold_bytes: int,
31
+ hard_processor_suspend: bool,
32
+ logging_paths: Tuple[str, ...],
33
+ logging_level: str,
34
+ ):
35
+ tblib.pickling_support.install()
36
+
37
+ self._identity = identity
38
+ self._event_loop = event_loop
39
+ self._scheduler_address = scheduler_address
40
+ self._preload = preload
41
+
42
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
43
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
44
+ self._hard_processor_suspend = hard_processor_suspend
45
+ self._logging_paths = logging_paths
46
+ self._logging_level = logging_level
47
+
48
+ self._heartbeat_manager: Optional[HeartbeatManager] = None
49
+ self._task_manager: Optional[TaskManager] = None
50
+ self._profiling_manager: Optional[ProfilingManager] = None
51
+ self._connector_external: Optional[AsyncConnector] = None
52
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
53
+
54
+ self._address_internal: ZMQConfig = address_internal
55
+
56
+ self._current_holder: Optional[ProcessorHolder] = None
57
+ self._suspended_holders_by_task_id: Dict[bytes, ProcessorHolder] = {}
58
+ self._holders_by_processor_id: Dict[ProcessorID, ProcessorHolder] = {}
59
+
60
+ self._can_accept_task_lock: asyncio.Lock = asyncio.Lock()
61
+
62
+ self._binder_internal: Optional[AsyncBinder] = None
63
+
64
+ def register(
65
+ self,
66
+ heartbeat_manager: HeartbeatManager,
67
+ task_manager: TaskManager,
68
+ profiling_manager: ProfilingManager,
69
+ connector_external: AsyncConnector,
70
+ binder_internal: AsyncBinder,
71
+ connector_storage: AsyncObjectStorageConnector,
72
+ ):
73
+ self._heartbeat_manager = heartbeat_manager
74
+ self._task_manager = task_manager
75
+ self._profiling_manager = profiling_manager
76
+ self._connector_external = connector_external
77
+ self._binder_internal = binder_internal
78
+ self._connector_storage = connector_storage
79
+
80
+ async def initialize(self):
81
+ await self._can_accept_task_lock.acquire() # prevents any processor to accept task until initialized
82
+
83
+ await self._connector_storage.wait_until_connected()
84
+
85
+ self.__start_new_processor() # we can start the processor now that we know the storage address.
86
+
87
+ def can_accept_task(self) -> bool:
88
+ return not self._can_accept_task_lock.locked()
89
+
90
+ async def wait_until_can_accept_task(self):
91
+ """
92
+ Makes sure a processor is ready to start processing a new or suspended task.
93
+
94
+ Must be called before any call to `on_task()` or `on_task_resume()`.
95
+ """
96
+
97
+ await self._can_accept_task_lock.acquire()
98
+
99
+ async def on_processor_initialized(self, processor_id: ProcessorID, processor_initialized: ProcessorInitialized):
100
+ assert self._current_holder is not None
101
+
102
+ if self._current_holder.initialized():
103
+ return
104
+
105
+ self._holders_by_processor_id[processor_id] = self._current_holder
106
+ self._current_holder.initialize(processor_id)
107
+
108
+ self._can_accept_task_lock.release()
109
+
110
+ async def on_task(self, task: Task) -> bool:
111
+ assert self._can_accept_task_lock.locked()
112
+ assert self.current_processor_is_initialized()
113
+
114
+ holder = self._current_holder
115
+
116
+ assert holder.task() is None
117
+ holder.set_task(task)
118
+
119
+ self._profiling_manager.on_task_start(holder.pid(), task.task_id)
120
+
121
+ await self._binder_internal.send(holder.processor_id(), task)
122
+
123
+ return True
124
+
125
+ async def on_cancel_task(self, task_id: TaskID) -> Optional[Task]:
126
+ assert self._current_holder is not None
127
+
128
+ if self.current_task_id() == task_id:
129
+ current_task = self.current_task()
130
+ self.__restart_current_processor(f"cancel task_id={task_id.hex()}")
131
+ return current_task
132
+
133
+ if task_id in self._suspended_holders_by_task_id:
134
+ suspended_holder = self._suspended_holders_by_task_id.pop(task_id)
135
+ task = suspended_holder.task()
136
+ self.__kill_processor(f"cancel suspended task_id={task_id.hex()}", suspended_holder)
137
+ return task
138
+
139
+ return None
140
+
141
+ async def on_failing_processor(self, processor_id: ProcessorID, process_status: str):
142
+ assert self._current_holder is not None
143
+
144
+ holder = self._holders_by_processor_id.get(processor_id)
145
+
146
+ if holder is None:
147
+ return
148
+
149
+ task = holder.task()
150
+ if task is not None:
151
+ profile_result = self.__end_task(holder) # profiling the task should happen before killing the processor
152
+ else:
153
+ profile_result = None
154
+
155
+ reason = f"process died {process_status=}"
156
+ if holder == self._current_holder:
157
+ self.__restart_current_processor(reason)
158
+ else:
159
+ self.__kill_processor(reason, holder)
160
+
161
+ if task is not None:
162
+ source = task.source
163
+ task_id = task.task_id
164
+
165
+ result_object_id = ObjectID.generate_object_id(source)
166
+ result_object_bytes = serialize_failure(ProcessorDiedError(f"{process_status=}"))
167
+
168
+ await self._connector_storage.set_object(result_object_id, result_object_bytes)
169
+ await self._connector_external.send(
170
+ ObjectInstruction.new_msg(
171
+ ObjectInstruction.ObjectInstructionType.Create,
172
+ source,
173
+ ObjectMetadata.new_msg((result_object_id,), (ObjectMetadata.ObjectContentType.Object,), (b"",)),
174
+ )
175
+ )
176
+
177
+ await self._task_manager.on_task_result(
178
+ TaskResult.new_msg(
179
+ task_id, TaskResultType.Failed, profile_result.serialize(), [bytes(result_object_id)]
180
+ )
181
+ )
182
+
183
+ async def on_suspend_task(self, task_id: TaskID) -> bool:
184
+ assert self._current_holder is not None
185
+ holder = self._current_holder
186
+
187
+ current_task = holder.task()
188
+
189
+ if current_task is None or current_task.task_id != task_id:
190
+ return False
191
+
192
+ holder.suspend()
193
+ self._suspended_holders_by_task_id[task_id] = holder
194
+
195
+ logging.info(f"{self._identity!r}: suspend Processor[{holder.pid()}]")
196
+
197
+ self.__start_new_processor()
198
+
199
+ return True
200
+
201
+ def on_resume_task(self, task_id: TaskID) -> bool:
202
+ assert self._can_accept_task_lock.locked()
203
+ assert self.current_processor_is_initialized()
204
+
205
+ if self.current_task() is not None:
206
+ return False
207
+
208
+ suspended_holder = self._suspended_holders_by_task_id.pop(task_id, None)
209
+
210
+ if suspended_holder is None:
211
+ return False
212
+
213
+ self.__kill_processor("replaced by suspended processor", self._current_holder)
214
+
215
+ self._current_holder = suspended_holder
216
+ suspended_holder.resume()
217
+
218
+ logging.info(f"{self._identity!r}: resume Processor[{self._current_holder.pid()}]")
219
+
220
+ return True
221
+
222
+ async def on_task_result(self, processor_id: ProcessorID, task_result: TaskResult):
223
+ assert self._current_holder is not None
224
+ task_id = task_result.task_id
225
+
226
+ if task_id == self.current_task_id():
227
+ assert self._current_holder.processor_id() == processor_id
228
+
229
+ profile_result = self.__end_task(self._current_holder)
230
+
231
+ release_task_lock = True
232
+ elif task_id in self._suspended_holders_by_task_id:
233
+ # Receiving a task result from a suspended processor is possible as the message might have been queued while
234
+ # we were suspending the process.
235
+
236
+ holder = self._suspended_holders_by_task_id.pop(task_id)
237
+ assert holder.processor_id() == processor_id
238
+
239
+ profile_result = self.__end_task(holder)
240
+
241
+ self.__kill_processor("task finished in suspended processor", holder)
242
+
243
+ release_task_lock = False
244
+ else:
245
+ return
246
+
247
+ await self._task_manager.on_task_result(
248
+ TaskResult.new_msg(
249
+ task_id=task_id,
250
+ result_type=task_result.result_type,
251
+ metadata=profile_result.serialize(),
252
+ results=task_result.results,
253
+ )
254
+ )
255
+
256
+ # task lock must be released after calling `TaskManager.on_task_result()`
257
+ if release_task_lock:
258
+ self._can_accept_task_lock.release()
259
+
260
+ async def on_external_object_instruction(self, instruction: ObjectInstruction):
261
+ for processor_id in self._holders_by_processor_id.keys():
262
+ await self._binder_internal.send(processor_id, instruction)
263
+
264
+ async def on_internal_object_instruction(self, processor_id: ProcessorID, instruction: ObjectInstruction):
265
+ if not self.__processor_ready_to_process_object(processor_id):
266
+ return
267
+
268
+ await self._connector_external.send(instruction)
269
+
270
+ def destroy(self, reason: str):
271
+ if self._connector_storage is not None:
272
+ self._connector_external.destroy()
273
+
274
+ self.__kill_all_processors(reason)
275
+
276
+ def current_processor_is_initialized(self) -> bool:
277
+ return self._current_holder is not None and self._current_holder.initialized()
278
+
279
+ def current_task(self) -> Optional[Task]:
280
+ if self._current_holder is None: # worker is not yet initialized
281
+ return None
282
+
283
+ return self._current_holder.task()
284
+
285
+ def current_task_id(self) -> Optional[TaskID]:
286
+ task = self.current_task()
287
+
288
+ if task is None:
289
+ return None
290
+ else:
291
+ return task.task_id
292
+
293
+ def processors(self) -> List[ProcessorHolder]:
294
+ return list(self._holders_by_processor_id.values())
295
+
296
+ def num_suspended_processors(self) -> int:
297
+ return len(self._suspended_holders_by_task_id)
298
+
299
+ def __start_new_processor(self):
300
+ object_storage_address = self._heartbeat_manager.get_object_storage_address()
301
+
302
+ self._current_holder = ProcessorHolder(
303
+ self._event_loop,
304
+ self._address_internal,
305
+ self._scheduler_address,
306
+ object_storage_address,
307
+ self._preload,
308
+ self._garbage_collect_interval_seconds,
309
+ self._trim_memory_threshold_bytes,
310
+ self._hard_processor_suspend,
311
+ self._logging_paths,
312
+ self._logging_level,
313
+ )
314
+
315
+ processor_pid = self._current_holder.pid()
316
+
317
+ self._profiling_manager.on_process_start(processor_pid)
318
+
319
+ logging.info(f"{self._identity!r}: start Processor[{processor_pid}]")
320
+
321
+ def __kill_processor(self, reason: str, holder: ProcessorHolder):
322
+ processor_pid = holder.pid()
323
+
324
+ self._profiling_manager.on_process_end(processor_pid)
325
+
326
+ if holder.initialized():
327
+ self._holders_by_processor_id.pop(holder.processor_id(), None)
328
+
329
+ holder.kill()
330
+
331
+ logging.info(f"{self._identity!r}: stop Processor[{processor_pid}], reason: {reason}")
332
+
333
+ def __restart_current_processor(self, reason: str):
334
+ assert self._current_holder is not None
335
+
336
+ self.__kill_processor(reason, self._current_holder)
337
+ self.__start_new_processor()
338
+
339
+ def __kill_all_processors(self, reason: str):
340
+ if self._current_holder is not None:
341
+ self.__kill_processor(reason, self._current_holder)
342
+ self._current_holder = None
343
+
344
+ for processor_holder in self._suspended_holders_by_task_id.values():
345
+ self.__kill_processor(reason, processor_holder)
346
+
347
+ self._suspended_holders_by_task_id = {}
348
+ self._holders_by_processor_id = {}
349
+
350
+ def __end_task(self, processor_holder: ProcessorHolder) -> ProfileResult:
351
+ profile_result = self._profiling_manager.on_task_end(processor_holder.pid(), processor_holder.task().task_id)
352
+ processor_holder.set_task(None)
353
+
354
+ return profile_result
355
+
356
+ def __processor_ready_to_process_object(self, processor_id: ProcessorID) -> bool:
357
+ holder = self._holders_by_processor_id.get(processor_id)
358
+
359
+ if holder is None:
360
+ return False
361
+
362
+ assert holder.initialized()
363
+
364
+ if holder.task() is None:
365
+ return False
366
+
367
+ # TODO: check if the objects belong to the task
368
+
369
+ return True
@@ -0,0 +1,109 @@
1
+ import dataclasses
2
+ import logging
3
+ import time
4
+ from typing import Dict, Optional
5
+
6
+ import psutil
7
+
8
+ from scaler.utility.identifiers import TaskID
9
+ from scaler.utility.metadata.profile_result import ProfileResult
10
+ from scaler.utility.mixins import Looper
11
+ from scaler.worker.agent.mixins import ProfilingManager
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class _ProcessProfiler:
16
+ process: psutil.Process
17
+
18
+ current_task_id: Optional[TaskID] = None
19
+
20
+ start_time: Optional[float] = None
21
+ start_cpu_time: Optional[float] = None
22
+ init_memory_rss: Optional[int] = None
23
+ peak_memory_rss: Optional[int] = None
24
+
25
+
26
+ class VanillaProfilingManager(ProfilingManager, Looper):
27
+ def __init__(self):
28
+ self._process_profiler_by_pid: Dict[int, _ProcessProfiler] = {}
29
+
30
+ def on_process_start(self, pid: int):
31
+ if pid in self._process_profiler_by_pid:
32
+ raise ValueError(f"process {pid=} is already registered.")
33
+
34
+ self._process_profiler_by_pid[pid] = _ProcessProfiler(psutil.Process(pid))
35
+
36
+ def on_process_end(self, pid: int):
37
+ if pid not in self._process_profiler_by_pid:
38
+ raise ValueError(f"process {pid=} is not registered.")
39
+
40
+ self._process_profiler_by_pid.pop(pid)
41
+
42
+ def on_task_start(self, pid: int, task_id: TaskID):
43
+ process_profiler = self._process_profiler_by_pid.get(pid)
44
+
45
+ if process_profiler is None:
46
+ raise ValueError(f"process {pid=} is not registered.")
47
+
48
+ process_profiler.current_task_id = task_id
49
+
50
+ process = process_profiler.process
51
+
52
+ process_profiler.start_time = self.__process_time()
53
+ process_profiler.start_cpu_time = self.__process_cpu_time(process)
54
+ process_profiler.init_memory_rss = self.__process_memory_rss(process)
55
+ process_profiler.peak_memory_rss = process_profiler.init_memory_rss
56
+
57
+ def on_task_end(self, pid: int, task_id: TaskID) -> ProfileResult:
58
+ process_profiler = self._process_profiler_by_pid.get(pid)
59
+
60
+ if process_profiler is None:
61
+ raise ValueError(f"process {pid=} is not registered.")
62
+
63
+ if task_id != process_profiler.current_task_id:
64
+ raise ValueError(f"task {task_id=!r} is not the current task task_id={process_profiler.current_task_id!r}.")
65
+
66
+ assert process_profiler.start_time is not None
67
+ assert process_profiler.init_memory_rss is not None
68
+ assert process_profiler.peak_memory_rss is not None
69
+
70
+ process = process_profiler.process
71
+
72
+ time_delta = self.__process_time() - process_profiler.start_time
73
+
74
+ try:
75
+ cpu_time_delta = self.__process_cpu_time(process) - process_profiler.start_cpu_time
76
+ except psutil.ZombieProcess:
77
+ logging.warning(f"profiling zombie process: {pid=}")
78
+ cpu_time_delta = 0
79
+
80
+ memory_delta = process_profiler.peak_memory_rss - process_profiler.init_memory_rss
81
+
82
+ process_profiler.current_task_id = None
83
+ process_profiler.init_memory_rss = None
84
+ process_profiler.peak_memory_rss = None
85
+
86
+ return ProfileResult(time_delta, memory_delta, cpu_time_delta)
87
+
88
+ async def routine(self):
89
+ for process_profiler in self._process_profiler_by_pid.values():
90
+ if process_profiler.current_task_id is not None:
91
+ try:
92
+ process_profiler.peak_memory_rss = max(
93
+ process_profiler.peak_memory_rss, self.__process_memory_rss(process_profiler.process)
94
+ )
95
+ except psutil.ZombieProcess:
96
+ logging.warning(f"profiling zombie process: pid={process_profiler.process.pid}")
97
+
98
+ @staticmethod
99
+ def __process_time():
100
+ return time.monotonic()
101
+
102
+ @staticmethod
103
+ def __process_cpu_time(process: psutil.Process) -> float:
104
+ cpu_times = process.cpu_times()
105
+ return cpu_times.user + cpu_times.system
106
+
107
+ @staticmethod
108
+ def __process_memory_rss(process: psutil.Process) -> int:
109
+ return process.memory_info().rss
@@ -0,0 +1,150 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from scaler.io.mixins import AsyncConnector
4
+ from scaler.protocol.python.common import TaskCancelConfirmType
5
+ from scaler.protocol.python.message import Task, TaskCancel, TaskCancelConfirm, TaskResult
6
+ from scaler.utility.identifiers import TaskID
7
+ from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
8
+ from scaler.utility.mixins import Looper
9
+ from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
10
+ from scaler.worker.agent.mixins import ProcessorManager, TaskManager
11
+
12
+ _SUSPENDED_TASKS_PRIORITY = 1
13
+ _QUEUED_TASKS_PRIORITY = 2
14
+
15
+
16
+ class VanillaTaskManager(Looper, TaskManager):
17
+ def __init__(self, task_timeout_seconds: int):
18
+ self._task_timeout_seconds = task_timeout_seconds
19
+
20
+ self._queued_task_id_to_task: Dict[TaskID, Task] = dict()
21
+
22
+ # Queued tasks are sorted first by task's priorities, then suspended tasks are prioritized over non yet started
23
+ # tasks, finally the sorted queue ensure we execute the oldest tasks first.
24
+ #
25
+ # For example, if we receive these tasks in this order:
26
+ # 1. Task(priority=0) [suspended]
27
+ # 2. Task(priority=3) [suspended]
28
+ # 3. Task(priority=3)
29
+ # 4. Task(priority=0)
30
+ #
31
+ # We want to execute the tasks in this order: 2-3-1-4.
32
+ self._queued_task_ids = AsyncSortedPriorityQueue()
33
+
34
+ self._processing_task_ids: Set[TaskID] = set() # Tasks associated with a processor, including suspended tasks
35
+
36
+ self._connector_external: Optional[AsyncConnector] = None
37
+ self._processor_manager: Optional[ProcessorManager] = None
38
+
39
+ def register(self, connector: AsyncConnector, processor_manager: ProcessorManager):
40
+ self._connector_external = connector
41
+ self._processor_manager = processor_manager
42
+
43
+ async def on_task_new(self, task: Task):
44
+ self.__enqueue_task(task, is_suspended=False)
45
+
46
+ await self.__suspend_if_priority_is_higher(task)
47
+
48
+ async def on_cancel_task(self, task_cancel: TaskCancel):
49
+ task_not_found = (
50
+ task_cancel.task_id not in self._processing_task_ids
51
+ and task_cancel.task_id not in self._queued_task_id_to_task
52
+ )
53
+ if task_not_found:
54
+ await self._connector_external.send(
55
+ TaskCancelConfirm.new_msg(
56
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
57
+ )
58
+ )
59
+ return
60
+
61
+ if task_cancel.task_id in self._processing_task_ids and not task_cancel.flags.force:
62
+ # ignore cancel task while in processing if is not force cancel
63
+ await self._connector_external.send(
64
+ TaskCancelConfirm.new_msg(
65
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
66
+ )
67
+ )
68
+ return
69
+
70
+ # A suspended task will be both processing AND queued
71
+
72
+ if task_cancel.task_id in self._processing_task_ids:
73
+ # if task is in processing
74
+ self._processing_task_ids.remove(task_cancel.task_id)
75
+ _ = await self._processor_manager.on_cancel_task(task_cancel.task_id)
76
+ else:
77
+ # if task is queued
78
+ assert task_cancel.task_id in self._queued_task_id_to_task
79
+ self._queued_task_ids.remove(task_cancel.task_id)
80
+ _ = self._queued_task_id_to_task.pop(task_cancel.task_id)
81
+
82
+ await self._connector_external.send(
83
+ TaskCancelConfirm.new_msg(task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled)
84
+ )
85
+
86
+ async def on_task_result(self, result: TaskResult):
87
+ if result.task_id in self._queued_task_id_to_task:
88
+ # Finishing a queued task might happen if a task ended during the suspension process.
89
+ self._queued_task_id_to_task.pop(result.task_id)
90
+ self._queued_task_ids.remove(result.task_id)
91
+
92
+ self._processing_task_ids.remove(result.task_id)
93
+
94
+ await self._connector_external.send(result)
95
+
96
+ async def routine(self):
97
+ await self.__processing_task()
98
+
99
+ def get_queued_size(self):
100
+ return self._queued_task_ids.qsize()
101
+
102
+ async def __processing_task(self):
103
+ await self._processor_manager.wait_until_can_accept_task()
104
+
105
+ _, task_id = await self._queued_task_ids.get()
106
+ task = self._queued_task_id_to_task.pop(task_id)
107
+
108
+ if task_id not in self._processing_task_ids:
109
+ self._processing_task_ids.add(task_id)
110
+ await self._processor_manager.on_task(task)
111
+ else:
112
+ self._processor_manager.on_resume_task(task_id)
113
+
114
+ async def __suspend_if_priority_is_higher(self, new_task: Task):
115
+ current_task = self._processor_manager.current_task()
116
+
117
+ if current_task is None:
118
+ return
119
+
120
+ new_task_priority = self.__get_task_priority(new_task)
121
+ current_task_priority = self.__get_task_priority(current_task)
122
+
123
+ if new_task_priority <= current_task_priority:
124
+ return
125
+
126
+ self.__enqueue_task(current_task, is_suspended=True)
127
+
128
+ await self._processor_manager.on_suspend_task(current_task.task_id)
129
+
130
+ def __enqueue_task(self, task: Task, is_suspended: bool):
131
+ task_priority = self.__get_task_priority(task)
132
+
133
+ # Higher-priority tasks have a higher priority value. But as the queue is sorted by increasing order, we negate
134
+ # the inserted value that it will be at the head of the queue.
135
+ if is_suspended:
136
+ queue_priority = (-task_priority, _SUSPENDED_TASKS_PRIORITY)
137
+ else:
138
+ queue_priority = (-task_priority, _QUEUED_TASKS_PRIORITY)
139
+
140
+ self._queued_task_ids.put_nowait((queue_priority, task.task_id))
141
+ self._queued_task_id_to_task[task.task_id] = task
142
+
143
+ @staticmethod
144
+ def __get_task_priority(task: Task) -> int:
145
+ priority = retrieve_task_flags_from_task(task).priority
146
+
147
+ if priority < 0:
148
+ raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
149
+
150
+ return priority
@@ -0,0 +1,19 @@
1
+ import time
2
+
3
+ from scaler.utility.mixins import Looper
4
+ from scaler.worker.agent.mixins import TimeoutManager
5
+
6
+
7
+ class VanillaTimeoutManager(Looper, TimeoutManager):
8
+ def __init__(self, death_timeout_seconds: int):
9
+ self._death_timeout_seconds = death_timeout_seconds
10
+ self._last_seen_time = time.time()
11
+
12
+ def update_last_seen_time(self):
13
+ self._last_seen_time = time.time()
14
+
15
+ async def routine(self):
16
+ if (time.time() - self._last_seen_time) < self._death_timeout_seconds:
17
+ return
18
+
19
+ raise TimeoutError("timeout when connect to scheduler, quitting")