opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,285 @@
1
+ import contextlib
2
+ import logging
3
+ import multiprocessing
4
+ import os
5
+ import signal
6
+ from contextlib import redirect_stderr, redirect_stdout
7
+ from contextvars import ContextVar, Token
8
+ from multiprocessing.synchronize import Event as EventType
9
+ from typing import IO, Callable, List, Optional, Tuple, cast
10
+
11
+ import tblib.pickling_support
12
+ import zmq
13
+
14
+ from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
15
+ from scaler.config.types.zmq import ZMQConfig
16
+ from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
17
+ from scaler.io.sync_connector import ZMQSyncConnector
18
+ from scaler.io.utility import create_sync_object_storage_connector
19
+ from scaler.protocol.python.common import ObjectMetadata, TaskResultType
20
+ from scaler.protocol.python.message import ObjectInstruction, ProcessorInitialized, Task, TaskLog, TaskResult
21
+ from scaler.protocol.python.mixins import Message
22
+ from scaler.utility.identifiers import ClientID, ObjectID, TaskID
23
+ from scaler.utility.logging.utility import setup_logger
24
+ from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
25
+ from scaler.utility.serialization import serialize_failure
26
+ from scaler.worker.agent.processor.object_cache import ObjectCache
27
+ from scaler.worker.agent.processor.streaming_buffer import StreamingBuffer
28
+ from scaler.worker.preload import execute_preload
29
+
30
+ SUSPEND_SIGNAL = "SIGUSR1" # use str instead of a signal.Signal to not trigger an import error on unsupported systems.
31
+
32
+ _current_processor: ContextVar[Optional["Processor"]] = ContextVar("_current_processor", default=None)
33
+
34
+
35
+ class Processor(multiprocessing.get_context("spawn").Process): # type: ignore
36
+ def __init__(
37
+ self,
38
+ event_loop: str,
39
+ agent_address: ZMQConfig,
40
+ scheduler_address: ZMQConfig,
41
+ object_storage_address: ObjectStorageAddressConfig,
42
+ preload: Optional[str],
43
+ resume_event: Optional[EventType],
44
+ resumed_event: Optional[EventType],
45
+ garbage_collect_interval_seconds: int,
46
+ trim_memory_threshold_bytes: int,
47
+ logging_paths: Tuple[str, ...],
48
+ logging_level: str,
49
+ ):
50
+ multiprocessing.Process.__init__(self, name="Processor")
51
+
52
+ self._event_loop = event_loop
53
+ self._agent_address = agent_address
54
+ self._scheduler_address = scheduler_address
55
+ self._object_storage_address = object_storage_address
56
+ self._preload = preload
57
+
58
+ self._resume_event = resume_event
59
+ self._resumed_event = resumed_event
60
+
61
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
62
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
63
+ self._logging_paths = logging_paths
64
+ self._logging_level = logging_level
65
+
66
+ self._object_cache: Optional[ObjectCache] = None
67
+
68
+ self._current_task: Optional[Task] = None
69
+
70
+ def run(self) -> None:
71
+ self.__initialize()
72
+ self.__run_forever()
73
+
74
+ @staticmethod
75
+ def get_current_processor() -> Optional["Processor"]:
76
+ """Returns the current Processor instance controlling the current process, if any."""
77
+ return _current_processor.get()
78
+
79
+ def scheduler_address(self) -> ZMQConfig:
80
+ """Returns the scheduler address this processor's worker is connected to."""
81
+ return self._scheduler_address
82
+
83
+ def current_task(self) -> Optional[Task]:
84
+ return self._current_task
85
+
86
+ def __initialize(self):
87
+ # modify the logging path and add process id to the path
88
+ logging_paths = [f"{path}-{os.getpid()}" for path in self._logging_paths if path != "/dev/stdout"]
89
+ if "/dev/stdout" in self._logging_paths:
90
+ logging_paths.append("/dev/stdout")
91
+
92
+ setup_logger(log_paths=tuple(logging_paths), logging_level=self._logging_level)
93
+ tblib.pickling_support.install()
94
+
95
+ self._connector_agent: SyncConnector = ZMQSyncConnector(
96
+ context=zmq.Context(), socket_type=zmq.DEALER, address=self._agent_address, identity=None
97
+ )
98
+ self._connector_storage: SyncObjectStorageConnector = create_sync_object_storage_connector(
99
+ self._object_storage_address.host, self._object_storage_address.port
100
+ )
101
+
102
+ self._object_cache = ObjectCache(
103
+ garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
104
+ trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
105
+ )
106
+ self._object_cache.start()
107
+
108
+ self.__register_signals()
109
+
110
+ # Execute optional preload hook if provided
111
+ if self._preload is not None:
112
+ try:
113
+ execute_preload(self._preload)
114
+ except Exception as e:
115
+ raise RuntimeError(
116
+ f"Processor[{self.pid}] initialization failed due to preload error: {self._preload}"
117
+ ) from e
118
+
119
+ def __register_signals(self):
120
+ self.__register_signal("SIGTERM", self.__interrupt)
121
+
122
+ if self._resume_event is not None:
123
+ self.__register_signal(SUSPEND_SIGNAL, self.__suspend)
124
+
125
+ def __interrupt(self, *args):
126
+ self._connector_agent.destroy() # interrupts any blocking socket.
127
+
128
+ def __suspend(self, *args):
129
+ assert self._resume_event is not None
130
+ assert self._resumed_event is not None
131
+
132
+ self._resume_event.wait() # stops any computation in the main thread until the event is triggered
133
+
134
+ # Ensures the processor agent knows we stopped waiting on `_resume_event`, as to avoid re-entrant wait on the
135
+ # event.
136
+ self._resumed_event.set()
137
+
138
+ def __run_forever(self):
139
+ try:
140
+ self._connector_agent.send(ProcessorInitialized.new_msg())
141
+ while True:
142
+ message = self._connector_agent.receive()
143
+ if message is None:
144
+ continue
145
+
146
+ self.__on_connector_receive(message)
147
+
148
+ except zmq.error.ZMQError as e:
149
+ if e.errno != zmq.ENOTSOCK: # ignore if socket got closed
150
+ raise
151
+
152
+ except (KeyboardInterrupt, InterruptedError):
153
+ pass
154
+
155
+ except Exception as e:
156
+ logging.exception(f"Processor[{self.pid}]: failed with unhandled exception:\n{e}")
157
+
158
+ finally:
159
+ self._object_cache.destroy()
160
+ self._connector_agent.destroy()
161
+
162
+ self._object_cache.join()
163
+
164
+ def __on_connector_receive(self, message: Message):
165
+ if isinstance(message, ObjectInstruction):
166
+ self.__on_receive_object_instruction(message)
167
+ return
168
+
169
+ if isinstance(message, Task):
170
+ self.__on_received_task(message)
171
+ return
172
+
173
+ logging.error(f"unknown {message=}")
174
+
175
+ def __on_receive_object_instruction(self, instruction: ObjectInstruction):
176
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
177
+ for object_id in instruction.object_metadata.object_ids:
178
+ self._object_cache.del_object(object_id)
179
+ return
180
+
181
+ logging.error(f"worker received unknown object instruction type {instruction=}")
182
+
183
+ def __on_received_task(self, task: Task):
184
+ self._current_task = task
185
+
186
+ self.__cache_required_object_ids(task)
187
+
188
+ self.__process_task(task)
189
+
190
+ def __cache_required_object_ids(self, task: Task) -> None:
191
+ required_object_ids = self.__get_required_object_ids_for_task(task)
192
+
193
+ for object_id in required_object_ids:
194
+ if self._object_cache.has_object(object_id):
195
+ continue
196
+
197
+ object_content = self._connector_storage.get_object(object_id)
198
+ self._object_cache.add_object(task.source, object_id, object_content)
199
+
200
+ @staticmethod
201
+ def __get_required_object_ids_for_task(task: Task) -> List[ObjectID]:
202
+ serializer_id = ObjectID.generate_serializer_object_id(task.source)
203
+ object_ids = [
204
+ serializer_id,
205
+ task.func_object_id,
206
+ *(cast(ObjectID, argument) for argument in task.function_args),
207
+ ]
208
+ return object_ids
209
+
210
+ def __process_task(self, task: Task):
211
+ task_flags = retrieve_task_flags_from_task(task)
212
+
213
+ try:
214
+ function = self._object_cache.get_object(task.func_object_id)
215
+
216
+ args = [self._object_cache.get_object(cast(ObjectID, arg)) for arg in task.function_args]
217
+
218
+ if task_flags.stream_output:
219
+ with StreamingBuffer(
220
+ task.task_id, TaskLog.LogType.Stdout, self._connector_agent
221
+ ) as stdout_buf, StreamingBuffer(
222
+ task.task_id, TaskLog.LogType.Stderr, self._connector_agent
223
+ ) as stderr_buf, self.__processor_context(), redirect_stdout(
224
+ cast(IO[str], stdout_buf)
225
+ ), redirect_stderr(
226
+ cast(IO[str], stderr_buf)
227
+ ):
228
+ result = function(*args)
229
+ else:
230
+ with self.__processor_context():
231
+ result = function(*args)
232
+
233
+ result_bytes = self._object_cache.serialize(task.source, result)
234
+ task_result_type = TaskResultType.Success
235
+
236
+ except Exception as e:
237
+ logging.exception(f"exception when processing task_id={task.task_id.hex()}:")
238
+ task_result_type = TaskResultType.Failed
239
+ result_bytes = serialize_failure(e)
240
+
241
+ self.__send_result(task.source, task.task_id, task_result_type, result_bytes)
242
+
243
+ def __send_result(self, source: ClientID, task_id: TaskID, task_result_type: TaskResultType, result_bytes: bytes):
244
+ self._current_task = None
245
+
246
+ result_object_id = ObjectID.generate_object_id(source)
247
+
248
+ self._connector_storage.set_object(result_object_id, result_bytes)
249
+ self._connector_agent.send(
250
+ ObjectInstruction.new_msg(
251
+ ObjectInstruction.ObjectInstructionType.Create,
252
+ source,
253
+ ObjectMetadata.new_msg(
254
+ (result_object_id,),
255
+ (ObjectMetadata.ObjectContentType.Object,),
256
+ (f"<res {repr(result_object_id)}>".encode(),),
257
+ ),
258
+ )
259
+ )
260
+ self._connector_agent.send(
261
+ TaskResult.new_msg(task_id, task_result_type, metadata=b"", results=[bytes(result_object_id)])
262
+ )
263
+
264
+ @staticmethod
265
+ def __set_current_processor(context: Optional["Processor"]) -> Token:
266
+ if context is not None and _current_processor.get() is not None:
267
+ raise ValueError("cannot override a previously set processor context.")
268
+
269
+ return _current_processor.set(context)
270
+
271
+ @contextlib.contextmanager
272
+ def __processor_context(self):
273
+ self.__set_current_processor(self)
274
+ try:
275
+ yield
276
+ finally:
277
+ self.__set_current_processor(None)
278
+
279
+ @staticmethod
280
+ def __register_signal(signal_name: str, handler: Callable) -> None:
281
+ signal_instance = getattr(signal, signal_name, None)
282
+ if signal_instance is None:
283
+ raise RuntimeError(f"unsupported platform, signal not available: {signal_name}.")
284
+
285
+ signal.signal(signal_instance, handler)
@@ -0,0 +1,28 @@
1
+ import io
2
+ import logging
3
+
4
+ from scaler.io.mixins import SyncConnector
5
+ from scaler.protocol.python.message import TaskLog
6
+ from scaler.utility.identifiers import TaskID
7
+
8
+
9
+ class StreamingBuffer(io.TextIOBase):
10
+ """A custom IO buffer that sends content as it's written."""
11
+
12
+ def __init__(self, task_id: TaskID, log_type: TaskLog.LogType, connector_agent: SyncConnector):
13
+ super().__init__()
14
+ self._task_id = task_id
15
+ self._log_type = log_type
16
+ self._connector_agent = connector_agent
17
+
18
+ def write(self, content: str) -> int:
19
+ if self.closed:
20
+ return 0
21
+
22
+ if content:
23
+ try:
24
+ self._connector_agent.send(TaskLog.new_msg(self._task_id, self._log_type, content))
25
+ except Exception as e:
26
+ logging.warning(f"Failed to send stream content: {e}")
27
+
28
+ return 0
@@ -0,0 +1,147 @@
1
+ import logging
2
+ import multiprocessing
3
+ import os
4
+ import signal
5
+ from typing import Optional, Tuple
6
+
7
+ import psutil
8
+
9
+ from scaler.config.defaults import DEFAULT_PROCESSOR_KILL_DELAY_SECONDS
10
+ from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
11
+ from scaler.config.types.zmq import ZMQConfig
12
+ from scaler.protocol.python.message import Task
13
+ from scaler.utility.identifiers import ProcessorID
14
+ from scaler.worker.agent.processor.processor import SUSPEND_SIGNAL, Processor
15
+
16
+
17
+ class ProcessorHolder:
18
+ def __init__(
19
+ self,
20
+ event_loop: str,
21
+ agent_address: ZMQConfig,
22
+ scheduler_address: ZMQConfig,
23
+ object_storage_address: ObjectStorageAddressConfig,
24
+ preload: Optional[str],
25
+ garbage_collect_interval_seconds: int,
26
+ trim_memory_threshold_bytes: int,
27
+ hard_suspend: bool,
28
+ logging_paths: Tuple[str, ...],
29
+ logging_level: str,
30
+ ):
31
+ self._processor_id: Optional[ProcessorID] = None
32
+ self._task: Optional[Task] = None
33
+ self._suspended = False
34
+
35
+ self._hard_suspend = hard_suspend
36
+ if hard_suspend:
37
+ self._resume_event = None
38
+ self._resumed_event = None
39
+ else:
40
+ context = multiprocessing.get_context("spawn")
41
+ self._resume_event = context.Event()
42
+ self._resumed_event = context.Event()
43
+
44
+ self._processor = Processor(
45
+ event_loop=event_loop,
46
+ agent_address=agent_address,
47
+ scheduler_address=scheduler_address,
48
+ object_storage_address=object_storage_address,
49
+ preload=preload,
50
+ resume_event=self._resume_event,
51
+ resumed_event=self._resumed_event,
52
+ garbage_collect_interval_seconds=garbage_collect_interval_seconds,
53
+ trim_memory_threshold_bytes=trim_memory_threshold_bytes,
54
+ logging_paths=logging_paths,
55
+ logging_level=logging_level,
56
+ )
57
+ self._processor.start()
58
+ self._process = psutil.Process(self._processor.pid)
59
+
60
+ def pid(self) -> int:
61
+ assert self._processor.pid is not None
62
+ return self._processor.pid
63
+
64
+ def process(self) -> psutil.Process:
65
+ return self._process
66
+
67
+ def processor_id(self) -> ProcessorID:
68
+ assert self._processor_id is not None
69
+ return self._processor_id
70
+
71
+ def initialized(self) -> bool:
72
+ return self._processor_id is not None
73
+
74
+ def initialize(self, processor_id: ProcessorID):
75
+ self._processor_id = processor_id
76
+
77
+ def task(self) -> Optional[Task]:
78
+ return self._task
79
+
80
+ def set_task(self, task: Optional[Task]):
81
+ self._task = task
82
+
83
+ def suspended(self) -> bool:
84
+ return self._suspended
85
+
86
+ def suspend(self):
87
+ assert self._processor is not None
88
+ assert self._task is not None
89
+ assert self._suspended is False
90
+ assert self.initialized()
91
+
92
+ if self._hard_suspend:
93
+ self.__send_signal("SIGSTOP")
94
+ else:
95
+ # If we do not want to hardly suspend the processor's process (e.g. to keep network links alive), we request
96
+ # the process to wait on a synchronization event. That will stop the main thread while allowing the helper
97
+ # threads to continue running.
98
+ #
99
+ # See https://github.com/finos/opengris-scaler/issues/14
100
+
101
+ assert self._resume_event is not None
102
+ assert self._resumed_event is not None
103
+ self._resume_event.clear()
104
+ self._resumed_event.clear()
105
+
106
+ self.__send_signal(SUSPEND_SIGNAL)
107
+
108
+ self._suspended = True
109
+
110
+ def resume(self):
111
+ assert self._task is not None
112
+ assert self._suspended is True
113
+
114
+ if self._hard_suspend:
115
+ self.__send_signal("SIGCONT")
116
+ else:
117
+ assert self._resume_event is not None
118
+ assert self._resumed_event is not None
119
+
120
+ self._resume_event.set()
121
+
122
+ # Waits until the processor resumes processing. This avoids any future call to `suspend()` while the
123
+ # processor hasn't returned from the `_resumed_event.wait()` call yet (causes a re-entrant error on Linux).
124
+ self._resumed_event.wait()
125
+
126
+ self._suspended = False
127
+
128
+ def kill(self):
129
+ self.__send_signal("SIGTERM")
130
+ self._processor.join(DEFAULT_PROCESSOR_KILL_DELAY_SECONDS)
131
+
132
+ if self._processor.exitcode is None:
133
+ # TODO: some processors fail to interrupt because of a blocking 0mq call. Ideally we should interrupt
134
+ # these blocking calls instead of sending a SIGKILL signal.
135
+
136
+ logging.warning(f"Processor[{self.pid()}] does not terminate in time, send SIGKILL.")
137
+ self.__send_signal("SIGKILL")
138
+ self._processor.join()
139
+
140
+ self.set_task(None)
141
+
142
+ def __send_signal(self, signal_name: str):
143
+ signal_instance = getattr(signal, signal_name, None)
144
+ if signal_instance is None:
145
+ raise RuntimeError(f"unsupported platform, signal not available: {signal_name}.")
146
+
147
+ os.kill(self.pid(), signal_instance)