opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import logging
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import os
|
|
5
|
+
import signal
|
|
6
|
+
from contextlib import redirect_stderr, redirect_stdout
|
|
7
|
+
from contextvars import ContextVar, Token
|
|
8
|
+
from multiprocessing.synchronize import Event as EventType
|
|
9
|
+
from typing import IO, Callable, List, Optional, Tuple, cast
|
|
10
|
+
|
|
11
|
+
import tblib.pickling_support
|
|
12
|
+
import zmq
|
|
13
|
+
|
|
14
|
+
from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
|
|
15
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
16
|
+
from scaler.io.mixins import SyncConnector, SyncObjectStorageConnector
|
|
17
|
+
from scaler.io.sync_connector import ZMQSyncConnector
|
|
18
|
+
from scaler.io.utility import create_sync_object_storage_connector
|
|
19
|
+
from scaler.protocol.python.common import ObjectMetadata, TaskResultType
|
|
20
|
+
from scaler.protocol.python.message import ObjectInstruction, ProcessorInitialized, Task, TaskLog, TaskResult
|
|
21
|
+
from scaler.protocol.python.mixins import Message
|
|
22
|
+
from scaler.utility.identifiers import ClientID, ObjectID, TaskID
|
|
23
|
+
from scaler.utility.logging.utility import setup_logger
|
|
24
|
+
from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
|
|
25
|
+
from scaler.utility.serialization import serialize_failure
|
|
26
|
+
from scaler.worker.agent.processor.object_cache import ObjectCache
|
|
27
|
+
from scaler.worker.agent.processor.streaming_buffer import StreamingBuffer
|
|
28
|
+
from scaler.worker.preload import execute_preload
|
|
29
|
+
|
|
30
|
+
SUSPEND_SIGNAL = "SIGUSR1" # use str instead of a signal.Signal to not trigger an import error on unsupported systems.
|
|
31
|
+
|
|
32
|
+
_current_processor: ContextVar[Optional["Processor"]] = ContextVar("_current_processor", default=None)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Processor(multiprocessing.get_context("spawn").Process): # type: ignore
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
event_loop: str,
|
|
39
|
+
agent_address: ZMQConfig,
|
|
40
|
+
scheduler_address: ZMQConfig,
|
|
41
|
+
object_storage_address: ObjectStorageAddressConfig,
|
|
42
|
+
preload: Optional[str],
|
|
43
|
+
resume_event: Optional[EventType],
|
|
44
|
+
resumed_event: Optional[EventType],
|
|
45
|
+
garbage_collect_interval_seconds: int,
|
|
46
|
+
trim_memory_threshold_bytes: int,
|
|
47
|
+
logging_paths: Tuple[str, ...],
|
|
48
|
+
logging_level: str,
|
|
49
|
+
):
|
|
50
|
+
multiprocessing.Process.__init__(self, name="Processor")
|
|
51
|
+
|
|
52
|
+
self._event_loop = event_loop
|
|
53
|
+
self._agent_address = agent_address
|
|
54
|
+
self._scheduler_address = scheduler_address
|
|
55
|
+
self._object_storage_address = object_storage_address
|
|
56
|
+
self._preload = preload
|
|
57
|
+
|
|
58
|
+
self._resume_event = resume_event
|
|
59
|
+
self._resumed_event = resumed_event
|
|
60
|
+
|
|
61
|
+
self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
|
|
62
|
+
self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
|
|
63
|
+
self._logging_paths = logging_paths
|
|
64
|
+
self._logging_level = logging_level
|
|
65
|
+
|
|
66
|
+
self._object_cache: Optional[ObjectCache] = None
|
|
67
|
+
|
|
68
|
+
self._current_task: Optional[Task] = None
|
|
69
|
+
|
|
70
|
+
def run(self) -> None:
|
|
71
|
+
self.__initialize()
|
|
72
|
+
self.__run_forever()
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_current_processor() -> Optional["Processor"]:
|
|
76
|
+
"""Returns the current Processor instance controlling the current process, if any."""
|
|
77
|
+
return _current_processor.get()
|
|
78
|
+
|
|
79
|
+
def scheduler_address(self) -> ZMQConfig:
|
|
80
|
+
"""Returns the scheduler address this processor's worker is connected to."""
|
|
81
|
+
return self._scheduler_address
|
|
82
|
+
|
|
83
|
+
def current_task(self) -> Optional[Task]:
|
|
84
|
+
return self._current_task
|
|
85
|
+
|
|
86
|
+
def __initialize(self):
|
|
87
|
+
# modify the logging path and add process id to the path
|
|
88
|
+
logging_paths = [f"{path}-{os.getpid()}" for path in self._logging_paths if path != "/dev/stdout"]
|
|
89
|
+
if "/dev/stdout" in self._logging_paths:
|
|
90
|
+
logging_paths.append("/dev/stdout")
|
|
91
|
+
|
|
92
|
+
setup_logger(log_paths=tuple(logging_paths), logging_level=self._logging_level)
|
|
93
|
+
tblib.pickling_support.install()
|
|
94
|
+
|
|
95
|
+
self._connector_agent: SyncConnector = ZMQSyncConnector(
|
|
96
|
+
context=zmq.Context(), socket_type=zmq.DEALER, address=self._agent_address, identity=None
|
|
97
|
+
)
|
|
98
|
+
self._connector_storage: SyncObjectStorageConnector = create_sync_object_storage_connector(
|
|
99
|
+
self._object_storage_address.host, self._object_storage_address.port
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self._object_cache = ObjectCache(
|
|
103
|
+
garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
|
|
104
|
+
trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
|
|
105
|
+
)
|
|
106
|
+
self._object_cache.start()
|
|
107
|
+
|
|
108
|
+
self.__register_signals()
|
|
109
|
+
|
|
110
|
+
# Execute optional preload hook if provided
|
|
111
|
+
if self._preload is not None:
|
|
112
|
+
try:
|
|
113
|
+
execute_preload(self._preload)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
f"Processor[{self.pid}] initialization failed due to preload error: {self._preload}"
|
|
117
|
+
) from e
|
|
118
|
+
|
|
119
|
+
def __register_signals(self):
|
|
120
|
+
self.__register_signal("SIGTERM", self.__interrupt)
|
|
121
|
+
|
|
122
|
+
if self._resume_event is not None:
|
|
123
|
+
self.__register_signal(SUSPEND_SIGNAL, self.__suspend)
|
|
124
|
+
|
|
125
|
+
def __interrupt(self, *args):
|
|
126
|
+
self._connector_agent.destroy() # interrupts any blocking socket.
|
|
127
|
+
|
|
128
|
+
def __suspend(self, *args):
|
|
129
|
+
assert self._resume_event is not None
|
|
130
|
+
assert self._resumed_event is not None
|
|
131
|
+
|
|
132
|
+
self._resume_event.wait() # stops any computation in the main thread until the event is triggered
|
|
133
|
+
|
|
134
|
+
# Ensures the processor agent knows we stopped waiting on `_resume_event`, as to avoid re-entrant wait on the
|
|
135
|
+
# event.
|
|
136
|
+
self._resumed_event.set()
|
|
137
|
+
|
|
138
|
+
def __run_forever(self):
|
|
139
|
+
try:
|
|
140
|
+
self._connector_agent.send(ProcessorInitialized.new_msg())
|
|
141
|
+
while True:
|
|
142
|
+
message = self._connector_agent.receive()
|
|
143
|
+
if message is None:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
self.__on_connector_receive(message)
|
|
147
|
+
|
|
148
|
+
except zmq.error.ZMQError as e:
|
|
149
|
+
if e.errno != zmq.ENOTSOCK: # ignore if socket got closed
|
|
150
|
+
raise
|
|
151
|
+
|
|
152
|
+
except (KeyboardInterrupt, InterruptedError):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logging.exception(f"Processor[{self.pid}]: failed with unhandled exception:\n{e}")
|
|
157
|
+
|
|
158
|
+
finally:
|
|
159
|
+
self._object_cache.destroy()
|
|
160
|
+
self._connector_agent.destroy()
|
|
161
|
+
|
|
162
|
+
self._object_cache.join()
|
|
163
|
+
|
|
164
|
+
def __on_connector_receive(self, message: Message):
|
|
165
|
+
if isinstance(message, ObjectInstruction):
|
|
166
|
+
self.__on_receive_object_instruction(message)
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
if isinstance(message, Task):
|
|
170
|
+
self.__on_received_task(message)
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
logging.error(f"unknown {message=}")
|
|
174
|
+
|
|
175
|
+
def __on_receive_object_instruction(self, instruction: ObjectInstruction):
|
|
176
|
+
if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
|
|
177
|
+
for object_id in instruction.object_metadata.object_ids:
|
|
178
|
+
self._object_cache.del_object(object_id)
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
logging.error(f"worker received unknown object instruction type {instruction=}")
|
|
182
|
+
|
|
183
|
+
def __on_received_task(self, task: Task):
|
|
184
|
+
self._current_task = task
|
|
185
|
+
|
|
186
|
+
self.__cache_required_object_ids(task)
|
|
187
|
+
|
|
188
|
+
self.__process_task(task)
|
|
189
|
+
|
|
190
|
+
def __cache_required_object_ids(self, task: Task) -> None:
|
|
191
|
+
required_object_ids = self.__get_required_object_ids_for_task(task)
|
|
192
|
+
|
|
193
|
+
for object_id in required_object_ids:
|
|
194
|
+
if self._object_cache.has_object(object_id):
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
object_content = self._connector_storage.get_object(object_id)
|
|
198
|
+
self._object_cache.add_object(task.source, object_id, object_content)
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def __get_required_object_ids_for_task(task: Task) -> List[ObjectID]:
|
|
202
|
+
serializer_id = ObjectID.generate_serializer_object_id(task.source)
|
|
203
|
+
object_ids = [
|
|
204
|
+
serializer_id,
|
|
205
|
+
task.func_object_id,
|
|
206
|
+
*(cast(ObjectID, argument) for argument in task.function_args),
|
|
207
|
+
]
|
|
208
|
+
return object_ids
|
|
209
|
+
|
|
210
|
+
def __process_task(self, task: Task):
|
|
211
|
+
task_flags = retrieve_task_flags_from_task(task)
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
function = self._object_cache.get_object(task.func_object_id)
|
|
215
|
+
|
|
216
|
+
args = [self._object_cache.get_object(cast(ObjectID, arg)) for arg in task.function_args]
|
|
217
|
+
|
|
218
|
+
if task_flags.stream_output:
|
|
219
|
+
with StreamingBuffer(
|
|
220
|
+
task.task_id, TaskLog.LogType.Stdout, self._connector_agent
|
|
221
|
+
) as stdout_buf, StreamingBuffer(
|
|
222
|
+
task.task_id, TaskLog.LogType.Stderr, self._connector_agent
|
|
223
|
+
) as stderr_buf, self.__processor_context(), redirect_stdout(
|
|
224
|
+
cast(IO[str], stdout_buf)
|
|
225
|
+
), redirect_stderr(
|
|
226
|
+
cast(IO[str], stderr_buf)
|
|
227
|
+
):
|
|
228
|
+
result = function(*args)
|
|
229
|
+
else:
|
|
230
|
+
with self.__processor_context():
|
|
231
|
+
result = function(*args)
|
|
232
|
+
|
|
233
|
+
result_bytes = self._object_cache.serialize(task.source, result)
|
|
234
|
+
task_result_type = TaskResultType.Success
|
|
235
|
+
|
|
236
|
+
except Exception as e:
|
|
237
|
+
logging.exception(f"exception when processing task_id={task.task_id.hex()}:")
|
|
238
|
+
task_result_type = TaskResultType.Failed
|
|
239
|
+
result_bytes = serialize_failure(e)
|
|
240
|
+
|
|
241
|
+
self.__send_result(task.source, task.task_id, task_result_type, result_bytes)
|
|
242
|
+
|
|
243
|
+
def __send_result(self, source: ClientID, task_id: TaskID, task_result_type: TaskResultType, result_bytes: bytes):
|
|
244
|
+
self._current_task = None
|
|
245
|
+
|
|
246
|
+
result_object_id = ObjectID.generate_object_id(source)
|
|
247
|
+
|
|
248
|
+
self._connector_storage.set_object(result_object_id, result_bytes)
|
|
249
|
+
self._connector_agent.send(
|
|
250
|
+
ObjectInstruction.new_msg(
|
|
251
|
+
ObjectInstruction.ObjectInstructionType.Create,
|
|
252
|
+
source,
|
|
253
|
+
ObjectMetadata.new_msg(
|
|
254
|
+
(result_object_id,),
|
|
255
|
+
(ObjectMetadata.ObjectContentType.Object,),
|
|
256
|
+
(f"<res {repr(result_object_id)}>".encode(),),
|
|
257
|
+
),
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
self._connector_agent.send(
|
|
261
|
+
TaskResult.new_msg(task_id, task_result_type, metadata=b"", results=[bytes(result_object_id)])
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
@staticmethod
|
|
265
|
+
def __set_current_processor(context: Optional["Processor"]) -> Token:
|
|
266
|
+
if context is not None and _current_processor.get() is not None:
|
|
267
|
+
raise ValueError("cannot override a previously set processor context.")
|
|
268
|
+
|
|
269
|
+
return _current_processor.set(context)
|
|
270
|
+
|
|
271
|
+
@contextlib.contextmanager
|
|
272
|
+
def __processor_context(self):
|
|
273
|
+
self.__set_current_processor(self)
|
|
274
|
+
try:
|
|
275
|
+
yield
|
|
276
|
+
finally:
|
|
277
|
+
self.__set_current_processor(None)
|
|
278
|
+
|
|
279
|
+
@staticmethod
|
|
280
|
+
def __register_signal(signal_name: str, handler: Callable) -> None:
|
|
281
|
+
signal_instance = getattr(signal, signal_name, None)
|
|
282
|
+
if signal_instance is None:
|
|
283
|
+
raise RuntimeError(f"unsupported platform, signal not available: {signal_name}.")
|
|
284
|
+
|
|
285
|
+
signal.signal(signal_instance, handler)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from scaler.io.mixins import SyncConnector
|
|
5
|
+
from scaler.protocol.python.message import TaskLog
|
|
6
|
+
from scaler.utility.identifiers import TaskID
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StreamingBuffer(io.TextIOBase):
|
|
10
|
+
"""A custom IO buffer that sends content as it's written."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, task_id: TaskID, log_type: TaskLog.LogType, connector_agent: SyncConnector):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._task_id = task_id
|
|
15
|
+
self._log_type = log_type
|
|
16
|
+
self._connector_agent = connector_agent
|
|
17
|
+
|
|
18
|
+
def write(self, content: str) -> int:
|
|
19
|
+
if self.closed:
|
|
20
|
+
return 0
|
|
21
|
+
|
|
22
|
+
if content:
|
|
23
|
+
try:
|
|
24
|
+
self._connector_agent.send(TaskLog.new_msg(self._task_id, self._log_type, content))
|
|
25
|
+
except Exception as e:
|
|
26
|
+
logging.warning(f"Failed to send stream content: {e}")
|
|
27
|
+
|
|
28
|
+
return 0
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import os
|
|
4
|
+
import signal
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import psutil
|
|
8
|
+
|
|
9
|
+
from scaler.config.defaults import DEFAULT_PROCESSOR_KILL_DELAY_SECONDS
|
|
10
|
+
from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
|
|
11
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
12
|
+
from scaler.protocol.python.message import Task
|
|
13
|
+
from scaler.utility.identifiers import ProcessorID
|
|
14
|
+
from scaler.worker.agent.processor.processor import SUSPEND_SIGNAL, Processor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProcessorHolder:
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
event_loop: str,
|
|
21
|
+
agent_address: ZMQConfig,
|
|
22
|
+
scheduler_address: ZMQConfig,
|
|
23
|
+
object_storage_address: ObjectStorageAddressConfig,
|
|
24
|
+
preload: Optional[str],
|
|
25
|
+
garbage_collect_interval_seconds: int,
|
|
26
|
+
trim_memory_threshold_bytes: int,
|
|
27
|
+
hard_suspend: bool,
|
|
28
|
+
logging_paths: Tuple[str, ...],
|
|
29
|
+
logging_level: str,
|
|
30
|
+
):
|
|
31
|
+
self._processor_id: Optional[ProcessorID] = None
|
|
32
|
+
self._task: Optional[Task] = None
|
|
33
|
+
self._suspended = False
|
|
34
|
+
|
|
35
|
+
self._hard_suspend = hard_suspend
|
|
36
|
+
if hard_suspend:
|
|
37
|
+
self._resume_event = None
|
|
38
|
+
self._resumed_event = None
|
|
39
|
+
else:
|
|
40
|
+
context = multiprocessing.get_context("spawn")
|
|
41
|
+
self._resume_event = context.Event()
|
|
42
|
+
self._resumed_event = context.Event()
|
|
43
|
+
|
|
44
|
+
self._processor = Processor(
|
|
45
|
+
event_loop=event_loop,
|
|
46
|
+
agent_address=agent_address,
|
|
47
|
+
scheduler_address=scheduler_address,
|
|
48
|
+
object_storage_address=object_storage_address,
|
|
49
|
+
preload=preload,
|
|
50
|
+
resume_event=self._resume_event,
|
|
51
|
+
resumed_event=self._resumed_event,
|
|
52
|
+
garbage_collect_interval_seconds=garbage_collect_interval_seconds,
|
|
53
|
+
trim_memory_threshold_bytes=trim_memory_threshold_bytes,
|
|
54
|
+
logging_paths=logging_paths,
|
|
55
|
+
logging_level=logging_level,
|
|
56
|
+
)
|
|
57
|
+
self._processor.start()
|
|
58
|
+
self._process = psutil.Process(self._processor.pid)
|
|
59
|
+
|
|
60
|
+
def pid(self) -> int:
|
|
61
|
+
assert self._processor.pid is not None
|
|
62
|
+
return self._processor.pid
|
|
63
|
+
|
|
64
|
+
def process(self) -> psutil.Process:
|
|
65
|
+
return self._process
|
|
66
|
+
|
|
67
|
+
def processor_id(self) -> ProcessorID:
|
|
68
|
+
assert self._processor_id is not None
|
|
69
|
+
return self._processor_id
|
|
70
|
+
|
|
71
|
+
def initialized(self) -> bool:
|
|
72
|
+
return self._processor_id is not None
|
|
73
|
+
|
|
74
|
+
def initialize(self, processor_id: ProcessorID):
|
|
75
|
+
self._processor_id = processor_id
|
|
76
|
+
|
|
77
|
+
def task(self) -> Optional[Task]:
|
|
78
|
+
return self._task
|
|
79
|
+
|
|
80
|
+
def set_task(self, task: Optional[Task]):
|
|
81
|
+
self._task = task
|
|
82
|
+
|
|
83
|
+
def suspended(self) -> bool:
|
|
84
|
+
return self._suspended
|
|
85
|
+
|
|
86
|
+
def suspend(self):
|
|
87
|
+
assert self._processor is not None
|
|
88
|
+
assert self._task is not None
|
|
89
|
+
assert self._suspended is False
|
|
90
|
+
assert self.initialized()
|
|
91
|
+
|
|
92
|
+
if self._hard_suspend:
|
|
93
|
+
self.__send_signal("SIGSTOP")
|
|
94
|
+
else:
|
|
95
|
+
# If we do not want to hardly suspend the processor's process (e.g. to keep network links alive), we request
|
|
96
|
+
# the process to wait on a synchronization event. That will stop the main thread while allowing the helper
|
|
97
|
+
# threads to continue running.
|
|
98
|
+
#
|
|
99
|
+
# See https://github.com/finos/opengris-scaler/issues/14
|
|
100
|
+
|
|
101
|
+
assert self._resume_event is not None
|
|
102
|
+
assert self._resumed_event is not None
|
|
103
|
+
self._resume_event.clear()
|
|
104
|
+
self._resumed_event.clear()
|
|
105
|
+
|
|
106
|
+
self.__send_signal(SUSPEND_SIGNAL)
|
|
107
|
+
|
|
108
|
+
self._suspended = True
|
|
109
|
+
|
|
110
|
+
def resume(self):
|
|
111
|
+
assert self._task is not None
|
|
112
|
+
assert self._suspended is True
|
|
113
|
+
|
|
114
|
+
if self._hard_suspend:
|
|
115
|
+
self.__send_signal("SIGCONT")
|
|
116
|
+
else:
|
|
117
|
+
assert self._resume_event is not None
|
|
118
|
+
assert self._resumed_event is not None
|
|
119
|
+
|
|
120
|
+
self._resume_event.set()
|
|
121
|
+
|
|
122
|
+
# Waits until the processor resumes processing. This avoids any future call to `suspend()` while the
|
|
123
|
+
# processor hasn't returned from the `_resumed_event.wait()` call yet (causes a re-entrant error on Linux).
|
|
124
|
+
self._resumed_event.wait()
|
|
125
|
+
|
|
126
|
+
self._suspended = False
|
|
127
|
+
|
|
128
|
+
def kill(self):
|
|
129
|
+
self.__send_signal("SIGTERM")
|
|
130
|
+
self._processor.join(DEFAULT_PROCESSOR_KILL_DELAY_SECONDS)
|
|
131
|
+
|
|
132
|
+
if self._processor.exitcode is None:
|
|
133
|
+
# TODO: some processors fail to interrupt because of a blocking 0mq call. Ideally we should interrupt
|
|
134
|
+
# these blocking calls instead of sending a SIGKILL signal.
|
|
135
|
+
|
|
136
|
+
logging.warning(f"Processor[{self.pid()}] does not terminate in time, send SIGKILL.")
|
|
137
|
+
self.__send_signal("SIGKILL")
|
|
138
|
+
self._processor.join()
|
|
139
|
+
|
|
140
|
+
self.set_task(None)
|
|
141
|
+
|
|
142
|
+
def __send_signal(self, signal_name: str):
|
|
143
|
+
signal_instance = getattr(signal, signal_name, None)
|
|
144
|
+
if signal_instance is None:
|
|
145
|
+
raise RuntimeError(f"unsupported platform, signal not available: {signal_name}.")
|
|
146
|
+
|
|
147
|
+
os.kill(self.pid(), signal_instance)
|