opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
scaler/worker/preload.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import importlib
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PreloadSpecError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def execute_preload(spec: str) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Import and execute the given preload spec in current interpreter.
|
|
16
|
+
|
|
17
|
+
Example: 'foo.bar:preload_function("a", 2)'
|
|
18
|
+
"""
|
|
19
|
+
module_path, func_name, args, kwargs = _parse_preload_spec(spec)
|
|
20
|
+
logging.info("preloading: %s:%s with args=%s kwargs=%s", module_path, func_name, args, kwargs)
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
module = importlib.import_module(module_path)
|
|
24
|
+
except ImportError:
|
|
25
|
+
if module_path.endswith(".py") and os.path.exists(module_path):
|
|
26
|
+
raise PreloadSpecError(
|
|
27
|
+
f"Failed to find module. Did you mean '{module_path.rsplit('.', 1)[0]}:{func_name}'?"
|
|
28
|
+
)
|
|
29
|
+
raise
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
target = getattr(module, func_name)
|
|
33
|
+
except AttributeError:
|
|
34
|
+
logging.exception(f"Failed to find attribute {func_name!r} in {module_path!r}.")
|
|
35
|
+
raise PreloadSpecError(f"Failed to find attribute {func_name!r} in {module_path!r}.")
|
|
36
|
+
|
|
37
|
+
if not callable(target):
|
|
38
|
+
raise PreloadSpecError("Preload target must be callable.")
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
if args is None:
|
|
42
|
+
# Simple name: call with no args
|
|
43
|
+
target()
|
|
44
|
+
else:
|
|
45
|
+
target(*args, **(kwargs or {}))
|
|
46
|
+
except TypeError as e:
|
|
47
|
+
raise PreloadSpecError("".join(traceback.format_exception_only(TypeError, e)).strip())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _parse_preload_spec(spec: str) -> Tuple[str, str, Optional[List[Any]], Optional[Dict[str, Any]]]:
|
|
51
|
+
"""
|
|
52
|
+
Parse 'pkg.mod:func(arg1, kw=val)' using AST.
|
|
53
|
+
Returns (module_path, func_name, args_or_None, kwargs_or_None).
|
|
54
|
+
If expression is a simple name (no args), returns args=None, kwargs=None.
|
|
55
|
+
"""
|
|
56
|
+
if ":" not in spec:
|
|
57
|
+
raise PreloadSpecError("preload must be in 'module.sub:func(...)' format")
|
|
58
|
+
|
|
59
|
+
module_part, obj_expr = spec.split(":", 1)
|
|
60
|
+
|
|
61
|
+
# Parse the right-hand side as a single expression
|
|
62
|
+
try:
|
|
63
|
+
expression = ast.parse(obj_expr, mode="eval").body
|
|
64
|
+
except SyntaxError:
|
|
65
|
+
raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
|
|
66
|
+
|
|
67
|
+
if isinstance(expression, ast.Name):
|
|
68
|
+
func_name = expression.id
|
|
69
|
+
args = None
|
|
70
|
+
kwargs = None
|
|
71
|
+
elif isinstance(expression, ast.Call):
|
|
72
|
+
# Ensure the function name is an attribute name only (no dotted path)
|
|
73
|
+
if not isinstance(expression.func, ast.Name):
|
|
74
|
+
raise PreloadSpecError(f"Function reference must be a simple name: {obj_expr!r}")
|
|
75
|
+
func_name = expression.func.id
|
|
76
|
+
try:
|
|
77
|
+
args = [ast.literal_eval(arg) for arg in expression.args]
|
|
78
|
+
kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in expression.keywords}
|
|
79
|
+
except ValueError:
|
|
80
|
+
raise PreloadSpecError(f"Failed to parse arguments as literal values: {obj_expr!r}")
|
|
81
|
+
else:
|
|
82
|
+
raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
|
|
83
|
+
|
|
84
|
+
return module_part, func_name, args, kwargs
|
scaler/worker/worker.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import os
|
|
5
|
+
import signal
|
|
6
|
+
import tempfile
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Dict, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import zmq.asyncio
|
|
11
|
+
|
|
12
|
+
from scaler.config.defaults import PROFILING_INTERVAL_SECONDS
|
|
13
|
+
from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
|
|
14
|
+
from scaler.config.types.zmq import ZMQConfig, ZMQType
|
|
15
|
+
from scaler.io.async_binder import ZMQAsyncBinder
|
|
16
|
+
from scaler.io.async_connector import ZMQAsyncConnector
|
|
17
|
+
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
|
|
18
|
+
from scaler.io.utility import create_async_object_storage_connector
|
|
19
|
+
from scaler.io.ymq import ymq
|
|
20
|
+
from scaler.protocol.python.message import (
|
|
21
|
+
ClientDisconnect,
|
|
22
|
+
DisconnectRequest,
|
|
23
|
+
ObjectInstruction,
|
|
24
|
+
ProcessorInitialized,
|
|
25
|
+
Task,
|
|
26
|
+
TaskCancel,
|
|
27
|
+
TaskLog,
|
|
28
|
+
TaskResult,
|
|
29
|
+
WorkerHeartbeatEcho,
|
|
30
|
+
)
|
|
31
|
+
from scaler.protocol.python.mixins import Message
|
|
32
|
+
from scaler.utility.event_loop import create_async_loop_routine, register_event_loop
|
|
33
|
+
from scaler.utility.exceptions import ClientShutdownException
|
|
34
|
+
from scaler.utility.identifiers import ProcessorID, WorkerID
|
|
35
|
+
from scaler.utility.logging.utility import setup_logger
|
|
36
|
+
from scaler.worker.agent.heartbeat_manager import VanillaHeartbeatManager
|
|
37
|
+
from scaler.worker.agent.processor_manager import VanillaProcessorManager
|
|
38
|
+
from scaler.worker.agent.profiling_manager import VanillaProfilingManager
|
|
39
|
+
from scaler.worker.agent.task_manager import VanillaTaskManager
|
|
40
|
+
from scaler.worker.agent.timeout_manager import VanillaTimeoutManager
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Worker(multiprocessing.get_context("spawn").Process): # type: ignore
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
event_loop: str,
|
|
47
|
+
name: str,
|
|
48
|
+
address: ZMQConfig,
|
|
49
|
+
object_storage_address: Optional[ObjectStorageAddressConfig],
|
|
50
|
+
preload: Optional[str],
|
|
51
|
+
capabilities: Dict[str, int],
|
|
52
|
+
io_threads: int,
|
|
53
|
+
task_queue_size: int,
|
|
54
|
+
heartbeat_interval_seconds: int,
|
|
55
|
+
garbage_collect_interval_seconds: int,
|
|
56
|
+
trim_memory_threshold_bytes: int,
|
|
57
|
+
task_timeout_seconds: int,
|
|
58
|
+
death_timeout_seconds: int,
|
|
59
|
+
hard_processor_suspend: bool,
|
|
60
|
+
logging_paths: Tuple[str, ...],
|
|
61
|
+
logging_level: str,
|
|
62
|
+
):
|
|
63
|
+
multiprocessing.Process.__init__(self, name="Agent")
|
|
64
|
+
|
|
65
|
+
self._event_loop = event_loop
|
|
66
|
+
self._name = name
|
|
67
|
+
self._address = address
|
|
68
|
+
self._object_storage_address = object_storage_address
|
|
69
|
+
self._preload = preload
|
|
70
|
+
self._capabilities = capabilities
|
|
71
|
+
self._io_threads = io_threads
|
|
72
|
+
self._task_queue_size = task_queue_size
|
|
73
|
+
|
|
74
|
+
self._ident = WorkerID.generate_worker_id(name) # _identity is internal to multiprocessing.Process
|
|
75
|
+
|
|
76
|
+
self._address_path_internal = os.path.join(tempfile.gettempdir(), f"scaler_worker_{uuid.uuid4().hex}")
|
|
77
|
+
self._address_internal = ZMQConfig(ZMQType.ipc, host=self._address_path_internal)
|
|
78
|
+
|
|
79
|
+
self._task_queue_size = task_queue_size
|
|
80
|
+
self._heartbeat_interval_seconds = heartbeat_interval_seconds
|
|
81
|
+
self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
|
|
82
|
+
self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
|
|
83
|
+
self._task_timeout_seconds = task_timeout_seconds
|
|
84
|
+
self._death_timeout_seconds = death_timeout_seconds
|
|
85
|
+
self._hard_processor_suspend = hard_processor_suspend
|
|
86
|
+
|
|
87
|
+
self._logging_paths = logging_paths
|
|
88
|
+
self._logging_level = logging_level
|
|
89
|
+
|
|
90
|
+
self._context: Optional[zmq.asyncio.Context] = None
|
|
91
|
+
self._connector_external: Optional[AsyncConnector] = None
|
|
92
|
+
self._binder_internal: Optional[AsyncBinder] = None
|
|
93
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
94
|
+
self._task_manager: Optional[VanillaTaskManager] = None
|
|
95
|
+
self._heartbeat_manager: Optional[VanillaHeartbeatManager] = None
|
|
96
|
+
self._profiling_manager: Optional[VanillaProfilingManager] = None
|
|
97
|
+
self._processor_manager: Optional[VanillaProcessorManager] = None
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def identity(self) -> WorkerID:
|
|
101
|
+
return self._ident
|
|
102
|
+
|
|
103
|
+
def run(self) -> None:
|
|
104
|
+
self.__initialize()
|
|
105
|
+
self.__run_forever()
|
|
106
|
+
|
|
107
|
+
def __initialize(self):
|
|
108
|
+
setup_logger()
|
|
109
|
+
register_event_loop(self._event_loop)
|
|
110
|
+
|
|
111
|
+
self._context = zmq.asyncio.Context()
|
|
112
|
+
self._connector_external = ZMQAsyncConnector(
|
|
113
|
+
context=self._context,
|
|
114
|
+
name=self.name,
|
|
115
|
+
socket_type=zmq.DEALER,
|
|
116
|
+
address=self._address,
|
|
117
|
+
bind_or_connect="connect",
|
|
118
|
+
callback=self.__on_receive_external,
|
|
119
|
+
identity=self._ident,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
self._binder_internal = ZMQAsyncBinder(
|
|
123
|
+
context=self._context, name=self.name, address=self._address_internal, identity=self._ident
|
|
124
|
+
)
|
|
125
|
+
self._binder_internal.register(self.__on_receive_internal)
|
|
126
|
+
|
|
127
|
+
self._connector_storage = create_async_object_storage_connector()
|
|
128
|
+
|
|
129
|
+
self._heartbeat_manager = VanillaHeartbeatManager(
|
|
130
|
+
object_storage_address=self._object_storage_address,
|
|
131
|
+
capabilities=self._capabilities,
|
|
132
|
+
task_queue_size=self._task_queue_size,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
self._profiling_manager = VanillaProfilingManager()
|
|
136
|
+
self._task_manager = VanillaTaskManager(task_timeout_seconds=self._task_timeout_seconds)
|
|
137
|
+
self._timeout_manager = VanillaTimeoutManager(death_timeout_seconds=self._death_timeout_seconds)
|
|
138
|
+
self._processor_manager = VanillaProcessorManager(
|
|
139
|
+
identity=self._ident,
|
|
140
|
+
event_loop=self._event_loop,
|
|
141
|
+
address_internal=self._address_internal,
|
|
142
|
+
scheduler_address=self._address,
|
|
143
|
+
preload=self._preload,
|
|
144
|
+
garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
|
|
145
|
+
trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
|
|
146
|
+
hard_processor_suspend=self._hard_processor_suspend,
|
|
147
|
+
logging_paths=self._logging_paths,
|
|
148
|
+
logging_level=self._logging_level,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# register
|
|
152
|
+
self._task_manager.register(connector=self._connector_external, processor_manager=self._processor_manager)
|
|
153
|
+
self._heartbeat_manager.register(
|
|
154
|
+
connector_external=self._connector_external,
|
|
155
|
+
connector_storage=self._connector_storage,
|
|
156
|
+
worker_task_manager=self._task_manager,
|
|
157
|
+
timeout_manager=self._timeout_manager,
|
|
158
|
+
processor_manager=self._processor_manager,
|
|
159
|
+
)
|
|
160
|
+
self._processor_manager.register(
|
|
161
|
+
heartbeat_manager=self._heartbeat_manager,
|
|
162
|
+
task_manager=self._task_manager,
|
|
163
|
+
profiling_manager=self._profiling_manager,
|
|
164
|
+
connector_external=self._connector_external,
|
|
165
|
+
binder_internal=self._binder_internal,
|
|
166
|
+
connector_storage=self._connector_storage,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
self._loop = asyncio.get_event_loop()
|
|
170
|
+
self.__register_signal()
|
|
171
|
+
self._task = self._loop.create_task(self.__get_loops())
|
|
172
|
+
|
|
173
|
+
async def __on_receive_external(self, message: Message):
|
|
174
|
+
if isinstance(message, WorkerHeartbeatEcho):
|
|
175
|
+
await self._heartbeat_manager.on_heartbeat_echo(message)
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
if isinstance(message, Task):
|
|
179
|
+
await self._task_manager.on_task_new(message)
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
if isinstance(message, TaskCancel):
|
|
183
|
+
await self._task_manager.on_cancel_task(message)
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
if isinstance(message, ObjectInstruction):
|
|
187
|
+
await self._processor_manager.on_external_object_instruction(message)
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if isinstance(message, ClientDisconnect):
|
|
191
|
+
if message.disconnect_type == ClientDisconnect.DisconnectType.Shutdown:
|
|
192
|
+
raise ClientShutdownException("received client shutdown, quitting")
|
|
193
|
+
logging.error(f"Worker received invalid ClientDisconnect type, ignoring {message=}")
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
raise TypeError(f"Unknown {message=}")
|
|
197
|
+
|
|
198
|
+
async def __on_receive_internal(self, processor_id_bytes: bytes, message: Message):
|
|
199
|
+
processor_id = ProcessorID(processor_id_bytes)
|
|
200
|
+
|
|
201
|
+
if isinstance(message, ProcessorInitialized):
|
|
202
|
+
await self._processor_manager.on_processor_initialized(processor_id, message)
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
if isinstance(message, ObjectInstruction):
|
|
206
|
+
await self._processor_manager.on_internal_object_instruction(processor_id, message)
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
if isinstance(message, TaskLog):
|
|
210
|
+
await self._connector_external.send(message)
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
if isinstance(message, TaskResult):
|
|
214
|
+
await self._processor_manager.on_task_result(processor_id, message)
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
raise TypeError(f"Unknown message from {processor_id!r}: {message}")
|
|
218
|
+
|
|
219
|
+
async def __get_loops(self):
|
|
220
|
+
if self._object_storage_address is not None:
|
|
221
|
+
# With a manually set storage address, immediately connect to the object storage server.
|
|
222
|
+
await self._connector_storage.connect(self._object_storage_address.host, self._object_storage_address.port)
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
await asyncio.gather(
|
|
226
|
+
self._processor_manager.initialize(),
|
|
227
|
+
create_async_loop_routine(self._connector_external.routine, 0),
|
|
228
|
+
create_async_loop_routine(self._connector_storage.routine, 0),
|
|
229
|
+
create_async_loop_routine(self._binder_internal.routine, 0),
|
|
230
|
+
create_async_loop_routine(self._heartbeat_manager.routine, self._heartbeat_interval_seconds),
|
|
231
|
+
create_async_loop_routine(self._timeout_manager.routine, 1),
|
|
232
|
+
create_async_loop_routine(self._task_manager.routine, 0),
|
|
233
|
+
create_async_loop_routine(self._profiling_manager.routine, PROFILING_INTERVAL_SECONDS),
|
|
234
|
+
)
|
|
235
|
+
except asyncio.CancelledError:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
# TODO: Should the object storage connector catch this error?
|
|
239
|
+
except ymq.YMQException as e:
|
|
240
|
+
if e.code == ymq.ErrorCode.ConnectorSocketClosedByRemoteEnd:
|
|
241
|
+
pass
|
|
242
|
+
else:
|
|
243
|
+
logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
|
|
244
|
+
except (ClientShutdownException, TimeoutError) as e:
|
|
245
|
+
logging.info(f"{self.identity!r}: {str(e)}")
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
|
|
248
|
+
|
|
249
|
+
await self._connector_external.send(DisconnectRequest.new_msg(self.identity))
|
|
250
|
+
|
|
251
|
+
self._connector_external.destroy()
|
|
252
|
+
self._processor_manager.destroy("quit")
|
|
253
|
+
self._binder_internal.destroy()
|
|
254
|
+
os.remove(self._address_path_internal)
|
|
255
|
+
|
|
256
|
+
logging.info(f"{self.identity!r}: quit")
|
|
257
|
+
|
|
258
|
+
def __run_forever(self):
|
|
259
|
+
self._loop.run_until_complete(self._task)
|
|
260
|
+
|
|
261
|
+
def __register_signal(self):
|
|
262
|
+
self._loop.add_signal_handler(signal.SIGINT, self.__destroy)
|
|
263
|
+
|
|
264
|
+
def __destroy(self):
|
|
265
|
+
self._task.cancel()
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
WorkerGroupID = bytes
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CapacityExceededError(Exception):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WorkerGroupNotFoundError(Exception):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def format_capabilities(capabilities: Dict[str, int]) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Reverse of `parse_capabilities`: convert a capabilities dict into a
|
|
17
|
+
comma-separated capability string (e.g. "linux,cpu=4").
|
|
18
|
+
Values equal to -1 are emitted as flag-style entries (no `=value`).
|
|
19
|
+
"""
|
|
20
|
+
parts = []
|
|
21
|
+
for name, value in capabilities.items():
|
|
22
|
+
if value == -1:
|
|
23
|
+
parts.append(name)
|
|
24
|
+
else:
|
|
25
|
+
parts.append(f"{name}={value}")
|
|
26
|
+
return ",".join(parts)
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Set
|
|
5
|
+
|
|
6
|
+
import boto3
|
|
7
|
+
from aiohttp import web
|
|
8
|
+
from aiohttp.web_request import Request
|
|
9
|
+
|
|
10
|
+
from scaler.config.section.ecs_worker_adapter import ECSWorkerAdapterConfig
|
|
11
|
+
from scaler.utility.identifiers import WorkerID
|
|
12
|
+
from scaler.worker_adapter.common import (
|
|
13
|
+
CapacityExceededError,
|
|
14
|
+
WorkerGroupID,
|
|
15
|
+
WorkerGroupNotFoundError,
|
|
16
|
+
format_capabilities,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class WorkerGroupInfo:
|
|
22
|
+
worker_ids: Set[WorkerID]
|
|
23
|
+
task_arn: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ECSWorkerAdapter:
|
|
27
|
+
def __init__(self, config: ECSWorkerAdapterConfig):
|
|
28
|
+
self._address = config.worker_adapter_config.scheduler_address
|
|
29
|
+
self._object_storage_address = config.worker_adapter_config.object_storage_address
|
|
30
|
+
self._capabilities = config.worker_config.per_worker_capabilities.capabilities
|
|
31
|
+
self._io_threads = config.worker_io_threads
|
|
32
|
+
self._per_worker_task_queue_size = config.worker_config.per_worker_task_queue_size
|
|
33
|
+
self._max_instances = config.worker_adapter_config.max_workers
|
|
34
|
+
self._heartbeat_interval_seconds = config.worker_config.heartbeat_interval_seconds
|
|
35
|
+
self._task_timeout_seconds = config.worker_config.task_timeout_seconds
|
|
36
|
+
self._death_timeout_seconds = config.worker_config.death_timeout_seconds
|
|
37
|
+
self._garbage_collect_interval_seconds = config.worker_config.garbage_collect_interval_seconds
|
|
38
|
+
self._trim_memory_threshold_bytes = config.worker_config.trim_memory_threshold_bytes
|
|
39
|
+
self._hard_processor_suspend = config.worker_config.hard_processor_suspend
|
|
40
|
+
self._event_loop = config.event_loop
|
|
41
|
+
|
|
42
|
+
self._aws_access_key_id = config.aws_access_key_id
|
|
43
|
+
self._aws_secret_access_key = config.aws_secret_access_key
|
|
44
|
+
self._aws_region = config.aws_region
|
|
45
|
+
|
|
46
|
+
self._ecs_cluster = config.ecs_cluster
|
|
47
|
+
self._ecs_task_image = config.ecs_task_image
|
|
48
|
+
self._ecs_python_requirements = config.ecs_python_requirements
|
|
49
|
+
self._ecs_python_version = config.ecs_python_version
|
|
50
|
+
self._ecs_task_definition = config.ecs_task_definition
|
|
51
|
+
self._ecs_task_cpu = config.ecs_task_cpu
|
|
52
|
+
self._ecs_task_memory = config.ecs_task_memory
|
|
53
|
+
self._ecs_subnets = config.ecs_subnets
|
|
54
|
+
|
|
55
|
+
aws_session = boto3.Session(
|
|
56
|
+
aws_access_key_id=self._aws_access_key_id,
|
|
57
|
+
aws_secret_access_key=self._aws_secret_access_key,
|
|
58
|
+
region_name=self._aws_region,
|
|
59
|
+
)
|
|
60
|
+
self._ecs_client = aws_session.client("ecs")
|
|
61
|
+
|
|
62
|
+
resp = self._ecs_client.describe_clusters(clusters=[self._ecs_cluster])
|
|
63
|
+
clusters = resp.get("clusters") or []
|
|
64
|
+
if not clusters or clusters[0]["status"] != "ACTIVE":
|
|
65
|
+
logging.info(f"ECS cluster '{self._ecs_cluster}' missing, creating it.")
|
|
66
|
+
self._ecs_client.create_cluster(clusterName=self._ecs_cluster)
|
|
67
|
+
|
|
68
|
+
self._worker_groups: Dict[WorkerGroupID, WorkerGroupInfo] = {}
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
resp = self._ecs_client.describe_task_definition(taskDefinition=self._ecs_task_definition)
|
|
72
|
+
except self._ecs_client.exceptions.ClientException:
|
|
73
|
+
logging.info(f"ECS task definition '{self._ecs_task_definition}' missing, creating it.")
|
|
74
|
+
iam_client = aws_session.client("iam")
|
|
75
|
+
try:
|
|
76
|
+
resp = iam_client.get_role(RoleName="ecsTaskExecutionRole")
|
|
77
|
+
execution_role_arn = resp["Role"]["Arn"]
|
|
78
|
+
except iam_client.exceptions.NoSuchEntityException:
|
|
79
|
+
resp = iam_client.create_role(
|
|
80
|
+
RoleName="ecsTaskExecutionRole",
|
|
81
|
+
AssumeRolePolicyDocument=(
|
|
82
|
+
'{"Version": "2012-10-17", '
|
|
83
|
+
'"Statement": [{"Effect": "Allow", '
|
|
84
|
+
'"Principal": {"Service": "ecs-tasks.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
execution_role_arn = resp["Role"]["Arn"]
|
|
88
|
+
iam_client.attach_role_policy(
|
|
89
|
+
RoleName="ecsTaskExecutionRole",
|
|
90
|
+
PolicyArn="arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
resp = self._ecs_client.register_task_definition(
|
|
94
|
+
family=self._ecs_task_definition,
|
|
95
|
+
cpu=str(self._ecs_task_cpu * 1024),
|
|
96
|
+
memory=str(self._ecs_task_memory * 1024),
|
|
97
|
+
runtimePlatform={"cpuArchitecture": "X86_64", "operatingSystemFamily": "LINUX"},
|
|
98
|
+
networkMode="awsvpc",
|
|
99
|
+
containerDefinitions=[{"name": "scaler-container", "image": self._ecs_task_image, "essential": True}],
|
|
100
|
+
requiresCompatibilities=["FARGATE"],
|
|
101
|
+
executionRoleArn=execution_role_arn,
|
|
102
|
+
)
|
|
103
|
+
self._ecs_task_definition = resp["taskDefinition"]["taskDefinitionArn"]
|
|
104
|
+
|
|
105
|
+
async def start_worker_group(self) -> WorkerGroupID:
|
|
106
|
+
if len(self._worker_groups) >= self._max_instances != -1:
|
|
107
|
+
raise CapacityExceededError(f"Maximum number of instances ({self._max_instances}) reached.")
|
|
108
|
+
|
|
109
|
+
worker_names = [f"ECS|{uuid.uuid4().hex}" for _ in range(self._ecs_task_cpu)]
|
|
110
|
+
command = (
|
|
111
|
+
f"scaler_cluster {self._address.to_address()} "
|
|
112
|
+
f"--num-of-workers {self._ecs_task_cpu} "
|
|
113
|
+
f"--worker-names \"{','.join(worker_names)}\" "
|
|
114
|
+
f"--per-worker-task-queue-size {self._per_worker_task_queue_size} "
|
|
115
|
+
f"--heartbeat-interval-seconds {self._heartbeat_interval_seconds} "
|
|
116
|
+
f"--task-timeout-seconds {self._task_timeout_seconds} "
|
|
117
|
+
f"--garbage-collect-interval-seconds {self._garbage_collect_interval_seconds} "
|
|
118
|
+
f"--death-timeout-seconds {self._death_timeout_seconds} "
|
|
119
|
+
f"--trim-memory-threshold-bytes {self._trim_memory_threshold_bytes} "
|
|
120
|
+
f"--event-loop {self._event_loop} "
|
|
121
|
+
f"--worker-io-threads {self._io_threads}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if self._hard_processor_suspend:
|
|
125
|
+
command += " --hard-processor-suspend"
|
|
126
|
+
|
|
127
|
+
if self._object_storage_address:
|
|
128
|
+
command += f" --object-storage-address {self._object_storage_address.to_string()}"
|
|
129
|
+
|
|
130
|
+
if format_capabilities(self._capabilities).strip():
|
|
131
|
+
command += f" --per-worker-capabilities {format_capabilities(self._capabilities)}"
|
|
132
|
+
|
|
133
|
+
resp = self._ecs_client.run_task(
|
|
134
|
+
cluster=self._ecs_cluster,
|
|
135
|
+
taskDefinition=self._ecs_task_definition,
|
|
136
|
+
launchType="FARGATE",
|
|
137
|
+
overrides={
|
|
138
|
+
"containerOverrides": [
|
|
139
|
+
{
|
|
140
|
+
"name": "scaler-container",
|
|
141
|
+
"environment": [
|
|
142
|
+
{"name": "COMMAND", "value": command},
|
|
143
|
+
{"name": "PYTHON_REQUIREMENTS", "value": self._ecs_python_requirements},
|
|
144
|
+
{"name": "PYTHON_VERSION", "value": self._ecs_python_version},
|
|
145
|
+
],
|
|
146
|
+
}
|
|
147
|
+
]
|
|
148
|
+
},
|
|
149
|
+
networkConfiguration={"awsvpcConfiguration": {"subnets": self._ecs_subnets, "assignPublicIp": "ENABLED"}},
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
failures = resp.get("failures") or []
|
|
153
|
+
if failures:
|
|
154
|
+
raise RuntimeError(f"ECS run task failed: {failures}")
|
|
155
|
+
|
|
156
|
+
tasks = resp.get("tasks") or []
|
|
157
|
+
if not tasks:
|
|
158
|
+
raise RuntimeError("ECS run task returned no tasks")
|
|
159
|
+
if len(tasks) > 1:
|
|
160
|
+
raise RuntimeError("ECS run task returned multiple tasks, expected only one")
|
|
161
|
+
|
|
162
|
+
task_arn = tasks[0]["taskArn"]
|
|
163
|
+
worker_group_id = f"ecs-{uuid.uuid4().hex}".encode()
|
|
164
|
+
self._worker_groups[worker_group_id] = WorkerGroupInfo(
|
|
165
|
+
worker_ids={WorkerID.generate_worker_id(worker_name) for worker_name in worker_names}, task_arn=task_arn
|
|
166
|
+
)
|
|
167
|
+
return worker_group_id
|
|
168
|
+
|
|
169
|
+
async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
|
|
170
|
+
if worker_group_id not in self._worker_groups:
|
|
171
|
+
raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
|
|
172
|
+
|
|
173
|
+
resp = self._ecs_client.stop_task(
|
|
174
|
+
cluster=self._ecs_cluster,
|
|
175
|
+
task=self._worker_groups[worker_group_id].task_arn,
|
|
176
|
+
reason="Shutdown requested by ecs adapter",
|
|
177
|
+
)
|
|
178
|
+
failures = resp.get("failures") or []
|
|
179
|
+
if failures:
|
|
180
|
+
raise RuntimeError(f"ECS stop task failed: {failures}")
|
|
181
|
+
|
|
182
|
+
self._worker_groups.pop(worker_group_id)
|
|
183
|
+
|
|
184
|
+
async def webhook_handler(self, request: Request):
|
|
185
|
+
request_json = await request.json()
|
|
186
|
+
|
|
187
|
+
if "action" not in request_json:
|
|
188
|
+
return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
|
|
189
|
+
|
|
190
|
+
action = request_json["action"]
|
|
191
|
+
|
|
192
|
+
if action == "get_worker_adapter_info":
|
|
193
|
+
return web.json_response(
|
|
194
|
+
{
|
|
195
|
+
"max_worker_groups": self._max_instances,
|
|
196
|
+
"workers_per_group": self._ecs_task_cpu,
|
|
197
|
+
"base_capabilities": self._capabilities,
|
|
198
|
+
},
|
|
199
|
+
status=web.HTTPOk.status_code,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
elif action == "start_worker_group":
|
|
203
|
+
try:
|
|
204
|
+
worker_group_id = await self.start_worker_group()
|
|
205
|
+
except CapacityExceededError as e:
|
|
206
|
+
return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
|
|
209
|
+
|
|
210
|
+
return web.json_response(
|
|
211
|
+
{
|
|
212
|
+
"status": "Worker group started",
|
|
213
|
+
"worker_group_id": worker_group_id.decode(),
|
|
214
|
+
"worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].worker_ids],
|
|
215
|
+
},
|
|
216
|
+
status=web.HTTPOk.status_code,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
elif action == "shutdown_worker_group":
|
|
220
|
+
if "worker_group_id" not in request_json:
|
|
221
|
+
return web.json_response(
|
|
222
|
+
{"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
worker_group_id = request_json["worker_group_id"].encode()
|
|
226
|
+
try:
|
|
227
|
+
await self.shutdown_worker_group(worker_group_id)
|
|
228
|
+
except WorkerGroupNotFoundError as e:
|
|
229
|
+
return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
|
|
230
|
+
except Exception as e:
|
|
231
|
+
return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
|
|
232
|
+
|
|
233
|
+
return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
|
|
234
|
+
|
|
235
|
+
else:
|
|
236
|
+
return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
|
|
237
|
+
|
|
238
|
+
def create_app(self):
|
|
239
|
+
app = web.Application()
|
|
240
|
+
app.router.add_post("/", self.webhook_handler)
|
|
241
|
+
return app
|