opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,84 @@
1
+ import ast
2
+ import importlib
3
+ import logging
4
+ import os
5
+ import traceback
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+
9
+ class PreloadSpecError(Exception):
10
+ pass
11
+
12
+
13
+ def execute_preload(spec: str) -> None:
14
+ """
15
+ Import and execute the given preload spec in current interpreter.
16
+
17
+ Example: 'foo.bar:preload_function("a", 2)'
18
+ """
19
+ module_path, func_name, args, kwargs = _parse_preload_spec(spec)
20
+ logging.info("preloading: %s:%s with args=%s kwargs=%s", module_path, func_name, args, kwargs)
21
+
22
+ try:
23
+ module = importlib.import_module(module_path)
24
+ except ImportError:
25
+ if module_path.endswith(".py") and os.path.exists(module_path):
26
+ raise PreloadSpecError(
27
+ f"Failed to find module. Did you mean '{module_path.rsplit('.', 1)[0]}:{func_name}'?"
28
+ )
29
+ raise
30
+
31
+ try:
32
+ target = getattr(module, func_name)
33
+ except AttributeError:
34
+ logging.exception(f"Failed to find attribute {func_name!r} in {module_path!r}.")
35
+ raise PreloadSpecError(f"Failed to find attribute {func_name!r} in {module_path!r}.")
36
+
37
+ if not callable(target):
38
+ raise PreloadSpecError("Preload target must be callable.")
39
+
40
+ try:
41
+ if args is None:
42
+ # Simple name: call with no args
43
+ target()
44
+ else:
45
+ target(*args, **(kwargs or {}))
46
+ except TypeError as e:
47
+ raise PreloadSpecError("".join(traceback.format_exception_only(TypeError, e)).strip())
48
+
49
+
50
+ def _parse_preload_spec(spec: str) -> Tuple[str, str, Optional[List[Any]], Optional[Dict[str, Any]]]:
51
+ """
52
+ Parse 'pkg.mod:func(arg1, kw=val)' using AST.
53
+ Returns (module_path, func_name, args_or_None, kwargs_or_None).
54
+ If expression is a simple name (no args), returns args=None, kwargs=None.
55
+ """
56
+ if ":" not in spec:
57
+ raise PreloadSpecError("preload must be in 'module.sub:func(...)' format")
58
+
59
+ module_part, obj_expr = spec.split(":", 1)
60
+
61
+ # Parse the right-hand side as a single expression
62
+ try:
63
+ expression = ast.parse(obj_expr, mode="eval").body
64
+ except SyntaxError:
65
+ raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
66
+
67
+ if isinstance(expression, ast.Name):
68
+ func_name = expression.id
69
+ args = None
70
+ kwargs = None
71
+ elif isinstance(expression, ast.Call):
72
+ # Ensure the function name is an attribute name only (no dotted path)
73
+ if not isinstance(expression.func, ast.Name):
74
+ raise PreloadSpecError(f"Function reference must be a simple name: {obj_expr!r}")
75
+ func_name = expression.func.id
76
+ try:
77
+ args = [ast.literal_eval(arg) for arg in expression.args]
78
+ kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in expression.keywords}
79
+ except ValueError:
80
+ raise PreloadSpecError(f"Failed to parse arguments as literal values: {obj_expr!r}")
81
+ else:
82
+ raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
83
+
84
+ return module_part, func_name, args, kwargs
@@ -0,0 +1,265 @@
1
+ import asyncio
2
+ import logging
3
+ import multiprocessing
4
+ import os
5
+ import signal
6
+ import tempfile
7
+ import uuid
8
+ from typing import Dict, Optional, Tuple
9
+
10
+ import zmq.asyncio
11
+
12
+ from scaler.config.defaults import PROFILING_INTERVAL_SECONDS
13
+ from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
14
+ from scaler.config.types.zmq import ZMQConfig, ZMQType
15
+ from scaler.io.async_binder import ZMQAsyncBinder
16
+ from scaler.io.async_connector import ZMQAsyncConnector
17
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
18
+ from scaler.io.utility import create_async_object_storage_connector
19
+ from scaler.io.ymq import ymq
20
+ from scaler.protocol.python.message import (
21
+ ClientDisconnect,
22
+ DisconnectRequest,
23
+ ObjectInstruction,
24
+ ProcessorInitialized,
25
+ Task,
26
+ TaskCancel,
27
+ TaskLog,
28
+ TaskResult,
29
+ WorkerHeartbeatEcho,
30
+ )
31
+ from scaler.protocol.python.mixins import Message
32
+ from scaler.utility.event_loop import create_async_loop_routine, register_event_loop
33
+ from scaler.utility.exceptions import ClientShutdownException
34
+ from scaler.utility.identifiers import ProcessorID, WorkerID
35
+ from scaler.utility.logging.utility import setup_logger
36
+ from scaler.worker.agent.heartbeat_manager import VanillaHeartbeatManager
37
+ from scaler.worker.agent.processor_manager import VanillaProcessorManager
38
+ from scaler.worker.agent.profiling_manager import VanillaProfilingManager
39
+ from scaler.worker.agent.task_manager import VanillaTaskManager
40
+ from scaler.worker.agent.timeout_manager import VanillaTimeoutManager
41
+
42
+
43
+ class Worker(multiprocessing.get_context("spawn").Process): # type: ignore
44
+ def __init__(
45
+ self,
46
+ event_loop: str,
47
+ name: str,
48
+ address: ZMQConfig,
49
+ object_storage_address: Optional[ObjectStorageAddressConfig],
50
+ preload: Optional[str],
51
+ capabilities: Dict[str, int],
52
+ io_threads: int,
53
+ task_queue_size: int,
54
+ heartbeat_interval_seconds: int,
55
+ garbage_collect_interval_seconds: int,
56
+ trim_memory_threshold_bytes: int,
57
+ task_timeout_seconds: int,
58
+ death_timeout_seconds: int,
59
+ hard_processor_suspend: bool,
60
+ logging_paths: Tuple[str, ...],
61
+ logging_level: str,
62
+ ):
63
+ multiprocessing.Process.__init__(self, name="Agent")
64
+
65
+ self._event_loop = event_loop
66
+ self._name = name
67
+ self._address = address
68
+ self._object_storage_address = object_storage_address
69
+ self._preload = preload
70
+ self._capabilities = capabilities
71
+ self._io_threads = io_threads
72
+ self._task_queue_size = task_queue_size
73
+
74
+ self._ident = WorkerID.generate_worker_id(name) # _identity is internal to multiprocessing.Process
75
+
76
+ self._address_path_internal = os.path.join(tempfile.gettempdir(), f"scaler_worker_{uuid.uuid4().hex}")
77
+ self._address_internal = ZMQConfig(ZMQType.ipc, host=self._address_path_internal)
78
+
79
+ self._task_queue_size = task_queue_size
80
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
81
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
82
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
83
+ self._task_timeout_seconds = task_timeout_seconds
84
+ self._death_timeout_seconds = death_timeout_seconds
85
+ self._hard_processor_suspend = hard_processor_suspend
86
+
87
+ self._logging_paths = logging_paths
88
+ self._logging_level = logging_level
89
+
90
+ self._context: Optional[zmq.asyncio.Context] = None
91
+ self._connector_external: Optional[AsyncConnector] = None
92
+ self._binder_internal: Optional[AsyncBinder] = None
93
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
94
+ self._task_manager: Optional[VanillaTaskManager] = None
95
+ self._heartbeat_manager: Optional[VanillaHeartbeatManager] = None
96
+ self._profiling_manager: Optional[VanillaProfilingManager] = None
97
+ self._processor_manager: Optional[VanillaProcessorManager] = None
98
+
99
+ @property
100
+ def identity(self) -> WorkerID:
101
+ return self._ident
102
+
103
+ def run(self) -> None:
104
+ self.__initialize()
105
+ self.__run_forever()
106
+
107
+ def __initialize(self):
108
+ setup_logger()
109
+ register_event_loop(self._event_loop)
110
+
111
+ self._context = zmq.asyncio.Context()
112
+ self._connector_external = ZMQAsyncConnector(
113
+ context=self._context,
114
+ name=self.name,
115
+ socket_type=zmq.DEALER,
116
+ address=self._address,
117
+ bind_or_connect="connect",
118
+ callback=self.__on_receive_external,
119
+ identity=self._ident,
120
+ )
121
+
122
+ self._binder_internal = ZMQAsyncBinder(
123
+ context=self._context, name=self.name, address=self._address_internal, identity=self._ident
124
+ )
125
+ self._binder_internal.register(self.__on_receive_internal)
126
+
127
+ self._connector_storage = create_async_object_storage_connector()
128
+
129
+ self._heartbeat_manager = VanillaHeartbeatManager(
130
+ object_storage_address=self._object_storage_address,
131
+ capabilities=self._capabilities,
132
+ task_queue_size=self._task_queue_size,
133
+ )
134
+
135
+ self._profiling_manager = VanillaProfilingManager()
136
+ self._task_manager = VanillaTaskManager(task_timeout_seconds=self._task_timeout_seconds)
137
+ self._timeout_manager = VanillaTimeoutManager(death_timeout_seconds=self._death_timeout_seconds)
138
+ self._processor_manager = VanillaProcessorManager(
139
+ identity=self._ident,
140
+ event_loop=self._event_loop,
141
+ address_internal=self._address_internal,
142
+ scheduler_address=self._address,
143
+ preload=self._preload,
144
+ garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
145
+ trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
146
+ hard_processor_suspend=self._hard_processor_suspend,
147
+ logging_paths=self._logging_paths,
148
+ logging_level=self._logging_level,
149
+ )
150
+
151
+ # register
152
+ self._task_manager.register(connector=self._connector_external, processor_manager=self._processor_manager)
153
+ self._heartbeat_manager.register(
154
+ connector_external=self._connector_external,
155
+ connector_storage=self._connector_storage,
156
+ worker_task_manager=self._task_manager,
157
+ timeout_manager=self._timeout_manager,
158
+ processor_manager=self._processor_manager,
159
+ )
160
+ self._processor_manager.register(
161
+ heartbeat_manager=self._heartbeat_manager,
162
+ task_manager=self._task_manager,
163
+ profiling_manager=self._profiling_manager,
164
+ connector_external=self._connector_external,
165
+ binder_internal=self._binder_internal,
166
+ connector_storage=self._connector_storage,
167
+ )
168
+
169
+ self._loop = asyncio.get_event_loop()
170
+ self.__register_signal()
171
+ self._task = self._loop.create_task(self.__get_loops())
172
+
173
+ async def __on_receive_external(self, message: Message):
174
+ if isinstance(message, WorkerHeartbeatEcho):
175
+ await self._heartbeat_manager.on_heartbeat_echo(message)
176
+ return
177
+
178
+ if isinstance(message, Task):
179
+ await self._task_manager.on_task_new(message)
180
+ return
181
+
182
+ if isinstance(message, TaskCancel):
183
+ await self._task_manager.on_cancel_task(message)
184
+ return
185
+
186
+ if isinstance(message, ObjectInstruction):
187
+ await self._processor_manager.on_external_object_instruction(message)
188
+ return
189
+
190
+ if isinstance(message, ClientDisconnect):
191
+ if message.disconnect_type == ClientDisconnect.DisconnectType.Shutdown:
192
+ raise ClientShutdownException("received client shutdown, quitting")
193
+ logging.error(f"Worker received invalid ClientDisconnect type, ignoring {message=}")
194
+ return
195
+
196
+ raise TypeError(f"Unknown {message=}")
197
+
198
+ async def __on_receive_internal(self, processor_id_bytes: bytes, message: Message):
199
+ processor_id = ProcessorID(processor_id_bytes)
200
+
201
+ if isinstance(message, ProcessorInitialized):
202
+ await self._processor_manager.on_processor_initialized(processor_id, message)
203
+ return
204
+
205
+ if isinstance(message, ObjectInstruction):
206
+ await self._processor_manager.on_internal_object_instruction(processor_id, message)
207
+ return
208
+
209
+ if isinstance(message, TaskLog):
210
+ await self._connector_external.send(message)
211
+ return
212
+
213
+ if isinstance(message, TaskResult):
214
+ await self._processor_manager.on_task_result(processor_id, message)
215
+ return
216
+
217
+ raise TypeError(f"Unknown message from {processor_id!r}: {message}")
218
+
219
+ async def __get_loops(self):
220
+ if self._object_storage_address is not None:
221
+ # With a manually set storage address, immediately connect to the object storage server.
222
+ await self._connector_storage.connect(self._object_storage_address.host, self._object_storage_address.port)
223
+
224
+ try:
225
+ await asyncio.gather(
226
+ self._processor_manager.initialize(),
227
+ create_async_loop_routine(self._connector_external.routine, 0),
228
+ create_async_loop_routine(self._connector_storage.routine, 0),
229
+ create_async_loop_routine(self._binder_internal.routine, 0),
230
+ create_async_loop_routine(self._heartbeat_manager.routine, self._heartbeat_interval_seconds),
231
+ create_async_loop_routine(self._timeout_manager.routine, 1),
232
+ create_async_loop_routine(self._task_manager.routine, 0),
233
+ create_async_loop_routine(self._profiling_manager.routine, PROFILING_INTERVAL_SECONDS),
234
+ )
235
+ except asyncio.CancelledError:
236
+ pass
237
+
238
+ # TODO: Should the object storage connector catch this error?
239
+ except ymq.YMQException as e:
240
+ if e.code == ymq.ErrorCode.ConnectorSocketClosedByRemoteEnd:
241
+ pass
242
+ else:
243
+ logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
244
+ except (ClientShutdownException, TimeoutError) as e:
245
+ logging.info(f"{self.identity!r}: {str(e)}")
246
+ except Exception as e:
247
+ logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
248
+
249
+ await self._connector_external.send(DisconnectRequest.new_msg(self.identity))
250
+
251
+ self._connector_external.destroy()
252
+ self._processor_manager.destroy("quit")
253
+ self._binder_internal.destroy()
254
+ os.remove(self._address_path_internal)
255
+
256
+ logging.info(f"{self.identity!r}: quit")
257
+
258
+ def __run_forever(self):
259
+ self._loop.run_until_complete(self._task)
260
+
261
+ def __register_signal(self):
262
+ self._loop.add_signal_handler(signal.SIGINT, self.__destroy)
263
+
264
+ def __destroy(self):
265
+ self._task.cancel()
File without changes
@@ -0,0 +1,26 @@
1
+ from typing import Dict
2
+
3
+ WorkerGroupID = bytes
4
+
5
+
6
+ class CapacityExceededError(Exception):
7
+ pass
8
+
9
+
10
+ class WorkerGroupNotFoundError(Exception):
11
+ pass
12
+
13
+
14
+ def format_capabilities(capabilities: Dict[str, int]) -> str:
15
+ """
16
+ Reverse of `parse_capabilities`: convert a capabilities dict into a
17
+ comma-separated capability string (e.g. "linux,cpu=4").
18
+ Values equal to -1 are emitted as flag-style entries (no `=value`).
19
+ """
20
+ parts = []
21
+ for name, value in capabilities.items():
22
+ if value == -1:
23
+ parts.append(name)
24
+ else:
25
+ parts.append(f"{name}={value}")
26
+ return ",".join(parts)
@@ -0,0 +1,241 @@
1
+ import logging
2
+ import uuid
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Set
5
+
6
+ import boto3
7
+ from aiohttp import web
8
+ from aiohttp.web_request import Request
9
+
10
+ from scaler.config.section.ecs_worker_adapter import ECSWorkerAdapterConfig
11
+ from scaler.utility.identifiers import WorkerID
12
+ from scaler.worker_adapter.common import (
13
+ CapacityExceededError,
14
+ WorkerGroupID,
15
+ WorkerGroupNotFoundError,
16
+ format_capabilities,
17
+ )
18
+
19
+
20
+ @dataclass
21
+ class WorkerGroupInfo:
22
+ worker_ids: Set[WorkerID]
23
+ task_arn: str
24
+
25
+
26
+ class ECSWorkerAdapter:
27
+ def __init__(self, config: ECSWorkerAdapterConfig):
28
+ self._address = config.worker_adapter_config.scheduler_address
29
+ self._object_storage_address = config.worker_adapter_config.object_storage_address
30
+ self._capabilities = config.worker_config.per_worker_capabilities.capabilities
31
+ self._io_threads = config.worker_io_threads
32
+ self._per_worker_task_queue_size = config.worker_config.per_worker_task_queue_size
33
+ self._max_instances = config.worker_adapter_config.max_workers
34
+ self._heartbeat_interval_seconds = config.worker_config.heartbeat_interval_seconds
35
+ self._task_timeout_seconds = config.worker_config.task_timeout_seconds
36
+ self._death_timeout_seconds = config.worker_config.death_timeout_seconds
37
+ self._garbage_collect_interval_seconds = config.worker_config.garbage_collect_interval_seconds
38
+ self._trim_memory_threshold_bytes = config.worker_config.trim_memory_threshold_bytes
39
+ self._hard_processor_suspend = config.worker_config.hard_processor_suspend
40
+ self._event_loop = config.event_loop
41
+
42
+ self._aws_access_key_id = config.aws_access_key_id
43
+ self._aws_secret_access_key = config.aws_secret_access_key
44
+ self._aws_region = config.aws_region
45
+
46
+ self._ecs_cluster = config.ecs_cluster
47
+ self._ecs_task_image = config.ecs_task_image
48
+ self._ecs_python_requirements = config.ecs_python_requirements
49
+ self._ecs_python_version = config.ecs_python_version
50
+ self._ecs_task_definition = config.ecs_task_definition
51
+ self._ecs_task_cpu = config.ecs_task_cpu
52
+ self._ecs_task_memory = config.ecs_task_memory
53
+ self._ecs_subnets = config.ecs_subnets
54
+
55
+ aws_session = boto3.Session(
56
+ aws_access_key_id=self._aws_access_key_id,
57
+ aws_secret_access_key=self._aws_secret_access_key,
58
+ region_name=self._aws_region,
59
+ )
60
+ self._ecs_client = aws_session.client("ecs")
61
+
62
+ resp = self._ecs_client.describe_clusters(clusters=[self._ecs_cluster])
63
+ clusters = resp.get("clusters") or []
64
+ if not clusters or clusters[0]["status"] != "ACTIVE":
65
+ logging.info(f"ECS cluster '{self._ecs_cluster}' missing, creating it.")
66
+ self._ecs_client.create_cluster(clusterName=self._ecs_cluster)
67
+
68
+ self._worker_groups: Dict[WorkerGroupID, WorkerGroupInfo] = {}
69
+
70
+ try:
71
+ resp = self._ecs_client.describe_task_definition(taskDefinition=self._ecs_task_definition)
72
+ except self._ecs_client.exceptions.ClientException:
73
+ logging.info(f"ECS task definition '{self._ecs_task_definition}' missing, creating it.")
74
+ iam_client = aws_session.client("iam")
75
+ try:
76
+ resp = iam_client.get_role(RoleName="ecsTaskExecutionRole")
77
+ execution_role_arn = resp["Role"]["Arn"]
78
+ except iam_client.exceptions.NoSuchEntityException:
79
+ resp = iam_client.create_role(
80
+ RoleName="ecsTaskExecutionRole",
81
+ AssumeRolePolicyDocument=(
82
+ '{"Version": "2012-10-17", '
83
+ '"Statement": [{"Effect": "Allow", '
84
+ '"Principal": {"Service": "ecs-tasks.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
85
+ ),
86
+ )
87
+ execution_role_arn = resp["Role"]["Arn"]
88
+ iam_client.attach_role_policy(
89
+ RoleName="ecsTaskExecutionRole",
90
+ PolicyArn="arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy",
91
+ )
92
+
93
+ resp = self._ecs_client.register_task_definition(
94
+ family=self._ecs_task_definition,
95
+ cpu=str(self._ecs_task_cpu * 1024),
96
+ memory=str(self._ecs_task_memory * 1024),
97
+ runtimePlatform={"cpuArchitecture": "X86_64", "operatingSystemFamily": "LINUX"},
98
+ networkMode="awsvpc",
99
+ containerDefinitions=[{"name": "scaler-container", "image": self._ecs_task_image, "essential": True}],
100
+ requiresCompatibilities=["FARGATE"],
101
+ executionRoleArn=execution_role_arn,
102
+ )
103
+ self._ecs_task_definition = resp["taskDefinition"]["taskDefinitionArn"]
104
+
105
+ async def start_worker_group(self) -> WorkerGroupID:
106
+ if len(self._worker_groups) >= self._max_instances != -1:
107
+ raise CapacityExceededError(f"Maximum number of instances ({self._max_instances}) reached.")
108
+
109
+ worker_names = [f"ECS|{uuid.uuid4().hex}" for _ in range(self._ecs_task_cpu)]
110
+ command = (
111
+ f"scaler_cluster {self._address.to_address()} "
112
+ f"--num-of-workers {self._ecs_task_cpu} "
113
+ f"--worker-names \"{','.join(worker_names)}\" "
114
+ f"--per-worker-task-queue-size {self._per_worker_task_queue_size} "
115
+ f"--heartbeat-interval-seconds {self._heartbeat_interval_seconds} "
116
+ f"--task-timeout-seconds {self._task_timeout_seconds} "
117
+ f"--garbage-collect-interval-seconds {self._garbage_collect_interval_seconds} "
118
+ f"--death-timeout-seconds {self._death_timeout_seconds} "
119
+ f"--trim-memory-threshold-bytes {self._trim_memory_threshold_bytes} "
120
+ f"--event-loop {self._event_loop} "
121
+ f"--worker-io-threads {self._io_threads}"
122
+ )
123
+
124
+ if self._hard_processor_suspend:
125
+ command += " --hard-processor-suspend"
126
+
127
+ if self._object_storage_address:
128
+ command += f" --object-storage-address {self._object_storage_address.to_string()}"
129
+
130
+ if format_capabilities(self._capabilities).strip():
131
+ command += f" --per-worker-capabilities {format_capabilities(self._capabilities)}"
132
+
133
+ resp = self._ecs_client.run_task(
134
+ cluster=self._ecs_cluster,
135
+ taskDefinition=self._ecs_task_definition,
136
+ launchType="FARGATE",
137
+ overrides={
138
+ "containerOverrides": [
139
+ {
140
+ "name": "scaler-container",
141
+ "environment": [
142
+ {"name": "COMMAND", "value": command},
143
+ {"name": "PYTHON_REQUIREMENTS", "value": self._ecs_python_requirements},
144
+ {"name": "PYTHON_VERSION", "value": self._ecs_python_version},
145
+ ],
146
+ }
147
+ ]
148
+ },
149
+ networkConfiguration={"awsvpcConfiguration": {"subnets": self._ecs_subnets, "assignPublicIp": "ENABLED"}},
150
+ )
151
+
152
+ failures = resp.get("failures") or []
153
+ if failures:
154
+ raise RuntimeError(f"ECS run task failed: {failures}")
155
+
156
+ tasks = resp.get("tasks") or []
157
+ if not tasks:
158
+ raise RuntimeError("ECS run task returned no tasks")
159
+ if len(tasks) > 1:
160
+ raise RuntimeError("ECS run task returned multiple tasks, expected only one")
161
+
162
+ task_arn = tasks[0]["taskArn"]
163
+ worker_group_id = f"ecs-{uuid.uuid4().hex}".encode()
164
+ self._worker_groups[worker_group_id] = WorkerGroupInfo(
165
+ worker_ids={WorkerID.generate_worker_id(worker_name) for worker_name in worker_names}, task_arn=task_arn
166
+ )
167
+ return worker_group_id
168
+
169
+ async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
170
+ if worker_group_id not in self._worker_groups:
171
+ raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
172
+
173
+ resp = self._ecs_client.stop_task(
174
+ cluster=self._ecs_cluster,
175
+ task=self._worker_groups[worker_group_id].task_arn,
176
+ reason="Shutdown requested by ecs adapter",
177
+ )
178
+ failures = resp.get("failures") or []
179
+ if failures:
180
+ raise RuntimeError(f"ECS stop task failed: {failures}")
181
+
182
+ self._worker_groups.pop(worker_group_id)
183
+
184
+ async def webhook_handler(self, request: Request):
185
+ request_json = await request.json()
186
+
187
+ if "action" not in request_json:
188
+ return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
189
+
190
+ action = request_json["action"]
191
+
192
+ if action == "get_worker_adapter_info":
193
+ return web.json_response(
194
+ {
195
+ "max_worker_groups": self._max_instances,
196
+ "workers_per_group": self._ecs_task_cpu,
197
+ "base_capabilities": self._capabilities,
198
+ },
199
+ status=web.HTTPOk.status_code,
200
+ )
201
+
202
+ elif action == "start_worker_group":
203
+ try:
204
+ worker_group_id = await self.start_worker_group()
205
+ except CapacityExceededError as e:
206
+ return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
207
+ except Exception as e:
208
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
209
+
210
+ return web.json_response(
211
+ {
212
+ "status": "Worker group started",
213
+ "worker_group_id": worker_group_id.decode(),
214
+ "worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].worker_ids],
215
+ },
216
+ status=web.HTTPOk.status_code,
217
+ )
218
+
219
+ elif action == "shutdown_worker_group":
220
+ if "worker_group_id" not in request_json:
221
+ return web.json_response(
222
+ {"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
223
+ )
224
+
225
+ worker_group_id = request_json["worker_group_id"].encode()
226
+ try:
227
+ await self.shutdown_worker_group(worker_group_id)
228
+ except WorkerGroupNotFoundError as e:
229
+ return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
230
+ except Exception as e:
231
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
232
+
233
+ return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
234
+
235
+ else:
236
+ return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
237
+
238
+ def create_app(self):
239
+ app = web.Application()
240
+ app.router.add_post("/", self.webhook_handler)
241
+ return app