opengris-scaler 1.12.7__cp311-cp311-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (234) hide show
  1. opengris_scaler-1.12.7.dist-info/METADATA +729 -0
  2. opengris_scaler-1.12.7.dist-info/RECORD +234 -0
  3. opengris_scaler-1.12.7.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.7.dist-info/entry_points.txt +9 -0
  5. opengris_scaler-1.12.7.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.7.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.7.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-61c06778.1.0.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-21b63b70.1.0.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/CMakeLists.txt +11 -0
  13. scaler/__init__.py +14 -0
  14. scaler/about.py +5 -0
  15. scaler/client/__init__.py +0 -0
  16. scaler/client/agent/__init__.py +0 -0
  17. scaler/client/agent/client_agent.py +210 -0
  18. scaler/client/agent/disconnect_manager.py +27 -0
  19. scaler/client/agent/future_manager.py +112 -0
  20. scaler/client/agent/heartbeat_manager.py +74 -0
  21. scaler/client/agent/mixins.py +89 -0
  22. scaler/client/agent/object_manager.py +98 -0
  23. scaler/client/agent/task_manager.py +64 -0
  24. scaler/client/client.py +635 -0
  25. scaler/client/future.py +252 -0
  26. scaler/client/object_buffer.py +129 -0
  27. scaler/client/object_reference.py +25 -0
  28. scaler/client/serializer/__init__.py +0 -0
  29. scaler/client/serializer/default.py +16 -0
  30. scaler/client/serializer/mixins.py +38 -0
  31. scaler/cluster/__init__.py +0 -0
  32. scaler/cluster/cluster.py +115 -0
  33. scaler/cluster/combo.py +148 -0
  34. scaler/cluster/object_storage_server.py +45 -0
  35. scaler/cluster/scheduler.py +83 -0
  36. scaler/config/__init__.py +0 -0
  37. scaler/config/defaults.py +87 -0
  38. scaler/config/loader.py +95 -0
  39. scaler/config/mixins.py +15 -0
  40. scaler/config/section/__init__.py +0 -0
  41. scaler/config/section/cluster.py +56 -0
  42. scaler/config/section/native_worker_adapter.py +44 -0
  43. scaler/config/section/object_storage_server.py +7 -0
  44. scaler/config/section/scheduler.py +53 -0
  45. scaler/config/section/symphony_worker_adapter.py +47 -0
  46. scaler/config/section/top.py +13 -0
  47. scaler/config/section/webui.py +16 -0
  48. scaler/config/types/__init__.py +0 -0
  49. scaler/config/types/object_storage_server.py +45 -0
  50. scaler/config/types/worker.py +57 -0
  51. scaler/config/types/zmq.py +79 -0
  52. scaler/entry_points/__init__.py +0 -0
  53. scaler/entry_points/cluster.py +133 -0
  54. scaler/entry_points/object_storage_server.py +41 -0
  55. scaler/entry_points/scheduler.py +135 -0
  56. scaler/entry_points/top.py +286 -0
  57. scaler/entry_points/webui.py +26 -0
  58. scaler/entry_points/worker_adapter_native.py +137 -0
  59. scaler/entry_points/worker_adapter_symphony.py +102 -0
  60. scaler/io/__init__.py +0 -0
  61. scaler/io/async_binder.py +85 -0
  62. scaler/io/async_connector.py +95 -0
  63. scaler/io/async_object_storage_connector.py +185 -0
  64. scaler/io/mixins.py +154 -0
  65. scaler/io/sync_connector.py +68 -0
  66. scaler/io/sync_object_storage_connector.py +185 -0
  67. scaler/io/sync_subscriber.py +83 -0
  68. scaler/io/utility.py +31 -0
  69. scaler/io/ymq/CMakeLists.txt +98 -0
  70. scaler/io/ymq/__init__.py +0 -0
  71. scaler/io/ymq/_ymq.pyi +96 -0
  72. scaler/io/ymq/_ymq.so +0 -0
  73. scaler/io/ymq/bytes.h +114 -0
  74. scaler/io/ymq/common.h +29 -0
  75. scaler/io/ymq/configuration.h +60 -0
  76. scaler/io/ymq/epoll_context.cpp +185 -0
  77. scaler/io/ymq/epoll_context.h +85 -0
  78. scaler/io/ymq/error.h +132 -0
  79. scaler/io/ymq/event_loop.h +55 -0
  80. scaler/io/ymq/event_loop_thread.cpp +64 -0
  81. scaler/io/ymq/event_loop_thread.h +46 -0
  82. scaler/io/ymq/event_manager.h +81 -0
  83. scaler/io/ymq/file_descriptor.h +203 -0
  84. scaler/io/ymq/interruptive_concurrent_queue.h +169 -0
  85. scaler/io/ymq/io_context.cpp +98 -0
  86. scaler/io/ymq/io_context.h +44 -0
  87. scaler/io/ymq/io_socket.cpp +299 -0
  88. scaler/io/ymq/io_socket.h +121 -0
  89. scaler/io/ymq/iocp_context.cpp +102 -0
  90. scaler/io/ymq/iocp_context.h +83 -0
  91. scaler/io/ymq/logging.h +163 -0
  92. scaler/io/ymq/message.h +15 -0
  93. scaler/io/ymq/message_connection.h +16 -0
  94. scaler/io/ymq/message_connection_tcp.cpp +672 -0
  95. scaler/io/ymq/message_connection_tcp.h +96 -0
  96. scaler/io/ymq/network_utils.h +179 -0
  97. scaler/io/ymq/pymod_ymq/bytes.h +113 -0
  98. scaler/io/ymq/pymod_ymq/exception.h +124 -0
  99. scaler/io/ymq/pymod_ymq/gil.h +15 -0
  100. scaler/io/ymq/pymod_ymq/io_context.h +166 -0
  101. scaler/io/ymq/pymod_ymq/io_socket.h +285 -0
  102. scaler/io/ymq/pymod_ymq/message.h +99 -0
  103. scaler/io/ymq/pymod_ymq/python.h +153 -0
  104. scaler/io/ymq/pymod_ymq/ymq.cpp +23 -0
  105. scaler/io/ymq/pymod_ymq/ymq.h +357 -0
  106. scaler/io/ymq/readme.md +114 -0
  107. scaler/io/ymq/simple_interface.cpp +80 -0
  108. scaler/io/ymq/simple_interface.h +24 -0
  109. scaler/io/ymq/tcp_client.cpp +367 -0
  110. scaler/io/ymq/tcp_client.h +75 -0
  111. scaler/io/ymq/tcp_operations.h +41 -0
  112. scaler/io/ymq/tcp_server.cpp +410 -0
  113. scaler/io/ymq/tcp_server.h +79 -0
  114. scaler/io/ymq/third_party/concurrentqueue.h +3747 -0
  115. scaler/io/ymq/timed_queue.h +272 -0
  116. scaler/io/ymq/timestamp.h +102 -0
  117. scaler/io/ymq/typedefs.h +20 -0
  118. scaler/io/ymq/utils.h +34 -0
  119. scaler/io/ymq/ymq.py +130 -0
  120. scaler/object_storage/CMakeLists.txt +50 -0
  121. scaler/object_storage/__init__.py +0 -0
  122. scaler/object_storage/constants.h +11 -0
  123. scaler/object_storage/defs.h +14 -0
  124. scaler/object_storage/io_helper.cpp +44 -0
  125. scaler/object_storage/io_helper.h +9 -0
  126. scaler/object_storage/message.cpp +56 -0
  127. scaler/object_storage/message.h +130 -0
  128. scaler/object_storage/object_manager.cpp +126 -0
  129. scaler/object_storage/object_manager.h +52 -0
  130. scaler/object_storage/object_storage_server.cpp +359 -0
  131. scaler/object_storage/object_storage_server.h +126 -0
  132. scaler/object_storage/object_storage_server.so +0 -0
  133. scaler/object_storage/pymod_object_storage_server.cpp +104 -0
  134. scaler/protocol/__init__.py +0 -0
  135. scaler/protocol/capnp/__init__.py +0 -0
  136. scaler/protocol/capnp/_python.py +6 -0
  137. scaler/protocol/capnp/common.capnp +63 -0
  138. scaler/protocol/capnp/message.capnp +216 -0
  139. scaler/protocol/capnp/object_storage.capnp +52 -0
  140. scaler/protocol/capnp/status.capnp +73 -0
  141. scaler/protocol/introduction.md +105 -0
  142. scaler/protocol/python/__init__.py +0 -0
  143. scaler/protocol/python/common.py +135 -0
  144. scaler/protocol/python/message.py +726 -0
  145. scaler/protocol/python/mixins.py +13 -0
  146. scaler/protocol/python/object_storage.py +118 -0
  147. scaler/protocol/python/status.py +279 -0
  148. scaler/protocol/worker.md +228 -0
  149. scaler/scheduler/__init__.py +0 -0
  150. scaler/scheduler/allocate_policy/__init__.py +0 -0
  151. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  152. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  153. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  154. scaler/scheduler/allocate_policy/mixins.py +55 -0
  155. scaler/scheduler/controllers/__init__.py +0 -0
  156. scaler/scheduler/controllers/balance_controller.py +65 -0
  157. scaler/scheduler/controllers/client_controller.py +131 -0
  158. scaler/scheduler/controllers/config_controller.py +31 -0
  159. scaler/scheduler/controllers/graph_controller.py +424 -0
  160. scaler/scheduler/controllers/information_controller.py +81 -0
  161. scaler/scheduler/controllers/mixins.py +201 -0
  162. scaler/scheduler/controllers/object_controller.py +147 -0
  163. scaler/scheduler/controllers/scaling_controller.py +86 -0
  164. scaler/scheduler/controllers/task_controller.py +373 -0
  165. scaler/scheduler/controllers/worker_controller.py +168 -0
  166. scaler/scheduler/object_usage/__init__.py +0 -0
  167. scaler/scheduler/object_usage/object_tracker.py +131 -0
  168. scaler/scheduler/scheduler.py +253 -0
  169. scaler/scheduler/task/__init__.py +0 -0
  170. scaler/scheduler/task/task_state_machine.py +92 -0
  171. scaler/scheduler/task/task_state_manager.py +61 -0
  172. scaler/ui/__init__.py +0 -0
  173. scaler/ui/constants.py +9 -0
  174. scaler/ui/live_display.py +118 -0
  175. scaler/ui/memory_window.py +146 -0
  176. scaler/ui/setting_page.py +47 -0
  177. scaler/ui/task_graph.py +370 -0
  178. scaler/ui/task_log.py +83 -0
  179. scaler/ui/utility.py +35 -0
  180. scaler/ui/webui.py +125 -0
  181. scaler/ui/worker_processors.py +85 -0
  182. scaler/utility/__init__.py +0 -0
  183. scaler/utility/debug.py +19 -0
  184. scaler/utility/event_list.py +63 -0
  185. scaler/utility/event_loop.py +58 -0
  186. scaler/utility/exceptions.py +42 -0
  187. scaler/utility/formatter.py +44 -0
  188. scaler/utility/graph/__init__.py +0 -0
  189. scaler/utility/graph/optimization.py +27 -0
  190. scaler/utility/graph/topological_sorter.py +11 -0
  191. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  192. scaler/utility/identifiers.py +105 -0
  193. scaler/utility/logging/__init__.py +0 -0
  194. scaler/utility/logging/decorators.py +25 -0
  195. scaler/utility/logging/scoped_logger.py +33 -0
  196. scaler/utility/logging/utility.py +183 -0
  197. scaler/utility/many_to_many_dict.py +123 -0
  198. scaler/utility/metadata/__init__.py +0 -0
  199. scaler/utility/metadata/profile_result.py +31 -0
  200. scaler/utility/metadata/task_flags.py +30 -0
  201. scaler/utility/mixins.py +13 -0
  202. scaler/utility/network_util.py +7 -0
  203. scaler/utility/one_to_many_dict.py +72 -0
  204. scaler/utility/queues/__init__.py +0 -0
  205. scaler/utility/queues/async_indexed_queue.py +37 -0
  206. scaler/utility/queues/async_priority_queue.py +70 -0
  207. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  208. scaler/utility/queues/indexed_queue.py +114 -0
  209. scaler/utility/serialization.py +9 -0
  210. scaler/version.txt +1 -0
  211. scaler/worker/__init__.py +0 -0
  212. scaler/worker/agent/__init__.py +0 -0
  213. scaler/worker/agent/heartbeat_manager.py +107 -0
  214. scaler/worker/agent/mixins.py +137 -0
  215. scaler/worker/agent/processor/__init__.py +0 -0
  216. scaler/worker/agent/processor/object_cache.py +107 -0
  217. scaler/worker/agent/processor/processor.py +279 -0
  218. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  219. scaler/worker/agent/processor_holder.py +145 -0
  220. scaler/worker/agent/processor_manager.py +365 -0
  221. scaler/worker/agent/profiling_manager.py +109 -0
  222. scaler/worker/agent/task_manager.py +150 -0
  223. scaler/worker/agent/timeout_manager.py +19 -0
  224. scaler/worker/preload.py +84 -0
  225. scaler/worker/worker.py +264 -0
  226. scaler/worker_adapter/__init__.py +0 -0
  227. scaler/worker_adapter/native.py +154 -0
  228. scaler/worker_adapter/symphony/__init__.py +0 -0
  229. scaler/worker_adapter/symphony/callback.py +45 -0
  230. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  231. scaler/worker_adapter/symphony/message.py +24 -0
  232. scaler/worker_adapter/symphony/task_manager.py +288 -0
  233. scaler/worker_adapter/symphony/worker.py +205 -0
  234. scaler/worker_adapter/symphony/worker_adapter.py +142 -0
@@ -0,0 +1,84 @@
1
+ import ast
2
+ import importlib
3
+ import logging
4
+ import os
5
+ import traceback
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+
9
+ class PreloadSpecError(Exception):
10
+ pass
11
+
12
+
13
+ def execute_preload(spec: str) -> None:
14
+ """
15
+ Import and execute the given preload spec in current interpreter.
16
+
17
+ Example: 'foo.bar:preload_function("a", 2)'
18
+ """
19
+ module_path, func_name, args, kwargs = _parse_preload_spec(spec)
20
+ logging.info("preloading: %s:%s with args=%s kwargs=%s", module_path, func_name, args, kwargs)
21
+
22
+ try:
23
+ module = importlib.import_module(module_path)
24
+ except ImportError:
25
+ if module_path.endswith(".py") and os.path.exists(module_path):
26
+ raise PreloadSpecError(
27
+ f"Failed to find module. Did you mean '{module_path.rsplit('.', 1)[0]}:{func_name}'?"
28
+ )
29
+ raise
30
+
31
+ try:
32
+ target = getattr(module, func_name)
33
+ except AttributeError:
34
+ logging.exception(f"Failed to find attribute {func_name!r} in {module_path!r}.")
35
+ raise PreloadSpecError(f"Failed to find attribute {func_name!r} in {module_path!r}.")
36
+
37
+ if not callable(target):
38
+ raise PreloadSpecError("Preload target must be callable.")
39
+
40
+ try:
41
+ if args is None:
42
+ # Simple name: call with no args
43
+ target()
44
+ else:
45
+ target(*args, **(kwargs or {}))
46
+ except TypeError as e:
47
+ raise PreloadSpecError("".join(traceback.format_exception_only(TypeError, e)).strip())
48
+
49
+
50
+ def _parse_preload_spec(spec: str) -> Tuple[str, str, Optional[List[Any]], Optional[Dict[str, Any]]]:
51
+ """
52
+ Parse 'pkg.mod:func(arg1, kw=val)' using AST.
53
+ Returns (module_path, func_name, args_or_None, kwargs_or_None).
54
+ If expression is a simple name (no args), returns args=None, kwargs=None.
55
+ """
56
+ if ":" not in spec:
57
+ raise PreloadSpecError("preload must be in 'module.sub:func(...)' format")
58
+
59
+ module_part, obj_expr = spec.split(":", 1)
60
+
61
+ # Parse the right-hand side as a single expression
62
+ try:
63
+ expression = ast.parse(obj_expr, mode="eval").body
64
+ except SyntaxError:
65
+ raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
66
+
67
+ if isinstance(expression, ast.Name):
68
+ func_name = expression.id
69
+ args = None
70
+ kwargs = None
71
+ elif isinstance(expression, ast.Call):
72
+ # Ensure the function name is an attribute name only (no dotted path)
73
+ if not isinstance(expression.func, ast.Name):
74
+ raise PreloadSpecError(f"Function reference must be a simple name: {obj_expr!r}")
75
+ func_name = expression.func.id
76
+ try:
77
+ args = [ast.literal_eval(arg) for arg in expression.args]
78
+ kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in expression.keywords}
79
+ except ValueError:
80
+ raise PreloadSpecError(f"Failed to parse arguments as literal values: {obj_expr!r}")
81
+ else:
82
+ raise PreloadSpecError(f"Failed to parse {obj_expr!r} as an attribute name or function call.")
83
+
84
+ return module_part, func_name, args, kwargs
@@ -0,0 +1,264 @@
1
+ import asyncio
2
+ import logging
3
+ import multiprocessing
4
+ import os
5
+ import signal
6
+ import tempfile
7
+ import uuid
8
+ from typing import Dict, Optional, Tuple
9
+
10
+ import zmq.asyncio
11
+
12
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
13
+ from scaler.config.types.zmq import ZMQConfig, ZMQType
14
+ from scaler.io.async_binder import ZMQAsyncBinder
15
+ from scaler.io.async_connector import ZMQAsyncConnector
16
+ from scaler.io.async_object_storage_connector import PyAsyncObjectStorageConnector
17
+ from scaler.config.defaults import PROFILING_INTERVAL_SECONDS
18
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
19
+ from scaler.io.ymq import ymq
20
+ from scaler.protocol.python.message import (
21
+ ClientDisconnect,
22
+ DisconnectRequest,
23
+ ObjectInstruction,
24
+ ProcessorInitialized,
25
+ Task,
26
+ TaskCancel,
27
+ TaskLog,
28
+ TaskResult,
29
+ WorkerHeartbeatEcho,
30
+ )
31
+ from scaler.protocol.python.mixins import Message
32
+ from scaler.utility.event_loop import create_async_loop_routine, register_event_loop
33
+ from scaler.utility.exceptions import ClientShutdownException
34
+ from scaler.utility.identifiers import ProcessorID, WorkerID
35
+ from scaler.utility.logging.utility import setup_logger
36
+ from scaler.worker.agent.heartbeat_manager import VanillaHeartbeatManager
37
+ from scaler.worker.agent.processor_manager import VanillaProcessorManager
38
+ from scaler.worker.agent.profiling_manager import VanillaProfilingManager
39
+ from scaler.worker.agent.task_manager import VanillaTaskManager
40
+ from scaler.worker.agent.timeout_manager import VanillaTimeoutManager
41
+
42
+
43
+ class Worker(multiprocessing.get_context("spawn").Process): # type: ignore
44
+ def __init__(
45
+ self,
46
+ event_loop: str,
47
+ name: str,
48
+ address: ZMQConfig,
49
+ storage_address: Optional[ObjectStorageConfig],
50
+ preload: Optional[str],
51
+ capabilities: Dict[str, int],
52
+ io_threads: int,
53
+ task_queue_size: int,
54
+ heartbeat_interval_seconds: int,
55
+ garbage_collect_interval_seconds: int,
56
+ trim_memory_threshold_bytes: int,
57
+ task_timeout_seconds: int,
58
+ death_timeout_seconds: int,
59
+ hard_processor_suspend: bool,
60
+ logging_paths: Tuple[str, ...],
61
+ logging_level: str,
62
+ ):
63
+ multiprocessing.Process.__init__(self, name="Agent")
64
+
65
+ self._event_loop = event_loop
66
+ self._name = name
67
+ self._address = address
68
+ self._storage_address = storage_address
69
+ self._preload = preload
70
+ self._capabilities = capabilities
71
+ self._io_threads = io_threads
72
+ self._task_queue_size = task_queue_size
73
+
74
+ self._ident = WorkerID.generate_worker_id(name) # _identity is internal to multiprocessing.Process
75
+
76
+ self._address_path_internal = os.path.join(tempfile.gettempdir(), f"scaler_worker_{uuid.uuid4().hex}")
77
+ self._address_internal = ZMQConfig(ZMQType.ipc, host=self._address_path_internal)
78
+
79
+ self._task_queue_size = task_queue_size
80
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
81
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
82
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
83
+ self._task_timeout_seconds = task_timeout_seconds
84
+ self._death_timeout_seconds = death_timeout_seconds
85
+ self._hard_processor_suspend = hard_processor_suspend
86
+
87
+ self._logging_paths = logging_paths
88
+ self._logging_level = logging_level
89
+
90
+ self._context: Optional[zmq.asyncio.Context] = None
91
+ self._connector_external: Optional[AsyncConnector] = None
92
+ self._binder_internal: Optional[AsyncBinder] = None
93
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
94
+ self._task_manager: Optional[VanillaTaskManager] = None
95
+ self._heartbeat_manager: Optional[VanillaHeartbeatManager] = None
96
+ self._profiling_manager: Optional[VanillaProfilingManager] = None
97
+ self._processor_manager: Optional[VanillaProcessorManager] = None
98
+
99
+ @property
100
+ def identity(self) -> WorkerID:
101
+ return self._ident
102
+
103
+ def run(self) -> None:
104
+ self.__initialize()
105
+ self.__run_forever()
106
+
107
+ def __initialize(self):
108
+ setup_logger()
109
+ register_event_loop(self._event_loop)
110
+
111
+ self._context = zmq.asyncio.Context()
112
+ self._connector_external = ZMQAsyncConnector(
113
+ context=self._context,
114
+ name=self.name,
115
+ socket_type=zmq.DEALER,
116
+ address=self._address,
117
+ bind_or_connect="connect",
118
+ callback=self.__on_receive_external,
119
+ identity=self._ident,
120
+ )
121
+
122
+ self._binder_internal = ZMQAsyncBinder(
123
+ context=self._context, name=self.name, address=self._address_internal, identity=self._ident
124
+ )
125
+ self._binder_internal.register(self.__on_receive_internal)
126
+
127
+ self._connector_storage = PyAsyncObjectStorageConnector()
128
+
129
+ self._heartbeat_manager = VanillaHeartbeatManager(
130
+ storage_address=self._storage_address,
131
+ capabilities=self._capabilities,
132
+ task_queue_size=self._task_queue_size,
133
+ )
134
+
135
+ self._profiling_manager = VanillaProfilingManager()
136
+ self._task_manager = VanillaTaskManager(task_timeout_seconds=self._task_timeout_seconds)
137
+ self._timeout_manager = VanillaTimeoutManager(death_timeout_seconds=self._death_timeout_seconds)
138
+ self._processor_manager = VanillaProcessorManager(
139
+ identity=self._ident,
140
+ event_loop=self._event_loop,
141
+ address_internal=self._address_internal,
142
+ preload=self._preload,
143
+ garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
144
+ trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
145
+ hard_processor_suspend=self._hard_processor_suspend,
146
+ logging_paths=self._logging_paths,
147
+ logging_level=self._logging_level,
148
+ )
149
+
150
+ # register
151
+ self._task_manager.register(connector=self._connector_external, processor_manager=self._processor_manager)
152
+ self._heartbeat_manager.register(
153
+ connector_external=self._connector_external,
154
+ connector_storage=self._connector_storage,
155
+ worker_task_manager=self._task_manager,
156
+ timeout_manager=self._timeout_manager,
157
+ processor_manager=self._processor_manager,
158
+ )
159
+ self._processor_manager.register(
160
+ heartbeat_manager=self._heartbeat_manager,
161
+ task_manager=self._task_manager,
162
+ profiling_manager=self._profiling_manager,
163
+ connector_external=self._connector_external,
164
+ binder_internal=self._binder_internal,
165
+ connector_storage=self._connector_storage,
166
+ )
167
+
168
+ self._loop = asyncio.get_event_loop()
169
+ self.__register_signal()
170
+ self._task = self._loop.create_task(self.__get_loops())
171
+
172
+ async def __on_receive_external(self, message: Message):
173
+ if isinstance(message, WorkerHeartbeatEcho):
174
+ await self._heartbeat_manager.on_heartbeat_echo(message)
175
+ return
176
+
177
+ if isinstance(message, Task):
178
+ await self._task_manager.on_task_new(message)
179
+ return
180
+
181
+ if isinstance(message, TaskCancel):
182
+ await self._task_manager.on_cancel_task(message)
183
+ return
184
+
185
+ if isinstance(message, ObjectInstruction):
186
+ await self._processor_manager.on_external_object_instruction(message)
187
+ return
188
+
189
+ if isinstance(message, ClientDisconnect):
190
+ if message.disconnect_type == ClientDisconnect.DisconnectType.Shutdown:
191
+ raise ClientShutdownException("received client shutdown, quitting")
192
+ logging.error(f"Worker received invalid ClientDisconnect type, ignoring {message=}")
193
+ return
194
+
195
+ raise TypeError(f"Unknown {message=}")
196
+
197
+ async def __on_receive_internal(self, processor_id_bytes: bytes, message: Message):
198
+ processor_id = ProcessorID(processor_id_bytes)
199
+
200
+ if isinstance(message, ProcessorInitialized):
201
+ await self._processor_manager.on_processor_initialized(processor_id, message)
202
+ return
203
+
204
+ if isinstance(message, ObjectInstruction):
205
+ await self._processor_manager.on_internal_object_instruction(processor_id, message)
206
+ return
207
+
208
+ if isinstance(message, TaskLog):
209
+ await self._connector_external.send(message)
210
+ return
211
+
212
+ if isinstance(message, TaskResult):
213
+ await self._processor_manager.on_task_result(processor_id, message)
214
+ return
215
+
216
+ raise TypeError(f"Unknown message from {processor_id!r}: {message}")
217
+
218
+ async def __get_loops(self):
219
+ if self._storage_address is not None:
220
+ # With a manually set storage address, immediately connect to the object storage server.
221
+ await self._connector_storage.connect(self._storage_address.host, self._storage_address.port)
222
+
223
+ try:
224
+ await asyncio.gather(
225
+ self._processor_manager.initialize(),
226
+ create_async_loop_routine(self._connector_external.routine, 0),
227
+ create_async_loop_routine(self._connector_storage.routine, 0),
228
+ create_async_loop_routine(self._binder_internal.routine, 0),
229
+ create_async_loop_routine(self._heartbeat_manager.routine, self._heartbeat_interval_seconds),
230
+ create_async_loop_routine(self._timeout_manager.routine, 1),
231
+ create_async_loop_routine(self._task_manager.routine, 0),
232
+ create_async_loop_routine(self._profiling_manager.routine, PROFILING_INTERVAL_SECONDS),
233
+ )
234
+ except asyncio.CancelledError:
235
+ pass
236
+
237
+ # TODO: Should the object storage connector catch this error?
238
+ except ymq.YMQException as e:
239
+ if e.code == ymq.ErrorCode.ConnectorSocketClosedByRemoteEnd:
240
+ pass
241
+ else:
242
+ logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
243
+ except (ClientShutdownException, TimeoutError) as e:
244
+ logging.info(f"{self.identity!r}: {str(e)}")
245
+ except Exception as e:
246
+ logging.exception(f"{self.identity!r}: failed with unhandled exception:\n{e}")
247
+
248
+ await self._connector_external.send(DisconnectRequest.new_msg(self.identity))
249
+
250
+ self._connector_external.destroy()
251
+ self._processor_manager.destroy("quit")
252
+ self._binder_internal.destroy()
253
+ os.remove(self._address_path_internal)
254
+
255
+ logging.info(f"{self.identity!r}: quit")
256
+
257
+ def __run_forever(self):
258
+ self._loop.run_until_complete(self._task)
259
+
260
+ def __register_signal(self):
261
+ self._loop.add_signal_handler(signal.SIGINT, self.__destroy)
262
+
263
+ def __destroy(self):
264
+ self._task.cancel()
File without changes
@@ -0,0 +1,154 @@
1
+ import os
2
+ import signal
3
+ import uuid
4
+ from typing import Dict, Optional, Tuple
5
+
6
+ from aiohttp import web
7
+ from aiohttp.web_request import Request
8
+
9
+ from scaler.utility.identifiers import WorkerID
10
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
11
+ from scaler.config.types.zmq import ZMQConfig
12
+ from scaler.worker.worker import Worker
13
+
14
+ WorkerGroupID = bytes
15
+
16
+
17
+ class CapacityExceededError(Exception):
18
+ pass
19
+
20
+
21
+ class WorkerGroupNotFoundError(Exception):
22
+ pass
23
+
24
+
25
+ class NativeWorkerAdapter:
26
+ def __init__(
27
+ self,
28
+ address: ZMQConfig,
29
+ storage_address: Optional[ObjectStorageConfig],
30
+ capabilities: Dict[str, int],
31
+ io_threads: int,
32
+ task_queue_size: int,
33
+ max_workers: int,
34
+ heartbeat_interval_seconds: int,
35
+ task_timeout_seconds: int,
36
+ death_timeout_seconds: int,
37
+ garbage_collect_interval_seconds: int,
38
+ trim_memory_threshold_bytes: int,
39
+ hard_processor_suspend: bool,
40
+ event_loop: str,
41
+ logging_paths: Tuple[str, ...],
42
+ logging_level: str,
43
+ logging_config_file: Optional[str],
44
+ ):
45
+ self._address = address
46
+ self._storage_address = storage_address
47
+ self._capabilities = capabilities
48
+ self._io_threads = io_threads
49
+ self._task_queue_size = task_queue_size
50
+ self._max_workers = max_workers
51
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
52
+ self._task_timeout_seconds = task_timeout_seconds
53
+ self._death_timeout_seconds = death_timeout_seconds
54
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
55
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
56
+ self._hard_processor_suspend = hard_processor_suspend
57
+ self._event_loop = event_loop
58
+ self._logging_paths = logging_paths
59
+ self._logging_level = logging_level
60
+ self._logging_config_file = logging_config_file
61
+
62
+ """
63
+ Although a worker group can contain multiple workers, in this native adapter implementation,
64
+ each worker group will only contain one worker.
65
+ """
66
+ self._worker_groups: Dict[WorkerGroupID, Dict[WorkerID, Worker]] = {}
67
+
68
+ async def start_worker_group(self) -> WorkerGroupID:
69
+ num_of_workers = sum(len(workers) for workers in self._worker_groups.values())
70
+ if num_of_workers >= self._max_workers != -1:
71
+ raise CapacityExceededError(f"Maximum number of workers ({self._max_workers}) reached.")
72
+
73
+ worker = Worker(
74
+ name=uuid.uuid4().hex,
75
+ address=self._address,
76
+ storage_address=self._storage_address,
77
+ preload=None,
78
+ capabilities=self._capabilities,
79
+ io_threads=self._io_threads,
80
+ task_queue_size=self._task_queue_size,
81
+ heartbeat_interval_seconds=self._heartbeat_interval_seconds,
82
+ task_timeout_seconds=self._task_timeout_seconds,
83
+ death_timeout_seconds=self._death_timeout_seconds,
84
+ garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
85
+ trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
86
+ hard_processor_suspend=self._hard_processor_suspend,
87
+ event_loop=self._event_loop,
88
+ logging_paths=self._logging_paths,
89
+ logging_level=self._logging_level,
90
+ )
91
+
92
+ worker.start()
93
+ worker_group_id = f"native-{uuid.uuid4().hex}".encode()
94
+ self._worker_groups[worker_group_id] = {worker.identity: worker}
95
+ return worker_group_id
96
+
97
+ async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
98
+ if worker_group_id not in self._worker_groups:
99
+ raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
100
+
101
+ for worker in self._worker_groups[worker_group_id].values():
102
+ os.kill(worker.pid, signal.SIGINT)
103
+ worker.join()
104
+
105
+ self._worker_groups.pop(worker_group_id)
106
+
107
+ async def webhook_handler(self, request: Request):
108
+ request_json = await request.json()
109
+
110
+ if "action" not in request_json:
111
+ return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
112
+
113
+ action = request_json["action"]
114
+
115
+ if action == "start_worker_group":
116
+ try:
117
+ worker_group_id = await self.start_worker_group()
118
+ except CapacityExceededError as e:
119
+ return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
120
+ except Exception as e:
121
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
122
+
123
+ return web.json_response(
124
+ {
125
+ "status": "Worker group started",
126
+ "worker_group_id": worker_group_id.decode(),
127
+ "worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].keys()],
128
+ },
129
+ status=web.HTTPOk.status_code,
130
+ )
131
+
132
+ elif action == "shutdown_worker_group":
133
+ if "worker_group_id" not in request_json:
134
+ return web.json_response(
135
+ {"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
136
+ )
137
+
138
+ worker_group_id = request_json["worker_group_id"].encode()
139
+ try:
140
+ await self.shutdown_worker_group(worker_group_id)
141
+ except WorkerGroupNotFoundError as e:
142
+ return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
143
+ except Exception as e:
144
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
145
+
146
+ return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
147
+
148
+ else:
149
+ return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
150
+
151
+ def create_app(self):
152
+ app = web.Application()
153
+ app.router.add_post("/", self.webhook_handler)
154
+ return app
File without changes
@@ -0,0 +1,45 @@
1
+ import concurrent.futures
2
+ import threading
3
+ from typing import Dict
4
+
5
+ import cloudpickle
6
+
7
+ from scaler.worker_adapter.symphony.message import SoamMessage
8
+
9
+ try:
10
+ import soamapi
11
+ except ImportError:
12
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
13
+
14
+
15
+ class SessionCallback(soamapi.SessionCallback):
16
+ def __init__(self):
17
+ self._callback_lock = threading.Lock()
18
+ self._task_id_to_future: Dict[str, concurrent.futures.Future] = {}
19
+
20
+ def on_response(self, task_output_handle):
21
+ with self._callback_lock:
22
+ task_id = task_output_handle.get_id()
23
+
24
+ future = self._task_id_to_future.pop(task_id)
25
+
26
+ if task_output_handle.is_successful():
27
+ output_message = SoamMessage()
28
+ task_output_handle.populate_task_output(output_message)
29
+ result = cloudpickle.loads(output_message.get_payload())
30
+ future.set_result(result)
31
+ else:
32
+ future.set_exception(task_output_handle.get_exception().get_embedded_exception())
33
+
34
+ def on_exception(self, exception):
35
+ with self._callback_lock:
36
+ for future in self._task_id_to_future.values():
37
+ future.set_exception(exception)
38
+
39
+ self._task_id_to_future.clear()
40
+
41
+ def submit_task(self, task_id: str, future: concurrent.futures.Future):
42
+ self._task_id_to_future[task_id] = future
43
+
44
+ def get_callback_lock(self) -> threading.Lock:
45
+ return self._callback_lock
@@ -0,0 +1,79 @@
1
+ import time
2
+ from typing import Dict, Optional
3
+
4
+ import psutil
5
+
6
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
7
+ from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
8
+ from scaler.protocol.python.message import WorkerHeartbeat, WorkerHeartbeatEcho
9
+ from scaler.protocol.python.status import Resource
10
+ from scaler.utility.mixins import Looper
11
+ from scaler.worker.agent.mixins import HeartbeatManager, TimeoutManager
12
+ from scaler.worker_adapter.symphony.task_manager import SymphonyTaskManager
13
+
14
+
15
+ class SymphonyHeartbeatManager(Looper, HeartbeatManager):
16
+ def __init__(
17
+ self, storage_address: Optional[ObjectStorageConfig], capabilities: Dict[str, int], task_queue_size: int
18
+ ):
19
+ self._capabilities = capabilities
20
+ self._task_queue_size = task_queue_size
21
+
22
+ self._agent_process = psutil.Process()
23
+
24
+ self._connector_external: Optional[AsyncConnector] = None
25
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
26
+ self._worker_task_manager: Optional[SymphonyTaskManager] = None
27
+ self._timeout_manager: Optional[TimeoutManager] = None
28
+
29
+ self._start_timestamp_ns = 0
30
+ self._latency_us = 0
31
+
32
+ self._storage_address: Optional[ObjectStorageConfig] = storage_address
33
+
34
+ def register(
35
+ self,
36
+ connector_external: AsyncConnector,
37
+ connector_storage: AsyncObjectStorageConnector,
38
+ worker_task_manager: SymphonyTaskManager,
39
+ timeout_manager: TimeoutManager,
40
+ ):
41
+ self._connector_external = connector_external
42
+ self._connector_storage = connector_storage
43
+ self._worker_task_manager = worker_task_manager
44
+ self._timeout_manager = timeout_manager
45
+
46
+ async def on_heartbeat_echo(self, heartbeat: WorkerHeartbeatEcho):
47
+ if self._start_timestamp_ns == 0:
48
+ # not handling echo if we didn't send out heartbeat
49
+ return
50
+
51
+ self._latency_us = int(((time.time_ns() - self._start_timestamp_ns) / 2) // 1_000)
52
+ self._start_timestamp_ns = 0
53
+ self._timeout_manager.update_last_seen_time()
54
+
55
+ if self._storage_address is None:
56
+ address_message = heartbeat.object_storage_address()
57
+ self._storage_address = ObjectStorageConfig(address_message.host, address_message.port)
58
+ await self._connector_storage.connect(self._storage_address.host, self._storage_address.port)
59
+
60
+ def get_storage_address(self) -> Optional[ObjectStorageConfig]:
61
+ return self._storage_address
62
+
63
+ async def routine(self):
64
+ if self._start_timestamp_ns != 0:
65
+ return
66
+
67
+ await self._connector_external.send(
68
+ WorkerHeartbeat.new_msg(
69
+ Resource.new_msg(int(self._agent_process.cpu_percent() * 10), self._agent_process.memory_info().rss),
70
+ psutil.virtual_memory().available,
71
+ self._task_queue_size,
72
+ self._worker_task_manager.get_queued_size(),
73
+ self._latency_us,
74
+ self._worker_task_manager.can_accept_task(),
75
+ [],
76
+ self._capabilities,
77
+ )
78
+ )
79
+ self._start_timestamp_ns = time.time_ns()
@@ -0,0 +1,24 @@
1
+ import array
2
+
3
+ try:
4
+ import soamapi
5
+ except ImportError:
6
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
7
+
8
+
9
+ class SoamMessage(soamapi.Message):
10
+ def __init__(self, payload: bytes = b""):
11
+ self.__payload = payload
12
+
13
+ def set_payload(self, payload: bytes):
14
+ self.__payload = payload
15
+
16
+ def get_payload(self) -> bytes:
17
+ return self.__payload
18
+
19
+ def on_serialize(self, stream):
20
+ payload_array = array.array("b", self.get_payload())
21
+ stream.write_byte_array(payload_array, 0, len(payload_array))
22
+
23
+ def on_deserialize(self, stream):
24
+ self.set_payload(stream.read_byte_array("b"))