opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (187) hide show
  1. opengris_scaler-1.12.28.dist-info/METADATA +728 -0
  2. opengris_scaler-1.12.28.dist-info/RECORD +187 -0
  3. opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +210 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +658 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +115 -0
  32. scaler/cluster/combo.py +150 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/defaults.py +94 -0
  37. scaler/config/loader.py +96 -0
  38. scaler/config/mixins.py +20 -0
  39. scaler/config/section/__init__.py +0 -0
  40. scaler/config/section/cluster.py +55 -0
  41. scaler/config/section/ecs_worker_adapter.py +85 -0
  42. scaler/config/section/native_worker_adapter.py +43 -0
  43. scaler/config/section/object_storage_server.py +8 -0
  44. scaler/config/section/scheduler.py +54 -0
  45. scaler/config/section/symphony_worker_adapter.py +47 -0
  46. scaler/config/section/top.py +13 -0
  47. scaler/config/section/webui.py +21 -0
  48. scaler/config/types/__init__.py +0 -0
  49. scaler/config/types/network_backend.py +12 -0
  50. scaler/config/types/object_storage_server.py +45 -0
  51. scaler/config/types/worker.py +62 -0
  52. scaler/config/types/zmq.py +83 -0
  53. scaler/entry_points/__init__.py +0 -0
  54. scaler/entry_points/cluster.py +133 -0
  55. scaler/entry_points/object_storage_server.py +45 -0
  56. scaler/entry_points/scheduler.py +144 -0
  57. scaler/entry_points/top.py +286 -0
  58. scaler/entry_points/webui.py +48 -0
  59. scaler/entry_points/worker_adapter_ecs.py +191 -0
  60. scaler/entry_points/worker_adapter_native.py +137 -0
  61. scaler/entry_points/worker_adapter_symphony.py +98 -0
  62. scaler/io/__init__.py +0 -0
  63. scaler/io/async_binder.py +89 -0
  64. scaler/io/async_connector.py +95 -0
  65. scaler/io/async_object_storage_connector.py +225 -0
  66. scaler/io/mixins.py +154 -0
  67. scaler/io/sync_connector.py +68 -0
  68. scaler/io/sync_object_storage_connector.py +247 -0
  69. scaler/io/sync_subscriber.py +83 -0
  70. scaler/io/utility.py +80 -0
  71. scaler/io/ymq/__init__.py +0 -0
  72. scaler/io/ymq/_ymq.pyi +95 -0
  73. scaler/io/ymq/ymq.py +138 -0
  74. scaler/io/ymq_async_object_storage_connector.py +184 -0
  75. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  76. scaler/object_storage/__init__.py +0 -0
  77. scaler/protocol/__init__.py +0 -0
  78. scaler/protocol/capnp/__init__.py +0 -0
  79. scaler/protocol/capnp/_python.py +6 -0
  80. scaler/protocol/capnp/common.capnp +68 -0
  81. scaler/protocol/capnp/message.capnp +218 -0
  82. scaler/protocol/capnp/object_storage.capnp +57 -0
  83. scaler/protocol/capnp/status.capnp +73 -0
  84. scaler/protocol/introduction.md +105 -0
  85. scaler/protocol/python/__init__.py +0 -0
  86. scaler/protocol/python/common.py +140 -0
  87. scaler/protocol/python/message.py +751 -0
  88. scaler/protocol/python/mixins.py +13 -0
  89. scaler/protocol/python/object_storage.py +118 -0
  90. scaler/protocol/python/status.py +279 -0
  91. scaler/protocol/worker.md +228 -0
  92. scaler/scheduler/__init__.py +0 -0
  93. scaler/scheduler/allocate_policy/__init__.py +0 -0
  94. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  95. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  96. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  97. scaler/scheduler/allocate_policy/mixins.py +55 -0
  98. scaler/scheduler/controllers/__init__.py +0 -0
  99. scaler/scheduler/controllers/balance_controller.py +65 -0
  100. scaler/scheduler/controllers/client_controller.py +131 -0
  101. scaler/scheduler/controllers/config_controller.py +31 -0
  102. scaler/scheduler/controllers/graph_controller.py +424 -0
  103. scaler/scheduler/controllers/information_controller.py +81 -0
  104. scaler/scheduler/controllers/mixins.py +194 -0
  105. scaler/scheduler/controllers/object_controller.py +147 -0
  106. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  107. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  108. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  109. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  110. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  111. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  112. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  113. scaler/scheduler/controllers/task_controller.py +376 -0
  114. scaler/scheduler/controllers/worker_controller.py +169 -0
  115. scaler/scheduler/object_usage/__init__.py +0 -0
  116. scaler/scheduler/object_usage/object_tracker.py +131 -0
  117. scaler/scheduler/scheduler.py +251 -0
  118. scaler/scheduler/task/__init__.py +0 -0
  119. scaler/scheduler/task/task_state_machine.py +92 -0
  120. scaler/scheduler/task/task_state_manager.py +61 -0
  121. scaler/ui/__init__.py +0 -0
  122. scaler/ui/constants.py +9 -0
  123. scaler/ui/live_display.py +147 -0
  124. scaler/ui/memory_window.py +146 -0
  125. scaler/ui/setting_page.py +40 -0
  126. scaler/ui/task_graph.py +832 -0
  127. scaler/ui/task_log.py +107 -0
  128. scaler/ui/utility.py +66 -0
  129. scaler/ui/webui.py +147 -0
  130. scaler/ui/worker_processors.py +104 -0
  131. scaler/utility/__init__.py +0 -0
  132. scaler/utility/debug.py +19 -0
  133. scaler/utility/event_list.py +63 -0
  134. scaler/utility/event_loop.py +58 -0
  135. scaler/utility/exceptions.py +42 -0
  136. scaler/utility/formatter.py +44 -0
  137. scaler/utility/graph/__init__.py +0 -0
  138. scaler/utility/graph/optimization.py +27 -0
  139. scaler/utility/graph/topological_sorter.py +11 -0
  140. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  141. scaler/utility/identifiers.py +107 -0
  142. scaler/utility/logging/__init__.py +0 -0
  143. scaler/utility/logging/decorators.py +25 -0
  144. scaler/utility/logging/scoped_logger.py +33 -0
  145. scaler/utility/logging/utility.py +183 -0
  146. scaler/utility/many_to_many_dict.py +123 -0
  147. scaler/utility/metadata/__init__.py +0 -0
  148. scaler/utility/metadata/profile_result.py +31 -0
  149. scaler/utility/metadata/task_flags.py +30 -0
  150. scaler/utility/mixins.py +13 -0
  151. scaler/utility/network_util.py +7 -0
  152. scaler/utility/one_to_many_dict.py +72 -0
  153. scaler/utility/queues/__init__.py +0 -0
  154. scaler/utility/queues/async_indexed_queue.py +37 -0
  155. scaler/utility/queues/async_priority_queue.py +70 -0
  156. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  157. scaler/utility/queues/indexed_queue.py +114 -0
  158. scaler/utility/serialization.py +9 -0
  159. scaler/version.txt +1 -0
  160. scaler/worker/__init__.py +0 -0
  161. scaler/worker/agent/__init__.py +0 -0
  162. scaler/worker/agent/heartbeat_manager.py +107 -0
  163. scaler/worker/agent/mixins.py +137 -0
  164. scaler/worker/agent/processor/__init__.py +0 -0
  165. scaler/worker/agent/processor/object_cache.py +107 -0
  166. scaler/worker/agent/processor/processor.py +285 -0
  167. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  168. scaler/worker/agent/processor_holder.py +147 -0
  169. scaler/worker/agent/processor_manager.py +369 -0
  170. scaler/worker/agent/profiling_manager.py +109 -0
  171. scaler/worker/agent/task_manager.py +150 -0
  172. scaler/worker/agent/timeout_manager.py +19 -0
  173. scaler/worker/preload.py +84 -0
  174. scaler/worker/worker.py +265 -0
  175. scaler/worker_adapter/__init__.py +0 -0
  176. scaler/worker_adapter/common.py +26 -0
  177. scaler/worker_adapter/ecs.py +269 -0
  178. scaler/worker_adapter/native.py +155 -0
  179. scaler/worker_adapter/symphony/__init__.py +0 -0
  180. scaler/worker_adapter/symphony/callback.py +45 -0
  181. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  182. scaler/worker_adapter/symphony/message.py +24 -0
  183. scaler/worker_adapter/symphony/task_manager.py +289 -0
  184. scaler/worker_adapter/symphony/worker.py +204 -0
  185. scaler/worker_adapter/symphony/worker_adapter.py +139 -0
  186. src/scaler/io/ymq/_ymq.so +0 -0
  187. src/scaler/object_storage/object_storage_server.so +0 -0
@@ -0,0 +1,155 @@
1
+ import os
2
+ import signal
3
+ import uuid
4
+ from typing import Dict, Optional, Tuple
5
+
6
+ from aiohttp import web
7
+ from aiohttp.web_request import Request
8
+
9
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
10
+ from scaler.config.types.zmq import ZMQConfig
11
+ from scaler.utility.identifiers import WorkerID
12
+ from scaler.worker.worker import Worker
13
+ from scaler.worker_adapter.common import CapacityExceededError, WorkerGroupID, WorkerGroupNotFoundError
14
+
15
+
16
+ class NativeWorkerAdapter:
17
+ def __init__(
18
+ self,
19
+ address: ZMQConfig,
20
+ object_storage_address: Optional[ObjectStorageConfig],
21
+ capabilities: Dict[str, int],
22
+ io_threads: int,
23
+ task_queue_size: int,
24
+ max_workers: int,
25
+ heartbeat_interval_seconds: int,
26
+ task_timeout_seconds: int,
27
+ death_timeout_seconds: int,
28
+ garbage_collect_interval_seconds: int,
29
+ trim_memory_threshold_bytes: int,
30
+ hard_processor_suspend: bool,
31
+ event_loop: str,
32
+ logging_paths: Tuple[str, ...],
33
+ logging_level: str,
34
+ logging_config_file: Optional[str],
35
+ ):
36
+ self._address = address
37
+ self._object_storage_address = object_storage_address
38
+ self._capabilities = capabilities
39
+ self._io_threads = io_threads
40
+ self._task_queue_size = task_queue_size
41
+ self._max_workers = max_workers
42
+ self._heartbeat_interval_seconds = heartbeat_interval_seconds
43
+ self._task_timeout_seconds = task_timeout_seconds
44
+ self._death_timeout_seconds = death_timeout_seconds
45
+ self._garbage_collect_interval_seconds = garbage_collect_interval_seconds
46
+ self._trim_memory_threshold_bytes = trim_memory_threshold_bytes
47
+ self._hard_processor_suspend = hard_processor_suspend
48
+ self._event_loop = event_loop
49
+ self._logging_paths = logging_paths
50
+ self._logging_level = logging_level
51
+ self._logging_config_file = logging_config_file
52
+
53
+ """
54
+ Although a worker group can contain multiple workers, in this native adapter implementation,
55
+ each worker group will only contain one worker.
56
+ """
57
+ self._worker_groups: Dict[WorkerGroupID, Dict[WorkerID, Worker]] = {}
58
+
59
+ async def start_worker_group(self) -> WorkerGroupID:
60
+ num_of_workers = sum(len(workers) for workers in self._worker_groups.values())
61
+ if num_of_workers >= self._max_workers != -1:
62
+ raise CapacityExceededError(f"Maximum number of workers ({self._max_workers}) reached.")
63
+
64
+ worker = Worker(
65
+ name=f"NAT|{uuid.uuid4().hex}",
66
+ address=self._address,
67
+ object_storage_address=self._object_storage_address,
68
+ preload=None,
69
+ capabilities=self._capabilities,
70
+ io_threads=self._io_threads,
71
+ task_queue_size=self._task_queue_size,
72
+ heartbeat_interval_seconds=self._heartbeat_interval_seconds,
73
+ task_timeout_seconds=self._task_timeout_seconds,
74
+ death_timeout_seconds=self._death_timeout_seconds,
75
+ garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
76
+ trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
77
+ hard_processor_suspend=self._hard_processor_suspend,
78
+ event_loop=self._event_loop,
79
+ logging_paths=self._logging_paths,
80
+ logging_level=self._logging_level,
81
+ )
82
+
83
+ worker.start()
84
+ worker_group_id = f"native-{uuid.uuid4().hex}".encode()
85
+ self._worker_groups[worker_group_id] = {worker.identity: worker}
86
+ return worker_group_id
87
+
88
+ async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
89
+ if worker_group_id not in self._worker_groups:
90
+ raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
91
+
92
+ for worker in self._worker_groups[worker_group_id].values():
93
+ os.kill(worker.pid, signal.SIGINT)
94
+ worker.join()
95
+
96
+ self._worker_groups.pop(worker_group_id)
97
+
98
+ async def webhook_handler(self, request: Request):
99
+ request_json = await request.json()
100
+
101
+ if "action" not in request_json:
102
+ return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
103
+
104
+ action = request_json["action"]
105
+
106
+ if action == "get_worker_adapter_info":
107
+ return web.json_response(
108
+ {
109
+ "max_worker_groups": self._max_workers,
110
+ "workers_per_group": 1,
111
+ "base_capabilities": self._capabilities,
112
+ },
113
+ status=web.HTTPOk.status_code,
114
+ )
115
+
116
+ elif action == "start_worker_group":
117
+ try:
118
+ worker_group_id = await self.start_worker_group()
119
+ except CapacityExceededError as e:
120
+ return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
121
+ except Exception as e:
122
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
123
+
124
+ return web.json_response(
125
+ {
126
+ "status": "Worker group started",
127
+ "worker_group_id": worker_group_id.decode(),
128
+ "worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].keys()],
129
+ },
130
+ status=web.HTTPOk.status_code,
131
+ )
132
+
133
+ elif action == "shutdown_worker_group":
134
+ if "worker_group_id" not in request_json:
135
+ return web.json_response(
136
+ {"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
137
+ )
138
+
139
+ worker_group_id = request_json["worker_group_id"].encode()
140
+ try:
141
+ await self.shutdown_worker_group(worker_group_id)
142
+ except WorkerGroupNotFoundError as e:
143
+ return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
144
+ except Exception as e:
145
+ return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
146
+
147
+ return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
148
+
149
+ else:
150
+ return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
151
+
152
+ def create_app(self):
153
+ app = web.Application()
154
+ app.router.add_post("/", self.webhook_handler)
155
+ return app
File without changes
@@ -0,0 +1,45 @@
1
+ import concurrent.futures
2
+ import threading
3
+ from typing import Dict
4
+
5
+ import cloudpickle
6
+
7
+ from scaler.worker_adapter.symphony.message import SoamMessage
8
+
9
+ try:
10
+ import soamapi
11
+ except ImportError:
12
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
13
+
14
+
15
+ class SessionCallback(soamapi.SessionCallback):
16
+ def __init__(self):
17
+ self._callback_lock = threading.Lock()
18
+ self._task_id_to_future: Dict[str, concurrent.futures.Future] = {}
19
+
20
+ def on_response(self, task_output_handle):
21
+ with self._callback_lock:
22
+ task_id = task_output_handle.get_id()
23
+
24
+ future = self._task_id_to_future.pop(task_id)
25
+
26
+ if task_output_handle.is_successful():
27
+ output_message = SoamMessage()
28
+ task_output_handle.populate_task_output(output_message)
29
+ result = cloudpickle.loads(output_message.get_payload())
30
+ future.set_result(result)
31
+ else:
32
+ future.set_exception(task_output_handle.get_exception().get_embedded_exception())
33
+
34
+ def on_exception(self, exception):
35
+ with self._callback_lock:
36
+ for future in self._task_id_to_future.values():
37
+ future.set_exception(exception)
38
+
39
+ self._task_id_to_future.clear()
40
+
41
+ def submit_task(self, task_id: str, future: concurrent.futures.Future):
42
+ self._task_id_to_future[task_id] = future
43
+
44
+ def get_callback_lock(self) -> threading.Lock:
45
+ return self._callback_lock
@@ -0,0 +1,79 @@
1
+ import time
2
+ from typing import Dict, Optional
3
+
4
+ import psutil
5
+
6
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
7
+ from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
8
+ from scaler.protocol.python.message import WorkerHeartbeat, WorkerHeartbeatEcho
9
+ from scaler.protocol.python.status import Resource
10
+ from scaler.utility.mixins import Looper
11
+ from scaler.worker.agent.mixins import HeartbeatManager, TimeoutManager
12
+ from scaler.worker_adapter.symphony.task_manager import SymphonyTaskManager
13
+
14
+
15
+ class SymphonyHeartbeatManager(Looper, HeartbeatManager):
16
+ def __init__(
17
+ self, object_storage_address: Optional[ObjectStorageConfig], capabilities: Dict[str, int], task_queue_size: int
18
+ ):
19
+ self._capabilities = capabilities
20
+ self._task_queue_size = task_queue_size
21
+
22
+ self._agent_process = psutil.Process()
23
+
24
+ self._connector_external: Optional[AsyncConnector] = None
25
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
26
+ self._worker_task_manager: Optional[SymphonyTaskManager] = None
27
+ self._timeout_manager: Optional[TimeoutManager] = None
28
+
29
+ self._start_timestamp_ns = 0
30
+ self._latency_us = 0
31
+
32
+ self._object_storage_address: Optional[ObjectStorageConfig] = object_storage_address
33
+
34
+ def register(
35
+ self,
36
+ connector_external: AsyncConnector,
37
+ connector_storage: AsyncObjectStorageConnector,
38
+ worker_task_manager: SymphonyTaskManager,
39
+ timeout_manager: TimeoutManager,
40
+ ):
41
+ self._connector_external = connector_external
42
+ self._connector_storage = connector_storage
43
+ self._worker_task_manager = worker_task_manager
44
+ self._timeout_manager = timeout_manager
45
+
46
+ async def on_heartbeat_echo(self, heartbeat: WorkerHeartbeatEcho):
47
+ if self._start_timestamp_ns == 0:
48
+ # not handling echo if we didn't send out heartbeat
49
+ return
50
+
51
+ self._latency_us = int(((time.time_ns() - self._start_timestamp_ns) / 2) // 1_000)
52
+ self._start_timestamp_ns = 0
53
+ self._timeout_manager.update_last_seen_time()
54
+
55
+ if self._object_storage_address is None:
56
+ address_message = heartbeat.object_storage_address()
57
+ self._object_storage_address = ObjectStorageConfig(address_message.host, address_message.port)
58
+ await self._connector_storage.connect(self._object_storage_address.host, self._object_storage_address.port)
59
+
60
+ def get_object_storage_address(self) -> Optional[ObjectStorageConfig]:
61
+ return self._object_storage_address
62
+
63
+ async def routine(self):
64
+ if self._start_timestamp_ns != 0:
65
+ return
66
+
67
+ await self._connector_external.send(
68
+ WorkerHeartbeat.new_msg(
69
+ Resource.new_msg(int(self._agent_process.cpu_percent() * 10), self._agent_process.memory_info().rss),
70
+ psutil.virtual_memory().available,
71
+ self._task_queue_size,
72
+ self._worker_task_manager.get_queued_size(),
73
+ self._latency_us,
74
+ self._worker_task_manager.can_accept_task(),
75
+ [],
76
+ self._capabilities,
77
+ )
78
+ )
79
+ self._start_timestamp_ns = time.time_ns()
@@ -0,0 +1,24 @@
1
+ import array
2
+
3
+ try:
4
+ import soamapi
5
+ except ImportError:
6
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
7
+
8
+
9
+ class SoamMessage(soamapi.Message):
10
+ def __init__(self, payload: bytes = b""):
11
+ self.__payload = payload
12
+
13
+ def set_payload(self, payload: bytes):
14
+ self.__payload = payload
15
+
16
+ def get_payload(self) -> bytes:
17
+ return self.__payload
18
+
19
+ def on_serialize(self, stream):
20
+ payload_array = array.array("b", self.get_payload())
21
+ stream.write_byte_array(payload_array, 0, len(payload_array))
22
+
23
+ def on_deserialize(self, stream):
24
+ self.set_payload(stream.read_byte_array("b"))
@@ -0,0 +1,289 @@
1
+ import asyncio
2
+ import logging
3
+ from concurrent.futures import Future
4
+ from typing import Dict, Optional, Set, cast
5
+
6
+ import cloudpickle
7
+ from bidict import bidict
8
+
9
+ from scaler import Serializer
10
+ from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
11
+ from scaler.protocol.python.common import ObjectMetadata, ObjectStorageAddress, TaskCancelConfirmType, TaskResultType
12
+ from scaler.protocol.python.message import ObjectInstruction, Task, TaskCancel, TaskCancelConfirm, TaskResult
13
+ from scaler.utility.identifiers import ObjectID, TaskID
14
+ from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
15
+ from scaler.utility.mixins import Looper
16
+ from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
17
+ from scaler.utility.serialization import serialize_failure
18
+ from scaler.worker.agent.mixins import HeartbeatManager, TaskManager
19
+ from scaler.worker_adapter.symphony.callback import SessionCallback
20
+ from scaler.worker_adapter.symphony.message import SoamMessage
21
+
22
+ try:
23
+ import soamapi
24
+ except ImportError:
25
+ raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
26
+
27
+
28
+ class SymphonyTaskManager(Looper, TaskManager):
29
+ def __init__(self, base_concurrency: int, service_name: str):
30
+ if isinstance(base_concurrency, int) and base_concurrency <= 0:
31
+ raise ValueError(f"base_concurrency must be a possible integer, got {base_concurrency}")
32
+
33
+ self._base_concurrency = base_concurrency
34
+ self._service_name = service_name
35
+
36
+ self._executor_semaphore = asyncio.Semaphore(value=self._base_concurrency)
37
+
38
+ self._task_id_to_task: Dict[TaskID, Task] = dict()
39
+ self._task_id_to_future: bidict[TaskID, asyncio.Future] = bidict()
40
+
41
+ self._serializers: Dict[bytes, Serializer] = dict()
42
+
43
+ self._queued_task_id_queue = AsyncSortedPriorityQueue()
44
+ self._queued_task_ids: Set[bytes] = set()
45
+
46
+ self._acquiring_task_ids: Set[TaskID] = set() # tasks contesting the semaphore
47
+ self._processing_task_ids: Set[TaskID] = set()
48
+ self._canceled_task_ids: Set[TaskID] = set()
49
+
50
+ self._object_storage_address: Optional[ObjectStorageAddress] = None
51
+
52
+ self._connector_external: Optional[AsyncConnector] = None
53
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
54
+
55
+ """
56
+ SOAM specific code
57
+ """
58
+ soamapi.initialize()
59
+
60
+ self._session_callback = SessionCallback()
61
+
62
+ self._ibm_soam_connection = soamapi.connect(
63
+ self._service_name, soamapi.DefaultSecurityCallback("Guest", "Guest")
64
+ )
65
+ logging.info(f"established IBM Spectrum Symphony connection {self._ibm_soam_connection.get_id()}")
66
+
67
+ ibm_soam_session_attr = soamapi.SessionCreationAttributes()
68
+ ibm_soam_session_attr.set_session_type("RecoverableAllHistoricalData")
69
+ ibm_soam_session_attr.set_session_name("ScalerSession")
70
+ ibm_soam_session_attr.set_session_flags(soamapi.SessionFlags.PARTIAL_ASYNC)
71
+ ibm_soam_session_attr.set_session_callback(self._session_callback)
72
+ self._ibm_soam_session = self._ibm_soam_connection.create_session(ibm_soam_session_attr)
73
+ logging.info(f"established IBM Spectrum Symphony session {self._ibm_soam_session.get_id()}")
74
+
75
+ def register(
76
+ self,
77
+ connector_external: AsyncConnector,
78
+ connector_storage: AsyncObjectStorageConnector,
79
+ heartbeat_manager: HeartbeatManager,
80
+ ):
81
+ self._connector_external = connector_external
82
+ self._connector_storage = connector_storage
83
+ self._heartbeat_manager = heartbeat_manager
84
+
85
+ async def routine(self): # SymphonyTaskManager has two loops
86
+ pass
87
+
88
+ async def on_object_instruction(self, instruction: ObjectInstruction):
89
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
90
+ for object_id in instruction.object_metadata.object_ids:
91
+ self._serializers.pop(object_id, None) # we only cache serializers
92
+
93
+ return
94
+
95
+ logging.error(f"worker received unknown object instruction type {instruction=}")
96
+
97
+ async def on_task_new(self, task: Task):
98
+ task_priority = self.__get_task_priority(task)
99
+
100
+ # if semaphore is locked, check if task is higher priority than all acquired tasks
101
+ # if so, bypass acquiring and execute the task immediately
102
+ if self._executor_semaphore.locked():
103
+ for acquired_task_id in self._acquiring_task_ids:
104
+ acquired_task = self._task_id_to_task[acquired_task_id]
105
+ acquired_task_priority = self.__get_task_priority(acquired_task)
106
+ if task_priority <= acquired_task_priority:
107
+ break
108
+ else:
109
+ self._task_id_to_task[task.task_id] = task
110
+ self._processing_task_ids.add(task.task_id)
111
+ self._task_id_to_future[task.task_id] = await self.__execute_task(task)
112
+ return
113
+
114
+ self._task_id_to_task[task.task_id] = task
115
+ self._queued_task_id_queue.put_nowait((-task_priority, task.task_id))
116
+ self._queued_task_ids.add(task.task_id)
117
+
118
+ async def on_cancel_task(self, task_cancel: TaskCancel):
119
+ task_queued = task_cancel.task_id in self._queued_task_ids
120
+ task_processing = task_cancel.task_id in self._processing_task_ids
121
+
122
+ if not task_queued and not task_processing:
123
+ await self._connector_external.send(
124
+ TaskCancelConfirm.new_msg(
125
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
126
+ )
127
+ )
128
+ return
129
+
130
+ if task_processing and not task_cancel.flags.force:
131
+ await self._connector_external.send(
132
+ TaskCancelConfirm.new_msg(
133
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
134
+ )
135
+ )
136
+ return
137
+
138
+ if task_queued:
139
+ self._queued_task_ids.remove(task_cancel.task_id)
140
+ self._queued_task_id_queue.remove(task_cancel.task_id)
141
+
142
+ # task can be discarded because task was never submitted
143
+ self._task_id_to_task.pop(task_cancel.task_id)
144
+
145
+ if task_processing:
146
+ future = self._task_id_to_future[task_cancel.task_id]
147
+ future.cancel()
148
+
149
+ # regardless of the future being canceled, the task is considered canceled and cleanup will occur later
150
+ self._processing_task_ids.remove(task_cancel.task_id)
151
+ self._canceled_task_ids.add(task_cancel.task_id)
152
+
153
+ result = TaskCancelConfirm.new_msg(
154
+ task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled
155
+ )
156
+ await self._connector_external.send(result)
157
+
158
+ async def on_task_result(self, result: TaskResult):
159
+ if result.task_id in self._queued_task_ids:
160
+ self._queued_task_ids.remove(result.task_id)
161
+ self._queued_task_id_queue.remove(result.task_id)
162
+
163
+ self._processing_task_ids.remove(result.task_id)
164
+ self._task_id_to_task.pop(result.task_id)
165
+
166
+ await self._connector_external.send(result)
167
+
168
+ def get_queued_size(self):
169
+ return self._queued_task_id_queue.qsize()
170
+
171
+ def can_accept_task(self):
172
+ return not self._executor_semaphore.locked()
173
+
174
+ async def resolve_tasks(self):
175
+ if not self._task_id_to_future:
176
+ await asyncio.sleep(0)
177
+ return
178
+
179
+ done, _ = await asyncio.wait(self._task_id_to_future.values(), return_when=asyncio.FIRST_COMPLETED)
180
+ for future in done:
181
+ task_id = self._task_id_to_future.inv.pop(future)
182
+ task = self._task_id_to_task[task_id]
183
+
184
+ if task_id in self._processing_task_ids:
185
+ self._processing_task_ids.remove(task_id)
186
+
187
+ if future.exception() is None:
188
+ serializer_id = ObjectID.generate_serializer_object_id(task.source)
189
+ serializer = self._serializers[serializer_id]
190
+ result_bytes = serializer.serialize(future.result())
191
+ result_type = TaskResultType.Success
192
+ else:
193
+ result_bytes = serialize_failure(cast(Exception, future.exception()))
194
+ result_type = TaskResultType.Failed
195
+
196
+ result_object_id = ObjectID.generate_object_id(task.source)
197
+
198
+ await self._connector_storage.set_object(result_object_id, result_bytes)
199
+ await self._connector_external.send(
200
+ ObjectInstruction.new_msg(
201
+ ObjectInstruction.ObjectInstructionType.Create,
202
+ task.source,
203
+ ObjectMetadata.new_msg(
204
+ object_ids=(result_object_id,),
205
+ object_types=(ObjectMetadata.ObjectContentType.Object,),
206
+ object_names=(f"<res {result_object_id.hex()[:6]}>".encode(),),
207
+ ),
208
+ )
209
+ )
210
+
211
+ await self._connector_external.send(
212
+ TaskResult.new_msg(task_id, result_type, metadata=b"", results=[bytes(result_object_id)])
213
+ )
214
+
215
+ elif task_id in self._canceled_task_ids:
216
+ self._canceled_task_ids.remove(task_id)
217
+
218
+ else:
219
+ raise ValueError(f"task_id {task_id.hex()} not found in processing or canceled tasks")
220
+
221
+ if task_id in self._acquiring_task_ids:
222
+ self._acquiring_task_ids.remove(task_id)
223
+ self._executor_semaphore.release()
224
+
225
+ self._task_id_to_task.pop(task_id)
226
+
227
+ async def process_task(self):
228
+ await self._executor_semaphore.acquire()
229
+
230
+ _, task_id = await self._queued_task_id_queue.get()
231
+ task = self._task_id_to_task[task_id]
232
+
233
+ self._acquiring_task_ids.add(task_id)
234
+ self._processing_task_ids.add(task_id)
235
+ self._task_id_to_future[task.task_id] = await self.__execute_task(task)
236
+
237
+ async def __execute_task(self, task: Task) -> asyncio.Future:
238
+ """
239
+ This method is not very efficient because it does let objects linger in the cache. Each time inputs are
240
+ requested, all object data are requested.
241
+ """
242
+ serializer_id = ObjectID.generate_serializer_object_id(task.source)
243
+
244
+ if serializer_id not in self._serializers:
245
+ serializer_bytes = await self._connector_storage.get_object(serializer_id)
246
+ serializer = cloudpickle.loads(serializer_bytes)
247
+ self._serializers[serializer_id] = serializer
248
+ else:
249
+ serializer = self._serializers[serializer_id]
250
+
251
+ # Fetches the function object and the argument objects concurrently
252
+
253
+ get_tasks = [
254
+ self._connector_storage.get_object(object_id)
255
+ for object_id in [task.func_object_id, *(cast(ObjectID, arg) for arg in task.function_args)]
256
+ ]
257
+
258
+ function_bytes, *arg_bytes = await asyncio.gather(*get_tasks)
259
+
260
+ function = serializer.deserialize(function_bytes)
261
+ arg_objects = [serializer.deserialize(object_bytes) for object_bytes in arg_bytes]
262
+
263
+ """
264
+ SOAM specific code
265
+ """
266
+ input_message = SoamMessage()
267
+ input_message.set_payload(cloudpickle.dumps((function, *arg_objects)))
268
+
269
+ task_attr = soamapi.TaskSubmissionAttributes()
270
+ task_attr.set_task_input(input_message)
271
+
272
+ with self._session_callback.get_callback_lock():
273
+ symphony_task = self._ibm_soam_session.send_task_input(task_attr)
274
+
275
+ future: Future = Future()
276
+ future.set_running_or_notify_cancel()
277
+
278
+ self._session_callback.submit_task(symphony_task.get_id(), future)
279
+
280
+ return asyncio.wrap_future(future)
281
+
282
+ @staticmethod
283
+ def __get_task_priority(task: Task) -> int:
284
+ priority = retrieve_task_flags_from_task(task).priority
285
+
286
+ if priority < 0:
287
+ raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
288
+
289
+ return priority