opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_scaler-1.12.37.dist-info/METADATA +730 -0
- opengris_scaler-1.12.37.dist-info/RECORD +196 -0
- opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +218 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +672 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +95 -0
- scaler/cluster/combo.py +157 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/common/__init__.py +0 -0
- scaler/config/common/logging.py +41 -0
- scaler/config/common/web.py +18 -0
- scaler/config/common/worker.py +65 -0
- scaler/config/common/worker_adapter.py +28 -0
- scaler/config/config_class.py +317 -0
- scaler/config/defaults.py +94 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +66 -0
- scaler/config/section/ecs_worker_adapter.py +78 -0
- scaler/config/section/native_worker_adapter.py +30 -0
- scaler/config/section/object_storage_server.py +13 -0
- scaler/config/section/scheduler.py +126 -0
- scaler/config/section/symphony_worker_adapter.py +35 -0
- scaler/config/section/top.py +16 -0
- scaler/config/section/webui.py +16 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +67 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +10 -0
- scaler/entry_points/object_storage_server.py +26 -0
- scaler/entry_points/scheduler.py +51 -0
- scaler/entry_points/top.py +272 -0
- scaler/entry_points/webui.py +6 -0
- scaler/entry_points/worker_adapter_ecs.py +22 -0
- scaler/entry_points/worker_adapter_native.py +31 -0
- scaler/entry_points/worker_adapter_symphony.py +26 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +249 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/_ymq.so +0 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/object_storage/object_storage_server.so +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/common/__init__.py +0 -0
- scaler/ui/common/constants.py +9 -0
- scaler/ui/common/live_display.py +147 -0
- scaler/ui/common/memory_window.py +146 -0
- scaler/ui/common/setting_page.py +40 -0
- scaler/ui/common/task_graph.py +840 -0
- scaler/ui/common/task_log.py +111 -0
- scaler/ui/common/utility.py +66 -0
- scaler/ui/common/webui.py +80 -0
- scaler/ui/common/worker_processors.py +104 -0
- scaler/ui/v1.py +76 -0
- scaler/ui/v2.py +102 -0
- scaler/ui/webui.py +21 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +110 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +241 -0
- scaler/worker_adapter/native.py +138 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +123 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import signal
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
from aiohttp import web
|
|
7
|
+
from aiohttp.web_request import Request
|
|
8
|
+
|
|
9
|
+
from scaler.config.section.native_worker_adapter import NativeWorkerAdapterConfig
|
|
10
|
+
from scaler.utility.identifiers import WorkerID
|
|
11
|
+
from scaler.worker.worker import Worker
|
|
12
|
+
from scaler.worker_adapter.common import CapacityExceededError, WorkerGroupID, WorkerGroupNotFoundError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class NativeWorkerAdapter:
|
|
16
|
+
def __init__(self, config: NativeWorkerAdapterConfig):
|
|
17
|
+
self._address = config.worker_adapter_config.scheduler_address
|
|
18
|
+
self._object_storage_address = config.worker_adapter_config.object_storage_address
|
|
19
|
+
self._capabilities = config.worker_config.per_worker_capabilities.capabilities
|
|
20
|
+
self._io_threads = config.worker_io_threads
|
|
21
|
+
self._task_queue_size = config.worker_config.per_worker_task_queue_size
|
|
22
|
+
self._max_workers = config.worker_adapter_config.max_workers
|
|
23
|
+
self._heartbeat_interval_seconds = config.worker_config.heartbeat_interval_seconds
|
|
24
|
+
self._task_timeout_seconds = config.worker_config.task_timeout_seconds
|
|
25
|
+
self._death_timeout_seconds = config.worker_config.death_timeout_seconds
|
|
26
|
+
self._garbage_collect_interval_seconds = config.worker_config.garbage_collect_interval_seconds
|
|
27
|
+
self._trim_memory_threshold_bytes = config.worker_config.trim_memory_threshold_bytes
|
|
28
|
+
self._hard_processor_suspend = config.worker_config.hard_processor_suspend
|
|
29
|
+
self._event_loop = config.event_loop
|
|
30
|
+
self._adapter_web_host = config.web_config.adapter_web_host
|
|
31
|
+
self._adapter_web_port = config.web_config.adapter_web_port
|
|
32
|
+
self._logging_paths = config.logging_config.paths
|
|
33
|
+
self._logging_level = config.logging_config.level
|
|
34
|
+
self._logging_config_file = config.logging_config.config_file
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
Although a worker group can contain multiple workers, in this native adapter implementation,
|
|
38
|
+
each worker group will only contain one worker.
|
|
39
|
+
"""
|
|
40
|
+
self._worker_groups: Dict[WorkerGroupID, Dict[WorkerID, Worker]] = {}
|
|
41
|
+
|
|
42
|
+
async def start_worker_group(self) -> WorkerGroupID:
|
|
43
|
+
num_of_workers = sum(len(workers) for workers in self._worker_groups.values())
|
|
44
|
+
if num_of_workers >= self._max_workers != -1:
|
|
45
|
+
raise CapacityExceededError(f"Maximum number of workers ({self._max_workers}) reached.")
|
|
46
|
+
|
|
47
|
+
worker = Worker(
|
|
48
|
+
name=f"NAT|{uuid.uuid4().hex}",
|
|
49
|
+
address=self._address,
|
|
50
|
+
object_storage_address=self._object_storage_address,
|
|
51
|
+
preload=None,
|
|
52
|
+
capabilities=self._capabilities,
|
|
53
|
+
io_threads=self._io_threads,
|
|
54
|
+
task_queue_size=self._task_queue_size,
|
|
55
|
+
heartbeat_interval_seconds=self._heartbeat_interval_seconds,
|
|
56
|
+
task_timeout_seconds=self._task_timeout_seconds,
|
|
57
|
+
death_timeout_seconds=self._death_timeout_seconds,
|
|
58
|
+
garbage_collect_interval_seconds=self._garbage_collect_interval_seconds,
|
|
59
|
+
trim_memory_threshold_bytes=self._trim_memory_threshold_bytes,
|
|
60
|
+
hard_processor_suspend=self._hard_processor_suspend,
|
|
61
|
+
event_loop=self._event_loop,
|
|
62
|
+
logging_paths=self._logging_paths,
|
|
63
|
+
logging_level=self._logging_level,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
worker.start()
|
|
67
|
+
worker_group_id = f"native-{uuid.uuid4().hex}".encode()
|
|
68
|
+
self._worker_groups[worker_group_id] = {worker.identity: worker}
|
|
69
|
+
return worker_group_id
|
|
70
|
+
|
|
71
|
+
async def shutdown_worker_group(self, worker_group_id: WorkerGroupID):
|
|
72
|
+
if worker_group_id not in self._worker_groups:
|
|
73
|
+
raise WorkerGroupNotFoundError(f"Worker group with ID {worker_group_id.decode()} does not exist.")
|
|
74
|
+
|
|
75
|
+
for worker in self._worker_groups[worker_group_id].values():
|
|
76
|
+
os.kill(worker.pid, signal.SIGINT)
|
|
77
|
+
worker.join()
|
|
78
|
+
|
|
79
|
+
self._worker_groups.pop(worker_group_id)
|
|
80
|
+
|
|
81
|
+
async def webhook_handler(self, request: Request):
|
|
82
|
+
request_json = await request.json()
|
|
83
|
+
|
|
84
|
+
if "action" not in request_json:
|
|
85
|
+
return web.json_response({"error": "No action specified"}, status=web.HTTPBadRequest.status_code)
|
|
86
|
+
|
|
87
|
+
action = request_json["action"]
|
|
88
|
+
|
|
89
|
+
if action == "get_worker_adapter_info":
|
|
90
|
+
return web.json_response(
|
|
91
|
+
{
|
|
92
|
+
"max_worker_groups": self._max_workers,
|
|
93
|
+
"workers_per_group": 1,
|
|
94
|
+
"base_capabilities": self._capabilities,
|
|
95
|
+
},
|
|
96
|
+
status=web.HTTPOk.status_code,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
elif action == "start_worker_group":
|
|
100
|
+
try:
|
|
101
|
+
worker_group_id = await self.start_worker_group()
|
|
102
|
+
except CapacityExceededError as e:
|
|
103
|
+
return web.json_response({"error": str(e)}, status=web.HTTPTooManyRequests.status_code)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
|
|
106
|
+
|
|
107
|
+
return web.json_response(
|
|
108
|
+
{
|
|
109
|
+
"status": "Worker group started",
|
|
110
|
+
"worker_group_id": worker_group_id.decode(),
|
|
111
|
+
"worker_ids": [worker_id.decode() for worker_id in self._worker_groups[worker_group_id].keys()],
|
|
112
|
+
},
|
|
113
|
+
status=web.HTTPOk.status_code,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
elif action == "shutdown_worker_group":
|
|
117
|
+
if "worker_group_id" not in request_json:
|
|
118
|
+
return web.json_response(
|
|
119
|
+
{"error": "No worker_group_id specified"}, status=web.HTTPBadRequest.status_code
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
worker_group_id = request_json["worker_group_id"].encode()
|
|
123
|
+
try:
|
|
124
|
+
await self.shutdown_worker_group(worker_group_id)
|
|
125
|
+
except WorkerGroupNotFoundError as e:
|
|
126
|
+
return web.json_response({"error": str(e)}, status=web.HTTPNotFound.status_code)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
return web.json_response({"error": str(e)}, status=web.HTTPInternalServerError.status_code)
|
|
129
|
+
|
|
130
|
+
return web.json_response({"status": "Worker group shutdown"}, status=web.HTTPOk.status_code)
|
|
131
|
+
|
|
132
|
+
else:
|
|
133
|
+
return web.json_response({"error": "Unknown action"}, status=web.HTTPBadRequest.status_code)
|
|
134
|
+
|
|
135
|
+
def create_app(self):
|
|
136
|
+
app = web.Application()
|
|
137
|
+
app.router.add_post("/", self.webhook_handler)
|
|
138
|
+
return app
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
import cloudpickle
|
|
6
|
+
|
|
7
|
+
from scaler.worker_adapter.symphony.message import SoamMessage
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import soamapi
|
|
11
|
+
except ImportError:
|
|
12
|
+
raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SessionCallback(soamapi.SessionCallback):
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self._callback_lock = threading.Lock()
|
|
18
|
+
self._task_id_to_future: Dict[str, concurrent.futures.Future] = {}
|
|
19
|
+
|
|
20
|
+
def on_response(self, task_output_handle):
|
|
21
|
+
with self._callback_lock:
|
|
22
|
+
task_id = task_output_handle.get_id()
|
|
23
|
+
|
|
24
|
+
future = self._task_id_to_future.pop(task_id)
|
|
25
|
+
|
|
26
|
+
if task_output_handle.is_successful():
|
|
27
|
+
output_message = SoamMessage()
|
|
28
|
+
task_output_handle.populate_task_output(output_message)
|
|
29
|
+
result = cloudpickle.loads(output_message.get_payload())
|
|
30
|
+
future.set_result(result)
|
|
31
|
+
else:
|
|
32
|
+
future.set_exception(task_output_handle.get_exception().get_embedded_exception())
|
|
33
|
+
|
|
34
|
+
def on_exception(self, exception):
|
|
35
|
+
with self._callback_lock:
|
|
36
|
+
for future in self._task_id_to_future.values():
|
|
37
|
+
future.set_exception(exception)
|
|
38
|
+
|
|
39
|
+
self._task_id_to_future.clear()
|
|
40
|
+
|
|
41
|
+
def submit_task(self, task_id: str, future: concurrent.futures.Future):
|
|
42
|
+
self._task_id_to_future[task_id] = future
|
|
43
|
+
|
|
44
|
+
def get_callback_lock(self) -> threading.Lock:
|
|
45
|
+
return self._callback_lock
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
import psutil
|
|
5
|
+
|
|
6
|
+
from scaler.config.types.object_storage_server import ObjectStorageAddressConfig
|
|
7
|
+
from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
|
|
8
|
+
from scaler.protocol.python.message import WorkerHeartbeat, WorkerHeartbeatEcho
|
|
9
|
+
from scaler.protocol.python.status import Resource
|
|
10
|
+
from scaler.utility.mixins import Looper
|
|
11
|
+
from scaler.worker.agent.mixins import HeartbeatManager, TimeoutManager
|
|
12
|
+
from scaler.worker_adapter.symphony.task_manager import SymphonyTaskManager
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SymphonyHeartbeatManager(Looper, HeartbeatManager):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
object_storage_address: Optional[ObjectStorageAddressConfig],
|
|
19
|
+
capabilities: Dict[str, int],
|
|
20
|
+
task_queue_size: int,
|
|
21
|
+
):
|
|
22
|
+
self._capabilities = capabilities
|
|
23
|
+
self._task_queue_size = task_queue_size
|
|
24
|
+
|
|
25
|
+
self._agent_process = psutil.Process()
|
|
26
|
+
|
|
27
|
+
self._connector_external: Optional[AsyncConnector] = None
|
|
28
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
29
|
+
self._worker_task_manager: Optional[SymphonyTaskManager] = None
|
|
30
|
+
self._timeout_manager: Optional[TimeoutManager] = None
|
|
31
|
+
|
|
32
|
+
self._start_timestamp_ns = 0
|
|
33
|
+
self._latency_us = 0
|
|
34
|
+
|
|
35
|
+
self._object_storage_address: Optional[ObjectStorageAddressConfig] = object_storage_address
|
|
36
|
+
|
|
37
|
+
def register(
|
|
38
|
+
self,
|
|
39
|
+
connector_external: AsyncConnector,
|
|
40
|
+
connector_storage: AsyncObjectStorageConnector,
|
|
41
|
+
worker_task_manager: SymphonyTaskManager,
|
|
42
|
+
timeout_manager: TimeoutManager,
|
|
43
|
+
):
|
|
44
|
+
self._connector_external = connector_external
|
|
45
|
+
self._connector_storage = connector_storage
|
|
46
|
+
self._worker_task_manager = worker_task_manager
|
|
47
|
+
self._timeout_manager = timeout_manager
|
|
48
|
+
|
|
49
|
+
async def on_heartbeat_echo(self, heartbeat: WorkerHeartbeatEcho):
|
|
50
|
+
if self._start_timestamp_ns == 0:
|
|
51
|
+
# not handling echo if we didn't send out heartbeat
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
self._latency_us = int(((time.time_ns() - self._start_timestamp_ns) / 2) // 1_000)
|
|
55
|
+
self._start_timestamp_ns = 0
|
|
56
|
+
self._timeout_manager.update_last_seen_time()
|
|
57
|
+
|
|
58
|
+
if self._object_storage_address is None:
|
|
59
|
+
address_message = heartbeat.object_storage_address()
|
|
60
|
+
self._object_storage_address = ObjectStorageAddressConfig(address_message.host, address_message.port)
|
|
61
|
+
await self._connector_storage.connect(self._object_storage_address.host, self._object_storage_address.port)
|
|
62
|
+
|
|
63
|
+
def get_object_storage_address(self) -> Optional[ObjectStorageAddressConfig]:
|
|
64
|
+
return self._object_storage_address
|
|
65
|
+
|
|
66
|
+
async def routine(self):
|
|
67
|
+
if self._start_timestamp_ns != 0:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
await self._connector_external.send(
|
|
71
|
+
WorkerHeartbeat.new_msg(
|
|
72
|
+
Resource.new_msg(int(self._agent_process.cpu_percent() * 10), self._agent_process.memory_info().rss),
|
|
73
|
+
psutil.virtual_memory().available,
|
|
74
|
+
self._task_queue_size,
|
|
75
|
+
self._worker_task_manager.get_queued_size(),
|
|
76
|
+
self._latency_us,
|
|
77
|
+
self._worker_task_manager.can_accept_task(),
|
|
78
|
+
[],
|
|
79
|
+
self._capabilities,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
self._start_timestamp_ns = time.time_ns()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import array
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
import soamapi
|
|
5
|
+
except ImportError:
|
|
6
|
+
raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SoamMessage(soamapi.Message):
|
|
10
|
+
def __init__(self, payload: bytes = b""):
|
|
11
|
+
self.__payload = payload
|
|
12
|
+
|
|
13
|
+
def set_payload(self, payload: bytes):
|
|
14
|
+
self.__payload = payload
|
|
15
|
+
|
|
16
|
+
def get_payload(self) -> bytes:
|
|
17
|
+
return self.__payload
|
|
18
|
+
|
|
19
|
+
def on_serialize(self, stream):
|
|
20
|
+
payload_array = array.array("b", self.get_payload())
|
|
21
|
+
stream.write_byte_array(payload_array, 0, len(payload_array))
|
|
22
|
+
|
|
23
|
+
def on_deserialize(self, stream):
|
|
24
|
+
self.set_payload(stream.read_byte_array("b"))
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from concurrent.futures import Future
|
|
4
|
+
from typing import Dict, Optional, Set, cast
|
|
5
|
+
|
|
6
|
+
import cloudpickle
|
|
7
|
+
from bidict import bidict
|
|
8
|
+
|
|
9
|
+
from scaler import Serializer
|
|
10
|
+
from scaler.io.mixins import AsyncConnector, AsyncObjectStorageConnector
|
|
11
|
+
from scaler.protocol.python.common import ObjectMetadata, ObjectStorageAddress, TaskCancelConfirmType, TaskResultType
|
|
12
|
+
from scaler.protocol.python.message import ObjectInstruction, Task, TaskCancel, TaskCancelConfirm, TaskResult
|
|
13
|
+
from scaler.utility.identifiers import ObjectID, TaskID
|
|
14
|
+
from scaler.utility.metadata.task_flags import retrieve_task_flags_from_task
|
|
15
|
+
from scaler.utility.mixins import Looper
|
|
16
|
+
from scaler.utility.queues.async_sorted_priority_queue import AsyncSortedPriorityQueue
|
|
17
|
+
from scaler.utility.serialization import serialize_failure
|
|
18
|
+
from scaler.worker.agent.mixins import HeartbeatManager, TaskManager
|
|
19
|
+
from scaler.worker_adapter.symphony.callback import SessionCallback
|
|
20
|
+
from scaler.worker_adapter.symphony.message import SoamMessage
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import soamapi
|
|
24
|
+
except ImportError:
|
|
25
|
+
raise ImportError("IBM Spectrum Symphony API not found, please install it with 'pip install soamapi'.")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SymphonyTaskManager(Looper, TaskManager):
|
|
29
|
+
def __init__(self, base_concurrency: int, service_name: str):
|
|
30
|
+
if isinstance(base_concurrency, int) and base_concurrency <= 0:
|
|
31
|
+
raise ValueError(f"base_concurrency must be a possible integer, got {base_concurrency}")
|
|
32
|
+
|
|
33
|
+
self._base_concurrency = base_concurrency
|
|
34
|
+
self._service_name = service_name
|
|
35
|
+
|
|
36
|
+
self._executor_semaphore = asyncio.Semaphore(value=self._base_concurrency)
|
|
37
|
+
|
|
38
|
+
self._task_id_to_task: Dict[TaskID, Task] = dict()
|
|
39
|
+
self._task_id_to_future: bidict[TaskID, asyncio.Future] = bidict()
|
|
40
|
+
|
|
41
|
+
self._serializers: Dict[bytes, Serializer] = dict()
|
|
42
|
+
|
|
43
|
+
self._queued_task_id_queue = AsyncSortedPriorityQueue()
|
|
44
|
+
self._queued_task_ids: Set[bytes] = set()
|
|
45
|
+
|
|
46
|
+
self._acquiring_task_ids: Set[TaskID] = set() # tasks contesting the semaphore
|
|
47
|
+
self._processing_task_ids: Set[TaskID] = set()
|
|
48
|
+
self._canceled_task_ids: Set[TaskID] = set()
|
|
49
|
+
|
|
50
|
+
self._object_storage_address: Optional[ObjectStorageAddress] = None
|
|
51
|
+
|
|
52
|
+
self._connector_external: Optional[AsyncConnector] = None
|
|
53
|
+
self._connector_storage: Optional[AsyncObjectStorageConnector] = None
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
SOAM specific code
|
|
57
|
+
"""
|
|
58
|
+
soamapi.initialize()
|
|
59
|
+
|
|
60
|
+
self._session_callback = SessionCallback()
|
|
61
|
+
|
|
62
|
+
self._ibm_soam_connection = soamapi.connect(
|
|
63
|
+
self._service_name, soamapi.DefaultSecurityCallback("Guest", "Guest")
|
|
64
|
+
)
|
|
65
|
+
logging.info(f"established IBM Spectrum Symphony connection {self._ibm_soam_connection.get_id()}")
|
|
66
|
+
|
|
67
|
+
ibm_soam_session_attr = soamapi.SessionCreationAttributes()
|
|
68
|
+
ibm_soam_session_attr.set_session_type("RecoverableAllHistoricalData")
|
|
69
|
+
ibm_soam_session_attr.set_session_name("ScalerSession")
|
|
70
|
+
ibm_soam_session_attr.set_session_flags(soamapi.SessionFlags.PARTIAL_ASYNC)
|
|
71
|
+
ibm_soam_session_attr.set_session_callback(self._session_callback)
|
|
72
|
+
self._ibm_soam_session = self._ibm_soam_connection.create_session(ibm_soam_session_attr)
|
|
73
|
+
logging.info(f"established IBM Spectrum Symphony session {self._ibm_soam_session.get_id()}")
|
|
74
|
+
|
|
75
|
+
def register(
|
|
76
|
+
self,
|
|
77
|
+
connector_external: AsyncConnector,
|
|
78
|
+
connector_storage: AsyncObjectStorageConnector,
|
|
79
|
+
heartbeat_manager: HeartbeatManager,
|
|
80
|
+
):
|
|
81
|
+
self._connector_external = connector_external
|
|
82
|
+
self._connector_storage = connector_storage
|
|
83
|
+
self._heartbeat_manager = heartbeat_manager
|
|
84
|
+
|
|
85
|
+
async def routine(self): # SymphonyTaskManager has two loops
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
async def on_object_instruction(self, instruction: ObjectInstruction):
|
|
89
|
+
if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
|
|
90
|
+
for object_id in instruction.object_metadata.object_ids:
|
|
91
|
+
self._serializers.pop(object_id, None) # we only cache serializers
|
|
92
|
+
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
logging.error(f"worker received unknown object instruction type {instruction=}")
|
|
96
|
+
|
|
97
|
+
async def on_task_new(self, task: Task):
|
|
98
|
+
task_priority = self.__get_task_priority(task)
|
|
99
|
+
|
|
100
|
+
# if semaphore is locked, check if task is higher priority than all acquired tasks
|
|
101
|
+
# if so, bypass acquiring and execute the task immediately
|
|
102
|
+
if self._executor_semaphore.locked():
|
|
103
|
+
for acquired_task_id in self._acquiring_task_ids:
|
|
104
|
+
acquired_task = self._task_id_to_task[acquired_task_id]
|
|
105
|
+
acquired_task_priority = self.__get_task_priority(acquired_task)
|
|
106
|
+
if task_priority <= acquired_task_priority:
|
|
107
|
+
break
|
|
108
|
+
else:
|
|
109
|
+
self._task_id_to_task[task.task_id] = task
|
|
110
|
+
self._processing_task_ids.add(task.task_id)
|
|
111
|
+
self._task_id_to_future[task.task_id] = await self.__execute_task(task)
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
self._task_id_to_task[task.task_id] = task
|
|
115
|
+
self._queued_task_id_queue.put_nowait((-task_priority, task.task_id))
|
|
116
|
+
self._queued_task_ids.add(task.task_id)
|
|
117
|
+
|
|
118
|
+
async def on_cancel_task(self, task_cancel: TaskCancel):
|
|
119
|
+
task_queued = task_cancel.task_id in self._queued_task_ids
|
|
120
|
+
task_processing = task_cancel.task_id in self._processing_task_ids
|
|
121
|
+
|
|
122
|
+
if not task_queued and not task_processing:
|
|
123
|
+
await self._connector_external.send(
|
|
124
|
+
TaskCancelConfirm.new_msg(
|
|
125
|
+
task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelNotFound
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
if task_processing and not task_cancel.flags.force:
|
|
131
|
+
await self._connector_external.send(
|
|
132
|
+
TaskCancelConfirm.new_msg(
|
|
133
|
+
task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.CancelFailed
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
if task_queued:
|
|
139
|
+
self._queued_task_ids.remove(task_cancel.task_id)
|
|
140
|
+
self._queued_task_id_queue.remove(task_cancel.task_id)
|
|
141
|
+
|
|
142
|
+
# task can be discarded because task was never submitted
|
|
143
|
+
self._task_id_to_task.pop(task_cancel.task_id)
|
|
144
|
+
|
|
145
|
+
if task_processing:
|
|
146
|
+
future = self._task_id_to_future[task_cancel.task_id]
|
|
147
|
+
future.cancel()
|
|
148
|
+
|
|
149
|
+
# regardless of the future being canceled, the task is considered canceled and cleanup will occur later
|
|
150
|
+
self._processing_task_ids.remove(task_cancel.task_id)
|
|
151
|
+
self._canceled_task_ids.add(task_cancel.task_id)
|
|
152
|
+
|
|
153
|
+
result = TaskCancelConfirm.new_msg(
|
|
154
|
+
task_id=task_cancel.task_id, cancel_confirm_type=TaskCancelConfirmType.Canceled
|
|
155
|
+
)
|
|
156
|
+
await self._connector_external.send(result)
|
|
157
|
+
|
|
158
|
+
async def on_task_result(self, result: TaskResult):
|
|
159
|
+
if result.task_id in self._queued_task_ids:
|
|
160
|
+
self._queued_task_ids.remove(result.task_id)
|
|
161
|
+
self._queued_task_id_queue.remove(result.task_id)
|
|
162
|
+
|
|
163
|
+
self._processing_task_ids.remove(result.task_id)
|
|
164
|
+
self._task_id_to_task.pop(result.task_id)
|
|
165
|
+
|
|
166
|
+
await self._connector_external.send(result)
|
|
167
|
+
|
|
168
|
+
def get_queued_size(self):
|
|
169
|
+
return self._queued_task_id_queue.qsize()
|
|
170
|
+
|
|
171
|
+
def can_accept_task(self):
|
|
172
|
+
return not self._executor_semaphore.locked()
|
|
173
|
+
|
|
174
|
+
async def resolve_tasks(self):
|
|
175
|
+
if not self._task_id_to_future:
|
|
176
|
+
await asyncio.sleep(0)
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
done, _ = await asyncio.wait(self._task_id_to_future.values(), return_when=asyncio.FIRST_COMPLETED)
|
|
180
|
+
for future in done:
|
|
181
|
+
task_id = self._task_id_to_future.inv.pop(future)
|
|
182
|
+
task = self._task_id_to_task[task_id]
|
|
183
|
+
|
|
184
|
+
if task_id in self._processing_task_ids:
|
|
185
|
+
self._processing_task_ids.remove(task_id)
|
|
186
|
+
|
|
187
|
+
if future.exception() is None:
|
|
188
|
+
serializer_id = ObjectID.generate_serializer_object_id(task.source)
|
|
189
|
+
serializer = self._serializers[serializer_id]
|
|
190
|
+
result_bytes = serializer.serialize(future.result())
|
|
191
|
+
result_type = TaskResultType.Success
|
|
192
|
+
else:
|
|
193
|
+
result_bytes = serialize_failure(cast(Exception, future.exception()))
|
|
194
|
+
result_type = TaskResultType.Failed
|
|
195
|
+
|
|
196
|
+
result_object_id = ObjectID.generate_object_id(task.source)
|
|
197
|
+
|
|
198
|
+
await self._connector_storage.set_object(result_object_id, result_bytes)
|
|
199
|
+
await self._connector_external.send(
|
|
200
|
+
ObjectInstruction.new_msg(
|
|
201
|
+
ObjectInstruction.ObjectInstructionType.Create,
|
|
202
|
+
task.source,
|
|
203
|
+
ObjectMetadata.new_msg(
|
|
204
|
+
object_ids=(result_object_id,),
|
|
205
|
+
object_types=(ObjectMetadata.ObjectContentType.Object,),
|
|
206
|
+
object_names=(f"<res {result_object_id.hex()[:6]}>".encode(),),
|
|
207
|
+
),
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
await self._connector_external.send(
|
|
212
|
+
TaskResult.new_msg(task_id, result_type, metadata=b"", results=[bytes(result_object_id)])
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
elif task_id in self._canceled_task_ids:
|
|
216
|
+
self._canceled_task_ids.remove(task_id)
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
raise ValueError(f"task_id {task_id.hex()} not found in processing or canceled tasks")
|
|
220
|
+
|
|
221
|
+
if task_id in self._acquiring_task_ids:
|
|
222
|
+
self._acquiring_task_ids.remove(task_id)
|
|
223
|
+
self._executor_semaphore.release()
|
|
224
|
+
|
|
225
|
+
self._task_id_to_task.pop(task_id)
|
|
226
|
+
|
|
227
|
+
async def process_task(self):
|
|
228
|
+
await self._executor_semaphore.acquire()
|
|
229
|
+
|
|
230
|
+
_, task_id = await self._queued_task_id_queue.get()
|
|
231
|
+
task = self._task_id_to_task[task_id]
|
|
232
|
+
|
|
233
|
+
self._acquiring_task_ids.add(task_id)
|
|
234
|
+
self._processing_task_ids.add(task_id)
|
|
235
|
+
self._task_id_to_future[task.task_id] = await self.__execute_task(task)
|
|
236
|
+
|
|
237
|
+
async def __execute_task(self, task: Task) -> asyncio.Future:
|
|
238
|
+
"""
|
|
239
|
+
This method is not very efficient because it does let objects linger in the cache. Each time inputs are
|
|
240
|
+
requested, all object data are requested.
|
|
241
|
+
"""
|
|
242
|
+
serializer_id = ObjectID.generate_serializer_object_id(task.source)
|
|
243
|
+
|
|
244
|
+
if serializer_id not in self._serializers:
|
|
245
|
+
serializer_bytes = await self._connector_storage.get_object(serializer_id)
|
|
246
|
+
serializer = cloudpickle.loads(serializer_bytes)
|
|
247
|
+
self._serializers[serializer_id] = serializer
|
|
248
|
+
else:
|
|
249
|
+
serializer = self._serializers[serializer_id]
|
|
250
|
+
|
|
251
|
+
# Fetches the function object and the argument objects concurrently
|
|
252
|
+
|
|
253
|
+
get_tasks = [
|
|
254
|
+
self._connector_storage.get_object(object_id)
|
|
255
|
+
for object_id in [task.func_object_id, *(cast(ObjectID, arg) for arg in task.function_args)]
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
function_bytes, *arg_bytes = await asyncio.gather(*get_tasks)
|
|
259
|
+
|
|
260
|
+
function = serializer.deserialize(function_bytes)
|
|
261
|
+
arg_objects = [serializer.deserialize(object_bytes) for object_bytes in arg_bytes]
|
|
262
|
+
|
|
263
|
+
"""
|
|
264
|
+
SOAM specific code
|
|
265
|
+
"""
|
|
266
|
+
input_message = SoamMessage()
|
|
267
|
+
input_message.set_payload(cloudpickle.dumps((function, *arg_objects)))
|
|
268
|
+
|
|
269
|
+
task_attr = soamapi.TaskSubmissionAttributes()
|
|
270
|
+
task_attr.set_task_input(input_message)
|
|
271
|
+
|
|
272
|
+
with self._session_callback.get_callback_lock():
|
|
273
|
+
symphony_task = self._ibm_soam_session.send_task_input(task_attr)
|
|
274
|
+
|
|
275
|
+
future: Future = Future()
|
|
276
|
+
future.set_running_or_notify_cancel()
|
|
277
|
+
|
|
278
|
+
self._session_callback.submit_task(symphony_task.get_id(), future)
|
|
279
|
+
|
|
280
|
+
return asyncio.wrap_future(future)
|
|
281
|
+
|
|
282
|
+
@staticmethod
|
|
283
|
+
def __get_task_priority(task: Task) -> int:
|
|
284
|
+
priority = retrieve_task_flags_from_task(task).priority
|
|
285
|
+
|
|
286
|
+
if priority < 0:
|
|
287
|
+
raise ValueError(f"invalid task priority, must be positive or zero, got {priority}")
|
|
288
|
+
|
|
289
|
+
return priority
|