opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opengris-scaler might be problematic. Click here for more details.
- opengris_scaler-1.12.28.dist-info/METADATA +728 -0
- opengris_scaler-1.12.28.dist-info/RECORD +187 -0
- opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
- opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
- opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
- opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
- opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
- opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
- opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
- scaler/__init__.py +14 -0
- scaler/about.py +5 -0
- scaler/client/__init__.py +0 -0
- scaler/client/agent/__init__.py +0 -0
- scaler/client/agent/client_agent.py +210 -0
- scaler/client/agent/disconnect_manager.py +27 -0
- scaler/client/agent/future_manager.py +112 -0
- scaler/client/agent/heartbeat_manager.py +74 -0
- scaler/client/agent/mixins.py +89 -0
- scaler/client/agent/object_manager.py +98 -0
- scaler/client/agent/task_manager.py +64 -0
- scaler/client/client.py +658 -0
- scaler/client/future.py +252 -0
- scaler/client/object_buffer.py +129 -0
- scaler/client/object_reference.py +25 -0
- scaler/client/serializer/__init__.py +0 -0
- scaler/client/serializer/default.py +16 -0
- scaler/client/serializer/mixins.py +38 -0
- scaler/cluster/__init__.py +0 -0
- scaler/cluster/cluster.py +115 -0
- scaler/cluster/combo.py +150 -0
- scaler/cluster/object_storage_server.py +45 -0
- scaler/cluster/scheduler.py +86 -0
- scaler/config/__init__.py +0 -0
- scaler/config/defaults.py +94 -0
- scaler/config/loader.py +96 -0
- scaler/config/mixins.py +20 -0
- scaler/config/section/__init__.py +0 -0
- scaler/config/section/cluster.py +55 -0
- scaler/config/section/ecs_worker_adapter.py +85 -0
- scaler/config/section/native_worker_adapter.py +43 -0
- scaler/config/section/object_storage_server.py +8 -0
- scaler/config/section/scheduler.py +54 -0
- scaler/config/section/symphony_worker_adapter.py +47 -0
- scaler/config/section/top.py +13 -0
- scaler/config/section/webui.py +21 -0
- scaler/config/types/__init__.py +0 -0
- scaler/config/types/network_backend.py +12 -0
- scaler/config/types/object_storage_server.py +45 -0
- scaler/config/types/worker.py +62 -0
- scaler/config/types/zmq.py +83 -0
- scaler/entry_points/__init__.py +0 -0
- scaler/entry_points/cluster.py +133 -0
- scaler/entry_points/object_storage_server.py +45 -0
- scaler/entry_points/scheduler.py +144 -0
- scaler/entry_points/top.py +286 -0
- scaler/entry_points/webui.py +48 -0
- scaler/entry_points/worker_adapter_ecs.py +191 -0
- scaler/entry_points/worker_adapter_native.py +137 -0
- scaler/entry_points/worker_adapter_symphony.py +98 -0
- scaler/io/__init__.py +0 -0
- scaler/io/async_binder.py +89 -0
- scaler/io/async_connector.py +95 -0
- scaler/io/async_object_storage_connector.py +225 -0
- scaler/io/mixins.py +154 -0
- scaler/io/sync_connector.py +68 -0
- scaler/io/sync_object_storage_connector.py +247 -0
- scaler/io/sync_subscriber.py +83 -0
- scaler/io/utility.py +80 -0
- scaler/io/ymq/__init__.py +0 -0
- scaler/io/ymq/_ymq.pyi +95 -0
- scaler/io/ymq/ymq.py +138 -0
- scaler/io/ymq_async_object_storage_connector.py +184 -0
- scaler/io/ymq_sync_object_storage_connector.py +184 -0
- scaler/object_storage/__init__.py +0 -0
- scaler/protocol/__init__.py +0 -0
- scaler/protocol/capnp/__init__.py +0 -0
- scaler/protocol/capnp/_python.py +6 -0
- scaler/protocol/capnp/common.capnp +68 -0
- scaler/protocol/capnp/message.capnp +218 -0
- scaler/protocol/capnp/object_storage.capnp +57 -0
- scaler/protocol/capnp/status.capnp +73 -0
- scaler/protocol/introduction.md +105 -0
- scaler/protocol/python/__init__.py +0 -0
- scaler/protocol/python/common.py +140 -0
- scaler/protocol/python/message.py +751 -0
- scaler/protocol/python/mixins.py +13 -0
- scaler/protocol/python/object_storage.py +118 -0
- scaler/protocol/python/status.py +279 -0
- scaler/protocol/worker.md +228 -0
- scaler/scheduler/__init__.py +0 -0
- scaler/scheduler/allocate_policy/__init__.py +0 -0
- scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
- scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
- scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
- scaler/scheduler/allocate_policy/mixins.py +55 -0
- scaler/scheduler/controllers/__init__.py +0 -0
- scaler/scheduler/controllers/balance_controller.py +65 -0
- scaler/scheduler/controllers/client_controller.py +131 -0
- scaler/scheduler/controllers/config_controller.py +31 -0
- scaler/scheduler/controllers/graph_controller.py +424 -0
- scaler/scheduler/controllers/information_controller.py +81 -0
- scaler/scheduler/controllers/mixins.py +194 -0
- scaler/scheduler/controllers/object_controller.py +147 -0
- scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
- scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
- scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
- scaler/scheduler/controllers/scaling_policies/null.py +14 -0
- scaler/scheduler/controllers/scaling_policies/types.py +9 -0
- scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
- scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
- scaler/scheduler/controllers/task_controller.py +376 -0
- scaler/scheduler/controllers/worker_controller.py +169 -0
- scaler/scheduler/object_usage/__init__.py +0 -0
- scaler/scheduler/object_usage/object_tracker.py +131 -0
- scaler/scheduler/scheduler.py +251 -0
- scaler/scheduler/task/__init__.py +0 -0
- scaler/scheduler/task/task_state_machine.py +92 -0
- scaler/scheduler/task/task_state_manager.py +61 -0
- scaler/ui/__init__.py +0 -0
- scaler/ui/constants.py +9 -0
- scaler/ui/live_display.py +147 -0
- scaler/ui/memory_window.py +146 -0
- scaler/ui/setting_page.py +40 -0
- scaler/ui/task_graph.py +832 -0
- scaler/ui/task_log.py +107 -0
- scaler/ui/utility.py +66 -0
- scaler/ui/webui.py +147 -0
- scaler/ui/worker_processors.py +104 -0
- scaler/utility/__init__.py +0 -0
- scaler/utility/debug.py +19 -0
- scaler/utility/event_list.py +63 -0
- scaler/utility/event_loop.py +58 -0
- scaler/utility/exceptions.py +42 -0
- scaler/utility/formatter.py +44 -0
- scaler/utility/graph/__init__.py +0 -0
- scaler/utility/graph/optimization.py +27 -0
- scaler/utility/graph/topological_sorter.py +11 -0
- scaler/utility/graph/topological_sorter_graphblas.py +174 -0
- scaler/utility/identifiers.py +107 -0
- scaler/utility/logging/__init__.py +0 -0
- scaler/utility/logging/decorators.py +25 -0
- scaler/utility/logging/scoped_logger.py +33 -0
- scaler/utility/logging/utility.py +183 -0
- scaler/utility/many_to_many_dict.py +123 -0
- scaler/utility/metadata/__init__.py +0 -0
- scaler/utility/metadata/profile_result.py +31 -0
- scaler/utility/metadata/task_flags.py +30 -0
- scaler/utility/mixins.py +13 -0
- scaler/utility/network_util.py +7 -0
- scaler/utility/one_to_many_dict.py +72 -0
- scaler/utility/queues/__init__.py +0 -0
- scaler/utility/queues/async_indexed_queue.py +37 -0
- scaler/utility/queues/async_priority_queue.py +70 -0
- scaler/utility/queues/async_sorted_priority_queue.py +45 -0
- scaler/utility/queues/indexed_queue.py +114 -0
- scaler/utility/serialization.py +9 -0
- scaler/version.txt +1 -0
- scaler/worker/__init__.py +0 -0
- scaler/worker/agent/__init__.py +0 -0
- scaler/worker/agent/heartbeat_manager.py +107 -0
- scaler/worker/agent/mixins.py +137 -0
- scaler/worker/agent/processor/__init__.py +0 -0
- scaler/worker/agent/processor/object_cache.py +107 -0
- scaler/worker/agent/processor/processor.py +285 -0
- scaler/worker/agent/processor/streaming_buffer.py +28 -0
- scaler/worker/agent/processor_holder.py +147 -0
- scaler/worker/agent/processor_manager.py +369 -0
- scaler/worker/agent/profiling_manager.py +109 -0
- scaler/worker/agent/task_manager.py +150 -0
- scaler/worker/agent/timeout_manager.py +19 -0
- scaler/worker/preload.py +84 -0
- scaler/worker/worker.py +265 -0
- scaler/worker_adapter/__init__.py +0 -0
- scaler/worker_adapter/common.py +26 -0
- scaler/worker_adapter/ecs.py +269 -0
- scaler/worker_adapter/native.py +155 -0
- scaler/worker_adapter/symphony/__init__.py +0 -0
- scaler/worker_adapter/symphony/callback.py +45 -0
- scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
- scaler/worker_adapter/symphony/message.py +24 -0
- scaler/worker_adapter/symphony/task_manager.py +289 -0
- scaler/worker_adapter/symphony/worker.py +204 -0
- scaler/worker_adapter/symphony/worker_adapter.py +139 -0
- src/scaler/io/ymq/_ymq.so +0 -0
- src/scaler/object_storage/object_storage_server.so +0 -0
scaler/cluster/combo.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import socket
|
|
3
|
+
from typing import Dict, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from scaler.cluster.cluster import Cluster
|
|
6
|
+
from scaler.cluster.object_storage_server import ObjectStorageServerProcess
|
|
7
|
+
from scaler.cluster.scheduler import SchedulerProcess
|
|
8
|
+
from scaler.config.defaults import (
|
|
9
|
+
DEFAULT_CLIENT_TIMEOUT_SECONDS,
|
|
10
|
+
DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
|
|
11
|
+
DEFAULT_HARD_PROCESSOR_SUSPEND,
|
|
12
|
+
DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
|
|
13
|
+
DEFAULT_IO_THREADS,
|
|
14
|
+
DEFAULT_LOAD_BALANCE_SECONDS,
|
|
15
|
+
DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
|
|
16
|
+
DEFAULT_LOGGING_LEVEL,
|
|
17
|
+
DEFAULT_LOGGING_PATHS,
|
|
18
|
+
DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
|
|
19
|
+
DEFAULT_OBJECT_RETENTION_SECONDS,
|
|
20
|
+
DEFAULT_PER_WORKER_QUEUE_SIZE,
|
|
21
|
+
DEFAULT_TASK_TIMEOUT_SECONDS,
|
|
22
|
+
DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
|
|
23
|
+
DEFAULT_WORKER_DEATH_TIMEOUT,
|
|
24
|
+
DEFAULT_WORKER_TIMEOUT_SECONDS,
|
|
25
|
+
)
|
|
26
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
27
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
28
|
+
from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
|
|
29
|
+
from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
|
|
30
|
+
from scaler.utility.network_util import get_available_tcp_port
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SchedulerClusterCombo:
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
n_workers: int,
|
|
37
|
+
address: Optional[str] = None,
|
|
38
|
+
object_storage_address: Optional[str] = None,
|
|
39
|
+
monitor_address: Optional[str] = None,
|
|
40
|
+
per_worker_capabilities: Optional[Dict[str, int]] = None,
|
|
41
|
+
worker_io_threads: int = DEFAULT_IO_THREADS,
|
|
42
|
+
scheduler_io_threads: int = DEFAULT_IO_THREADS,
|
|
43
|
+
max_number_of_tasks_waiting: int = DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
|
|
44
|
+
heartbeat_interval_seconds: int = DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
|
|
45
|
+
client_timeout_seconds: int = DEFAULT_CLIENT_TIMEOUT_SECONDS,
|
|
46
|
+
worker_timeout_seconds: int = DEFAULT_WORKER_TIMEOUT_SECONDS,
|
|
47
|
+
object_retention_seconds: int = DEFAULT_OBJECT_RETENTION_SECONDS,
|
|
48
|
+
task_timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
|
|
49
|
+
death_timeout_seconds: int = DEFAULT_WORKER_DEATH_TIMEOUT,
|
|
50
|
+
load_balance_seconds: int = DEFAULT_LOAD_BALANCE_SECONDS,
|
|
51
|
+
load_balance_trigger_times: int = DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
|
|
52
|
+
garbage_collect_interval_seconds: int = DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
|
|
53
|
+
trim_memory_threshold_bytes: int = DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
|
|
54
|
+
per_worker_task_queue_size: int = DEFAULT_PER_WORKER_QUEUE_SIZE,
|
|
55
|
+
hard_processor_suspend: bool = DEFAULT_HARD_PROCESSOR_SUSPEND,
|
|
56
|
+
protected: bool = True,
|
|
57
|
+
allocate_policy: AllocatePolicy = AllocatePolicy.even,
|
|
58
|
+
event_loop: str = "builtin",
|
|
59
|
+
logging_paths: Tuple[str, ...] = DEFAULT_LOGGING_PATHS,
|
|
60
|
+
logging_level: str = DEFAULT_LOGGING_LEVEL,
|
|
61
|
+
logging_config_file: Optional[str] = None,
|
|
62
|
+
):
|
|
63
|
+
if address is None:
|
|
64
|
+
self._address = ZMQConfig.from_string(f"tcp://127.0.0.1:{get_available_tcp_port()}")
|
|
65
|
+
else:
|
|
66
|
+
self._address = ZMQConfig.from_string(address)
|
|
67
|
+
|
|
68
|
+
if object_storage_address is None:
|
|
69
|
+
self._object_storage_address = ObjectStorageConfig(self._address.host, get_available_tcp_port())
|
|
70
|
+
else:
|
|
71
|
+
self._object_storage_address = ObjectStorageConfig.from_string(object_storage_address)
|
|
72
|
+
|
|
73
|
+
if monitor_address is None:
|
|
74
|
+
self._monitor_address = None
|
|
75
|
+
else:
|
|
76
|
+
self._monitor_address = ZMQConfig.from_string(monitor_address)
|
|
77
|
+
|
|
78
|
+
self._object_storage = ObjectStorageServerProcess(
|
|
79
|
+
object_storage_address=self._object_storage_address,
|
|
80
|
+
logging_paths=logging_paths,
|
|
81
|
+
logging_level=logging_level,
|
|
82
|
+
logging_config_file=logging_config_file,
|
|
83
|
+
)
|
|
84
|
+
self._object_storage.start()
|
|
85
|
+
self._object_storage.wait_until_ready() # object storage should be ready before starting the cluster
|
|
86
|
+
|
|
87
|
+
self._cluster = Cluster(
|
|
88
|
+
address=self._address,
|
|
89
|
+
object_storage_address=self._object_storage_address,
|
|
90
|
+
preload=None,
|
|
91
|
+
worker_io_threads=worker_io_threads,
|
|
92
|
+
worker_names=[f"{socket.gethostname().split('.')[0]}" for _ in range(n_workers)],
|
|
93
|
+
per_worker_capabilities=per_worker_capabilities or {},
|
|
94
|
+
per_worker_task_queue_size=per_worker_task_queue_size,
|
|
95
|
+
heartbeat_interval_seconds=heartbeat_interval_seconds,
|
|
96
|
+
task_timeout_seconds=task_timeout_seconds,
|
|
97
|
+
death_timeout_seconds=death_timeout_seconds,
|
|
98
|
+
garbage_collect_interval_seconds=garbage_collect_interval_seconds,
|
|
99
|
+
trim_memory_threshold_bytes=trim_memory_threshold_bytes,
|
|
100
|
+
hard_processor_suspend=hard_processor_suspend,
|
|
101
|
+
event_loop=event_loop,
|
|
102
|
+
logging_paths=logging_paths,
|
|
103
|
+
logging_config_file=logging_config_file,
|
|
104
|
+
logging_level=logging_level,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self._scheduler = SchedulerProcess(
|
|
108
|
+
address=self._address,
|
|
109
|
+
object_storage_address=self._object_storage_address,
|
|
110
|
+
monitor_address=self._monitor_address,
|
|
111
|
+
io_threads=scheduler_io_threads,
|
|
112
|
+
max_number_of_tasks_waiting=max_number_of_tasks_waiting,
|
|
113
|
+
client_timeout_seconds=client_timeout_seconds,
|
|
114
|
+
scaling_controller_strategy=ScalingControllerStrategy.NULL,
|
|
115
|
+
adapter_webhook_urls=(),
|
|
116
|
+
worker_timeout_seconds=worker_timeout_seconds,
|
|
117
|
+
object_retention_seconds=object_retention_seconds,
|
|
118
|
+
load_balance_seconds=load_balance_seconds,
|
|
119
|
+
load_balance_trigger_times=load_balance_trigger_times,
|
|
120
|
+
protected=protected,
|
|
121
|
+
allocate_policy=allocate_policy,
|
|
122
|
+
event_loop=event_loop,
|
|
123
|
+
logging_paths=logging_paths,
|
|
124
|
+
logging_config_file=logging_config_file,
|
|
125
|
+
logging_level=logging_level,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
self._cluster.start()
|
|
129
|
+
self._scheduler.start()
|
|
130
|
+
logging.info(f"{self.__get_prefix()} started")
|
|
131
|
+
|
|
132
|
+
def __del__(self):
|
|
133
|
+
self.shutdown()
|
|
134
|
+
|
|
135
|
+
def shutdown(self):
|
|
136
|
+
logging.info(f"{self.__get_prefix()} shutdown")
|
|
137
|
+
self._cluster.terminate()
|
|
138
|
+
self._scheduler.terminate()
|
|
139
|
+
self._cluster.join()
|
|
140
|
+
self._scheduler.join()
|
|
141
|
+
|
|
142
|
+
# object storage should terminate after the cluster and scheduler.
|
|
143
|
+
self._object_storage.terminate()
|
|
144
|
+
self._object_storage.join()
|
|
145
|
+
|
|
146
|
+
def get_address(self) -> str:
|
|
147
|
+
return self._address.to_address()
|
|
148
|
+
|
|
149
|
+
def __get_prefix(self):
|
|
150
|
+
return f"{self.__class__.__name__}:"
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import multiprocessing
|
|
3
|
+
from typing import Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
6
|
+
from scaler.object_storage.object_storage_server import ObjectStorageServer
|
|
7
|
+
from scaler.utility.logging.utility import get_logger_info, setup_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ObjectStorageServerProcess(multiprocessing.get_context("fork").Process): # type: ignore[misc]
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
object_storage_address: ObjectStorageConfig,
|
|
14
|
+
logging_paths: Tuple[str, ...],
|
|
15
|
+
logging_level: str,
|
|
16
|
+
logging_config_file: Optional[str],
|
|
17
|
+
):
|
|
18
|
+
multiprocessing.Process.__init__(self, name="ObjectStorageServer")
|
|
19
|
+
|
|
20
|
+
self._logging_paths = logging_paths
|
|
21
|
+
self._logging_level = logging_level
|
|
22
|
+
self._logging_config_file = logging_config_file
|
|
23
|
+
|
|
24
|
+
self._object_storage_address = object_storage_address
|
|
25
|
+
|
|
26
|
+
self._server = ObjectStorageServer()
|
|
27
|
+
|
|
28
|
+
def wait_until_ready(self) -> None:
|
|
29
|
+
"""Blocks until the object storage server is available to server requests."""
|
|
30
|
+
self._server.wait_until_ready()
|
|
31
|
+
|
|
32
|
+
def run(self) -> None:
|
|
33
|
+
setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
|
|
34
|
+
logging.info(f"ObjectStorageServer: start and listen to {self._object_storage_address.to_string()}")
|
|
35
|
+
|
|
36
|
+
log_format_str, log_level_str, logging_paths = get_logger_info(logging.getLogger())
|
|
37
|
+
|
|
38
|
+
self._server.run(
|
|
39
|
+
self._object_storage_address.host,
|
|
40
|
+
self._object_storage_address.port,
|
|
41
|
+
self._object_storage_address.identity,
|
|
42
|
+
log_level_str,
|
|
43
|
+
log_format_str,
|
|
44
|
+
logging_paths,
|
|
45
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import signal
|
|
4
|
+
from asyncio import AbstractEventLoop, Task
|
|
5
|
+
from typing import Any, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from scaler.config.section.scheduler import SchedulerConfig
|
|
8
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
9
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
10
|
+
from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
|
|
11
|
+
from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
|
|
12
|
+
from scaler.scheduler.scheduler import Scheduler, scheduler_main
|
|
13
|
+
from scaler.utility.event_loop import register_event_loop
|
|
14
|
+
from scaler.utility.logging.utility import setup_logger
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SchedulerProcess(multiprocessing.get_context("spawn").Process): # type: ignore[misc]
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
address: ZMQConfig,
|
|
21
|
+
object_storage_address: Optional[ObjectStorageConfig],
|
|
22
|
+
monitor_address: Optional[ZMQConfig],
|
|
23
|
+
scaling_controller_strategy: ScalingControllerStrategy,
|
|
24
|
+
adapter_webhook_urls: Tuple[str, ...],
|
|
25
|
+
io_threads: int,
|
|
26
|
+
max_number_of_tasks_waiting: int,
|
|
27
|
+
client_timeout_seconds: int,
|
|
28
|
+
worker_timeout_seconds: int,
|
|
29
|
+
object_retention_seconds: int,
|
|
30
|
+
load_balance_seconds: int,
|
|
31
|
+
load_balance_trigger_times: int,
|
|
32
|
+
protected: bool,
|
|
33
|
+
allocate_policy: AllocatePolicy,
|
|
34
|
+
event_loop: str,
|
|
35
|
+
logging_paths: Tuple[str, ...],
|
|
36
|
+
logging_config_file: Optional[str],
|
|
37
|
+
logging_level: str,
|
|
38
|
+
):
|
|
39
|
+
multiprocessing.Process.__init__(self, name="Scheduler")
|
|
40
|
+
self._scheduler_config = SchedulerConfig(
|
|
41
|
+
event_loop=event_loop,
|
|
42
|
+
scheduler_address=address,
|
|
43
|
+
object_storage_address=object_storage_address,
|
|
44
|
+
monitor_address=monitor_address,
|
|
45
|
+
scaling_controller_strategy=scaling_controller_strategy,
|
|
46
|
+
adapter_webhook_urls=adapter_webhook_urls,
|
|
47
|
+
io_threads=io_threads,
|
|
48
|
+
max_number_of_tasks_waiting=max_number_of_tasks_waiting,
|
|
49
|
+
client_timeout_seconds=client_timeout_seconds,
|
|
50
|
+
worker_timeout_seconds=worker_timeout_seconds,
|
|
51
|
+
object_retention_seconds=object_retention_seconds,
|
|
52
|
+
load_balance_seconds=load_balance_seconds,
|
|
53
|
+
load_balance_trigger_times=load_balance_trigger_times,
|
|
54
|
+
protected=protected,
|
|
55
|
+
allocate_policy=allocate_policy,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self._logging_paths = logging_paths
|
|
59
|
+
self._logging_config_file = logging_config_file
|
|
60
|
+
self._logging_level = logging_level
|
|
61
|
+
|
|
62
|
+
self._scheduler: Optional[Scheduler] = None
|
|
63
|
+
self._loop: Optional[AbstractEventLoop] = None
|
|
64
|
+
self._task: Optional[Task[Any]] = None
|
|
65
|
+
|
|
66
|
+
def run(self) -> None:
|
|
67
|
+
# scheduler have its own single process
|
|
68
|
+
setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
|
|
69
|
+
register_event_loop(self._scheduler_config.event_loop)
|
|
70
|
+
|
|
71
|
+
self._loop = asyncio.get_event_loop()
|
|
72
|
+
SchedulerProcess.__register_signal(self._loop)
|
|
73
|
+
|
|
74
|
+
self._task = self._loop.create_task(scheduler_main(self._scheduler_config))
|
|
75
|
+
|
|
76
|
+
self._loop.run_until_complete(self._task)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def __register_signal(loop):
|
|
80
|
+
loop.add_signal_handler(signal.SIGINT, SchedulerProcess.__handle_signal)
|
|
81
|
+
loop.add_signal_handler(signal.SIGTERM, SchedulerProcess.__handle_signal)
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def __handle_signal():
|
|
85
|
+
for task in asyncio.all_tasks():
|
|
86
|
+
task.cancel()
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from scaler.config.types.network_backend import NetworkBackend
|
|
4
|
+
|
|
5
|
+
# ==============
|
|
6
|
+
# SYSTEM OPTIONS
|
|
7
|
+
|
|
8
|
+
# object clean up time interval
|
|
9
|
+
CLEANUP_INTERVAL_SECONDS = 1
|
|
10
|
+
|
|
11
|
+
# status report interval, used by poke or scaled monitor
|
|
12
|
+
STATUS_REPORT_INTERVAL_SECONDS = 1
|
|
13
|
+
|
|
14
|
+
# number of seconds for profiling
|
|
15
|
+
PROFILING_INTERVAL_SECONDS = 1
|
|
16
|
+
|
|
17
|
+
# cap'n proto only allow Data/Text/Blob size to be as big as 500MB
|
|
18
|
+
CAPNP_DATA_SIZE_LIMIT = 2**29 - 1
|
|
19
|
+
|
|
20
|
+
# message size limitation, max can be 2**64
|
|
21
|
+
CAPNP_MESSAGE_SIZE_LIMIT = 2**64 - 1
|
|
22
|
+
|
|
23
|
+
# ==========================
|
|
24
|
+
# SCHEDULER SPECIFIC OPTIONS
|
|
25
|
+
|
|
26
|
+
# number of threads for zmq socket to handle
|
|
27
|
+
DEFAULT_IO_THREADS = 1
|
|
28
|
+
|
|
29
|
+
# if all workers are full and busy working, this option determine how many additional tasks scheduler can receive and
|
|
30
|
+
# queued, if additional number of tasks received exceeded this number, scheduler will reject tasks
|
|
31
|
+
DEFAULT_MAX_NUMBER_OF_TASKS_WAITING = -1
|
|
32
|
+
|
|
33
|
+
# if didn't receive heartbeat for following seconds, then scheduler will treat worker as dead and reschedule unfinished
|
|
34
|
+
# tasks for this worker
|
|
35
|
+
DEFAULT_WORKER_TIMEOUT_SECONDS = 60
|
|
36
|
+
|
|
37
|
+
# if didn't receive heartbeat for following seconds, then scheduler will treat client as dead and cancel remaining
|
|
38
|
+
# tasks for this client
|
|
39
|
+
DEFAULT_CLIENT_TIMEOUT_SECONDS = 60
|
|
40
|
+
|
|
41
|
+
# number of seconds for load balance, if value is -1 means disable load balance
|
|
42
|
+
DEFAULT_LOAD_BALANCE_SECONDS = 1
|
|
43
|
+
|
|
44
|
+
# when load balance advice happened repeatedly and always be the same, we issue load balance request when exact repeated
|
|
45
|
+
# times happened
|
|
46
|
+
DEFAULT_LOAD_BALANCE_TRIGGER_TIMES = 2
|
|
47
|
+
|
|
48
|
+
# number of tasks can be queued to each worker on scheduler side
|
|
49
|
+
DEFAULT_PER_WORKER_QUEUE_SIZE = 1000
|
|
50
|
+
|
|
51
|
+
# =======================
|
|
52
|
+
# WORKER SPECIFIC OPTIONS
|
|
53
|
+
|
|
54
|
+
# number of workers, echo worker use 1 process
|
|
55
|
+
DEFAULT_NUMBER_OF_WORKER = os.cpu_count() - 1
|
|
56
|
+
|
|
57
|
+
# number of seconds that worker agent send heartbeat to scheduler
|
|
58
|
+
DEFAULT_HEARTBEAT_INTERVAL_SECONDS = 2
|
|
59
|
+
|
|
60
|
+
# number of seconds the object cache kept in worker's memory
|
|
61
|
+
DEFAULT_OBJECT_RETENTION_SECONDS = 60
|
|
62
|
+
|
|
63
|
+
# number of seconds worker doing garbage collection
|
|
64
|
+
DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS = 30
|
|
65
|
+
|
|
66
|
+
# number of bytes threshold for worker process that trigger deep garbage collection
|
|
67
|
+
DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES = 1024 * 1024 * 1024
|
|
68
|
+
|
|
69
|
+
# default task timeout seconds, 0 means never timeout
|
|
70
|
+
DEFAULT_TASK_TIMEOUT_SECONDS = 0
|
|
71
|
+
|
|
72
|
+
# number of seconds that worker agent wait for processor to finish before killing it
|
|
73
|
+
DEFAULT_PROCESSOR_KILL_DELAY_SECONDS = 3
|
|
74
|
+
|
|
75
|
+
# number of seconds without scheduler contact before worker shuts down
|
|
76
|
+
DEFAULT_WORKER_DEATH_TIMEOUT = 5 * 60
|
|
77
|
+
|
|
78
|
+
# if true, suspended worker's processors will be actively suspended with a SIGTSTP signal, otherwise a synchronization
|
|
79
|
+
# event will be used.
|
|
80
|
+
DEFAULT_HARD_PROCESSOR_SUSPEND = False
|
|
81
|
+
|
|
82
|
+
# =======================
|
|
83
|
+
# LOGGING SPECIFIC OPTIONS
|
|
84
|
+
|
|
85
|
+
# default logging level
|
|
86
|
+
DEFAULT_LOGGING_LEVEL = "INFO"
|
|
87
|
+
|
|
88
|
+
# default logging paths
|
|
89
|
+
DEFAULT_LOGGING_PATHS = ("/dev/stdout",)
|
|
90
|
+
|
|
91
|
+
# =======================
|
|
92
|
+
# SCALER NETWORK BACKEND SPECIFIC OPTIONS
|
|
93
|
+
|
|
94
|
+
SCALER_NETWORK_BACKEND = NetworkBackend.tcp_zmq
|
scaler/config/loader.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import dataclasses
|
|
3
|
+
import enum
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, Dict, Optional, Type, TypeVar, Union, cast, get_args, get_origin
|
|
6
|
+
|
|
7
|
+
if sys.version_info >= (3, 11):
|
|
8
|
+
import tomllib
|
|
9
|
+
else:
|
|
10
|
+
import tomli as tomllib
|
|
11
|
+
|
|
12
|
+
from scaler.config.mixins import ConfigType
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_config(
|
|
18
|
+
config_class: Type[T], config_path: Optional[str], args: argparse.Namespace, section_name: Optional[str] = None
|
|
19
|
+
) -> T:
|
|
20
|
+
"""
|
|
21
|
+
Loads configuration for a given dataclass from a TOML file and overrides it with command-line arguments.
|
|
22
|
+
"""
|
|
23
|
+
if not dataclasses.is_dataclass(config_class):
|
|
24
|
+
raise TypeError(f"{config_class.__name__} is not a dataclass and cannot be used with this config loader.")
|
|
25
|
+
|
|
26
|
+
config_from_file = {}
|
|
27
|
+
if config_path:
|
|
28
|
+
try:
|
|
29
|
+
with open(config_path, "rb") as f:
|
|
30
|
+
try:
|
|
31
|
+
full_config = tomllib.load(f)
|
|
32
|
+
except tomllib.TOMLDecodeError as e:
|
|
33
|
+
raise ValueError(f"Error parsing TOML file at {config_path}: {e}") from e
|
|
34
|
+
|
|
35
|
+
if section_name:
|
|
36
|
+
config_from_file = full_config.get(section_name, {})
|
|
37
|
+
else:
|
|
38
|
+
config_from_file = full_config
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
raise FileNotFoundError(f"Configuration file not found at: {config_path}")
|
|
41
|
+
|
|
42
|
+
config_from_args = {k: v for k, v in vars(args).items() if v is not None}
|
|
43
|
+
merged_config_data = {**config_from_file, **config_from_args}
|
|
44
|
+
|
|
45
|
+
valid_keys = {f.name for f in dataclasses.fields(config_class)}
|
|
46
|
+
unknown_keys = set(merged_config_data.keys()) - valid_keys - {"config"}
|
|
47
|
+
if unknown_keys:
|
|
48
|
+
raise ValueError(f"Unknown configuration key(s) for {config_class.__name__}: {', '.join(unknown_keys)}")
|
|
49
|
+
|
|
50
|
+
final_kwargs: Dict[str, Any] = {}
|
|
51
|
+
for field in dataclasses.fields(config_class):
|
|
52
|
+
if field.name in merged_config_data:
|
|
53
|
+
raw_value = merged_config_data[field.name]
|
|
54
|
+
field_type = field.type
|
|
55
|
+
is_optional = get_origin(field_type) is Union
|
|
56
|
+
if is_optional:
|
|
57
|
+
possible_types = [t for t in get_args(field_type) if t is not type(None)]
|
|
58
|
+
actual_type = possible_types[0] if possible_types else field_type
|
|
59
|
+
else:
|
|
60
|
+
actual_type = field_type
|
|
61
|
+
|
|
62
|
+
if (
|
|
63
|
+
isinstance(raw_value, str)
|
|
64
|
+
and isinstance(actual_type, type)
|
|
65
|
+
and issubclass(actual_type, ConfigType)
|
|
66
|
+
and not isinstance(raw_value, actual_type)
|
|
67
|
+
):
|
|
68
|
+
final_kwargs[field.name] = actual_type.from_string(raw_value)
|
|
69
|
+
elif isinstance(raw_value, str) and isinstance(actual_type, type) and issubclass(actual_type, enum.Enum):
|
|
70
|
+
try:
|
|
71
|
+
final_kwargs[field.name] = actual_type[raw_value]
|
|
72
|
+
except KeyError as e:
|
|
73
|
+
raise ValueError(f"'{raw_value}' is not a valid member for {actual_type.__name__}") from e
|
|
74
|
+
elif isinstance(raw_value, list) and get_origin(field.type) is tuple:
|
|
75
|
+
final_kwargs[field.name] = tuple(raw_value)
|
|
76
|
+
else:
|
|
77
|
+
final_kwargs[field.name] = raw_value
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
return cast(T, config_class(**final_kwargs))
|
|
81
|
+
except TypeError as e:
|
|
82
|
+
missing_fields = [
|
|
83
|
+
f.name
|
|
84
|
+
for f in dataclasses.fields(config_class)
|
|
85
|
+
if f.init
|
|
86
|
+
and f.name not in final_kwargs
|
|
87
|
+
and f.default is dataclasses.MISSING
|
|
88
|
+
and f.default_factory is dataclasses.MISSING
|
|
89
|
+
]
|
|
90
|
+
if missing_fields:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Missing required configuration arguments: {', '.join(missing_fields)}. "
|
|
93
|
+
f"Please provide them via command line or a TOML config file."
|
|
94
|
+
) from e
|
|
95
|
+
else:
|
|
96
|
+
raise e
|
scaler/config/mixins.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
if sys.version_info >= (3, 11):
|
|
5
|
+
from typing import Self
|
|
6
|
+
else:
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigType(metaclass=abc.ABCMeta):
|
|
11
|
+
"""A base class for composite config values that can be parsed and serialized from/to a string."""
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
@abc.abstractmethod
|
|
15
|
+
def from_string(cls, value: str) -> Self:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
@abc.abstractmethod
|
|
19
|
+
def __str__(self) -> str:
|
|
20
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from scaler.config import defaults
|
|
5
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
6
|
+
from scaler.config.types.worker import WorkerCapabilities, WorkerNames
|
|
7
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
8
|
+
from scaler.utility.logging.utility import LoggingLevel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class ClusterConfig:
|
|
13
|
+
scheduler_address: ZMQConfig
|
|
14
|
+
object_storage_address: Optional[ObjectStorageConfig] = None
|
|
15
|
+
preload: Optional[str] = None
|
|
16
|
+
worker_io_threads: int = defaults.DEFAULT_IO_THREADS
|
|
17
|
+
worker_names: WorkerNames = dataclasses.field(default_factory=lambda: WorkerNames.from_string(""))
|
|
18
|
+
num_of_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
|
|
19
|
+
per_worker_capabilities: WorkerCapabilities = dataclasses.field(
|
|
20
|
+
default_factory=lambda: WorkerCapabilities.from_string("")
|
|
21
|
+
)
|
|
22
|
+
per_worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
|
|
23
|
+
heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
|
|
24
|
+
task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
|
|
25
|
+
death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
|
|
26
|
+
garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
|
|
27
|
+
trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
|
|
28
|
+
hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
|
|
29
|
+
event_loop: str = "builtin"
|
|
30
|
+
logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
|
|
31
|
+
logging_config_file: Optional[str] = None
|
|
32
|
+
logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
if self.worker_io_threads <= 0:
|
|
36
|
+
raise ValueError("worker_io_threads must be a positive integer.")
|
|
37
|
+
if self.worker_names.names and len(self.worker_names.names) != self.num_of_workers:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"The number of worker_names ({len(self.worker_names.names)}) \
|
|
40
|
+
must match num_of_workers ({self.num_of_workers})."
|
|
41
|
+
)
|
|
42
|
+
if self.per_worker_task_queue_size <= 0:
|
|
43
|
+
raise ValueError("per_worker_task_queue_size must be positive.")
|
|
44
|
+
if (
|
|
45
|
+
self.heartbeat_interval_seconds <= 0
|
|
46
|
+
or self.task_timeout_seconds < 0
|
|
47
|
+
or self.death_timeout_seconds <= 0
|
|
48
|
+
or self.garbage_collect_interval_seconds <= 0
|
|
49
|
+
):
|
|
50
|
+
raise ValueError("All interval/timeout second values must be positive.")
|
|
51
|
+
if self.trim_memory_threshold_bytes < 0:
|
|
52
|
+
raise ValueError("trim_memory_threshold_bytes cannot be negative.")
|
|
53
|
+
valid_levels = {level.name for level in LoggingLevel}
|
|
54
|
+
if self.logging_level.upper() not in valid_levels:
|
|
55
|
+
raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from scaler.config import defaults
|
|
5
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
6
|
+
from scaler.config.types.worker import WorkerCapabilities
|
|
7
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
8
|
+
from scaler.utility.logging.utility import LoggingLevel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class ECSWorkerAdapterConfig:
|
|
13
|
+
# Server (adapter) configuration
|
|
14
|
+
adapter_web_host: str
|
|
15
|
+
adapter_web_port: int
|
|
16
|
+
|
|
17
|
+
scheduler_address: ZMQConfig
|
|
18
|
+
object_storage_address: Optional[ObjectStorageConfig] = None
|
|
19
|
+
|
|
20
|
+
# AWS / ECS specific configuration
|
|
21
|
+
aws_access_key_id: Optional[str] = None
|
|
22
|
+
aws_secret_access_key: Optional[str] = None
|
|
23
|
+
aws_region: str = "us-east-1"
|
|
24
|
+
ecs_subnets: List[str] = dataclasses.field(default_factory=list)
|
|
25
|
+
ecs_cluster: str = "scaler-cluster"
|
|
26
|
+
ecs_task_image: str = "public.ecr.aws/v4u8j8r6/scaler:latest"
|
|
27
|
+
ecs_python_requirements: str = "tomli;pargraph;parfun;pandas"
|
|
28
|
+
ecs_python_version: str = "3.12.11"
|
|
29
|
+
ecs_task_definition: str = "scaler-task-definition"
|
|
30
|
+
ecs_task_cpu: int = 4
|
|
31
|
+
ecs_task_memory: int = 30
|
|
32
|
+
|
|
33
|
+
# Generic worker adapter options
|
|
34
|
+
io_threads: int = defaults.DEFAULT_IO_THREADS
|
|
35
|
+
per_worker_capabilities: WorkerCapabilities = dataclasses.field(
|
|
36
|
+
default_factory=lambda: WorkerCapabilities.from_string("")
|
|
37
|
+
)
|
|
38
|
+
per_worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
|
|
39
|
+
max_instances: int = defaults.DEFAULT_NUMBER_OF_WORKER
|
|
40
|
+
heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
|
|
41
|
+
task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
|
|
42
|
+
death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
|
|
43
|
+
garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
|
|
44
|
+
trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
|
|
45
|
+
hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
|
|
46
|
+
event_loop: str = "builtin"
|
|
47
|
+
logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
|
|
48
|
+
logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
|
|
49
|
+
logging_config_file: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
# Validate server fields
|
|
53
|
+
if not isinstance(self.adapter_web_host, str):
|
|
54
|
+
raise TypeError(f"adapter_web_host should be string, given {self.adapter_web_host}")
|
|
55
|
+
if not isinstance(self.adapter_web_port, int) or not (1 <= self.adapter_web_port <= 65535):
|
|
56
|
+
raise ValueError(f"adapter_web_port must be between 1 and 65535, but got {self.adapter_web_port}")
|
|
57
|
+
|
|
58
|
+
# Validate numeric and collection values
|
|
59
|
+
if self.io_threads <= 0:
|
|
60
|
+
raise ValueError("io_threads must be a positive integer.")
|
|
61
|
+
if self.per_worker_task_queue_size <= 0:
|
|
62
|
+
raise ValueError("worker_task_queue_size must be positive.")
|
|
63
|
+
if self.ecs_task_cpu <= 0:
|
|
64
|
+
raise ValueError("ecs_task_cpu must be a positive integer.")
|
|
65
|
+
if self.ecs_task_memory <= 0:
|
|
66
|
+
raise ValueError("ecs_task_memory must be a positive integer.")
|
|
67
|
+
if self.heartbeat_interval_seconds <= 0 or self.death_timeout_seconds <= 0:
|
|
68
|
+
raise ValueError("All interval/timeout second values must be positive.")
|
|
69
|
+
if self.max_instances != -1 and self.max_instances <= 0:
|
|
70
|
+
raise ValueError("max_instances must be -1 (no limit) or a positive integer.")
|
|
71
|
+
if not isinstance(self.ecs_subnets, list) or len(self.ecs_subnets) == 0:
|
|
72
|
+
raise ValueError("ecs_subnets must be a non-empty list of subnet ids.")
|
|
73
|
+
|
|
74
|
+
# Validate required strings
|
|
75
|
+
if not self.ecs_cluster:
|
|
76
|
+
raise ValueError("ecs_cluster cannot be an empty string.")
|
|
77
|
+
if not self.ecs_task_definition:
|
|
78
|
+
raise ValueError("ecs_task_definition cannot be an empty string.")
|
|
79
|
+
if not self.ecs_task_image:
|
|
80
|
+
raise ValueError("ecs_task_image cannot be an empty string.")
|
|
81
|
+
|
|
82
|
+
# Validate logging level
|
|
83
|
+
valid_levels = {level.name for level in LoggingLevel}
|
|
84
|
+
if self.logging_level.upper() not in valid_levels:
|
|
85
|
+
raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from scaler.config import defaults
|
|
5
|
+
from scaler.config.types.object_storage_server import ObjectStorageConfig
|
|
6
|
+
from scaler.config.types.worker import WorkerCapabilities
|
|
7
|
+
from scaler.config.types.zmq import ZMQConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclasses.dataclass
|
|
11
|
+
class NativeWorkerAdapterConfig:
|
|
12
|
+
scheduler_address: ZMQConfig
|
|
13
|
+
object_storage_address: Optional[ObjectStorageConfig] = None
|
|
14
|
+
adapter_web_host: str = "localhost"
|
|
15
|
+
adapter_web_port: int = 8080
|
|
16
|
+
per_worker_capabilities: WorkerCapabilities = dataclasses.field(
|
|
17
|
+
default_factory=lambda: WorkerCapabilities.from_string("")
|
|
18
|
+
)
|
|
19
|
+
io_threads: int = defaults.DEFAULT_IO_THREADS
|
|
20
|
+
worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
|
|
21
|
+
max_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
|
|
22
|
+
heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
|
|
23
|
+
task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
|
|
24
|
+
death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
|
|
25
|
+
garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
|
|
26
|
+
trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
|
|
27
|
+
hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
|
|
28
|
+
event_loop: str = "builtin"
|
|
29
|
+
logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
|
|
30
|
+
logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
|
|
31
|
+
logging_config_file: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
def __post_init__(self):
|
|
34
|
+
if not isinstance(self.adapter_web_host, str):
|
|
35
|
+
raise TypeError(f"adapter_web_host should be string, given {self.adapter_web_host}")
|
|
36
|
+
if not isinstance(self.adapter_web_port, int):
|
|
37
|
+
raise TypeError(f"adapter_web_port must be between 1 and 65535, but got {self.adapter_web_port}")
|
|
38
|
+
if self.io_threads <= 0:
|
|
39
|
+
raise ValueError("io_threads must be a positive integer.")
|
|
40
|
+
if self.worker_task_queue_size <= 0:
|
|
41
|
+
raise ValueError("worker_task_queue_size must be positive.")
|
|
42
|
+
if self.heartbeat_interval_seconds <= 0 or self.task_timeout_seconds < 0 or self.death_timeout_seconds <= 0:
|
|
43
|
+
raise ValueError("All interval/timeout second values must be positive.")
|