opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (187) hide show
  1. opengris_scaler-1.12.28.dist-info/METADATA +728 -0
  2. opengris_scaler-1.12.28.dist-info/RECORD +187 -0
  3. opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +210 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +658 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +115 -0
  32. scaler/cluster/combo.py +150 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/defaults.py +94 -0
  37. scaler/config/loader.py +96 -0
  38. scaler/config/mixins.py +20 -0
  39. scaler/config/section/__init__.py +0 -0
  40. scaler/config/section/cluster.py +55 -0
  41. scaler/config/section/ecs_worker_adapter.py +85 -0
  42. scaler/config/section/native_worker_adapter.py +43 -0
  43. scaler/config/section/object_storage_server.py +8 -0
  44. scaler/config/section/scheduler.py +54 -0
  45. scaler/config/section/symphony_worker_adapter.py +47 -0
  46. scaler/config/section/top.py +13 -0
  47. scaler/config/section/webui.py +21 -0
  48. scaler/config/types/__init__.py +0 -0
  49. scaler/config/types/network_backend.py +12 -0
  50. scaler/config/types/object_storage_server.py +45 -0
  51. scaler/config/types/worker.py +62 -0
  52. scaler/config/types/zmq.py +83 -0
  53. scaler/entry_points/__init__.py +0 -0
  54. scaler/entry_points/cluster.py +133 -0
  55. scaler/entry_points/object_storage_server.py +45 -0
  56. scaler/entry_points/scheduler.py +144 -0
  57. scaler/entry_points/top.py +286 -0
  58. scaler/entry_points/webui.py +48 -0
  59. scaler/entry_points/worker_adapter_ecs.py +191 -0
  60. scaler/entry_points/worker_adapter_native.py +137 -0
  61. scaler/entry_points/worker_adapter_symphony.py +98 -0
  62. scaler/io/__init__.py +0 -0
  63. scaler/io/async_binder.py +89 -0
  64. scaler/io/async_connector.py +95 -0
  65. scaler/io/async_object_storage_connector.py +225 -0
  66. scaler/io/mixins.py +154 -0
  67. scaler/io/sync_connector.py +68 -0
  68. scaler/io/sync_object_storage_connector.py +247 -0
  69. scaler/io/sync_subscriber.py +83 -0
  70. scaler/io/utility.py +80 -0
  71. scaler/io/ymq/__init__.py +0 -0
  72. scaler/io/ymq/_ymq.pyi +95 -0
  73. scaler/io/ymq/ymq.py +138 -0
  74. scaler/io/ymq_async_object_storage_connector.py +184 -0
  75. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  76. scaler/object_storage/__init__.py +0 -0
  77. scaler/protocol/__init__.py +0 -0
  78. scaler/protocol/capnp/__init__.py +0 -0
  79. scaler/protocol/capnp/_python.py +6 -0
  80. scaler/protocol/capnp/common.capnp +68 -0
  81. scaler/protocol/capnp/message.capnp +218 -0
  82. scaler/protocol/capnp/object_storage.capnp +57 -0
  83. scaler/protocol/capnp/status.capnp +73 -0
  84. scaler/protocol/introduction.md +105 -0
  85. scaler/protocol/python/__init__.py +0 -0
  86. scaler/protocol/python/common.py +140 -0
  87. scaler/protocol/python/message.py +751 -0
  88. scaler/protocol/python/mixins.py +13 -0
  89. scaler/protocol/python/object_storage.py +118 -0
  90. scaler/protocol/python/status.py +279 -0
  91. scaler/protocol/worker.md +228 -0
  92. scaler/scheduler/__init__.py +0 -0
  93. scaler/scheduler/allocate_policy/__init__.py +0 -0
  94. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  95. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  96. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  97. scaler/scheduler/allocate_policy/mixins.py +55 -0
  98. scaler/scheduler/controllers/__init__.py +0 -0
  99. scaler/scheduler/controllers/balance_controller.py +65 -0
  100. scaler/scheduler/controllers/client_controller.py +131 -0
  101. scaler/scheduler/controllers/config_controller.py +31 -0
  102. scaler/scheduler/controllers/graph_controller.py +424 -0
  103. scaler/scheduler/controllers/information_controller.py +81 -0
  104. scaler/scheduler/controllers/mixins.py +194 -0
  105. scaler/scheduler/controllers/object_controller.py +147 -0
  106. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  107. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  108. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  109. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  110. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  111. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  112. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  113. scaler/scheduler/controllers/task_controller.py +376 -0
  114. scaler/scheduler/controllers/worker_controller.py +169 -0
  115. scaler/scheduler/object_usage/__init__.py +0 -0
  116. scaler/scheduler/object_usage/object_tracker.py +131 -0
  117. scaler/scheduler/scheduler.py +251 -0
  118. scaler/scheduler/task/__init__.py +0 -0
  119. scaler/scheduler/task/task_state_machine.py +92 -0
  120. scaler/scheduler/task/task_state_manager.py +61 -0
  121. scaler/ui/__init__.py +0 -0
  122. scaler/ui/constants.py +9 -0
  123. scaler/ui/live_display.py +147 -0
  124. scaler/ui/memory_window.py +146 -0
  125. scaler/ui/setting_page.py +40 -0
  126. scaler/ui/task_graph.py +832 -0
  127. scaler/ui/task_log.py +107 -0
  128. scaler/ui/utility.py +66 -0
  129. scaler/ui/webui.py +147 -0
  130. scaler/ui/worker_processors.py +104 -0
  131. scaler/utility/__init__.py +0 -0
  132. scaler/utility/debug.py +19 -0
  133. scaler/utility/event_list.py +63 -0
  134. scaler/utility/event_loop.py +58 -0
  135. scaler/utility/exceptions.py +42 -0
  136. scaler/utility/formatter.py +44 -0
  137. scaler/utility/graph/__init__.py +0 -0
  138. scaler/utility/graph/optimization.py +27 -0
  139. scaler/utility/graph/topological_sorter.py +11 -0
  140. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  141. scaler/utility/identifiers.py +107 -0
  142. scaler/utility/logging/__init__.py +0 -0
  143. scaler/utility/logging/decorators.py +25 -0
  144. scaler/utility/logging/scoped_logger.py +33 -0
  145. scaler/utility/logging/utility.py +183 -0
  146. scaler/utility/many_to_many_dict.py +123 -0
  147. scaler/utility/metadata/__init__.py +0 -0
  148. scaler/utility/metadata/profile_result.py +31 -0
  149. scaler/utility/metadata/task_flags.py +30 -0
  150. scaler/utility/mixins.py +13 -0
  151. scaler/utility/network_util.py +7 -0
  152. scaler/utility/one_to_many_dict.py +72 -0
  153. scaler/utility/queues/__init__.py +0 -0
  154. scaler/utility/queues/async_indexed_queue.py +37 -0
  155. scaler/utility/queues/async_priority_queue.py +70 -0
  156. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  157. scaler/utility/queues/indexed_queue.py +114 -0
  158. scaler/utility/serialization.py +9 -0
  159. scaler/version.txt +1 -0
  160. scaler/worker/__init__.py +0 -0
  161. scaler/worker/agent/__init__.py +0 -0
  162. scaler/worker/agent/heartbeat_manager.py +107 -0
  163. scaler/worker/agent/mixins.py +137 -0
  164. scaler/worker/agent/processor/__init__.py +0 -0
  165. scaler/worker/agent/processor/object_cache.py +107 -0
  166. scaler/worker/agent/processor/processor.py +285 -0
  167. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  168. scaler/worker/agent/processor_holder.py +147 -0
  169. scaler/worker/agent/processor_manager.py +369 -0
  170. scaler/worker/agent/profiling_manager.py +109 -0
  171. scaler/worker/agent/task_manager.py +150 -0
  172. scaler/worker/agent/timeout_manager.py +19 -0
  173. scaler/worker/preload.py +84 -0
  174. scaler/worker/worker.py +265 -0
  175. scaler/worker_adapter/__init__.py +0 -0
  176. scaler/worker_adapter/common.py +26 -0
  177. scaler/worker_adapter/ecs.py +269 -0
  178. scaler/worker_adapter/native.py +155 -0
  179. scaler/worker_adapter/symphony/__init__.py +0 -0
  180. scaler/worker_adapter/symphony/callback.py +45 -0
  181. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  182. scaler/worker_adapter/symphony/message.py +24 -0
  183. scaler/worker_adapter/symphony/task_manager.py +289 -0
  184. scaler/worker_adapter/symphony/worker.py +204 -0
  185. scaler/worker_adapter/symphony/worker_adapter.py +139 -0
  186. src/scaler/io/ymq/_ymq.so +0 -0
  187. src/scaler/object_storage/object_storage_server.so +0 -0
@@ -0,0 +1,150 @@
1
+ import logging
2
+ import socket
3
+ from typing import Dict, Optional, Tuple
4
+
5
+ from scaler.cluster.cluster import Cluster
6
+ from scaler.cluster.object_storage_server import ObjectStorageServerProcess
7
+ from scaler.cluster.scheduler import SchedulerProcess
8
+ from scaler.config.defaults import (
9
+ DEFAULT_CLIENT_TIMEOUT_SECONDS,
10
+ DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
11
+ DEFAULT_HARD_PROCESSOR_SUSPEND,
12
+ DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
13
+ DEFAULT_IO_THREADS,
14
+ DEFAULT_LOAD_BALANCE_SECONDS,
15
+ DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
16
+ DEFAULT_LOGGING_LEVEL,
17
+ DEFAULT_LOGGING_PATHS,
18
+ DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
19
+ DEFAULT_OBJECT_RETENTION_SECONDS,
20
+ DEFAULT_PER_WORKER_QUEUE_SIZE,
21
+ DEFAULT_TASK_TIMEOUT_SECONDS,
22
+ DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
23
+ DEFAULT_WORKER_DEATH_TIMEOUT,
24
+ DEFAULT_WORKER_TIMEOUT_SECONDS,
25
+ )
26
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
27
+ from scaler.config.types.zmq import ZMQConfig
28
+ from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
29
+ from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
30
+ from scaler.utility.network_util import get_available_tcp_port
31
+
32
+
33
+ class SchedulerClusterCombo:
34
+ def __init__(
35
+ self,
36
+ n_workers: int,
37
+ address: Optional[str] = None,
38
+ object_storage_address: Optional[str] = None,
39
+ monitor_address: Optional[str] = None,
40
+ per_worker_capabilities: Optional[Dict[str, int]] = None,
41
+ worker_io_threads: int = DEFAULT_IO_THREADS,
42
+ scheduler_io_threads: int = DEFAULT_IO_THREADS,
43
+ max_number_of_tasks_waiting: int = DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
44
+ heartbeat_interval_seconds: int = DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
45
+ client_timeout_seconds: int = DEFAULT_CLIENT_TIMEOUT_SECONDS,
46
+ worker_timeout_seconds: int = DEFAULT_WORKER_TIMEOUT_SECONDS,
47
+ object_retention_seconds: int = DEFAULT_OBJECT_RETENTION_SECONDS,
48
+ task_timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
49
+ death_timeout_seconds: int = DEFAULT_WORKER_DEATH_TIMEOUT,
50
+ load_balance_seconds: int = DEFAULT_LOAD_BALANCE_SECONDS,
51
+ load_balance_trigger_times: int = DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
52
+ garbage_collect_interval_seconds: int = DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
53
+ trim_memory_threshold_bytes: int = DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
54
+ per_worker_task_queue_size: int = DEFAULT_PER_WORKER_QUEUE_SIZE,
55
+ hard_processor_suspend: bool = DEFAULT_HARD_PROCESSOR_SUSPEND,
56
+ protected: bool = True,
57
+ allocate_policy: AllocatePolicy = AllocatePolicy.even,
58
+ event_loop: str = "builtin",
59
+ logging_paths: Tuple[str, ...] = DEFAULT_LOGGING_PATHS,
60
+ logging_level: str = DEFAULT_LOGGING_LEVEL,
61
+ logging_config_file: Optional[str] = None,
62
+ ):
63
+ if address is None:
64
+ self._address = ZMQConfig.from_string(f"tcp://127.0.0.1:{get_available_tcp_port()}")
65
+ else:
66
+ self._address = ZMQConfig.from_string(address)
67
+
68
+ if object_storage_address is None:
69
+ self._object_storage_address = ObjectStorageConfig(self._address.host, get_available_tcp_port())
70
+ else:
71
+ self._object_storage_address = ObjectStorageConfig.from_string(object_storage_address)
72
+
73
+ if monitor_address is None:
74
+ self._monitor_address = None
75
+ else:
76
+ self._monitor_address = ZMQConfig.from_string(monitor_address)
77
+
78
+ self._object_storage = ObjectStorageServerProcess(
79
+ object_storage_address=self._object_storage_address,
80
+ logging_paths=logging_paths,
81
+ logging_level=logging_level,
82
+ logging_config_file=logging_config_file,
83
+ )
84
+ self._object_storage.start()
85
+ self._object_storage.wait_until_ready() # object storage should be ready before starting the cluster
86
+
87
+ self._cluster = Cluster(
88
+ address=self._address,
89
+ object_storage_address=self._object_storage_address,
90
+ preload=None,
91
+ worker_io_threads=worker_io_threads,
92
+ worker_names=[f"{socket.gethostname().split('.')[0]}" for _ in range(n_workers)],
93
+ per_worker_capabilities=per_worker_capabilities or {},
94
+ per_worker_task_queue_size=per_worker_task_queue_size,
95
+ heartbeat_interval_seconds=heartbeat_interval_seconds,
96
+ task_timeout_seconds=task_timeout_seconds,
97
+ death_timeout_seconds=death_timeout_seconds,
98
+ garbage_collect_interval_seconds=garbage_collect_interval_seconds,
99
+ trim_memory_threshold_bytes=trim_memory_threshold_bytes,
100
+ hard_processor_suspend=hard_processor_suspend,
101
+ event_loop=event_loop,
102
+ logging_paths=logging_paths,
103
+ logging_config_file=logging_config_file,
104
+ logging_level=logging_level,
105
+ )
106
+
107
+ self._scheduler = SchedulerProcess(
108
+ address=self._address,
109
+ object_storage_address=self._object_storage_address,
110
+ monitor_address=self._monitor_address,
111
+ io_threads=scheduler_io_threads,
112
+ max_number_of_tasks_waiting=max_number_of_tasks_waiting,
113
+ client_timeout_seconds=client_timeout_seconds,
114
+ scaling_controller_strategy=ScalingControllerStrategy.NULL,
115
+ adapter_webhook_urls=(),
116
+ worker_timeout_seconds=worker_timeout_seconds,
117
+ object_retention_seconds=object_retention_seconds,
118
+ load_balance_seconds=load_balance_seconds,
119
+ load_balance_trigger_times=load_balance_trigger_times,
120
+ protected=protected,
121
+ allocate_policy=allocate_policy,
122
+ event_loop=event_loop,
123
+ logging_paths=logging_paths,
124
+ logging_config_file=logging_config_file,
125
+ logging_level=logging_level,
126
+ )
127
+
128
+ self._cluster.start()
129
+ self._scheduler.start()
130
+ logging.info(f"{self.__get_prefix()} started")
131
+
132
+ def __del__(self):
133
+ self.shutdown()
134
+
135
+ def shutdown(self):
136
+ logging.info(f"{self.__get_prefix()} shutdown")
137
+ self._cluster.terminate()
138
+ self._scheduler.terminate()
139
+ self._cluster.join()
140
+ self._scheduler.join()
141
+
142
+ # object storage should terminate after the cluster and scheduler.
143
+ self._object_storage.terminate()
144
+ self._object_storage.join()
145
+
146
+ def get_address(self) -> str:
147
+ return self._address.to_address()
148
+
149
+ def __get_prefix(self):
150
+ return f"{self.__class__.__name__}:"
@@ -0,0 +1,45 @@
1
+ import logging
2
+ import multiprocessing
3
+ from typing import Optional, Tuple
4
+
5
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
6
+ from scaler.object_storage.object_storage_server import ObjectStorageServer
7
+ from scaler.utility.logging.utility import get_logger_info, setup_logger
8
+
9
+
10
+ class ObjectStorageServerProcess(multiprocessing.get_context("fork").Process): # type: ignore[misc]
11
+ def __init__(
12
+ self,
13
+ object_storage_address: ObjectStorageConfig,
14
+ logging_paths: Tuple[str, ...],
15
+ logging_level: str,
16
+ logging_config_file: Optional[str],
17
+ ):
18
+ multiprocessing.Process.__init__(self, name="ObjectStorageServer")
19
+
20
+ self._logging_paths = logging_paths
21
+ self._logging_level = logging_level
22
+ self._logging_config_file = logging_config_file
23
+
24
+ self._object_storage_address = object_storage_address
25
+
26
+ self._server = ObjectStorageServer()
27
+
28
+ def wait_until_ready(self) -> None:
29
+ """Blocks until the object storage server is available to server requests."""
30
+ self._server.wait_until_ready()
31
+
32
+ def run(self) -> None:
33
+ setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
34
+ logging.info(f"ObjectStorageServer: start and listen to {self._object_storage_address.to_string()}")
35
+
36
+ log_format_str, log_level_str, logging_paths = get_logger_info(logging.getLogger())
37
+
38
+ self._server.run(
39
+ self._object_storage_address.host,
40
+ self._object_storage_address.port,
41
+ self._object_storage_address.identity,
42
+ log_level_str,
43
+ log_format_str,
44
+ logging_paths,
45
+ )
@@ -0,0 +1,86 @@
1
+ import asyncio
2
+ import multiprocessing
3
+ import signal
4
+ from asyncio import AbstractEventLoop, Task
5
+ from typing import Any, Optional, Tuple
6
+
7
+ from scaler.config.section.scheduler import SchedulerConfig
8
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
9
+ from scaler.config.types.zmq import ZMQConfig
10
+ from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
11
+ from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
12
+ from scaler.scheduler.scheduler import Scheduler, scheduler_main
13
+ from scaler.utility.event_loop import register_event_loop
14
+ from scaler.utility.logging.utility import setup_logger
15
+
16
+
17
+ class SchedulerProcess(multiprocessing.get_context("spawn").Process): # type: ignore[misc]
18
+ def __init__(
19
+ self,
20
+ address: ZMQConfig,
21
+ object_storage_address: Optional[ObjectStorageConfig],
22
+ monitor_address: Optional[ZMQConfig],
23
+ scaling_controller_strategy: ScalingControllerStrategy,
24
+ adapter_webhook_urls: Tuple[str, ...],
25
+ io_threads: int,
26
+ max_number_of_tasks_waiting: int,
27
+ client_timeout_seconds: int,
28
+ worker_timeout_seconds: int,
29
+ object_retention_seconds: int,
30
+ load_balance_seconds: int,
31
+ load_balance_trigger_times: int,
32
+ protected: bool,
33
+ allocate_policy: AllocatePolicy,
34
+ event_loop: str,
35
+ logging_paths: Tuple[str, ...],
36
+ logging_config_file: Optional[str],
37
+ logging_level: str,
38
+ ):
39
+ multiprocessing.Process.__init__(self, name="Scheduler")
40
+ self._scheduler_config = SchedulerConfig(
41
+ event_loop=event_loop,
42
+ scheduler_address=address,
43
+ object_storage_address=object_storage_address,
44
+ monitor_address=monitor_address,
45
+ scaling_controller_strategy=scaling_controller_strategy,
46
+ adapter_webhook_urls=adapter_webhook_urls,
47
+ io_threads=io_threads,
48
+ max_number_of_tasks_waiting=max_number_of_tasks_waiting,
49
+ client_timeout_seconds=client_timeout_seconds,
50
+ worker_timeout_seconds=worker_timeout_seconds,
51
+ object_retention_seconds=object_retention_seconds,
52
+ load_balance_seconds=load_balance_seconds,
53
+ load_balance_trigger_times=load_balance_trigger_times,
54
+ protected=protected,
55
+ allocate_policy=allocate_policy,
56
+ )
57
+
58
+ self._logging_paths = logging_paths
59
+ self._logging_config_file = logging_config_file
60
+ self._logging_level = logging_level
61
+
62
+ self._scheduler: Optional[Scheduler] = None
63
+ self._loop: Optional[AbstractEventLoop] = None
64
+ self._task: Optional[Task[Any]] = None
65
+
66
+ def run(self) -> None:
67
+ # scheduler have its own single process
68
+ setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
69
+ register_event_loop(self._scheduler_config.event_loop)
70
+
71
+ self._loop = asyncio.get_event_loop()
72
+ SchedulerProcess.__register_signal(self._loop)
73
+
74
+ self._task = self._loop.create_task(scheduler_main(self._scheduler_config))
75
+
76
+ self._loop.run_until_complete(self._task)
77
+
78
+ @staticmethod
79
+ def __register_signal(loop):
80
+ loop.add_signal_handler(signal.SIGINT, SchedulerProcess.__handle_signal)
81
+ loop.add_signal_handler(signal.SIGTERM, SchedulerProcess.__handle_signal)
82
+
83
+ @staticmethod
84
+ def __handle_signal():
85
+ for task in asyncio.all_tasks():
86
+ task.cancel()
File without changes
@@ -0,0 +1,94 @@
1
+ import os
2
+
3
+ from scaler.config.types.network_backend import NetworkBackend
4
+
5
+ # ==============
6
+ # SYSTEM OPTIONS
7
+
8
+ # object clean up time interval
9
+ CLEANUP_INTERVAL_SECONDS = 1
10
+
11
+ # status report interval, used by poke or scaled monitor
12
+ STATUS_REPORT_INTERVAL_SECONDS = 1
13
+
14
+ # number of seconds for profiling
15
+ PROFILING_INTERVAL_SECONDS = 1
16
+
17
+ # cap'n proto only allow Data/Text/Blob size to be as big as 500MB
18
+ CAPNP_DATA_SIZE_LIMIT = 2**29 - 1
19
+
20
+ # message size limitation, max can be 2**64
21
+ CAPNP_MESSAGE_SIZE_LIMIT = 2**64 - 1
22
+
23
+ # ==========================
24
+ # SCHEDULER SPECIFIC OPTIONS
25
+
26
+ # number of threads for zmq socket to handle
27
+ DEFAULT_IO_THREADS = 1
28
+
29
+ # if all workers are full and busy working, this option determine how many additional tasks scheduler can receive and
30
+ # queued, if additional number of tasks received exceeded this number, scheduler will reject tasks
31
+ DEFAULT_MAX_NUMBER_OF_TASKS_WAITING = -1
32
+
33
+ # if didn't receive heartbeat for following seconds, then scheduler will treat worker as dead and reschedule unfinished
34
+ # tasks for this worker
35
+ DEFAULT_WORKER_TIMEOUT_SECONDS = 60
36
+
37
+ # if didn't receive heartbeat for following seconds, then scheduler will treat client as dead and cancel remaining
38
+ # tasks for this client
39
+ DEFAULT_CLIENT_TIMEOUT_SECONDS = 60
40
+
41
+ # number of seconds for load balance, if value is -1 means disable load balance
42
+ DEFAULT_LOAD_BALANCE_SECONDS = 1
43
+
44
+ # when load balance advice happened repeatedly and always be the same, we issue load balance request when exact repeated
45
+ # times happened
46
+ DEFAULT_LOAD_BALANCE_TRIGGER_TIMES = 2
47
+
48
+ # number of tasks can be queued to each worker on scheduler side
49
+ DEFAULT_PER_WORKER_QUEUE_SIZE = 1000
50
+
51
+ # =======================
52
+ # WORKER SPECIFIC OPTIONS
53
+
54
+ # number of workers, echo worker use 1 process
55
+ DEFAULT_NUMBER_OF_WORKER = os.cpu_count() - 1
56
+
57
+ # number of seconds that worker agent send heartbeat to scheduler
58
+ DEFAULT_HEARTBEAT_INTERVAL_SECONDS = 2
59
+
60
+ # number of seconds the object cache kept in worker's memory
61
+ DEFAULT_OBJECT_RETENTION_SECONDS = 60
62
+
63
+ # number of seconds worker doing garbage collection
64
+ DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS = 30
65
+
66
+ # number of bytes threshold for worker process that trigger deep garbage collection
67
+ DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES = 1024 * 1024 * 1024
68
+
69
+ # default task timeout seconds, 0 means never timeout
70
+ DEFAULT_TASK_TIMEOUT_SECONDS = 0
71
+
72
+ # number of seconds that worker agent wait for processor to finish before killing it
73
+ DEFAULT_PROCESSOR_KILL_DELAY_SECONDS = 3
74
+
75
+ # number of seconds without scheduler contact before worker shuts down
76
+ DEFAULT_WORKER_DEATH_TIMEOUT = 5 * 60
77
+
78
+ # if true, suspended worker's processors will be actively suspended with a SIGTSTP signal, otherwise a synchronization
79
+ # event will be used.
80
+ DEFAULT_HARD_PROCESSOR_SUSPEND = False
81
+
82
+ # =======================
83
+ # LOGGING SPECIFIC OPTIONS
84
+
85
+ # default logging level
86
+ DEFAULT_LOGGING_LEVEL = "INFO"
87
+
88
+ # default logging paths
89
+ DEFAULT_LOGGING_PATHS = ("/dev/stdout",)
90
+
91
+ # =======================
92
+ # SCALER NETWORK BACKEND SPECIFIC OPTIONS
93
+
94
+ SCALER_NETWORK_BACKEND = NetworkBackend.tcp_zmq
@@ -0,0 +1,96 @@
1
+ import argparse
2
+ import dataclasses
3
+ import enum
4
+ import sys
5
+ from typing import Any, Dict, Optional, Type, TypeVar, Union, cast, get_args, get_origin
6
+
7
+ if sys.version_info >= (3, 11):
8
+ import tomllib
9
+ else:
10
+ import tomli as tomllib
11
+
12
+ from scaler.config.mixins import ConfigType
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ def load_config(
18
+ config_class: Type[T], config_path: Optional[str], args: argparse.Namespace, section_name: Optional[str] = None
19
+ ) -> T:
20
+ """
21
+ Loads configuration for a given dataclass from a TOML file and overrides it with command-line arguments.
22
+ """
23
+ if not dataclasses.is_dataclass(config_class):
24
+ raise TypeError(f"{config_class.__name__} is not a dataclass and cannot be used with this config loader.")
25
+
26
+ config_from_file = {}
27
+ if config_path:
28
+ try:
29
+ with open(config_path, "rb") as f:
30
+ try:
31
+ full_config = tomllib.load(f)
32
+ except tomllib.TOMLDecodeError as e:
33
+ raise ValueError(f"Error parsing TOML file at {config_path}: {e}") from e
34
+
35
+ if section_name:
36
+ config_from_file = full_config.get(section_name, {})
37
+ else:
38
+ config_from_file = full_config
39
+ except FileNotFoundError:
40
+ raise FileNotFoundError(f"Configuration file not found at: {config_path}")
41
+
42
+ config_from_args = {k: v for k, v in vars(args).items() if v is not None}
43
+ merged_config_data = {**config_from_file, **config_from_args}
44
+
45
+ valid_keys = {f.name for f in dataclasses.fields(config_class)}
46
+ unknown_keys = set(merged_config_data.keys()) - valid_keys - {"config"}
47
+ if unknown_keys:
48
+ raise ValueError(f"Unknown configuration key(s) for {config_class.__name__}: {', '.join(unknown_keys)}")
49
+
50
+ final_kwargs: Dict[str, Any] = {}
51
+ for field in dataclasses.fields(config_class):
52
+ if field.name in merged_config_data:
53
+ raw_value = merged_config_data[field.name]
54
+ field_type = field.type
55
+ is_optional = get_origin(field_type) is Union
56
+ if is_optional:
57
+ possible_types = [t for t in get_args(field_type) if t is not type(None)]
58
+ actual_type = possible_types[0] if possible_types else field_type
59
+ else:
60
+ actual_type = field_type
61
+
62
+ if (
63
+ isinstance(raw_value, str)
64
+ and isinstance(actual_type, type)
65
+ and issubclass(actual_type, ConfigType)
66
+ and not isinstance(raw_value, actual_type)
67
+ ):
68
+ final_kwargs[field.name] = actual_type.from_string(raw_value)
69
+ elif isinstance(raw_value, str) and isinstance(actual_type, type) and issubclass(actual_type, enum.Enum):
70
+ try:
71
+ final_kwargs[field.name] = actual_type[raw_value]
72
+ except KeyError as e:
73
+ raise ValueError(f"'{raw_value}' is not a valid member for {actual_type.__name__}") from e
74
+ elif isinstance(raw_value, list) and get_origin(field.type) is tuple:
75
+ final_kwargs[field.name] = tuple(raw_value)
76
+ else:
77
+ final_kwargs[field.name] = raw_value
78
+
79
+ try:
80
+ return cast(T, config_class(**final_kwargs))
81
+ except TypeError as e:
82
+ missing_fields = [
83
+ f.name
84
+ for f in dataclasses.fields(config_class)
85
+ if f.init
86
+ and f.name not in final_kwargs
87
+ and f.default is dataclasses.MISSING
88
+ and f.default_factory is dataclasses.MISSING
89
+ ]
90
+ if missing_fields:
91
+ raise ValueError(
92
+ f"Missing required configuration arguments: {', '.join(missing_fields)}. "
93
+ f"Please provide them via command line or a TOML config file."
94
+ ) from e
95
+ else:
96
+ raise e
@@ -0,0 +1,20 @@
1
+ import abc
2
+ import sys
3
+
4
+ if sys.version_info >= (3, 11):
5
+ from typing import Self
6
+ else:
7
+ from typing_extensions import Self
8
+
9
+
10
+ class ConfigType(metaclass=abc.ABCMeta):
11
+ """A base class for composite config values that can be parsed and serialized from/to a string."""
12
+
13
+ @classmethod
14
+ @abc.abstractmethod
15
+ def from_string(cls, value: str) -> Self:
16
+ pass
17
+
18
+ @abc.abstractmethod
19
+ def __str__(self) -> str:
20
+ pass
File without changes
@@ -0,0 +1,55 @@
1
+ import dataclasses
2
+ from typing import Optional, Tuple
3
+
4
+ from scaler.config import defaults
5
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
6
+ from scaler.config.types.worker import WorkerCapabilities, WorkerNames
7
+ from scaler.config.types.zmq import ZMQConfig
8
+ from scaler.utility.logging.utility import LoggingLevel
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class ClusterConfig:
13
+ scheduler_address: ZMQConfig
14
+ object_storage_address: Optional[ObjectStorageConfig] = None
15
+ preload: Optional[str] = None
16
+ worker_io_threads: int = defaults.DEFAULT_IO_THREADS
17
+ worker_names: WorkerNames = dataclasses.field(default_factory=lambda: WorkerNames.from_string(""))
18
+ num_of_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
19
+ per_worker_capabilities: WorkerCapabilities = dataclasses.field(
20
+ default_factory=lambda: WorkerCapabilities.from_string("")
21
+ )
22
+ per_worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
23
+ heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
24
+ task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
25
+ death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
26
+ garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
27
+ trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
28
+ hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
29
+ event_loop: str = "builtin"
30
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
31
+ logging_config_file: Optional[str] = None
32
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
33
+
34
+ def __post_init__(self):
35
+ if self.worker_io_threads <= 0:
36
+ raise ValueError("worker_io_threads must be a positive integer.")
37
+ if self.worker_names.names and len(self.worker_names.names) != self.num_of_workers:
38
+ raise ValueError(
39
+ f"The number of worker_names ({len(self.worker_names.names)}) \
40
+ must match num_of_workers ({self.num_of_workers})."
41
+ )
42
+ if self.per_worker_task_queue_size <= 0:
43
+ raise ValueError("per_worker_task_queue_size must be positive.")
44
+ if (
45
+ self.heartbeat_interval_seconds <= 0
46
+ or self.task_timeout_seconds < 0
47
+ or self.death_timeout_seconds <= 0
48
+ or self.garbage_collect_interval_seconds <= 0
49
+ ):
50
+ raise ValueError("All interval/timeout second values must be positive.")
51
+ if self.trim_memory_threshold_bytes < 0:
52
+ raise ValueError("trim_memory_threshold_bytes cannot be negative.")
53
+ valid_levels = {level.name for level in LoggingLevel}
54
+ if self.logging_level.upper() not in valid_levels:
55
+ raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")
@@ -0,0 +1,85 @@
1
+ import dataclasses
2
+ from typing import List, Optional, Tuple
3
+
4
+ from scaler.config import defaults
5
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
6
+ from scaler.config.types.worker import WorkerCapabilities
7
+ from scaler.config.types.zmq import ZMQConfig
8
+ from scaler.utility.logging.utility import LoggingLevel
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class ECSWorkerAdapterConfig:
13
+ # Server (adapter) configuration
14
+ adapter_web_host: str
15
+ adapter_web_port: int
16
+
17
+ scheduler_address: ZMQConfig
18
+ object_storage_address: Optional[ObjectStorageConfig] = None
19
+
20
+ # AWS / ECS specific configuration
21
+ aws_access_key_id: Optional[str] = None
22
+ aws_secret_access_key: Optional[str] = None
23
+ aws_region: str = "us-east-1"
24
+ ecs_subnets: List[str] = dataclasses.field(default_factory=list)
25
+ ecs_cluster: str = "scaler-cluster"
26
+ ecs_task_image: str = "public.ecr.aws/v4u8j8r6/scaler:latest"
27
+ ecs_python_requirements: str = "tomli;pargraph;parfun;pandas"
28
+ ecs_python_version: str = "3.12.11"
29
+ ecs_task_definition: str = "scaler-task-definition"
30
+ ecs_task_cpu: int = 4
31
+ ecs_task_memory: int = 30
32
+
33
+ # Generic worker adapter options
34
+ io_threads: int = defaults.DEFAULT_IO_THREADS
35
+ per_worker_capabilities: WorkerCapabilities = dataclasses.field(
36
+ default_factory=lambda: WorkerCapabilities.from_string("")
37
+ )
38
+ per_worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
39
+ max_instances: int = defaults.DEFAULT_NUMBER_OF_WORKER
40
+ heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
41
+ task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
42
+ death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
43
+ garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
44
+ trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
45
+ hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
46
+ event_loop: str = "builtin"
47
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
48
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
49
+ logging_config_file: Optional[str] = None
50
+
51
+ def __post_init__(self):
52
+ # Validate server fields
53
+ if not isinstance(self.adapter_web_host, str):
54
+ raise TypeError(f"adapter_web_host should be string, given {self.adapter_web_host}")
55
+ if not isinstance(self.adapter_web_port, int) or not (1 <= self.adapter_web_port <= 65535):
56
+ raise ValueError(f"adapter_web_port must be between 1 and 65535, but got {self.adapter_web_port}")
57
+
58
+ # Validate numeric and collection values
59
+ if self.io_threads <= 0:
60
+ raise ValueError("io_threads must be a positive integer.")
61
+ if self.per_worker_task_queue_size <= 0:
62
+ raise ValueError("worker_task_queue_size must be positive.")
63
+ if self.ecs_task_cpu <= 0:
64
+ raise ValueError("ecs_task_cpu must be a positive integer.")
65
+ if self.ecs_task_memory <= 0:
66
+ raise ValueError("ecs_task_memory must be a positive integer.")
67
+ if self.heartbeat_interval_seconds <= 0 or self.death_timeout_seconds <= 0:
68
+ raise ValueError("All interval/timeout second values must be positive.")
69
+ if self.max_instances != -1 and self.max_instances <= 0:
70
+ raise ValueError("max_instances must be -1 (no limit) or a positive integer.")
71
+ if not isinstance(self.ecs_subnets, list) or len(self.ecs_subnets) == 0:
72
+ raise ValueError("ecs_subnets must be a non-empty list of subnet ids.")
73
+
74
+ # Validate required strings
75
+ if not self.ecs_cluster:
76
+ raise ValueError("ecs_cluster cannot be an empty string.")
77
+ if not self.ecs_task_definition:
78
+ raise ValueError("ecs_task_definition cannot be an empty string.")
79
+ if not self.ecs_task_image:
80
+ raise ValueError("ecs_task_image cannot be an empty string.")
81
+
82
+ # Validate logging level
83
+ valid_levels = {level.name for level in LoggingLevel}
84
+ if self.logging_level.upper() not in valid_levels:
85
+ raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")
@@ -0,0 +1,43 @@
1
+ import dataclasses
2
+ from typing import Optional, Tuple
3
+
4
+ from scaler.config import defaults
5
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
6
+ from scaler.config.types.worker import WorkerCapabilities
7
+ from scaler.config.types.zmq import ZMQConfig
8
+
9
+
10
+ @dataclasses.dataclass
11
+ class NativeWorkerAdapterConfig:
12
+ scheduler_address: ZMQConfig
13
+ object_storage_address: Optional[ObjectStorageConfig] = None
14
+ adapter_web_host: str = "localhost"
15
+ adapter_web_port: int = 8080
16
+ per_worker_capabilities: WorkerCapabilities = dataclasses.field(
17
+ default_factory=lambda: WorkerCapabilities.from_string("")
18
+ )
19
+ io_threads: int = defaults.DEFAULT_IO_THREADS
20
+ worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
21
+ max_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
22
+ heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
23
+ task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
24
+ death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
25
+ garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
26
+ trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
27
+ hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
28
+ event_loop: str = "builtin"
29
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
30
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
31
+ logging_config_file: Optional[str] = None
32
+
33
+ def __post_init__(self):
34
+ if not isinstance(self.adapter_web_host, str):
35
+ raise TypeError(f"adapter_web_host should be string, given {self.adapter_web_host}")
36
+ if not isinstance(self.adapter_web_port, int):
37
+ raise TypeError(f"adapter_web_port must be between 1 and 65535, but got {self.adapter_web_port}")
38
+ if self.io_threads <= 0:
39
+ raise ValueError("io_threads must be a positive integer.")
40
+ if self.worker_task_queue_size <= 0:
41
+ raise ValueError("worker_task_queue_size must be positive.")
42
+ if self.heartbeat_interval_seconds <= 0 or self.task_timeout_seconds < 0 or self.death_timeout_seconds <= 0:
43
+ raise ValueError("All interval/timeout second values must be positive.")