opengris-scaler 1.12.7__cp38-cp38-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (232) hide show
  1. opengris_scaler-1.12.7.dist-info/METADATA +729 -0
  2. opengris_scaler-1.12.7.dist-info/RECORD +232 -0
  3. opengris_scaler-1.12.7.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.7.dist-info/entry_points.txt +9 -0
  5. opengris_scaler-1.12.7.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.7.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.7.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-b787335c.1.0.so +0 -0
  9. opengris_scaler.libs/libkj-1-094aa318.1.0.so +0 -0
  10. scaler/CMakeLists.txt +11 -0
  11. scaler/__init__.py +14 -0
  12. scaler/about.py +5 -0
  13. scaler/client/__init__.py +0 -0
  14. scaler/client/agent/__init__.py +0 -0
  15. scaler/client/agent/client_agent.py +210 -0
  16. scaler/client/agent/disconnect_manager.py +27 -0
  17. scaler/client/agent/future_manager.py +112 -0
  18. scaler/client/agent/heartbeat_manager.py +74 -0
  19. scaler/client/agent/mixins.py +89 -0
  20. scaler/client/agent/object_manager.py +98 -0
  21. scaler/client/agent/task_manager.py +64 -0
  22. scaler/client/client.py +635 -0
  23. scaler/client/future.py +252 -0
  24. scaler/client/object_buffer.py +129 -0
  25. scaler/client/object_reference.py +25 -0
  26. scaler/client/serializer/__init__.py +0 -0
  27. scaler/client/serializer/default.py +16 -0
  28. scaler/client/serializer/mixins.py +38 -0
  29. scaler/cluster/__init__.py +0 -0
  30. scaler/cluster/cluster.py +115 -0
  31. scaler/cluster/combo.py +148 -0
  32. scaler/cluster/object_storage_server.py +45 -0
  33. scaler/cluster/scheduler.py +83 -0
  34. scaler/config/__init__.py +0 -0
  35. scaler/config/defaults.py +87 -0
  36. scaler/config/loader.py +95 -0
  37. scaler/config/mixins.py +15 -0
  38. scaler/config/section/__init__.py +0 -0
  39. scaler/config/section/cluster.py +56 -0
  40. scaler/config/section/native_worker_adapter.py +44 -0
  41. scaler/config/section/object_storage_server.py +7 -0
  42. scaler/config/section/scheduler.py +53 -0
  43. scaler/config/section/symphony_worker_adapter.py +47 -0
  44. scaler/config/section/top.py +13 -0
  45. scaler/config/section/webui.py +16 -0
  46. scaler/config/types/__init__.py +0 -0
  47. scaler/config/types/object_storage_server.py +45 -0
  48. scaler/config/types/worker.py +57 -0
  49. scaler/config/types/zmq.py +79 -0
  50. scaler/entry_points/__init__.py +0 -0
  51. scaler/entry_points/cluster.py +133 -0
  52. scaler/entry_points/object_storage_server.py +41 -0
  53. scaler/entry_points/scheduler.py +135 -0
  54. scaler/entry_points/top.py +286 -0
  55. scaler/entry_points/webui.py +26 -0
  56. scaler/entry_points/worker_adapter_native.py +137 -0
  57. scaler/entry_points/worker_adapter_symphony.py +102 -0
  58. scaler/io/__init__.py +0 -0
  59. scaler/io/async_binder.py +85 -0
  60. scaler/io/async_connector.py +95 -0
  61. scaler/io/async_object_storage_connector.py +185 -0
  62. scaler/io/mixins.py +154 -0
  63. scaler/io/sync_connector.py +68 -0
  64. scaler/io/sync_object_storage_connector.py +185 -0
  65. scaler/io/sync_subscriber.py +83 -0
  66. scaler/io/utility.py +31 -0
  67. scaler/io/ymq/CMakeLists.txt +98 -0
  68. scaler/io/ymq/__init__.py +0 -0
  69. scaler/io/ymq/_ymq.pyi +96 -0
  70. scaler/io/ymq/_ymq.so +0 -0
  71. scaler/io/ymq/bytes.h +114 -0
  72. scaler/io/ymq/common.h +29 -0
  73. scaler/io/ymq/configuration.h +60 -0
  74. scaler/io/ymq/epoll_context.cpp +185 -0
  75. scaler/io/ymq/epoll_context.h +85 -0
  76. scaler/io/ymq/error.h +132 -0
  77. scaler/io/ymq/event_loop.h +55 -0
  78. scaler/io/ymq/event_loop_thread.cpp +64 -0
  79. scaler/io/ymq/event_loop_thread.h +46 -0
  80. scaler/io/ymq/event_manager.h +81 -0
  81. scaler/io/ymq/file_descriptor.h +203 -0
  82. scaler/io/ymq/interruptive_concurrent_queue.h +169 -0
  83. scaler/io/ymq/io_context.cpp +98 -0
  84. scaler/io/ymq/io_context.h +44 -0
  85. scaler/io/ymq/io_socket.cpp +299 -0
  86. scaler/io/ymq/io_socket.h +121 -0
  87. scaler/io/ymq/iocp_context.cpp +102 -0
  88. scaler/io/ymq/iocp_context.h +83 -0
  89. scaler/io/ymq/logging.h +163 -0
  90. scaler/io/ymq/message.h +15 -0
  91. scaler/io/ymq/message_connection.h +16 -0
  92. scaler/io/ymq/message_connection_tcp.cpp +672 -0
  93. scaler/io/ymq/message_connection_tcp.h +96 -0
  94. scaler/io/ymq/network_utils.h +179 -0
  95. scaler/io/ymq/pymod_ymq/bytes.h +113 -0
  96. scaler/io/ymq/pymod_ymq/exception.h +124 -0
  97. scaler/io/ymq/pymod_ymq/gil.h +15 -0
  98. scaler/io/ymq/pymod_ymq/io_context.h +166 -0
  99. scaler/io/ymq/pymod_ymq/io_socket.h +285 -0
  100. scaler/io/ymq/pymod_ymq/message.h +99 -0
  101. scaler/io/ymq/pymod_ymq/python.h +153 -0
  102. scaler/io/ymq/pymod_ymq/ymq.cpp +23 -0
  103. scaler/io/ymq/pymod_ymq/ymq.h +357 -0
  104. scaler/io/ymq/readme.md +114 -0
  105. scaler/io/ymq/simple_interface.cpp +80 -0
  106. scaler/io/ymq/simple_interface.h +24 -0
  107. scaler/io/ymq/tcp_client.cpp +367 -0
  108. scaler/io/ymq/tcp_client.h +75 -0
  109. scaler/io/ymq/tcp_operations.h +41 -0
  110. scaler/io/ymq/tcp_server.cpp +410 -0
  111. scaler/io/ymq/tcp_server.h +79 -0
  112. scaler/io/ymq/third_party/concurrentqueue.h +3747 -0
  113. scaler/io/ymq/timed_queue.h +272 -0
  114. scaler/io/ymq/timestamp.h +102 -0
  115. scaler/io/ymq/typedefs.h +20 -0
  116. scaler/io/ymq/utils.h +34 -0
  117. scaler/io/ymq/ymq.py +130 -0
  118. scaler/object_storage/CMakeLists.txt +50 -0
  119. scaler/object_storage/__init__.py +0 -0
  120. scaler/object_storage/constants.h +11 -0
  121. scaler/object_storage/defs.h +14 -0
  122. scaler/object_storage/io_helper.cpp +44 -0
  123. scaler/object_storage/io_helper.h +9 -0
  124. scaler/object_storage/message.cpp +56 -0
  125. scaler/object_storage/message.h +130 -0
  126. scaler/object_storage/object_manager.cpp +126 -0
  127. scaler/object_storage/object_manager.h +52 -0
  128. scaler/object_storage/object_storage_server.cpp +359 -0
  129. scaler/object_storage/object_storage_server.h +126 -0
  130. scaler/object_storage/object_storage_server.so +0 -0
  131. scaler/object_storage/pymod_object_storage_server.cpp +104 -0
  132. scaler/protocol/__init__.py +0 -0
  133. scaler/protocol/capnp/__init__.py +0 -0
  134. scaler/protocol/capnp/_python.py +6 -0
  135. scaler/protocol/capnp/common.capnp +63 -0
  136. scaler/protocol/capnp/message.capnp +216 -0
  137. scaler/protocol/capnp/object_storage.capnp +52 -0
  138. scaler/protocol/capnp/status.capnp +73 -0
  139. scaler/protocol/introduction.md +105 -0
  140. scaler/protocol/python/__init__.py +0 -0
  141. scaler/protocol/python/common.py +135 -0
  142. scaler/protocol/python/message.py +726 -0
  143. scaler/protocol/python/mixins.py +13 -0
  144. scaler/protocol/python/object_storage.py +118 -0
  145. scaler/protocol/python/status.py +279 -0
  146. scaler/protocol/worker.md +228 -0
  147. scaler/scheduler/__init__.py +0 -0
  148. scaler/scheduler/allocate_policy/__init__.py +0 -0
  149. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  150. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  151. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  152. scaler/scheduler/allocate_policy/mixins.py +55 -0
  153. scaler/scheduler/controllers/__init__.py +0 -0
  154. scaler/scheduler/controllers/balance_controller.py +65 -0
  155. scaler/scheduler/controllers/client_controller.py +131 -0
  156. scaler/scheduler/controllers/config_controller.py +31 -0
  157. scaler/scheduler/controllers/graph_controller.py +424 -0
  158. scaler/scheduler/controllers/information_controller.py +81 -0
  159. scaler/scheduler/controllers/mixins.py +201 -0
  160. scaler/scheduler/controllers/object_controller.py +147 -0
  161. scaler/scheduler/controllers/scaling_controller.py +86 -0
  162. scaler/scheduler/controllers/task_controller.py +373 -0
  163. scaler/scheduler/controllers/worker_controller.py +168 -0
  164. scaler/scheduler/object_usage/__init__.py +0 -0
  165. scaler/scheduler/object_usage/object_tracker.py +131 -0
  166. scaler/scheduler/scheduler.py +253 -0
  167. scaler/scheduler/task/__init__.py +0 -0
  168. scaler/scheduler/task/task_state_machine.py +92 -0
  169. scaler/scheduler/task/task_state_manager.py +61 -0
  170. scaler/ui/__init__.py +0 -0
  171. scaler/ui/constants.py +9 -0
  172. scaler/ui/live_display.py +118 -0
  173. scaler/ui/memory_window.py +146 -0
  174. scaler/ui/setting_page.py +47 -0
  175. scaler/ui/task_graph.py +370 -0
  176. scaler/ui/task_log.py +83 -0
  177. scaler/ui/utility.py +35 -0
  178. scaler/ui/webui.py +125 -0
  179. scaler/ui/worker_processors.py +85 -0
  180. scaler/utility/__init__.py +0 -0
  181. scaler/utility/debug.py +19 -0
  182. scaler/utility/event_list.py +63 -0
  183. scaler/utility/event_loop.py +58 -0
  184. scaler/utility/exceptions.py +42 -0
  185. scaler/utility/formatter.py +44 -0
  186. scaler/utility/graph/__init__.py +0 -0
  187. scaler/utility/graph/optimization.py +27 -0
  188. scaler/utility/graph/topological_sorter.py +11 -0
  189. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  190. scaler/utility/identifiers.py +105 -0
  191. scaler/utility/logging/__init__.py +0 -0
  192. scaler/utility/logging/decorators.py +25 -0
  193. scaler/utility/logging/scoped_logger.py +33 -0
  194. scaler/utility/logging/utility.py +183 -0
  195. scaler/utility/many_to_many_dict.py +123 -0
  196. scaler/utility/metadata/__init__.py +0 -0
  197. scaler/utility/metadata/profile_result.py +31 -0
  198. scaler/utility/metadata/task_flags.py +30 -0
  199. scaler/utility/mixins.py +13 -0
  200. scaler/utility/network_util.py +7 -0
  201. scaler/utility/one_to_many_dict.py +72 -0
  202. scaler/utility/queues/__init__.py +0 -0
  203. scaler/utility/queues/async_indexed_queue.py +37 -0
  204. scaler/utility/queues/async_priority_queue.py +70 -0
  205. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  206. scaler/utility/queues/indexed_queue.py +114 -0
  207. scaler/utility/serialization.py +9 -0
  208. scaler/version.txt +1 -0
  209. scaler/worker/__init__.py +0 -0
  210. scaler/worker/agent/__init__.py +0 -0
  211. scaler/worker/agent/heartbeat_manager.py +107 -0
  212. scaler/worker/agent/mixins.py +137 -0
  213. scaler/worker/agent/processor/__init__.py +0 -0
  214. scaler/worker/agent/processor/object_cache.py +107 -0
  215. scaler/worker/agent/processor/processor.py +279 -0
  216. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  217. scaler/worker/agent/processor_holder.py +145 -0
  218. scaler/worker/agent/processor_manager.py +365 -0
  219. scaler/worker/agent/profiling_manager.py +109 -0
  220. scaler/worker/agent/task_manager.py +150 -0
  221. scaler/worker/agent/timeout_manager.py +19 -0
  222. scaler/worker/preload.py +84 -0
  223. scaler/worker/worker.py +264 -0
  224. scaler/worker_adapter/__init__.py +0 -0
  225. scaler/worker_adapter/native.py +154 -0
  226. scaler/worker_adapter/symphony/__init__.py +0 -0
  227. scaler/worker_adapter/symphony/callback.py +45 -0
  228. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  229. scaler/worker_adapter/symphony/message.py +24 -0
  230. scaler/worker_adapter/symphony/task_manager.py +288 -0
  231. scaler/worker_adapter/symphony/worker.py +205 -0
  232. scaler/worker_adapter/symphony/worker_adapter.py +142 -0
@@ -0,0 +1,148 @@
1
+ import logging
2
+ import socket
3
+ from typing import Dict, Optional, Tuple
4
+
5
+ from scaler.cluster.cluster import Cluster
6
+ from scaler.cluster.object_storage_server import ObjectStorageServerProcess
7
+ from scaler.cluster.scheduler import SchedulerProcess
8
+ from scaler.config.defaults import (
9
+ DEFAULT_CLIENT_TIMEOUT_SECONDS,
10
+ DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
11
+ DEFAULT_HARD_PROCESSOR_SUSPEND,
12
+ DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
13
+ DEFAULT_IO_THREADS,
14
+ DEFAULT_LOAD_BALANCE_SECONDS,
15
+ DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
16
+ DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
17
+ DEFAULT_OBJECT_RETENTION_SECONDS,
18
+ DEFAULT_PER_WORKER_QUEUE_SIZE,
19
+ DEFAULT_TASK_TIMEOUT_SECONDS,
20
+ DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
21
+ DEFAULT_WORKER_DEATH_TIMEOUT,
22
+ DEFAULT_WORKER_TIMEOUT_SECONDS,
23
+ DEFAULT_LOGGING_LEVEL,
24
+ DEFAULT_LOGGING_PATHS,
25
+ )
26
+ from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
27
+ from scaler.utility.network_util import get_available_tcp_port
28
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
29
+ from scaler.config.types.zmq import ZMQConfig
30
+
31
+
32
+ class SchedulerClusterCombo:
33
+ def __init__(
34
+ self,
35
+ n_workers: int,
36
+ address: Optional[str] = None,
37
+ storage_address: Optional[str] = None,
38
+ monitor_address: Optional[str] = None,
39
+ per_worker_capabilities: Optional[Dict[str, int]] = None,
40
+ worker_io_threads: int = DEFAULT_IO_THREADS,
41
+ scheduler_io_threads: int = DEFAULT_IO_THREADS,
42
+ max_number_of_tasks_waiting: int = DEFAULT_MAX_NUMBER_OF_TASKS_WAITING,
43
+ heartbeat_interval_seconds: int = DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
44
+ client_timeout_seconds: int = DEFAULT_CLIENT_TIMEOUT_SECONDS,
45
+ worker_timeout_seconds: int = DEFAULT_WORKER_TIMEOUT_SECONDS,
46
+ object_retention_seconds: int = DEFAULT_OBJECT_RETENTION_SECONDS,
47
+ task_timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
48
+ death_timeout_seconds: int = DEFAULT_WORKER_DEATH_TIMEOUT,
49
+ load_balance_seconds: int = DEFAULT_LOAD_BALANCE_SECONDS,
50
+ load_balance_trigger_times: int = DEFAULT_LOAD_BALANCE_TRIGGER_TIMES,
51
+ garbage_collect_interval_seconds: int = DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS,
52
+ trim_memory_threshold_bytes: int = DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES,
53
+ per_worker_task_queue_size: int = DEFAULT_PER_WORKER_QUEUE_SIZE,
54
+ hard_processor_suspend: bool = DEFAULT_HARD_PROCESSOR_SUSPEND,
55
+ protected: bool = True,
56
+ allocate_policy: AllocatePolicy = AllocatePolicy.even,
57
+ event_loop: str = "builtin",
58
+ logging_paths: Tuple[str, ...] = DEFAULT_LOGGING_PATHS,
59
+ logging_level: str = DEFAULT_LOGGING_LEVEL,
60
+ logging_config_file: Optional[str] = None,
61
+ ):
62
+ if address is None:
63
+ self._address = ZMQConfig.from_string(f"tcp://127.0.0.1:{get_available_tcp_port()}")
64
+ else:
65
+ self._address = ZMQConfig.from_string(address)
66
+
67
+ if storage_address is None:
68
+ self._storage_address = ObjectStorageConfig(self._address.host, get_available_tcp_port())
69
+ else:
70
+ self._storage_address = ObjectStorageConfig.from_string(storage_address)
71
+
72
+ if monitor_address is None:
73
+ self._monitor_address = None
74
+ else:
75
+ self._monitor_address = ZMQConfig.from_string(monitor_address)
76
+
77
+ self._object_storage = ObjectStorageServerProcess(
78
+ storage_address=self._storage_address,
79
+ logging_paths=logging_paths,
80
+ logging_level=logging_level,
81
+ logging_config_file=logging_config_file,
82
+ )
83
+ self._object_storage.start()
84
+ self._object_storage.wait_until_ready() # object storage should be ready before starting the cluster
85
+
86
+ self._cluster = Cluster(
87
+ address=self._address,
88
+ storage_address=self._storage_address,
89
+ preload=None,
90
+ worker_io_threads=worker_io_threads,
91
+ worker_names=[f"{socket.gethostname().split('.')[0]}_{i}" for i in range(n_workers)],
92
+ per_worker_capabilities=per_worker_capabilities or {},
93
+ per_worker_task_queue_size=per_worker_task_queue_size,
94
+ heartbeat_interval_seconds=heartbeat_interval_seconds,
95
+ task_timeout_seconds=task_timeout_seconds,
96
+ death_timeout_seconds=death_timeout_seconds,
97
+ garbage_collect_interval_seconds=garbage_collect_interval_seconds,
98
+ trim_memory_threshold_bytes=trim_memory_threshold_bytes,
99
+ hard_processor_suspend=hard_processor_suspend,
100
+ event_loop=event_loop,
101
+ logging_paths=logging_paths,
102
+ logging_config_file=logging_config_file,
103
+ logging_level=logging_level,
104
+ )
105
+
106
+ self._scheduler = SchedulerProcess(
107
+ address=self._address,
108
+ storage_address=self._storage_address,
109
+ monitor_address=self._monitor_address,
110
+ io_threads=scheduler_io_threads,
111
+ max_number_of_tasks_waiting=max_number_of_tasks_waiting,
112
+ client_timeout_seconds=client_timeout_seconds,
113
+ adapter_webhook_url=None,
114
+ worker_timeout_seconds=worker_timeout_seconds,
115
+ object_retention_seconds=object_retention_seconds,
116
+ load_balance_seconds=load_balance_seconds,
117
+ load_balance_trigger_times=load_balance_trigger_times,
118
+ protected=protected,
119
+ allocate_policy=allocate_policy,
120
+ event_loop=event_loop,
121
+ logging_paths=logging_paths,
122
+ logging_config_file=logging_config_file,
123
+ logging_level=logging_level,
124
+ )
125
+
126
+ self._cluster.start()
127
+ self._scheduler.start()
128
+ logging.info(f"{self.__get_prefix()} started")
129
+
130
+ def __del__(self):
131
+ self.shutdown()
132
+
133
+ def shutdown(self):
134
+ logging.info(f"{self.__get_prefix()} shutdown")
135
+ self._cluster.terminate()
136
+ self._scheduler.terminate()
137
+ self._cluster.join()
138
+ self._scheduler.join()
139
+
140
+ # object storage should terminate after the cluster and scheduler.
141
+ self._object_storage.terminate()
142
+ self._object_storage.join()
143
+
144
+ def get_address(self) -> str:
145
+ return self._address.to_address()
146
+
147
+ def __get_prefix(self):
148
+ return f"{self.__class__.__name__}:"
@@ -0,0 +1,45 @@
1
+ import logging
2
+ import multiprocessing
3
+ from typing import Optional, Tuple
4
+
5
+ from scaler.object_storage.object_storage_server import ObjectStorageServer
6
+ from scaler.utility.logging.utility import get_logger_info, setup_logger
7
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
8
+
9
+
10
+ class ObjectStorageServerProcess(multiprocessing.get_context("fork").Process): # type: ignore[misc]
11
+ def __init__(
12
+ self,
13
+ storage_address: ObjectStorageConfig,
14
+ logging_paths: Tuple[str, ...],
15
+ logging_level: str,
16
+ logging_config_file: Optional[str],
17
+ ):
18
+ multiprocessing.Process.__init__(self, name="ObjectStorageServer")
19
+
20
+ self._logging_paths = logging_paths
21
+ self._logging_level = logging_level
22
+ self._logging_config_file = logging_config_file
23
+
24
+ self._storage_address = storage_address
25
+
26
+ self._server = ObjectStorageServer()
27
+
28
+ def wait_until_ready(self) -> None:
29
+ """Blocks until the object storage server is available to server requests."""
30
+ self._server.wait_until_ready()
31
+
32
+ def run(self) -> None:
33
+ setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
34
+ logging.info(f"ObjectStorageServer: start and listen to {self._storage_address.to_string()}")
35
+
36
+ log_format_str, log_level_str, logging_paths = get_logger_info(logging.getLogger())
37
+
38
+ self._server.run(
39
+ self._storage_address.host,
40
+ self._storage_address.port,
41
+ self._storage_address.identity,
42
+ log_level_str,
43
+ log_format_str,
44
+ logging_paths,
45
+ )
@@ -0,0 +1,83 @@
1
+ import asyncio
2
+ import multiprocessing
3
+ import signal
4
+ from asyncio import AbstractEventLoop, Task
5
+ from typing import Any, Optional, Tuple
6
+
7
+ from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
8
+ from scaler.config.section.scheduler import SchedulerConfig
9
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
10
+ from scaler.config.types.zmq import ZMQConfig
11
+ from scaler.scheduler.scheduler import Scheduler, scheduler_main
12
+ from scaler.utility.event_loop import register_event_loop
13
+ from scaler.utility.logging.utility import setup_logger
14
+
15
+
16
+ class SchedulerProcess(multiprocessing.get_context("spawn").Process): # type: ignore[misc]
17
+ def __init__(
18
+ self,
19
+ address: ZMQConfig,
20
+ storage_address: Optional[ObjectStorageConfig],
21
+ monitor_address: Optional[ZMQConfig],
22
+ adapter_webhook_url: Optional[str],
23
+ io_threads: int,
24
+ max_number_of_tasks_waiting: int,
25
+ client_timeout_seconds: int,
26
+ worker_timeout_seconds: int,
27
+ object_retention_seconds: int,
28
+ load_balance_seconds: int,
29
+ load_balance_trigger_times: int,
30
+ protected: bool,
31
+ allocate_policy: AllocatePolicy,
32
+ event_loop: str,
33
+ logging_paths: Tuple[str, ...],
34
+ logging_config_file: Optional[str],
35
+ logging_level: str,
36
+ ):
37
+ multiprocessing.Process.__init__(self, name="Scheduler")
38
+ self._scheduler_config = SchedulerConfig(
39
+ event_loop=event_loop,
40
+ scheduler_address=address,
41
+ object_storage_address=storage_address,
42
+ monitor_address=monitor_address,
43
+ adapter_webhook_url=adapter_webhook_url,
44
+ io_threads=io_threads,
45
+ max_number_of_tasks_waiting=max_number_of_tasks_waiting,
46
+ client_timeout_seconds=client_timeout_seconds,
47
+ worker_timeout_seconds=worker_timeout_seconds,
48
+ object_retention_seconds=object_retention_seconds,
49
+ load_balance_seconds=load_balance_seconds,
50
+ load_balance_trigger_times=load_balance_trigger_times,
51
+ protected=protected,
52
+ allocate_policy=allocate_policy,
53
+ )
54
+
55
+ self._logging_paths = logging_paths
56
+ self._logging_config_file = logging_config_file
57
+ self._logging_level = logging_level
58
+
59
+ self._scheduler: Optional[Scheduler] = None
60
+ self._loop: Optional[AbstractEventLoop] = None
61
+ self._task: Optional[Task[Any]] = None
62
+
63
+ def run(self) -> None:
64
+ # scheduler have its own single process
65
+ setup_logger(self._logging_paths, self._logging_config_file, self._logging_level)
66
+ register_event_loop(self._scheduler_config.event_loop)
67
+
68
+ self._loop = asyncio.get_event_loop()
69
+ SchedulerProcess.__register_signal(self._loop)
70
+
71
+ self._task = self._loop.create_task(scheduler_main(self._scheduler_config))
72
+
73
+ self._loop.run_until_complete(self._task)
74
+
75
+ @staticmethod
76
+ def __register_signal(loop):
77
+ loop.add_signal_handler(signal.SIGINT, SchedulerProcess.__handle_signal)
78
+ loop.add_signal_handler(signal.SIGTERM, SchedulerProcess.__handle_signal)
79
+
80
+ @staticmethod
81
+ def __handle_signal():
82
+ for task in asyncio.all_tasks():
83
+ task.cancel()
File without changes
@@ -0,0 +1,87 @@
1
+ import os
2
+
3
+ # ==============
4
+ # SYSTEM OPTIONS
5
+
6
+ # object clean up time interval
7
+ CLEANUP_INTERVAL_SECONDS = 1
8
+
9
+ # status report interval, used by poke or scaled monitor
10
+ STATUS_REPORT_INTERVAL_SECONDS = 1
11
+
12
+ # number of seconds for profiling
13
+ PROFILING_INTERVAL_SECONDS = 1
14
+
15
+ # cap'n proto only allow Data/Text/Blob size to be as big as 500MB
16
+ CAPNP_DATA_SIZE_LIMIT = 2**29 - 1
17
+
18
+ # message size limitation, max can be 2**64
19
+ CAPNP_MESSAGE_SIZE_LIMIT = 2**64 - 1
20
+
21
+ # ==========================
22
+ # SCHEDULER SPECIFIC OPTIONS
23
+
24
+ # number of threads for zmq socket to handle
25
+ DEFAULT_IO_THREADS = 1
26
+
27
+ # if all workers are full and busy working, this option determine how many additional tasks scheduler can receive and
28
+ # queued, if additional number of tasks received exceeded this number, scheduler will reject tasks
29
+ DEFAULT_MAX_NUMBER_OF_TASKS_WAITING = -1
30
+
31
+ # if didn't receive heartbeat for following seconds, then scheduler will treat worker as dead and reschedule unfinished
32
+ # tasks for this worker
33
+ DEFAULT_WORKER_TIMEOUT_SECONDS = 60
34
+
35
+ # if didn't receive heartbeat for following seconds, then scheduler will treat client as dead and cancel remaining
36
+ # tasks for this client
37
+ DEFAULT_CLIENT_TIMEOUT_SECONDS = 60
38
+
39
+ # number of seconds for load balance, if value is -1 means disable load balance
40
+ DEFAULT_LOAD_BALANCE_SECONDS = 1
41
+
42
+ # when load balance advice happened repeatedly and always be the same, we issue load balance request when exact repeated
43
+ # times happened
44
+ DEFAULT_LOAD_BALANCE_TRIGGER_TIMES = 2
45
+
46
+ # number of tasks can be queued to each worker on scheduler side
47
+ DEFAULT_PER_WORKER_QUEUE_SIZE = 1000
48
+
49
+ # =======================
50
+ # WORKER SPECIFIC OPTIONS
51
+
52
+ # number of workers, echo worker use 1 process
53
+ DEFAULT_NUMBER_OF_WORKER = os.cpu_count() - 1
54
+
55
+ # number of seconds that worker agent send heartbeat to scheduler
56
+ DEFAULT_HEARTBEAT_INTERVAL_SECONDS = 2
57
+
58
+ # number of seconds the object cache kept in worker's memory
59
+ DEFAULT_OBJECT_RETENTION_SECONDS = 60
60
+
61
+ # number of seconds worker doing garbage collection
62
+ DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS = 30
63
+
64
+ # number of bytes threshold for worker process that trigger deep garbage collection
65
+ DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES = 1024 * 1024 * 1024
66
+
67
+ # default task timeout seconds, 0 means never timeout
68
+ DEFAULT_TASK_TIMEOUT_SECONDS = 0
69
+
70
+ # number of seconds that worker agent wait for processor to finish before killing it
71
+ DEFAULT_PROCESSOR_KILL_DELAY_SECONDS = 3
72
+
73
+ # number of seconds without scheduler contact before worker shuts down
74
+ DEFAULT_WORKER_DEATH_TIMEOUT = 5 * 60
75
+
76
+ # if true, suspended worker's processors will be actively suspended with a SIGTSTP signal, otherwise a synchronization
77
+ # event will be used.
78
+ DEFAULT_HARD_PROCESSOR_SUSPEND = False
79
+
80
+ # =======================
81
+ # LOGGING SPECIFIC OPTIONS
82
+
83
+ # default logging level
84
+ DEFAULT_LOGGING_LEVEL = "INFO"
85
+
86
+ # default logging paths
87
+ DEFAULT_LOGGING_PATHS = ("/dev/stdout",)
@@ -0,0 +1,95 @@
1
+ import argparse
2
+ import dataclasses
3
+ import enum
4
+ from typing import Any, cast, Dict, Optional, Type, TypeVar, Union, get_args, get_origin
5
+
6
+ try:
7
+ import tomllib
8
+ except ImportError:
9
+ import tomli as tomllib
10
+
11
+ from scaler.config.mixins import ConfigType
12
+
13
+ T = TypeVar("T")
14
+
15
+
16
+ def load_config(
17
+ config_class: Type[T], config_path: Optional[str], args: argparse.Namespace, section_name: Optional[str] = None
18
+ ) -> T:
19
+ """
20
+ Loads configuration for a given dataclass from a TOML file and overrides it with command-line arguments.
21
+ """
22
+ if not dataclasses.is_dataclass(config_class):
23
+ raise TypeError(f"{config_class.__name__} is not a dataclass and cannot be used with this config loader.")
24
+
25
+ config_from_file = {}
26
+ if config_path:
27
+ try:
28
+ with open(config_path, "rb") as f:
29
+ try:
30
+ full_config = tomllib.load(f)
31
+ except tomllib.TOMLDecodeError as e:
32
+ raise ValueError(f"Error parsing TOML file at {config_path}: {e}") from e
33
+
34
+ if section_name:
35
+ config_from_file = full_config.get(section_name, {})
36
+ else:
37
+ config_from_file = full_config
38
+ except FileNotFoundError:
39
+ raise FileNotFoundError(f"Configuration file not found at: {config_path}")
40
+
41
+ config_from_args = {k: v for k, v in vars(args).items() if v is not None}
42
+ merged_config_data = {**config_from_file, **config_from_args}
43
+
44
+ valid_keys = {f.name for f in dataclasses.fields(config_class)}
45
+ unknown_keys = set(merged_config_data.keys()) - valid_keys - {"config"}
46
+ if unknown_keys:
47
+ raise ValueError(f"Unknown configuration key(s) for {config_class.__name__}: {', '.join(unknown_keys)}")
48
+
49
+ final_kwargs: Dict[str, Any] = {}
50
+ for field in dataclasses.fields(config_class):
51
+ if field.name in merged_config_data:
52
+ raw_value = merged_config_data[field.name]
53
+ field_type = field.type
54
+ is_optional = get_origin(field_type) is Union
55
+ if is_optional:
56
+ possible_types = [t for t in get_args(field_type) if t is not type(None)]
57
+ actual_type = possible_types[0] if possible_types else field_type
58
+ else:
59
+ actual_type = field_type
60
+
61
+ if (
62
+ isinstance(raw_value, str)
63
+ and isinstance(actual_type, type)
64
+ and issubclass(actual_type, ConfigType)
65
+ and not isinstance(raw_value, actual_type)
66
+ ):
67
+ final_kwargs[field.name] = actual_type.from_string(raw_value)
68
+ elif isinstance(raw_value, str) and isinstance(actual_type, type) and issubclass(actual_type, enum.Enum):
69
+ try:
70
+ final_kwargs[field.name] = actual_type[raw_value]
71
+ except KeyError as e:
72
+ raise ValueError(f"'{raw_value}' is not a valid member for {actual_type.__name__}") from e
73
+ elif isinstance(raw_value, list) and get_origin(field.type) is tuple:
74
+ final_kwargs[field.name] = tuple(raw_value)
75
+ else:
76
+ final_kwargs[field.name] = raw_value
77
+
78
+ try:
79
+ return cast(T, config_class(**final_kwargs))
80
+ except TypeError as e:
81
+ missing_fields = [
82
+ f.name
83
+ for f in dataclasses.fields(config_class)
84
+ if f.init
85
+ and f.name not in final_kwargs
86
+ and f.default is dataclasses.MISSING
87
+ and f.default_factory is dataclasses.MISSING
88
+ ]
89
+ if missing_fields:
90
+ raise ValueError(
91
+ f"Missing required configuration arguments: {', '.join(missing_fields)}. "
92
+ f"Please provide them via command line or a TOML config file."
93
+ ) from e
94
+ else:
95
+ raise e
@@ -0,0 +1,15 @@
1
+ import abc
2
+ from typing_extensions import Self
3
+
4
+
5
+ class ConfigType(metaclass=abc.ABCMeta):
6
+ """A base class for composite config values that can be parsed and serialized from/to a string."""
7
+
8
+ @classmethod
9
+ @abc.abstractmethod
10
+ def from_string(cls, value: str) -> Self:
11
+ pass
12
+
13
+ @abc.abstractmethod
14
+ def __str__(self) -> str:
15
+ pass
File without changes
@@ -0,0 +1,56 @@
1
+ import dataclasses
2
+ from typing import Optional, Tuple
3
+
4
+ from scaler.config import defaults
5
+ from scaler.utility.logging.utility import LoggingLevel
6
+
7
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
8
+ from scaler.config.types.worker import WorkerCapabilities, WorkerNames
9
+ from scaler.config.types.zmq import ZMQConfig
10
+
11
+
12
+ @dataclasses.dataclass
13
+ class ClusterConfig:
14
+ scheduler_address: ZMQConfig
15
+ storage_address: Optional[ObjectStorageConfig] = None
16
+ preload: Optional[str] = None
17
+ worker_io_threads: int = defaults.DEFAULT_IO_THREADS
18
+ worker_names: WorkerNames = dataclasses.field(default_factory=lambda: WorkerNames.from_string(""))
19
+ num_of_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
20
+ per_worker_capabilities: WorkerCapabilities = dataclasses.field(
21
+ default_factory=lambda: WorkerCapabilities.from_string("")
22
+ )
23
+ per_worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
24
+ heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
25
+ task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
26
+ death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
27
+ garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
28
+ trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
29
+ hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
30
+ event_loop: str = "builtin"
31
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
32
+ logging_config_file: Optional[str] = None
33
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
34
+
35
+ def __post_init__(self):
36
+ if self.worker_io_threads <= 0:
37
+ raise ValueError("worker_io_threads must be a positive integer.")
38
+ if self.worker_names.names and len(self.worker_names.names) != self.num_of_workers:
39
+ raise ValueError(
40
+ f"The number of worker_names ({len(self.worker_names.names)}) \
41
+ must match num_of_workers ({self.num_of_workers})."
42
+ )
43
+ if self.per_worker_task_queue_size <= 0:
44
+ raise ValueError("per_worker_task_queue_size must be positive.")
45
+ if (
46
+ self.heartbeat_interval_seconds <= 0
47
+ or self.task_timeout_seconds < 0
48
+ or self.death_timeout_seconds <= 0
49
+ or self.garbage_collect_interval_seconds <= 0
50
+ ):
51
+ raise ValueError("All interval/timeout second values must be positive.")
52
+ if self.trim_memory_threshold_bytes < 0:
53
+ raise ValueError("trim_memory_threshold_bytes cannot be negative.")
54
+ valid_levels = {level.name for level in LoggingLevel}
55
+ if self.logging_level.upper() not in valid_levels:
56
+ raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")
@@ -0,0 +1,44 @@
1
+ import dataclasses
2
+ from typing import Optional, Tuple
3
+
4
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
5
+ from scaler.config.types.worker import WorkerCapabilities
6
+ from scaler.config.types.zmq import ZMQConfig
7
+
8
+ from scaler.config import defaults
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class NativeWorkerAdapterConfig:
13
+ scheduler_address: ZMQConfig
14
+ storage_address: Optional[ObjectStorageConfig] = None
15
+ adapter_web_host: str = "localhost"
16
+ adapter_web_port: int = 8080
17
+ per_worker_capabilities: WorkerCapabilities = dataclasses.field(
18
+ default_factory=lambda: WorkerCapabilities.from_string("")
19
+ )
20
+ io_threads: int = defaults.DEFAULT_IO_THREADS
21
+ worker_task_queue_size: int = defaults.DEFAULT_PER_WORKER_QUEUE_SIZE
22
+ max_workers: int = defaults.DEFAULT_NUMBER_OF_WORKER
23
+ heartbeat_interval_seconds: int = defaults.DEFAULT_HEARTBEAT_INTERVAL_SECONDS
24
+ task_timeout_seconds: int = defaults.DEFAULT_TASK_TIMEOUT_SECONDS
25
+ death_timeout_seconds: int = defaults.DEFAULT_WORKER_DEATH_TIMEOUT
26
+ garbage_collect_interval_seconds: int = defaults.DEFAULT_GARBAGE_COLLECT_INTERVAL_SECONDS
27
+ trim_memory_threshold_bytes: int = defaults.DEFAULT_TRIM_MEMORY_THRESHOLD_BYTES
28
+ hard_processor_suspend: bool = defaults.DEFAULT_HARD_PROCESSOR_SUSPEND
29
+ event_loop: str = "builtin"
30
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
31
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
32
+ logging_config_file: Optional[str] = None
33
+
34
+ def __post_init__(self):
35
+ if not isinstance(self.adapter_web_host, str):
36
+ raise TypeError(f"adapter_web_host should be string, given {self.adapter_web_host}")
37
+ if not isinstance(self.adapter_web_port, int):
38
+ raise TypeError(f"adapter_web_port must be between 1 and 65535, but got {self.adapter_web_port}")
39
+ if self.io_threads <= 0:
40
+ raise ValueError("io_threads must be a positive integer.")
41
+ if self.worker_task_queue_size <= 0:
42
+ raise ValueError("worker_task_queue_size must be positive.")
43
+ if self.heartbeat_interval_seconds <= 0 or self.task_timeout_seconds < 0 or self.death_timeout_seconds <= 0:
44
+ raise ValueError("All interval/timeout second values must be positive.")
@@ -0,0 +1,7 @@
1
+ import dataclasses
2
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
3
+
4
+
5
+ @dataclasses.dataclass
6
+ class ObjectStorageServerConfig:
7
+ object_storage_address: ObjectStorageConfig
@@ -0,0 +1,53 @@
1
+ import dataclasses
2
+ from typing import Optional, Tuple
3
+ from urllib.parse import urlparse
4
+
5
+ from scaler.config import defaults
6
+ from scaler.scheduler.allocate_policy.allocate_policy import AllocatePolicy
7
+ from scaler.utility.logging.utility import LoggingLevel
8
+
9
+ from scaler.config.types.object_storage_server import ObjectStorageConfig
10
+ from scaler.config.types.zmq import ZMQConfig
11
+
12
+
13
+ @dataclasses.dataclass
14
+ class SchedulerConfig:
15
+ scheduler_address: ZMQConfig = dataclasses.field()
16
+ object_storage_address: Optional[ObjectStorageConfig] = None
17
+ monitor_address: Optional[ZMQConfig] = None
18
+ adapter_webhook_url: Optional[str] = None
19
+ protected: bool = True
20
+ allocate_policy: AllocatePolicy = AllocatePolicy.even
21
+ event_loop: str = "builtin"
22
+ io_threads: int = defaults.DEFAULT_IO_THREADS
23
+ max_number_of_tasks_waiting: int = defaults.DEFAULT_MAX_NUMBER_OF_TASKS_WAITING
24
+ client_timeout_seconds: int = defaults.DEFAULT_CLIENT_TIMEOUT_SECONDS
25
+ worker_timeout_seconds: int = defaults.DEFAULT_WORKER_TIMEOUT_SECONDS
26
+ object_retention_seconds: int = defaults.DEFAULT_OBJECT_RETENTION_SECONDS
27
+ load_balance_seconds: int = defaults.DEFAULT_LOAD_BALANCE_SECONDS
28
+ load_balance_trigger_times: int = defaults.DEFAULT_LOAD_BALANCE_TRIGGER_TIMES
29
+ logging_paths: Tuple[str, ...] = defaults.DEFAULT_LOGGING_PATHS
30
+ logging_config_file: Optional[str] = None
31
+ logging_level: str = defaults.DEFAULT_LOGGING_LEVEL
32
+
33
+ def __post_init__(self):
34
+ if self.io_threads <= 0:
35
+ raise ValueError("io_threads must be a positive integer.")
36
+ if self.max_number_of_tasks_waiting < -1:
37
+ raise ValueError("max_number_of_tasks_waiting must be -1 (for unlimited) or non-negative.")
38
+ if (
39
+ self.client_timeout_seconds <= 0
40
+ or self.worker_timeout_seconds <= 0
41
+ or self.object_retention_seconds <= 0
42
+ or self.load_balance_seconds <= 0
43
+ ):
44
+ raise ValueError("All timeout/retention/balance second values must be positive.")
45
+ if self.load_balance_trigger_times <= 0:
46
+ raise ValueError("load_balance_trigger_times must be a positive integer.")
47
+ if self.adapter_webhook_url:
48
+ parsed_url = urlparse(self.adapter_webhook_url)
49
+ if not all([parsed_url.scheme, parsed_url.netloc]):
50
+ raise ValueError(f"adapter_webhook_url '{self.adapter_webhook_url}' is not a valid URL.")
51
+ valid_levels = {level.name for level in LoggingLevel}
52
+ if self.logging_level.upper() not in valid_levels:
53
+ raise ValueError(f"logging_level must be one of {valid_levels}, but got '{self.logging_level}'")