opengris-scaler 1.12.28__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opengris-scaler might be problematic. Click here for more details.

Files changed (187) hide show
  1. opengris_scaler-1.12.28.dist-info/METADATA +728 -0
  2. opengris_scaler-1.12.28.dist-info/RECORD +187 -0
  3. opengris_scaler-1.12.28.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.28.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.28.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.28.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.28.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +210 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +658 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +115 -0
  32. scaler/cluster/combo.py +150 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/defaults.py +94 -0
  37. scaler/config/loader.py +96 -0
  38. scaler/config/mixins.py +20 -0
  39. scaler/config/section/__init__.py +0 -0
  40. scaler/config/section/cluster.py +55 -0
  41. scaler/config/section/ecs_worker_adapter.py +85 -0
  42. scaler/config/section/native_worker_adapter.py +43 -0
  43. scaler/config/section/object_storage_server.py +8 -0
  44. scaler/config/section/scheduler.py +54 -0
  45. scaler/config/section/symphony_worker_adapter.py +47 -0
  46. scaler/config/section/top.py +13 -0
  47. scaler/config/section/webui.py +21 -0
  48. scaler/config/types/__init__.py +0 -0
  49. scaler/config/types/network_backend.py +12 -0
  50. scaler/config/types/object_storage_server.py +45 -0
  51. scaler/config/types/worker.py +62 -0
  52. scaler/config/types/zmq.py +83 -0
  53. scaler/entry_points/__init__.py +0 -0
  54. scaler/entry_points/cluster.py +133 -0
  55. scaler/entry_points/object_storage_server.py +45 -0
  56. scaler/entry_points/scheduler.py +144 -0
  57. scaler/entry_points/top.py +286 -0
  58. scaler/entry_points/webui.py +48 -0
  59. scaler/entry_points/worker_adapter_ecs.py +191 -0
  60. scaler/entry_points/worker_adapter_native.py +137 -0
  61. scaler/entry_points/worker_adapter_symphony.py +98 -0
  62. scaler/io/__init__.py +0 -0
  63. scaler/io/async_binder.py +89 -0
  64. scaler/io/async_connector.py +95 -0
  65. scaler/io/async_object_storage_connector.py +225 -0
  66. scaler/io/mixins.py +154 -0
  67. scaler/io/sync_connector.py +68 -0
  68. scaler/io/sync_object_storage_connector.py +247 -0
  69. scaler/io/sync_subscriber.py +83 -0
  70. scaler/io/utility.py +80 -0
  71. scaler/io/ymq/__init__.py +0 -0
  72. scaler/io/ymq/_ymq.pyi +95 -0
  73. scaler/io/ymq/ymq.py +138 -0
  74. scaler/io/ymq_async_object_storage_connector.py +184 -0
  75. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  76. scaler/object_storage/__init__.py +0 -0
  77. scaler/protocol/__init__.py +0 -0
  78. scaler/protocol/capnp/__init__.py +0 -0
  79. scaler/protocol/capnp/_python.py +6 -0
  80. scaler/protocol/capnp/common.capnp +68 -0
  81. scaler/protocol/capnp/message.capnp +218 -0
  82. scaler/protocol/capnp/object_storage.capnp +57 -0
  83. scaler/protocol/capnp/status.capnp +73 -0
  84. scaler/protocol/introduction.md +105 -0
  85. scaler/protocol/python/__init__.py +0 -0
  86. scaler/protocol/python/common.py +140 -0
  87. scaler/protocol/python/message.py +751 -0
  88. scaler/protocol/python/mixins.py +13 -0
  89. scaler/protocol/python/object_storage.py +118 -0
  90. scaler/protocol/python/status.py +279 -0
  91. scaler/protocol/worker.md +228 -0
  92. scaler/scheduler/__init__.py +0 -0
  93. scaler/scheduler/allocate_policy/__init__.py +0 -0
  94. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  95. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  96. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  97. scaler/scheduler/allocate_policy/mixins.py +55 -0
  98. scaler/scheduler/controllers/__init__.py +0 -0
  99. scaler/scheduler/controllers/balance_controller.py +65 -0
  100. scaler/scheduler/controllers/client_controller.py +131 -0
  101. scaler/scheduler/controllers/config_controller.py +31 -0
  102. scaler/scheduler/controllers/graph_controller.py +424 -0
  103. scaler/scheduler/controllers/information_controller.py +81 -0
  104. scaler/scheduler/controllers/mixins.py +194 -0
  105. scaler/scheduler/controllers/object_controller.py +147 -0
  106. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  107. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  108. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  109. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  110. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  111. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  112. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  113. scaler/scheduler/controllers/task_controller.py +376 -0
  114. scaler/scheduler/controllers/worker_controller.py +169 -0
  115. scaler/scheduler/object_usage/__init__.py +0 -0
  116. scaler/scheduler/object_usage/object_tracker.py +131 -0
  117. scaler/scheduler/scheduler.py +251 -0
  118. scaler/scheduler/task/__init__.py +0 -0
  119. scaler/scheduler/task/task_state_machine.py +92 -0
  120. scaler/scheduler/task/task_state_manager.py +61 -0
  121. scaler/ui/__init__.py +0 -0
  122. scaler/ui/constants.py +9 -0
  123. scaler/ui/live_display.py +147 -0
  124. scaler/ui/memory_window.py +146 -0
  125. scaler/ui/setting_page.py +40 -0
  126. scaler/ui/task_graph.py +832 -0
  127. scaler/ui/task_log.py +107 -0
  128. scaler/ui/utility.py +66 -0
  129. scaler/ui/webui.py +147 -0
  130. scaler/ui/worker_processors.py +104 -0
  131. scaler/utility/__init__.py +0 -0
  132. scaler/utility/debug.py +19 -0
  133. scaler/utility/event_list.py +63 -0
  134. scaler/utility/event_loop.py +58 -0
  135. scaler/utility/exceptions.py +42 -0
  136. scaler/utility/formatter.py +44 -0
  137. scaler/utility/graph/__init__.py +0 -0
  138. scaler/utility/graph/optimization.py +27 -0
  139. scaler/utility/graph/topological_sorter.py +11 -0
  140. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  141. scaler/utility/identifiers.py +107 -0
  142. scaler/utility/logging/__init__.py +0 -0
  143. scaler/utility/logging/decorators.py +25 -0
  144. scaler/utility/logging/scoped_logger.py +33 -0
  145. scaler/utility/logging/utility.py +183 -0
  146. scaler/utility/many_to_many_dict.py +123 -0
  147. scaler/utility/metadata/__init__.py +0 -0
  148. scaler/utility/metadata/profile_result.py +31 -0
  149. scaler/utility/metadata/task_flags.py +30 -0
  150. scaler/utility/mixins.py +13 -0
  151. scaler/utility/network_util.py +7 -0
  152. scaler/utility/one_to_many_dict.py +72 -0
  153. scaler/utility/queues/__init__.py +0 -0
  154. scaler/utility/queues/async_indexed_queue.py +37 -0
  155. scaler/utility/queues/async_priority_queue.py +70 -0
  156. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  157. scaler/utility/queues/indexed_queue.py +114 -0
  158. scaler/utility/serialization.py +9 -0
  159. scaler/version.txt +1 -0
  160. scaler/worker/__init__.py +0 -0
  161. scaler/worker/agent/__init__.py +0 -0
  162. scaler/worker/agent/heartbeat_manager.py +107 -0
  163. scaler/worker/agent/mixins.py +137 -0
  164. scaler/worker/agent/processor/__init__.py +0 -0
  165. scaler/worker/agent/processor/object_cache.py +107 -0
  166. scaler/worker/agent/processor/processor.py +285 -0
  167. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  168. scaler/worker/agent/processor_holder.py +147 -0
  169. scaler/worker/agent/processor_manager.py +369 -0
  170. scaler/worker/agent/profiling_manager.py +109 -0
  171. scaler/worker/agent/task_manager.py +150 -0
  172. scaler/worker/agent/timeout_manager.py +19 -0
  173. scaler/worker/preload.py +84 -0
  174. scaler/worker/worker.py +265 -0
  175. scaler/worker_adapter/__init__.py +0 -0
  176. scaler/worker_adapter/common.py +26 -0
  177. scaler/worker_adapter/ecs.py +269 -0
  178. scaler/worker_adapter/native.py +155 -0
  179. scaler/worker_adapter/symphony/__init__.py +0 -0
  180. scaler/worker_adapter/symphony/callback.py +45 -0
  181. scaler/worker_adapter/symphony/heartbeat_manager.py +79 -0
  182. scaler/worker_adapter/symphony/message.py +24 -0
  183. scaler/worker_adapter/symphony/task_manager.py +289 -0
  184. scaler/worker_adapter/symphony/worker.py +204 -0
  185. scaler/worker_adapter/symphony/worker_adapter.py +139 -0
  186. src/scaler/io/ymq/_ymq.so +0 -0
  187. src/scaler/object_storage/object_storage_server.so +0 -0
@@ -0,0 +1,194 @@
1
+ import abc
2
+ from typing import Any, Optional, Set
3
+
4
+ from scaler.protocol.python.common import ObjectMetadata
5
+ from scaler.protocol.python.message import (
6
+ ClientDisconnect,
7
+ ClientHeartbeat,
8
+ DisconnectRequest,
9
+ GraphTask,
10
+ InformationRequest,
11
+ ObjectInstruction,
12
+ Task,
13
+ TaskCancel,
14
+ TaskCancelConfirm,
15
+ TaskResult,
16
+ WorkerHeartbeat,
17
+ )
18
+ from scaler.utility.identifiers import ClientID, ObjectID, TaskID, WorkerID
19
+ from scaler.utility.mixins import Reporter
20
+
21
+
22
+ class ConfigController(metaclass=abc.ABCMeta):
23
+ @abc.abstractmethod
24
+ def get_config(self, path: str) -> Any:
25
+ raise NotImplementedError()
26
+
27
+ @abc.abstractmethod
28
+ def update_config(self, path: str, value: Any):
29
+ raise NotImplementedError()
30
+
31
+
32
+ class ObjectController(Reporter):
33
+ @abc.abstractmethod
34
+ async def on_object_instruction(self, source: bytes, request: ObjectInstruction):
35
+ raise NotImplementedError()
36
+
37
+ @abc.abstractmethod
38
+ def on_add_object(
39
+ self,
40
+ client_id: ClientID,
41
+ object_id: ObjectID,
42
+ object_type: ObjectMetadata.ObjectContentType,
43
+ object_name: bytes,
44
+ ):
45
+ raise NotImplementedError()
46
+
47
+ @abc.abstractmethod
48
+ def on_del_objects(self, client_id: ClientID, object_ids: Set[ObjectID]):
49
+ raise NotImplementedError()
50
+
51
+ @abc.abstractmethod
52
+ def clean_client(self, client_id: ClientID):
53
+ raise NotImplementedError()
54
+
55
+ @abc.abstractmethod
56
+ def has_object(self, object_id: ObjectID) -> bool:
57
+ raise NotImplementedError()
58
+
59
+ @abc.abstractmethod
60
+ def get_object_name(self, object_id: ObjectID) -> bytes:
61
+ raise NotImplementedError()
62
+
63
+
64
+ class ClientController(Reporter):
65
+ @abc.abstractmethod
66
+ def get_client_task_ids(self, client_id: ClientID) -> Set[TaskID]:
67
+ raise NotImplementedError()
68
+
69
+ @abc.abstractmethod
70
+ def has_client_id(self, client_id: ClientID) -> bool:
71
+ raise NotImplementedError()
72
+
73
+ @abc.abstractmethod
74
+ def get_client_id(self, task_id: TaskID) -> Optional[ClientID]:
75
+ raise NotImplementedError()
76
+
77
+ @abc.abstractmethod
78
+ def on_task_begin(self, client_id: ClientID, task_id: TaskID):
79
+ raise NotImplementedError()
80
+
81
+ @abc.abstractmethod
82
+ def on_task_finish(self, task_id: TaskID) -> bytes:
83
+ raise NotImplementedError()
84
+
85
+ @abc.abstractmethod
86
+ async def on_heartbeat(self, client_id: ClientID, info: ClientHeartbeat):
87
+ raise NotImplementedError()
88
+
89
+ @abc.abstractmethod
90
+ async def on_client_disconnect(self, client_id: ClientID, request: ClientDisconnect):
91
+ raise NotImplementedError()
92
+
93
+
94
+ class GraphTaskController(Reporter):
95
+ @abc.abstractmethod
96
+ async def on_graph_task(self, client_id: ClientID, graph_task: GraphTask):
97
+ raise NotImplementedError()
98
+
99
+ @abc.abstractmethod
100
+ async def on_graph_task_cancel(self, graph_task_cancel: TaskCancel):
101
+ raise NotImplementedError()
102
+
103
+ @abc.abstractmethod
104
+ async def on_graph_sub_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
105
+ raise NotImplementedError()
106
+
107
+ @abc.abstractmethod
108
+ async def on_graph_sub_task_result(self, result: TaskResult) -> bool:
109
+ raise NotImplementedError()
110
+
111
+ @abc.abstractmethod
112
+ def is_graph_subtask(self, task_id: TaskID) -> bool:
113
+ raise NotImplementedError()
114
+
115
+
116
+ class TaskController(Reporter):
117
+ @abc.abstractmethod
118
+ async def on_task_new(self, task: Task):
119
+ raise NotImplementedError()
120
+
121
+ @abc.abstractmethod
122
+ async def on_task_cancel(self, client_id: ClientID, task_cancel: TaskCancel):
123
+ raise NotImplementedError()
124
+
125
+ @abc.abstractmethod
126
+ async def on_task_balance_cancel(self, task_id: TaskID):
127
+ raise NotImplementedError()
128
+
129
+ @abc.abstractmethod
130
+ async def on_task_cancel_confirm(self, task_cancel_confirm: TaskCancelConfirm):
131
+ raise NotImplementedError()
132
+
133
+ @abc.abstractmethod
134
+ async def on_task_result(self, result: TaskResult):
135
+ raise NotImplementedError()
136
+
137
+ @abc.abstractmethod
138
+ async def on_worker_connect(self, worker_id: WorkerID):
139
+ raise NotImplementedError()
140
+
141
+ @abc.abstractmethod
142
+ async def on_worker_disconnect(self, task_id: TaskID, worker_id: WorkerID):
143
+ raise NotImplementedError()
144
+
145
+
146
+ class WorkerController(Reporter):
147
+ @abc.abstractmethod
148
+ def acquire_worker(self, task: Task) -> Optional[WorkerID]:
149
+ """this acquires worker should be atomic, means it cannot be async decorated, otherwise it will create gap that
150
+ get worker but task is not send to worker, and cannot find task in the worker state"""
151
+
152
+ # TODO: this function should return things that expose 3 kinds of information:
153
+ # TODO: 1. worker id as bytes if have capacity and able to assign to worker id
154
+ # TODO: 2. capacity is full, and unable to add new task
155
+ # TODO: 3. capacity is not full, but all the workers are busy right now, so tasks will be queued
156
+ raise NotImplementedError()
157
+
158
+ @abc.abstractmethod
159
+ async def on_task_cancel(self, task_cancel: TaskCancel) -> bytes:
160
+ raise NotImplementedError()
161
+
162
+ @abc.abstractmethod
163
+ async def on_task_done(self, task_id: TaskID):
164
+ raise NotImplementedError()
165
+
166
+ @abc.abstractmethod
167
+ async def on_heartbeat(self, worker_id: WorkerID, info: WorkerHeartbeat):
168
+ raise NotImplementedError()
169
+
170
+ @abc.abstractmethod
171
+ async def on_client_shutdown(self, client_id: ClientID):
172
+ raise NotImplementedError()
173
+
174
+ @abc.abstractmethod
175
+ async def on_disconnect(self, worker_id: WorkerID, request: DisconnectRequest):
176
+ raise NotImplementedError()
177
+
178
+ @abc.abstractmethod
179
+ def has_available_worker(self) -> bool:
180
+ raise NotImplementedError()
181
+
182
+ @abc.abstractmethod
183
+ def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
184
+ raise NotImplementedError()
185
+
186
+ @abc.abstractmethod
187
+ def get_worker_ids(self) -> Set[WorkerID]:
188
+ raise NotImplementedError()
189
+
190
+
191
+ class InformationController(metaclass=abc.ABCMeta):
192
+ @abc.abstractmethod
193
+ async def on_request(self, request: InformationRequest):
194
+ raise NotImplementedError()
@@ -0,0 +1,147 @@
1
+ import dataclasses
2
+ import logging
3
+ from asyncio import Queue
4
+ from typing import Optional, Set
5
+
6
+ from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
7
+ from scaler.protocol.python.common import ObjectMetadata
8
+ from scaler.protocol.python.message import ObjectInstruction
9
+ from scaler.protocol.python.status import ObjectManagerStatus
10
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
11
+ from scaler.scheduler.controllers.mixins import ClientController, ObjectController, WorkerController
12
+ from scaler.scheduler.object_usage.object_tracker import ObjectTracker, ObjectUsage
13
+ from scaler.utility.identifiers import ClientID, ObjectID
14
+ from scaler.utility.mixins import Looper, Reporter
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class _ObjectCreation(ObjectUsage):
19
+ object_id: ObjectID
20
+ object_creator: ClientID
21
+ object_type: ObjectMetadata.ObjectContentType
22
+ object_name: bytes
23
+
24
+ def get_object_key(self) -> ObjectID:
25
+ return self.object_id
26
+
27
+
28
+ class VanillaObjectController(ObjectController, Looper, Reporter):
29
+ def __init__(self, config_controller: VanillaConfigController):
30
+ self._config_controller = config_controller
31
+
32
+ self._object_tracker: ObjectTracker[ClientID, ObjectID, _ObjectCreation] = ObjectTracker(
33
+ "object_usage", self.__finished_object_storage
34
+ )
35
+
36
+ self._queue_deleted_object_ids: Queue[ObjectID] = Queue()
37
+
38
+ self._binder: Optional[AsyncBinder] = None
39
+ self._binder_monitor: Optional[AsyncConnector] = None
40
+ self._connector_storage: Optional[AsyncObjectStorageConnector] = None
41
+
42
+ self._client_manager: Optional[ClientController] = None
43
+ self._worker_manager: Optional[WorkerController] = None
44
+
45
+ def register(
46
+ self,
47
+ binder: AsyncBinder,
48
+ binder_monitor: AsyncConnector,
49
+ connector_storage: AsyncObjectStorageConnector,
50
+ client_manager: ClientController,
51
+ worker_manager: WorkerController,
52
+ ):
53
+ self._binder = binder
54
+ self._binder_monitor = binder_monitor
55
+ self._connector_storage = connector_storage
56
+ self._client_manager = client_manager
57
+ self._worker_manager = worker_manager
58
+
59
+ async def on_object_instruction(self, source: bytes, instruction: ObjectInstruction):
60
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Create:
61
+ self.__on_object_create(source, instruction)
62
+ return
63
+
64
+ if instruction.instruction_type == ObjectInstruction.ObjectInstructionType.Delete:
65
+ self.on_del_objects(instruction.object_user, set(instruction.object_metadata.object_ids))
66
+ return
67
+
68
+ logging.error(f"received unknown object instruction_type={instruction.instruction_type} from {source=}")
69
+
70
+ def on_add_object(
71
+ self,
72
+ client_id: ClientID,
73
+ object_id: ObjectID,
74
+ object_type: ObjectMetadata.ObjectContentType,
75
+ object_name: bytes,
76
+ ):
77
+ creation = _ObjectCreation(object_id, client_id, object_type, object_name)
78
+ logging.debug(
79
+ f"add object cache "
80
+ f"object_name={creation.object_name!r}, "
81
+ f"object_type={creation.object_type}, "
82
+ f"object_id={creation.object_id!r}"
83
+ )
84
+
85
+ self._object_tracker.add_object(creation)
86
+ self._object_tracker.add_blocks_for_one_object(creation.get_object_key(), {creation.object_creator})
87
+
88
+ def on_del_objects(self, client_id: ClientID, object_ids: Set[ObjectID]):
89
+ for object_id in object_ids:
90
+ self._object_tracker.remove_one_block_for_objects({object_id}, client_id)
91
+
92
+ def clean_client(self, client_id: ClientID):
93
+ self._object_tracker.remove_blocks({client_id})
94
+
95
+ async def routine(self):
96
+ await self.__routine_send_objects_deletions()
97
+
98
+ def has_object(self, object_id: ObjectID) -> bool:
99
+ return self._object_tracker.has_object(object_id)
100
+
101
+ def get_object_name(self, object_id: ObjectID) -> bytes:
102
+ if not self.has_object(object_id):
103
+ return b"<Unknown>"
104
+
105
+ return self._object_tracker.get_object(object_id).object_name
106
+
107
+ def get_status(self) -> ObjectManagerStatus:
108
+ return ObjectManagerStatus.new_msg(self._object_tracker.object_count())
109
+
110
+ async def __routine_send_objects_deletions(self):
111
+ deleted_object_ids = [await self._queue_deleted_object_ids.get()]
112
+ self._queue_deleted_object_ids.task_done()
113
+
114
+ while not self._queue_deleted_object_ids.empty():
115
+ deleted_object_ids.append(self._queue_deleted_object_ids.get_nowait())
116
+ self._queue_deleted_object_ids.task_done()
117
+
118
+ for worker in self._worker_manager.get_worker_ids():
119
+ await self._binder.send(
120
+ worker,
121
+ ObjectInstruction.new_msg(
122
+ ObjectInstruction.ObjectInstructionType.Delete,
123
+ # TODO: ideally object_user should be set to the owning client ID, but then we cannot batch these
124
+ # Delete instructions.
125
+ None,
126
+ ObjectMetadata.new_msg(tuple(deleted_object_ids)),
127
+ ),
128
+ )
129
+
130
+ for object_id in deleted_object_ids:
131
+ await self._connector_storage.delete_object(object_id)
132
+
133
+ def __on_object_create(self, source: bytes, instruction: ObjectInstruction):
134
+ if not self._client_manager.has_client_id(instruction.object_user):
135
+ logging.error(f"received object creation from {source!r} for unknown client {instruction.object_user!r}")
136
+ return
137
+
138
+ for object_id, object_type, object_name in zip(
139
+ instruction.object_metadata.object_ids,
140
+ instruction.object_metadata.object_types,
141
+ instruction.object_metadata.object_names,
142
+ ):
143
+ self.on_add_object(instruction.object_user, object_id, object_type, object_name)
144
+
145
+ def __finished_object_storage(self, creation: _ObjectCreation):
146
+ logging.debug(f"del object cache object_name={creation.object_name!r}, object_id={creation.object_id!r}")
147
+ self._queue_deleted_object_ids.put_nowait(creation.object_id)
@@ -0,0 +1,145 @@
1
+ import logging
2
+ import math
3
+ from typing import Dict, List, Literal, Optional
4
+
5
+ import aiohttp
6
+ from aiohttp import web
7
+
8
+ from scaler.protocol.python.message import InformationSnapshot
9
+ from scaler.protocol.python.status import ScalingManagerStatus
10
+ from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
11
+ from scaler.scheduler.controllers.scaling_policies.types import WorkerGroupID
12
+ from scaler.utility.identifiers import WorkerID
13
+
14
+ WorkerAdapterLabel = Literal["primary", "secondary"]
15
+
16
+
17
+ class FixedElasticScalingController(ScalingController):
18
+ def __init__(self, primary_adapter_webhook_url: str, secondary_adapter_webhook_url: str):
19
+ self._primary_webhook = primary_adapter_webhook_url
20
+ self._secondary_webhook = secondary_adapter_webhook_url
21
+ self._primary_group_limit = 1
22
+ self._lower_task_ratio = 1
23
+ self._upper_task_ratio = 10
24
+
25
+ self._worker_groups: Dict[WorkerGroupID, List[WorkerID]] = {}
26
+ self._worker_group_source: Dict[WorkerGroupID, WorkerAdapterLabel] = {}
27
+
28
+ def get_status(self):
29
+ return ScalingManagerStatus.new_msg(worker_groups=self._worker_groups)
30
+
31
+ async def on_snapshot(self, information_snapshot: InformationSnapshot):
32
+ if not information_snapshot.workers:
33
+ if information_snapshot.tasks:
34
+ await self._start_worker_group()
35
+ return
36
+
37
+ task_ratio = len(information_snapshot.tasks) / len(information_snapshot.workers)
38
+ if task_ratio > self._upper_task_ratio:
39
+ await self._start_worker_group()
40
+ elif task_ratio < self._lower_task_ratio:
41
+ worker_group_task_counts = {
42
+ worker_group_id: sum(
43
+ information_snapshot.workers[worker_id].queued_tasks
44
+ for worker_id in worker_ids
45
+ if worker_id in information_snapshot.workers
46
+ )
47
+ for worker_group_id, worker_ids in self._worker_groups.items()
48
+ }
49
+ if not worker_group_task_counts:
50
+ logging.warning("No worker groups available to shut down.")
51
+ return
52
+
53
+ # Prefer shutting down secondary adapter groups first
54
+ secondary_groups = [
55
+ (group_id, task_count)
56
+ for group_id, task_count in worker_group_task_counts.items()
57
+ if self._worker_group_source.get(group_id) == "secondary"
58
+ ]
59
+ if secondary_groups:
60
+ worker_group_id = min(secondary_groups, key=lambda item: item[1])[0]
61
+ else:
62
+ worker_group_id = min(worker_group_task_counts, key=worker_group_task_counts.get)
63
+
64
+ await self._shutdown_worker_group(worker_group_id)
65
+
66
+ async def _start_worker_group(self):
67
+ # Select adapter: use primary if under limit, otherwise use secondary
68
+ adapter: Optional[WorkerAdapterLabel] = None
69
+ webhook = None
70
+
71
+ if self._primary_webhook:
72
+ primary_count = sum(source == "primary" for source in self._worker_group_source.values())
73
+ if self._primary_group_limit is None or primary_count < self._primary_group_limit:
74
+ adapter = "primary"
75
+ webhook = self._primary_webhook
76
+ else:
77
+ logging.debug(f"Primary adapter worker group limit reached ({self._primary_group_limit}).")
78
+
79
+ if adapter is None and self._secondary_webhook:
80
+ adapter = "secondary"
81
+ webhook = self._secondary_webhook
82
+
83
+ if adapter is None:
84
+ logging.warning("All worker adapters have reached their capacity; cannot start a new worker group.")
85
+ return
86
+
87
+ response, status = await self._make_request(webhook, {"action": "get_worker_adapter_info"})
88
+ if status != web.HTTPOk.status_code:
89
+ logging.warning("Failed to get worker adapter info.")
90
+ return
91
+
92
+ if sum(adapter == "secondary" for adapter in self._worker_group_source.values()) >= response.get(
93
+ "max_worker_groups", math.inf
94
+ ):
95
+ return
96
+
97
+ response, status = await self._make_request(webhook, {"action": "start_worker_group"})
98
+ if status == web.HTTPTooManyRequests.status_code:
99
+ logging.warning(f"{adapter.capitalize()} adapter capacity exceeded, cannot start new worker group.")
100
+ return
101
+ if status == web.HTTPInternalServerError.status_code:
102
+ logging.error(
103
+ f"{adapter.capitalize()} adapter failed to start worker group:"
104
+ f" {response.get('error', 'Unknown error')}"
105
+ )
106
+ return
107
+
108
+ worker_group_id = response["worker_group_id"].encode()
109
+ self._worker_groups[worker_group_id] = [WorkerID(worker_id.encode()) for worker_id in response["worker_ids"]]
110
+ self._worker_group_source[worker_group_id] = adapter
111
+ logging.info(f"Started worker group {worker_group_id.decode()} on {adapter} adapter.")
112
+
113
+ async def _shutdown_worker_group(self, worker_group_id: WorkerGroupID):
114
+ if worker_group_id not in self._worker_groups:
115
+ logging.error(f"Worker group with ID {worker_group_id.decode()} does not exist.")
116
+ return
117
+
118
+ adapter = self._worker_group_source.get(worker_group_id)
119
+ if adapter is None:
120
+ logging.error(f"Worker group {worker_group_id.decode()} has no associated adapter recorded.")
121
+ return
122
+
123
+ webhook = self._primary_webhook if adapter == "primary" else self._secondary_webhook
124
+ response, status = await self._make_request(
125
+ webhook, {"action": "shutdown_worker_group", "worker_group_id": worker_group_id.decode()}
126
+ )
127
+ if status == web.HTTPNotFound.status_code:
128
+ logging.error(f"Worker group with ID {worker_group_id.decode()} not found in {adapter} adapter.")
129
+ return
130
+ if status == web.HTTPInternalServerError.status_code:
131
+ logging.error(
132
+ f"{adapter.capitalize()} adapter failed to shutdown worker group:"
133
+ f" {response.get('error', 'Unknown error')}"
134
+ )
135
+ return
136
+
137
+ self._worker_groups.pop(worker_group_id)
138
+ self._worker_group_source.pop(worker_group_id)
139
+ logging.info(f"Shutdown worker group {worker_group_id.decode()} on {adapter} adapter.")
140
+
141
+ @staticmethod
142
+ async def _make_request(webhook_url: str, payload):
143
+ async with aiohttp.ClientSession() as session:
144
+ async with session.post(webhook_url, json=payload) as response:
145
+ return await response.json(), response.status
@@ -0,0 +1,10 @@
1
+ import abc
2
+
3
+ from scaler.protocol.python.message import InformationSnapshot
4
+ from scaler.utility.mixins import Reporter
5
+
6
+
7
+ class ScalingController(Reporter):
8
+ @abc.abstractmethod
9
+ async def on_snapshot(self, snapshot: InformationSnapshot):
10
+ raise NotImplementedError()
@@ -0,0 +1,14 @@
1
+ from scaler.protocol.python.message import InformationSnapshot
2
+ from scaler.protocol.python.status import ScalingManagerStatus
3
+ from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
4
+
5
+
6
+ class NullScalingController(ScalingController):
7
+ def __init__(self):
8
+ pass
9
+
10
+ def get_status(self):
11
+ return ScalingManagerStatus.new_msg(worker_groups={})
12
+
13
+ async def on_snapshot(self, information_snapshot: InformationSnapshot):
14
+ pass
@@ -0,0 +1,9 @@
1
+ import enum
2
+
3
+ WorkerGroupID = bytes
4
+
5
+
6
+ class ScalingControllerStrategy(enum.Enum):
7
+ NULL = "null"
8
+ VANILLA = "vanilla"
9
+ FIXED_ELASTIC = "fixed_elastic"
@@ -0,0 +1,20 @@
1
+ from typing import Tuple
2
+
3
+ from scaler.scheduler.controllers.scaling_policies.fixed_elastic import FixedElasticScalingController
4
+ from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
5
+ from scaler.scheduler.controllers.scaling_policies.null import NullScalingController
6
+ from scaler.scheduler.controllers.scaling_policies.types import ScalingControllerStrategy
7
+ from scaler.scheduler.controllers.scaling_policies.vanilla import VanillaScalingController
8
+
9
+
10
+ def create_scaling_controller(
11
+ scaling_controller_strategy: ScalingControllerStrategy, adapter_webhook_urls: Tuple[str, ...]
12
+ ) -> ScalingController:
13
+ if scaling_controller_strategy == ScalingControllerStrategy.NULL:
14
+ return NullScalingController(*adapter_webhook_urls)
15
+ elif scaling_controller_strategy == ScalingControllerStrategy.VANILLA:
16
+ return VanillaScalingController(*adapter_webhook_urls)
17
+ elif scaling_controller_strategy == ScalingControllerStrategy.FIXED_ELASTIC:
18
+ return FixedElasticScalingController(*adapter_webhook_urls)
19
+
20
+ raise ValueError(f"unsupported scaling controller strategy: {scaling_controller_strategy}")
@@ -0,0 +1,95 @@
1
+ import logging
2
+ import math
3
+ from typing import Dict, List
4
+
5
+ import aiohttp
6
+ from aiohttp import web
7
+
8
+ from scaler.protocol.python.message import InformationSnapshot
9
+ from scaler.protocol.python.status import ScalingManagerStatus
10
+ from scaler.scheduler.controllers.scaling_policies.mixins import ScalingController
11
+ from scaler.scheduler.controllers.scaling_policies.types import WorkerGroupID
12
+ from scaler.utility.identifiers import WorkerID
13
+
14
+
15
+ class VanillaScalingController(ScalingController):
16
+ def __init__(self, adapter_webhook_url: str):
17
+ self._adapter_webhook_url = adapter_webhook_url
18
+ self._lower_task_ratio = 1
19
+ self._upper_task_ratio = 10
20
+
21
+ self._worker_groups: Dict[WorkerGroupID, List[WorkerID]] = {}
22
+
23
+ def get_status(self):
24
+ return ScalingManagerStatus.new_msg(worker_groups=self._worker_groups)
25
+
26
+ async def on_snapshot(self, information_snapshot: InformationSnapshot):
27
+ if not information_snapshot.workers:
28
+ if information_snapshot.tasks:
29
+ await self._start_worker_group()
30
+ return
31
+
32
+ task_ratio = len(information_snapshot.tasks) / len(information_snapshot.workers)
33
+ if task_ratio > self._upper_task_ratio:
34
+ await self._start_worker_group()
35
+ elif task_ratio < self._lower_task_ratio:
36
+ worker_group_task_counts = {
37
+ worker_group_id: sum(
38
+ information_snapshot.workers[worker_id].queued_tasks
39
+ for worker_id in worker_ids
40
+ if worker_id in information_snapshot.workers
41
+ )
42
+ for worker_group_id, worker_ids in self._worker_groups.items()
43
+ }
44
+ if not worker_group_task_counts:
45
+ logging.warning(
46
+ "No worker groups available to shut down. There might be statically provisioned workers."
47
+ )
48
+ return
49
+
50
+ worker_group_id = min(worker_group_task_counts, key=worker_group_task_counts.get)
51
+ await self._shutdown_worker_group(worker_group_id)
52
+
53
+ async def _start_worker_group(self):
54
+ response, status = await self._make_request({"action": "get_worker_adapter_info"})
55
+ if status != web.HTTPOk.status_code:
56
+ logging.warning("Failed to get worker adapter info.")
57
+ return
58
+
59
+ if len(self._worker_groups) >= response.get("max_worker_groups", math.inf):
60
+ return
61
+
62
+ response, status = await self._make_request({"action": "start_worker_group"})
63
+ if status == web.HTTPTooManyRequests.status_code:
64
+ logging.warning("Capacity exceeded, cannot start new worker group.")
65
+ return
66
+ if status == web.HTTPInternalServerError.status_code:
67
+ logging.error(f"Failed to start worker group: {response.get('error', 'Unknown error')}")
68
+ return
69
+
70
+ worker_group_id = response["worker_group_id"].encode()
71
+ self._worker_groups[worker_group_id] = [WorkerID(worker_id.encode()) for worker_id in response["worker_ids"]]
72
+ logging.info(f"Started worker group: {worker_group_id.decode()}")
73
+
74
+ async def _shutdown_worker_group(self, worker_group_id: WorkerGroupID):
75
+ if worker_group_id not in self._worker_groups:
76
+ logging.error(f"Worker group with ID {worker_group_id.decode()} does not exist.")
77
+ return
78
+
79
+ response, status = await self._make_request(
80
+ {"action": "shutdown_worker_group", "worker_group_id": worker_group_id.decode()}
81
+ )
82
+ if status == web.HTTPNotFound.status_code:
83
+ logging.error(f"Worker group with ID {worker_group_id.decode()} not found in adapter.")
84
+ return
85
+ if status == web.HTTPInternalServerError.status_code:
86
+ logging.error(f"Failed to shutdown worker group: {response.get('error', 'Unknown error')}")
87
+ return
88
+
89
+ self._worker_groups.pop(worker_group_id)
90
+ logging.info(f"Shutdown worker group: {worker_group_id.decode()}")
91
+
92
+ async def _make_request(self, payload):
93
+ async with aiohttp.ClientSession() as session:
94
+ async with session.post(self._adapter_webhook_url, json=payload) as response:
95
+ return await response.json(), response.status