opengris-scaler 1.12.37__cp38-cp38-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. opengris_scaler-1.12.37.dist-info/METADATA +730 -0
  2. opengris_scaler-1.12.37.dist-info/RECORD +196 -0
  3. opengris_scaler-1.12.37.dist-info/WHEEL +5 -0
  4. opengris_scaler-1.12.37.dist-info/entry_points.txt +10 -0
  5. opengris_scaler-1.12.37.dist-info/licenses/LICENSE +201 -0
  6. opengris_scaler-1.12.37.dist-info/licenses/LICENSE.spdx +7 -0
  7. opengris_scaler-1.12.37.dist-info/licenses/NOTICE +8 -0
  8. opengris_scaler.libs/libcapnp-1-e88d5415.0.1.so +0 -0
  9. opengris_scaler.libs/libgcc_s-2298274a.so.1 +0 -0
  10. opengris_scaler.libs/libkj-1-9bebd8ac.0.1.so +0 -0
  11. opengris_scaler.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
  12. scaler/__init__.py +14 -0
  13. scaler/about.py +5 -0
  14. scaler/client/__init__.py +0 -0
  15. scaler/client/agent/__init__.py +0 -0
  16. scaler/client/agent/client_agent.py +218 -0
  17. scaler/client/agent/disconnect_manager.py +27 -0
  18. scaler/client/agent/future_manager.py +112 -0
  19. scaler/client/agent/heartbeat_manager.py +74 -0
  20. scaler/client/agent/mixins.py +89 -0
  21. scaler/client/agent/object_manager.py +98 -0
  22. scaler/client/agent/task_manager.py +64 -0
  23. scaler/client/client.py +672 -0
  24. scaler/client/future.py +252 -0
  25. scaler/client/object_buffer.py +129 -0
  26. scaler/client/object_reference.py +25 -0
  27. scaler/client/serializer/__init__.py +0 -0
  28. scaler/client/serializer/default.py +16 -0
  29. scaler/client/serializer/mixins.py +38 -0
  30. scaler/cluster/__init__.py +0 -0
  31. scaler/cluster/cluster.py +95 -0
  32. scaler/cluster/combo.py +157 -0
  33. scaler/cluster/object_storage_server.py +45 -0
  34. scaler/cluster/scheduler.py +86 -0
  35. scaler/config/__init__.py +0 -0
  36. scaler/config/common/__init__.py +0 -0
  37. scaler/config/common/logging.py +41 -0
  38. scaler/config/common/web.py +18 -0
  39. scaler/config/common/worker.py +65 -0
  40. scaler/config/common/worker_adapter.py +28 -0
  41. scaler/config/config_class.py +317 -0
  42. scaler/config/defaults.py +94 -0
  43. scaler/config/mixins.py +20 -0
  44. scaler/config/section/__init__.py +0 -0
  45. scaler/config/section/cluster.py +66 -0
  46. scaler/config/section/ecs_worker_adapter.py +78 -0
  47. scaler/config/section/native_worker_adapter.py +30 -0
  48. scaler/config/section/object_storage_server.py +13 -0
  49. scaler/config/section/scheduler.py +126 -0
  50. scaler/config/section/symphony_worker_adapter.py +35 -0
  51. scaler/config/section/top.py +16 -0
  52. scaler/config/section/webui.py +16 -0
  53. scaler/config/types/__init__.py +0 -0
  54. scaler/config/types/network_backend.py +12 -0
  55. scaler/config/types/object_storage_server.py +45 -0
  56. scaler/config/types/worker.py +67 -0
  57. scaler/config/types/zmq.py +83 -0
  58. scaler/entry_points/__init__.py +0 -0
  59. scaler/entry_points/cluster.py +10 -0
  60. scaler/entry_points/object_storage_server.py +26 -0
  61. scaler/entry_points/scheduler.py +51 -0
  62. scaler/entry_points/top.py +272 -0
  63. scaler/entry_points/webui.py +6 -0
  64. scaler/entry_points/worker_adapter_ecs.py +22 -0
  65. scaler/entry_points/worker_adapter_native.py +31 -0
  66. scaler/entry_points/worker_adapter_symphony.py +26 -0
  67. scaler/io/__init__.py +0 -0
  68. scaler/io/async_binder.py +89 -0
  69. scaler/io/async_connector.py +95 -0
  70. scaler/io/async_object_storage_connector.py +225 -0
  71. scaler/io/mixins.py +154 -0
  72. scaler/io/sync_connector.py +68 -0
  73. scaler/io/sync_object_storage_connector.py +249 -0
  74. scaler/io/sync_subscriber.py +83 -0
  75. scaler/io/utility.py +80 -0
  76. scaler/io/ymq/__init__.py +0 -0
  77. scaler/io/ymq/_ymq.pyi +95 -0
  78. scaler/io/ymq/_ymq.so +0 -0
  79. scaler/io/ymq/ymq.py +138 -0
  80. scaler/io/ymq_async_object_storage_connector.py +184 -0
  81. scaler/io/ymq_sync_object_storage_connector.py +184 -0
  82. scaler/object_storage/__init__.py +0 -0
  83. scaler/object_storage/object_storage_server.so +0 -0
  84. scaler/protocol/__init__.py +0 -0
  85. scaler/protocol/capnp/__init__.py +0 -0
  86. scaler/protocol/capnp/_python.py +6 -0
  87. scaler/protocol/capnp/common.capnp +68 -0
  88. scaler/protocol/capnp/message.capnp +218 -0
  89. scaler/protocol/capnp/object_storage.capnp +57 -0
  90. scaler/protocol/capnp/status.capnp +73 -0
  91. scaler/protocol/introduction.md +105 -0
  92. scaler/protocol/python/__init__.py +0 -0
  93. scaler/protocol/python/common.py +140 -0
  94. scaler/protocol/python/message.py +751 -0
  95. scaler/protocol/python/mixins.py +13 -0
  96. scaler/protocol/python/object_storage.py +118 -0
  97. scaler/protocol/python/status.py +279 -0
  98. scaler/protocol/worker.md +228 -0
  99. scaler/scheduler/__init__.py +0 -0
  100. scaler/scheduler/allocate_policy/__init__.py +0 -0
  101. scaler/scheduler/allocate_policy/allocate_policy.py +9 -0
  102. scaler/scheduler/allocate_policy/capability_allocate_policy.py +280 -0
  103. scaler/scheduler/allocate_policy/even_load_allocate_policy.py +159 -0
  104. scaler/scheduler/allocate_policy/mixins.py +55 -0
  105. scaler/scheduler/controllers/__init__.py +0 -0
  106. scaler/scheduler/controllers/balance_controller.py +65 -0
  107. scaler/scheduler/controllers/client_controller.py +131 -0
  108. scaler/scheduler/controllers/config_controller.py +31 -0
  109. scaler/scheduler/controllers/graph_controller.py +424 -0
  110. scaler/scheduler/controllers/information_controller.py +81 -0
  111. scaler/scheduler/controllers/mixins.py +194 -0
  112. scaler/scheduler/controllers/object_controller.py +147 -0
  113. scaler/scheduler/controllers/scaling_policies/__init__.py +0 -0
  114. scaler/scheduler/controllers/scaling_policies/fixed_elastic.py +145 -0
  115. scaler/scheduler/controllers/scaling_policies/mixins.py +10 -0
  116. scaler/scheduler/controllers/scaling_policies/null.py +14 -0
  117. scaler/scheduler/controllers/scaling_policies/types.py +9 -0
  118. scaler/scheduler/controllers/scaling_policies/utility.py +20 -0
  119. scaler/scheduler/controllers/scaling_policies/vanilla.py +95 -0
  120. scaler/scheduler/controllers/task_controller.py +376 -0
  121. scaler/scheduler/controllers/worker_controller.py +169 -0
  122. scaler/scheduler/object_usage/__init__.py +0 -0
  123. scaler/scheduler/object_usage/object_tracker.py +131 -0
  124. scaler/scheduler/scheduler.py +251 -0
  125. scaler/scheduler/task/__init__.py +0 -0
  126. scaler/scheduler/task/task_state_machine.py +92 -0
  127. scaler/scheduler/task/task_state_manager.py +61 -0
  128. scaler/ui/__init__.py +0 -0
  129. scaler/ui/common/__init__.py +0 -0
  130. scaler/ui/common/constants.py +9 -0
  131. scaler/ui/common/live_display.py +147 -0
  132. scaler/ui/common/memory_window.py +146 -0
  133. scaler/ui/common/setting_page.py +40 -0
  134. scaler/ui/common/task_graph.py +840 -0
  135. scaler/ui/common/task_log.py +111 -0
  136. scaler/ui/common/utility.py +66 -0
  137. scaler/ui/common/webui.py +80 -0
  138. scaler/ui/common/worker_processors.py +104 -0
  139. scaler/ui/v1.py +76 -0
  140. scaler/ui/v2.py +102 -0
  141. scaler/ui/webui.py +21 -0
  142. scaler/utility/__init__.py +0 -0
  143. scaler/utility/debug.py +19 -0
  144. scaler/utility/event_list.py +63 -0
  145. scaler/utility/event_loop.py +58 -0
  146. scaler/utility/exceptions.py +42 -0
  147. scaler/utility/formatter.py +44 -0
  148. scaler/utility/graph/__init__.py +0 -0
  149. scaler/utility/graph/optimization.py +27 -0
  150. scaler/utility/graph/topological_sorter.py +11 -0
  151. scaler/utility/graph/topological_sorter_graphblas.py +174 -0
  152. scaler/utility/identifiers.py +107 -0
  153. scaler/utility/logging/__init__.py +0 -0
  154. scaler/utility/logging/decorators.py +25 -0
  155. scaler/utility/logging/scoped_logger.py +33 -0
  156. scaler/utility/logging/utility.py +183 -0
  157. scaler/utility/many_to_many_dict.py +123 -0
  158. scaler/utility/metadata/__init__.py +0 -0
  159. scaler/utility/metadata/profile_result.py +31 -0
  160. scaler/utility/metadata/task_flags.py +30 -0
  161. scaler/utility/mixins.py +13 -0
  162. scaler/utility/network_util.py +7 -0
  163. scaler/utility/one_to_many_dict.py +72 -0
  164. scaler/utility/queues/__init__.py +0 -0
  165. scaler/utility/queues/async_indexed_queue.py +37 -0
  166. scaler/utility/queues/async_priority_queue.py +70 -0
  167. scaler/utility/queues/async_sorted_priority_queue.py +45 -0
  168. scaler/utility/queues/indexed_queue.py +114 -0
  169. scaler/utility/serialization.py +9 -0
  170. scaler/version.txt +1 -0
  171. scaler/worker/__init__.py +0 -0
  172. scaler/worker/agent/__init__.py +0 -0
  173. scaler/worker/agent/heartbeat_manager.py +110 -0
  174. scaler/worker/agent/mixins.py +137 -0
  175. scaler/worker/agent/processor/__init__.py +0 -0
  176. scaler/worker/agent/processor/object_cache.py +107 -0
  177. scaler/worker/agent/processor/processor.py +285 -0
  178. scaler/worker/agent/processor/streaming_buffer.py +28 -0
  179. scaler/worker/agent/processor_holder.py +147 -0
  180. scaler/worker/agent/processor_manager.py +369 -0
  181. scaler/worker/agent/profiling_manager.py +109 -0
  182. scaler/worker/agent/task_manager.py +150 -0
  183. scaler/worker/agent/timeout_manager.py +19 -0
  184. scaler/worker/preload.py +84 -0
  185. scaler/worker/worker.py +265 -0
  186. scaler/worker_adapter/__init__.py +0 -0
  187. scaler/worker_adapter/common.py +26 -0
  188. scaler/worker_adapter/ecs.py +241 -0
  189. scaler/worker_adapter/native.py +138 -0
  190. scaler/worker_adapter/symphony/__init__.py +0 -0
  191. scaler/worker_adapter/symphony/callback.py +45 -0
  192. scaler/worker_adapter/symphony/heartbeat_manager.py +82 -0
  193. scaler/worker_adapter/symphony/message.py +24 -0
  194. scaler/worker_adapter/symphony/task_manager.py +289 -0
  195. scaler/worker_adapter/symphony/worker.py +204 -0
  196. scaler/worker_adapter/symphony/worker_adapter.py +123 -0
@@ -0,0 +1,280 @@
1
+ import dataclasses
2
+ import logging
3
+ import typing
4
+ from collections import OrderedDict, defaultdict
5
+ from itertools import takewhile
6
+ from typing import Dict, Iterable, List, Optional, Set
7
+
8
+ from sortedcontainers import SortedList
9
+
10
+ from scaler.protocol.python.message import Task
11
+ from scaler.scheduler.allocate_policy.mixins import TaskAllocatePolicy
12
+ from scaler.utility.identifiers import TaskID, WorkerID
13
+
14
+
15
+ @dataclasses.dataclass(frozen=True)
16
+ class _TaskHolder:
17
+ task_id: TaskID = dataclasses.field()
18
+ capabilities: Set[str] = dataclasses.field()
19
+
20
+
21
+ @dataclasses.dataclass(frozen=True)
22
+ class _WorkerHolder:
23
+ worker_id: WorkerID = dataclasses.field()
24
+
25
+ capabilities: Set[str] = dataclasses.field()
26
+ queue_size: int = dataclasses.field()
27
+
28
+ # Queued tasks, ordered from oldest to youngest tasks.
29
+ task_id_to_task: typing.OrderedDict[TaskID, _TaskHolder] = dataclasses.field(default_factory=OrderedDict)
30
+
31
+ def n_tasks(self) -> int:
32
+ return len(self.task_id_to_task)
33
+
34
+ def n_free(self) -> int:
35
+ return self.queue_size - self.n_tasks()
36
+
37
+ def copy(self) -> "_WorkerHolder":
38
+ return _WorkerHolder(self.worker_id, self.capabilities, self.queue_size, self.task_id_to_task.copy())
39
+
40
+
41
+ class CapabilityAllocatePolicy(TaskAllocatePolicy):
42
+ """
43
+ This allocator policy assigns the tasks to workers supporting the requested task capabilities, trying to make all
44
+ workers load as equal as possible.
45
+ """
46
+
47
+ def __init__(self):
48
+ self._worker_id_to_worker: Dict[WorkerID, _WorkerHolder] = {}
49
+
50
+ self._task_id_to_worker_id: Dict[TaskID, WorkerID] = {}
51
+ self._capability_to_worker_ids: Dict[str, Set[WorkerID]] = {}
52
+
53
+ def add_worker(self, worker: WorkerID, capabilities: Dict[str, int], queue_size: int) -> bool:
54
+ if any(capability_value != -1 for capability_value in capabilities.values()):
55
+ logging.warning(f"allocate policy ignores non-infinite worker capabilities: {capabilities!r}.")
56
+
57
+ if worker in self._worker_id_to_worker:
58
+ return False
59
+
60
+ worker_holder = _WorkerHolder(worker_id=worker, capabilities=set(capabilities.keys()), queue_size=queue_size)
61
+ self._worker_id_to_worker[worker] = worker_holder
62
+
63
+ for capability in worker_holder.capabilities:
64
+ if capability not in self._capability_to_worker_ids:
65
+ self._capability_to_worker_ids[capability] = set()
66
+
67
+ self._capability_to_worker_ids[capability].add(worker)
68
+
69
+ return True
70
+
71
+ def remove_worker(self, worker: WorkerID) -> List[TaskID]:
72
+ worker_holder = self._worker_id_to_worker.pop(worker, None)
73
+
74
+ if worker_holder is None:
75
+ return []
76
+
77
+ for capability in worker_holder.capabilities:
78
+ self._capability_to_worker_ids[capability].discard(worker)
79
+ if len(self._capability_to_worker_ids[capability]) == 0:
80
+ self._capability_to_worker_ids.pop(capability)
81
+
82
+ task_ids = list(worker_holder.task_id_to_task.keys())
83
+ for task_id in task_ids:
84
+ self._task_id_to_worker_id.pop(task_id)
85
+
86
+ return task_ids
87
+
88
+ def get_worker_ids(self) -> Set[WorkerID]:
89
+ return set(self._worker_id_to_worker.keys())
90
+
91
+ def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
92
+ return self._task_id_to_worker_id.get(task_id, WorkerID.invalid_worker_id())
93
+
94
+ def balance(self) -> Dict[WorkerID, List[TaskID]]:
95
+ """Returns, for every worker id, the list of task ids to balance out."""
96
+
97
+ has_idle_workers = any(worker.n_tasks() == 0 for worker in self._worker_id_to_worker.values())
98
+
99
+ if not has_idle_workers:
100
+ return {}
101
+
102
+ # The balancing algorithm works by trying to move tasks from workers that have more queued tasks than the
103
+ # average (high-load workers) to workers that have less tasks than the average (low-load workers).
104
+ #
105
+ # The overall worst-case time complexity of the balancing algorithm is:
106
+ #
107
+ # O(n_workers * log(n_workers) + n_tasks * n_workers * n_capabilities)
108
+ #
109
+ # However, if the cluster does not use any capability, time complexity is always:
110
+ #
111
+ # O(n_workers * log(n_workers) + n_tasks * log(n_workers))
112
+ #
113
+ # If capability constraints are used, this might result in less than optimal balancing. That's because, in some
114
+ # cases, the optimal balancing might require to move tasks between more than two workers. Consider this
115
+ # cluster's state:
116
+ #
117
+ # Worker 1
118
+ # Supported capabilities: {Linux, GPU}
119
+ # Tasks:
120
+ # Task 1: {Linux}
121
+ #
122
+ # Worker 2
123
+ # Supported capabilities: {Linux}
124
+ # Tasks: None
125
+ #
126
+ # Worker 3:
127
+ # Supported capabilities: {GPU}
128
+ # Tasks:
129
+ # Task 2: {GPU}
130
+ # Task 3: {GPU}
131
+ #
132
+ # Here, the algorithm will not be able to rebalance the cluster, while ideally we could move Task 1 to Worker 2
133
+ # and then Task 3 to Worker 1.
134
+ #
135
+ # Balancing algorithms that can find this optimal balancing exist (assignment problem), but these are complex
136
+ # and slow. These might also cause a lot of messages to be propagated through the cluster.
137
+ #
138
+ # See <https://github.com/finos/opengris-scaler/issues/32#issuecomment-2541897645> for more details.
139
+
140
+ n_tasks = sum(worker.n_tasks() for worker in self._worker_id_to_worker.values())
141
+ avg_tasks_per_worker = n_tasks / len(self._worker_id_to_worker)
142
+
143
+ def is_balanced(worker: _WorkerHolder) -> bool:
144
+ return abs(worker.n_tasks() - avg_tasks_per_worker) < 1
145
+
146
+ # First, we create a copy of the current workers objects so that we can modify their respective task queues.
147
+ # We also filter out workers that are already balanced as we will not touch these.
148
+ #
149
+ # Time complexity is O(n_workers + n_tasks)
150
+
151
+ workers = [worker.copy() for worker in self._worker_id_to_worker.values() if not is_balanced(worker)]
152
+
153
+ # Then, we sort the remaining workers by the number of queued tasks.
154
+ #
155
+ # Time complexity is O(n_workers * log(n_workers))
156
+
157
+ sorted_workers: SortedList[_WorkerHolder] = SortedList(workers, key=lambda worker: worker.n_tasks())
158
+
159
+ # Finally, we repeatedly remove one task from the most loaded worker until either:
160
+ #
161
+ # - all workers are balanced;
162
+ # - we cannot find a low-load worker than can accept tasks from a high-load worker.
163
+ #
164
+ # Worst-case time complexity is O(n_tasks * n_workers * n_capabilities).
165
+ # If no tag is used in the cluster, complexity is always O(n_tasks * log(n_workers))
166
+
167
+ balancing_advice: Dict[WorkerID, List[TaskID]] = defaultdict(list)
168
+ unbalanceable_tasks: Set[bytes] = set()
169
+
170
+ while len(sorted_workers) >= 2:
171
+ most_loaded_worker: _WorkerHolder = sorted_workers.pop(-1)
172
+
173
+ if is_balanced(most_loaded_worker):
174
+ # Most loaded worker is not high-load, stop
175
+ break
176
+
177
+ # Go through all of the most loaded worker's tasks, trying to find a low-load worker that can accept it.
178
+
179
+ receiving_worker: Optional[_WorkerHolder] = None
180
+ moved_task: Optional[_TaskHolder] = None
181
+
182
+ for task in reversed(most_loaded_worker.task_id_to_task.values()): # Try to balance youngest tasks first.
183
+ if task.task_id in unbalanceable_tasks:
184
+ continue
185
+
186
+ worker_candidates = takewhile(lambda worker: worker.n_tasks() < avg_tasks_per_worker, sorted_workers)
187
+
188
+ receiving_worker_index = self.__balance_try_reassign_task(task, worker_candidates)
189
+
190
+ if receiving_worker_index is not None:
191
+ receiving_worker = sorted_workers.pop(receiving_worker_index)
192
+ moved_task = task
193
+ break
194
+ else:
195
+ # We could not find a receiving worker for this task, remember the task as unbalanceable in case the
196
+ # worker pops-up again. This greatly reduces the worst-case big-O complexity of the algorithm.
197
+ unbalanceable_tasks.add(task.task_id)
198
+
199
+ # Re-inserts the workers in the sorted list if these can be balanced more.
200
+
201
+ if moved_task is not None:
202
+ assert receiving_worker is not None
203
+
204
+ balancing_advice[most_loaded_worker.worker_id].append(moved_task.task_id)
205
+
206
+ most_loaded_worker.task_id_to_task.pop(moved_task.task_id)
207
+ receiving_worker.task_id_to_task[moved_task.task_id] = moved_task
208
+
209
+ if not is_balanced(most_loaded_worker):
210
+ sorted_workers.add(most_loaded_worker)
211
+
212
+ if not is_balanced(receiving_worker):
213
+ sorted_workers.add(receiving_worker)
214
+
215
+ return balancing_advice
216
+
217
+ @staticmethod
218
+ def __balance_try_reassign_task(task: _TaskHolder, worker_candidates: Iterable[_WorkerHolder]) -> Optional[int]:
219
+ """Returns the index of the first worker that can accept the task."""
220
+
221
+ # Time complexity is O(n_workers * len(task.capabilities))
222
+
223
+ for worker_index, worker in enumerate(worker_candidates):
224
+ if task.capabilities.issubset(worker.capabilities):
225
+ return worker_index
226
+
227
+ return None
228
+
229
+ def assign_task(self, task: Task) -> WorkerID:
230
+ # Worst-case time complexity is O(n_workers * len(task.capabilities))
231
+
232
+ available_workers = self.__get_available_workers_for_capabilities(task.capabilities)
233
+
234
+ if len(available_workers) == 0:
235
+ return WorkerID.invalid_worker_id()
236
+
237
+ # Selects the worker that has the least amount of queued tasks. We could select the worker that has the most
238
+ # free queue task slots, but that might needlessly idle workers that have a smaller queue.
239
+
240
+ min_loaded_worker = min(available_workers, key=lambda worker: worker.n_tasks())
241
+ min_loaded_worker.task_id_to_task[task.task_id] = _TaskHolder(task.task_id, set(task.capabilities.keys()))
242
+
243
+ self._task_id_to_worker_id[task.task_id] = min_loaded_worker.worker_id
244
+
245
+ return min_loaded_worker.worker_id
246
+
247
+ def remove_task(self, task_id: TaskID) -> WorkerID:
248
+ worker_id = self._task_id_to_worker_id.pop(task_id, None)
249
+
250
+ if worker_id is None:
251
+ return WorkerID.invalid_worker_id()
252
+
253
+ worker = self._worker_id_to_worker[worker_id]
254
+ worker.task_id_to_task.pop(task_id)
255
+
256
+ return worker_id
257
+
258
+ def has_available_worker(self, capabilities: Optional[Dict[str, int]] = None) -> bool:
259
+ return len(self.__get_available_workers_for_capabilities(capabilities or {})) > 0
260
+
261
+ def statistics(self) -> Dict:
262
+ return {
263
+ worker.worker_id: {"free": worker.n_free(), "sent": worker.n_tasks(), "capabilities": worker.capabilities}
264
+ for worker in self._worker_id_to_worker.values()
265
+ }
266
+
267
+ def __get_available_workers_for_capabilities(self, capabilities: Dict[str, int]) -> List[_WorkerHolder]:
268
+ # Worst-case time complexity is O(n_workers * len(capabilities))
269
+
270
+ if any(capability not in self._capability_to_worker_ids for capability in capabilities.keys()):
271
+ return []
272
+
273
+ matching_worker_ids = set(self._worker_id_to_worker.keys())
274
+
275
+ for capability in capabilities.keys():
276
+ matching_worker_ids.intersection_update(self._capability_to_worker_ids[capability])
277
+
278
+ matching_workers = [self._worker_id_to_worker[worker_id] for worker_id in matching_worker_ids]
279
+
280
+ return [worker for worker in matching_workers if worker.n_free() > 0]
@@ -0,0 +1,159 @@
1
+ import logging
2
+ import math
3
+ from typing import Dict, List, Optional, Set
4
+
5
+ from scaler.protocol.python.message import Task
6
+ from scaler.scheduler.allocate_policy.mixins import TaskAllocatePolicy
7
+ from scaler.utility.identifiers import TaskID, WorkerID
8
+ from scaler.utility.queues.async_priority_queue import AsyncPriorityQueue
9
+ from scaler.utility.queues.indexed_queue import IndexedQueue
10
+
11
+
12
+ class EvenLoadAllocatePolicy(TaskAllocatePolicy):
13
+ """This Allocator policy is trying to make all workers load as equal as possible"""
14
+
15
+ def __init__(self):
16
+ self._workers_to_queue_size: Dict[bytes, int] = dict()
17
+ self._workers_to_task_ids: Dict[WorkerID, IndexedQueue] = dict()
18
+ self._task_id_to_worker: Dict[TaskID, WorkerID] = {}
19
+
20
+ self._worker_queue: AsyncPriorityQueue = AsyncPriorityQueue()
21
+
22
+ def add_worker(self, worker: WorkerID, capabilities: Dict[str, int], queue_size: int) -> bool:
23
+ if len(capabilities) > 0:
24
+ logging.warning(f"allocate policy ignores worker capabilities: {capabilities!r}.")
25
+
26
+ # TODO: handle uneven queue size for each worker
27
+ if worker in self._workers_to_task_ids:
28
+ return False
29
+
30
+ self._workers_to_task_ids[worker] = IndexedQueue()
31
+ self._workers_to_queue_size[worker] = queue_size
32
+
33
+ self._worker_queue.put_nowait([0, worker])
34
+ return True
35
+
36
+ def remove_worker(self, worker: WorkerID) -> List[TaskID]:
37
+ if worker not in self._workers_to_task_ids:
38
+ return []
39
+
40
+ self._worker_queue.remove(worker)
41
+
42
+ task_ids = list(self._workers_to_task_ids.pop(worker))
43
+ for task_id in task_ids:
44
+ self._task_id_to_worker.pop(task_id)
45
+ return task_ids
46
+
47
+ def get_worker_ids(self) -> Set[WorkerID]:
48
+ return set(self._workers_to_task_ids.keys())
49
+
50
+ def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
51
+ return self._task_id_to_worker.get(task_id, WorkerID.invalid_worker_id())
52
+
53
+ def balance(self) -> Dict[WorkerID, List[TaskID]]:
54
+ """Returns, for every worker, the list of tasks to balance out."""
55
+
56
+ # TODO: handle uneven queue size for each worker
57
+ balance_count = self.__get_balance_count_by_worker()
58
+
59
+ balance_result = {}
60
+
61
+ for worker, count in balance_count.items():
62
+ if count == 0:
63
+ continue
64
+
65
+ tasks = list(self._workers_to_task_ids[worker])
66
+ balance_result[worker] = tasks[-count:] # balance out the most recently queued tasks
67
+
68
+ return balance_result
69
+
70
+ def __get_balance_count_by_worker(self) -> Dict[WorkerID, int]:
71
+ """Returns, for every worker, the number of tasks to balance out."""
72
+
73
+ queued_tasks_per_worker = {
74
+ worker: max(0, len(tasks) - 1) for worker, tasks in self._workers_to_task_ids.items()
75
+ }
76
+
77
+ any_worker_has_queued_task = any(queued_tasks_per_worker.values())
78
+
79
+ if not any_worker_has_queued_task:
80
+ return {}
81
+
82
+ number_of_idle_workers = sum(1 for tasks in self._workers_to_task_ids.values() if len(tasks) == 0)
83
+
84
+ if number_of_idle_workers == 0:
85
+ return {}
86
+
87
+ mean_queued = math.ceil(sum(queued_tasks_per_worker.values()) / len(queued_tasks_per_worker))
88
+
89
+ balance_count = {worker: max(0, count - mean_queued) for worker, count in queued_tasks_per_worker.items()}
90
+
91
+ over_mean_advice_total = sum(balance_count.values())
92
+ minimal_allocate = min(number_of_idle_workers, sum(queued_tasks_per_worker.values()))
93
+
94
+ if over_mean_advice_total >= minimal_allocate:
95
+ return balance_count
96
+
97
+ total_to_be_balance = minimal_allocate - over_mean_advice_total
98
+ for worker, count in queued_tasks_per_worker.items():
99
+ assert total_to_be_balance >= 0, "total_to_be_balance must be positive"
100
+ if total_to_be_balance == 0:
101
+ break
102
+
103
+ leftover = count - balance_count[worker]
104
+ if leftover < 1:
105
+ continue
106
+
107
+ to_to_balance = min(leftover, total_to_be_balance)
108
+ balance_count[worker] += to_to_balance
109
+ total_to_be_balance -= to_to_balance
110
+
111
+ return balance_count
112
+
113
+ def assign_task(self, task: Task) -> WorkerID:
114
+ if len(task.capabilities) > 0:
115
+ logging.warning(f"allocate policy ignores task capabilities: {task.capabilities!r}.")
116
+
117
+ task_id = task.task_id
118
+
119
+ if task_id in self._task_id_to_worker:
120
+ return self._task_id_to_worker[task_id]
121
+
122
+ if self._worker_queue.empty():
123
+ return WorkerID.invalid_worker_id()
124
+
125
+ count, worker = self._worker_queue.get_nowait()
126
+ if count == self._workers_to_queue_size[worker]:
127
+ self._worker_queue.put_nowait([count, worker])
128
+ return WorkerID.invalid_worker_id()
129
+
130
+ self._workers_to_task_ids[worker].put(task_id)
131
+ self._task_id_to_worker[task_id] = worker
132
+ self._worker_queue.put_nowait([count + 1, worker])
133
+ return worker
134
+
135
+ def remove_task(self, task_id: TaskID) -> WorkerID:
136
+ if task_id not in self._task_id_to_worker:
137
+ return WorkerID.invalid_worker_id()
138
+
139
+ worker = self._task_id_to_worker.pop(task_id)
140
+ self._workers_to_task_ids[worker].remove(task_id)
141
+
142
+ self._worker_queue.decrease_priority(worker)
143
+ return worker
144
+
145
+ def has_available_worker(self, capabilities: Optional[Dict[str, int]] = None) -> bool:
146
+ if not len(self._worker_queue):
147
+ return False
148
+
149
+ count, worker = self._worker_queue.max_priority_item()
150
+ if count == self._workers_to_queue_size[worker]:
151
+ return False
152
+
153
+ return True
154
+
155
+ def statistics(self) -> Dict:
156
+ return {
157
+ worker: {"free": self._workers_to_queue_size[worker] - len(tasks), "sent": len(tasks)}
158
+ for worker, tasks in self._workers_to_task_ids.items()
159
+ }
@@ -0,0 +1,55 @@
1
+ import abc
2
+ from typing import Dict, List, Optional, Set
3
+
4
+ from scaler.protocol.python.message import Task
5
+ from scaler.utility.identifiers import TaskID, WorkerID
6
+
7
+
8
+ class TaskAllocatePolicy(metaclass=abc.ABCMeta):
9
+ @abc.abstractmethod
10
+ def add_worker(self, worker: WorkerID, capabilities: Dict[str, int], queue_size: int) -> bool:
11
+ """add worker to worker collection"""
12
+ raise NotImplementedError()
13
+
14
+ @abc.abstractmethod
15
+ def remove_worker(self, worker: WorkerID) -> List[TaskID]:
16
+ """remove worker to worker collection, and return list of task_ids of removed worker"""
17
+ raise NotImplementedError()
18
+
19
+ @abc.abstractmethod
20
+ def get_worker_ids(self) -> Set[WorkerID]:
21
+ """get all worker ids as list"""
22
+ raise NotImplementedError()
23
+
24
+ @abc.abstractmethod
25
+ def get_worker_by_task_id(self, task_id: TaskID) -> WorkerID:
26
+ """get worker that been assigned to this task_id, return an invalid worker ID if it cannot find the worker
27
+ assigned to this task id"""
28
+ raise NotImplementedError()
29
+
30
+ @abc.abstractmethod
31
+ def balance(self) -> Dict[WorkerID, List[TaskID]]:
32
+ """balance worker, it should return list of task ids for over burdened worker, represented as worker
33
+ identity to list of task ids dictionary"""
34
+ raise NotImplementedError()
35
+
36
+ @abc.abstractmethod
37
+ def assign_task(self, task: Task) -> WorkerID:
38
+ """assign task in allocator, return an invalid worker ID if available worker, otherwise will return worker been
39
+ assigned to"""
40
+ raise NotImplementedError()
41
+
42
+ @abc.abstractmethod
43
+ def remove_task(self, task_id: TaskID) -> WorkerID:
44
+ """remove task in allocator, return an invalid worker ID if it did not find any worker, otherwise will return
45
+ worker associate with the removed task_id"""
46
+ raise NotImplementedError()
47
+
48
+ @abc.abstractmethod
49
+ def has_available_worker(self, capabilities: Optional[Dict[str, int]] = None) -> bool:
50
+ """has available worker or not, possibly constrained to the requested task capabilities"""
51
+ raise NotImplementedError()
52
+
53
+ @abc.abstractmethod
54
+ def statistics(self) -> Dict:
55
+ raise NotImplementedError()
File without changes
@@ -0,0 +1,65 @@
1
+ import logging
2
+ from typing import Dict, List, Optional
3
+
4
+ from scaler.io.mixins import AsyncBinder, AsyncConnector
5
+ from scaler.protocol.python.message import StateBalanceAdvice
6
+ from scaler.scheduler.allocate_policy.mixins import TaskAllocatePolicy
7
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
8
+ from scaler.scheduler.controllers.mixins import TaskController
9
+ from scaler.utility.identifiers import TaskID, WorkerID
10
+ from scaler.utility.mixins import Looper
11
+
12
+
13
+ class VanillaBalanceController(Looper):
14
+ def __init__(self, config_controller: VanillaConfigController, task_allocate_policy: TaskAllocatePolicy):
15
+ self._config_controller = config_controller
16
+
17
+ self._task_allocate_policy = task_allocate_policy
18
+
19
+ self._last_balance_advice: Dict[WorkerID, List[TaskID]] = dict()
20
+ self._same_load_balance_advice_count = 0
21
+
22
+ self._binder: Optional[AsyncBinder] = None
23
+ self._binder_monitor: Optional[AsyncConnector] = None
24
+
25
+ self._task_controller: Optional[TaskController] = None
26
+
27
+ def register(self, binder: AsyncBinder, binder_monitor: AsyncConnector, task_controller: TaskController):
28
+ self._binder = binder
29
+ self._binder_monitor = binder_monitor
30
+
31
+ self._task_controller = task_controller
32
+
33
+ async def routine(self):
34
+ current_advice = self._task_allocate_policy.balance()
35
+ if not self.__should_balance(current_advice):
36
+ return
37
+
38
+ worker_to_num_tasks = {worker: len(task_ids) for worker, task_ids in current_advice.items()}
39
+ logging.info(f"balancing task: {worker_to_num_tasks}")
40
+ for worker, task_ids in current_advice.items():
41
+ await self._binder_monitor.send(StateBalanceAdvice.new_msg(worker, task_ids))
42
+
43
+ self._last_balance_advice = current_advice
44
+ for worker, task_ids in current_advice.items():
45
+ for task_id in task_ids:
46
+ await self._task_controller.on_task_balance_cancel(task_id)
47
+
48
+ def __should_balance(self, current_advice: Dict[WorkerID, List[TaskID]]) -> bool:
49
+ # 1. if this is the same advise as last time, then we +1 on same advice count
50
+ # 2. if there is another different advice come in, then we reset same advice count to 0
51
+ if self._last_balance_advice == current_advice:
52
+ self._same_load_balance_advice_count += 1
53
+ else:
54
+ self._last_balance_advice = current_advice
55
+ self._same_load_balance_advice_count = 0
56
+
57
+ # if we have same advice for more than trigger times, then we start doing the balancing
58
+ if 0 < self._same_load_balance_advice_count < self._config_controller.get_config("load_balance_trigger_times"):
59
+ return False
60
+
61
+ # if current advice is empty, then we skip
62
+ if not current_advice:
63
+ return False
64
+
65
+ return True
@@ -0,0 +1,131 @@
1
+ import logging
2
+ import time
3
+ from typing import Dict, Optional, Set, Tuple
4
+
5
+ from scaler.io.mixins import AsyncBinder, AsyncConnector
6
+ from scaler.protocol.python.message import (
7
+ ClientDisconnect,
8
+ ClientHeartbeat,
9
+ ClientHeartbeatEcho,
10
+ ClientShutdownResponse,
11
+ TaskCancel,
12
+ )
13
+ from scaler.protocol.python.status import ClientManagerStatus
14
+ from scaler.scheduler.controllers.config_controller import VanillaConfigController
15
+ from scaler.scheduler.controllers.mixins import ClientController, ObjectController, TaskController, WorkerController
16
+ from scaler.utility.exceptions import ClientShutdownException
17
+ from scaler.utility.identifiers import ClientID, TaskID
18
+ from scaler.utility.mixins import Looper, Reporter
19
+ from scaler.utility.one_to_many_dict import OneToManyDict
20
+
21
+
22
+ class VanillaClientController(ClientController, Looper, Reporter):
23
+ def __init__(self, config_controller: VanillaConfigController):
24
+ self._config_controller = config_controller
25
+
26
+ self._client_to_task_ids: OneToManyDict[ClientID, TaskID] = OneToManyDict()
27
+
28
+ self._binder: Optional[AsyncBinder] = None
29
+ self._binder_monitor: Optional[AsyncConnector] = None
30
+ self._object_controller: Optional[ObjectController] = None
31
+ self._task_controller: Optional[TaskController] = None
32
+ self._worker_controller: Optional[WorkerController] = None
33
+
34
+ self._client_last_seen: Dict[ClientID, Tuple[float, ClientHeartbeat]] = dict()
35
+
36
+ def register(
37
+ self,
38
+ binder: AsyncBinder,
39
+ binder_monitor: AsyncConnector,
40
+ object_controller: ObjectController,
41
+ task_controller: TaskController,
42
+ worker_controller: WorkerController,
43
+ ):
44
+ self._binder = binder
45
+ self._binder_monitor = binder_monitor
46
+ self._object_controller = object_controller
47
+ self._task_controller = task_controller
48
+ self._worker_controller = worker_controller
49
+
50
+ def get_client_task_ids(self, client_id: ClientID) -> Set[TaskID]:
51
+ return self._client_to_task_ids.get_values(client_id)
52
+
53
+ def has_client_id(self, client_id: ClientID) -> bool:
54
+ return client_id in self._client_last_seen
55
+
56
+ def get_client_id(self, task_id: TaskID) -> Optional[ClientID]:
57
+ return self._client_to_task_ids.get_key(task_id)
58
+
59
+ def on_task_begin(self, client_id: ClientID, task_id: TaskID):
60
+ self._client_to_task_ids.add(client_id, task_id)
61
+
62
+ def on_task_finish(self, task_id: TaskID) -> ClientID:
63
+ return self._client_to_task_ids.remove_value(task_id)
64
+
65
+ async def on_heartbeat(self, client_id: ClientID, info: ClientHeartbeat):
66
+ await self._binder.send(
67
+ client_id,
68
+ ClientHeartbeatEcho.new_msg(
69
+ object_storage_address=self._config_controller.get_config("object_storage_address")
70
+ ),
71
+ )
72
+ if client_id not in self._client_last_seen:
73
+ logging.info(f"{client_id!r} connected")
74
+
75
+ self._client_last_seen[client_id] = (time.time(), info)
76
+
77
+ async def on_client_disconnect(self, client_id: ClientID, request: ClientDisconnect):
78
+ if request.disconnect_type == ClientDisconnect.DisconnectType.Disconnect:
79
+ await self.__on_client_disconnect(client_id)
80
+ return
81
+
82
+ if self._config_controller.get_config("protected"):
83
+ logging.warning("cannot shutdown clusters as scheduler is running in protected mode")
84
+ accepted = False
85
+ else:
86
+ logging.info(f"shutdown scheduler and all clusters as received signal from {client_id!r}")
87
+ accepted = True
88
+
89
+ await self._binder.send(client_id, ClientShutdownResponse.new_msg(accepted=accepted))
90
+
91
+ if self._config_controller.get_config("protected"):
92
+ return
93
+
94
+ await self._worker_controller.on_client_shutdown(client_id)
95
+
96
+ raise ClientShutdownException(f"received client shutdown from {client_id!r}, quitting")
97
+
98
+ async def routine(self):
99
+ await self.__routine_cleanup_clients()
100
+
101
+ def get_status(self) -> ClientManagerStatus:
102
+ return ClientManagerStatus.new_msg(
103
+ {client: len(task_ids) for client, task_ids in self._client_to_task_ids.items()}
104
+ )
105
+
106
+ async def __routine_cleanup_clients(self):
107
+ now = time.time()
108
+ dead_clients = {
109
+ client
110
+ for client, (last_seen, info) in self._client_last_seen.items()
111
+ if now - last_seen > self._config_controller.get_config("client_timeout_seconds")
112
+ }
113
+
114
+ for client in dead_clients:
115
+ await self.__on_client_disconnect(client)
116
+
117
+ async def __on_client_disconnect(self, client_id: ClientID):
118
+ logging.info(f"{client_id!r} disconnected")
119
+ if client_id in self._client_last_seen:
120
+ self._client_last_seen.pop(client_id)
121
+
122
+ await self.__cancel_client_all_tasks(client_id)
123
+ self._object_controller.clean_client(client_id)
124
+
125
+ async def __cancel_client_all_tasks(self, client_id: ClientID):
126
+ if client_id not in self._client_to_task_ids.keys():
127
+ return
128
+
129
+ tasks = self._client_to_task_ids.get_values(client_id).copy()
130
+ for task in tasks:
131
+ await self._task_controller.on_task_cancel(client_id, TaskCancel.new_msg(task))