skypilot-nightly 1.0.0.dev20250119__py3-none-any.whl → 1.0.0.dev20250121__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/serve/autoscalers.py CHANGED
@@ -42,11 +42,10 @@ class AutoscalerDecision:
42
42
  # TODO(MaoZiming): Add a doc to elaborate on autoscaling policies.
43
43
  def __init__(self, operator: AutoscalerDecisionOperator,
44
44
  target: Union[Optional[Dict[str, Any]], int]):
45
-
46
- assert (operator == AutoscalerDecisionOperator.SCALE_UP and
47
- (target is None or isinstance(target, dict))) or (
48
- operator == AutoscalerDecisionOperator.SCALE_DOWN and
49
- isinstance(target, int))
45
+ if operator == AutoscalerDecisionOperator.SCALE_UP:
46
+ assert (target is None or isinstance(target, dict))
47
+ else:
48
+ assert isinstance(target, int)
50
49
  self.operator = operator
51
50
  self.target = target
52
51
 
@@ -54,9 +53,70 @@ class AutoscalerDecision:
54
53
  return f'AutoscalerDecision({self.operator}, {self.target})'
55
54
 
56
55
 
56
+ def _generate_scale_up_decisions(
57
+ num: int, target: Optional[Dict[str, Any]]) -> List[AutoscalerDecision]:
58
+ return [
59
+ AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, target)
60
+ for _ in range(num)
61
+ ]
62
+
63
+
64
+ def _generate_scale_down_decisions(
65
+ replica_ids: List[int]) -> List[AutoscalerDecision]:
66
+ return [
67
+ AutoscalerDecision(AutoscalerDecisionOperator.SCALE_DOWN, replica_id)
68
+ for replica_id in replica_ids
69
+ ]
70
+
71
+
72
+ def _select_nonterminal_replicas_to_scale_down(
73
+ num_replica_to_scale_down: int,
74
+ replica_infos: Iterable['replica_managers.ReplicaInfo'],
75
+ ) -> List[int]:
76
+ """Select nonterminal replicas to scale down.
77
+
78
+ We sort the replicas based on the following order:
79
+ 1. Based on the `scale_down_decision_order` of the status. We terminate
80
+ the replicas that is in earlier stage first, as the replicas in
81
+ later stage may become ready soon.
82
+ 2. Based on the version in ascending order, so we scale down the older
83
+ versions first.
84
+ 3. Based on the replica_id in descending order, which is also the order
85
+ of the replicas being launched. We scale down the replicas that are
86
+ launched earlier first, as the replicas that are launched later may
87
+ become ready soon.
88
+
89
+ Args:
90
+ num_replica_to_scale_down: The number of replicas to scale down.
91
+ replica_infos: The list of replica informations to select from.
92
+
93
+ Returns:
94
+ The list of replica ids to scale down.
95
+ """
96
+ replicas = list(replica_infos)
97
+ status_order = serve_state.ReplicaStatus.scale_down_decision_order()
98
+ assert all(info.status in status_order for info in replicas), (
99
+ 'All replicas to scale down should be in provisioning or launched '
100
+ 'status.', replicas)
101
+ replicas = sorted(
102
+ replicas,
103
+ key=lambda info: (
104
+ status_order.index(info.status),
105
+ # version in ascending order
106
+ info.version,
107
+ # replica_id in descending order, i.e. launched order
108
+ -info.replica_id))
109
+ assert len(replicas) >= num_replica_to_scale_down, (
110
+ 'Not enough replicas to scale down. Available replicas: ',
111
+ f'{replicas}, num_replica_to_scale_down: {num_replica_to_scale_down}.')
112
+ return [info.replica_id for info in replicas][:num_replica_to_scale_down]
113
+
114
+
57
115
  class Autoscaler:
58
116
  """Abstract class for autoscalers."""
59
117
 
118
+ # --------------- APIs to implement for custom autoscaler ---------------
119
+
60
120
  def __init__(self, service_name: str,
61
121
  spec: 'service_spec.SkyServiceSpec') -> None:
62
122
  """Initialize the autoscaler.
@@ -67,6 +127,8 @@ class Autoscaler:
67
127
  number of replicas, i.e. min_replicas == max_replicas.
68
128
  target_num_replicas: Target number of replicas output by autoscaler.
69
129
  latest_version: latest version of the service.
130
+ latest_version_ever_ready: The latest version that is ever ready.
131
+ update_mode: Update mode for the service.
70
132
  """
71
133
  self._service_name: str = service_name
72
134
  self.min_replicas: int = spec.min_replicas
@@ -81,6 +143,10 @@ class Autoscaler:
81
143
  self.latest_version_ever_ready: int = self.latest_version - 1
82
144
  self.update_mode = serve_utils.DEFAULT_UPDATE_MODE
83
145
 
146
+ def _calculate_target_num_replicas(self) -> int:
147
+ """Calculate target number of replicas."""
148
+ raise NotImplementedError
149
+
84
150
  def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
85
151
  update_mode: serve_utils.UpdateMode) -> None:
86
152
  if version <= self.latest_version:
@@ -91,9 +157,9 @@ class Autoscaler:
91
157
  self.min_replicas = spec.min_replicas
92
158
  self.max_replicas = (spec.max_replicas if spec.max_replicas is not None
93
159
  else spec.min_replicas)
94
- # Reclip self.target_num_replicas with new min and max replicas.
95
- self.target_num_replicas = max(
96
- self.min_replicas, min(self.max_replicas, self.target_num_replicas))
160
+ # Re-clip self.target_num_replicas with new min and max replicas.
161
+ self.target_num_replicas = self._clip_target_num_replicas(
162
+ self.target_num_replicas)
97
163
  self.update_mode = update_mode
98
164
 
99
165
  def collect_request_information(
@@ -101,222 +167,56 @@ class Autoscaler:
101
167
  """Collect request information from aggregator for autoscaling."""
102
168
  raise NotImplementedError
103
169
 
104
- def evaluate_scaling(
170
+ def _generate_scaling_decisions(
105
171
  self,
106
172
  replica_infos: List['replica_managers.ReplicaInfo'],
107
173
  ) -> List[AutoscalerDecision]:
108
- """Evaluate autoscale options based on replica information."""
174
+ """Generate Autoscaling decisions based on replica information."""
109
175
  raise NotImplementedError
110
176
 
111
- @classmethod
112
- def from_spec(cls, service_name: str,
113
- spec: 'service_spec.SkyServiceSpec') -> 'Autoscaler':
114
- # TODO(MaoZiming): use NAME to get the class.
115
- if spec.use_ondemand_fallback:
116
- return FallbackRequestRateAutoscaler(service_name, spec)
117
- else:
118
- return RequestRateAutoscaler(service_name, spec)
119
-
120
177
  def _dump_dynamic_states(self) -> Dict[str, Any]:
121
178
  """Dump dynamic states from autoscaler."""
122
179
  raise NotImplementedError
123
180
 
124
- def dump_dynamic_states(self) -> Dict[str, Any]:
125
- """Dump dynamic states from autoscaler."""
126
- states = {'latest_version_ever_ready': self.latest_version_ever_ready}
127
- states.update(self._dump_dynamic_states())
128
- return states
129
-
130
181
  def _load_dynamic_states(self, dynamic_states: Dict[str, Any]) -> None:
131
182
  """Load dynamic states to autoscaler."""
132
183
  raise NotImplementedError
133
184
 
134
- def get_decision_interval(self) -> int:
135
- """Get the decision interval for the autoscaler."""
136
- raise NotImplementedError
137
-
138
- def load_dynamic_states(self, dynamic_states: Dict[str, Any]) -> None:
139
- """Load dynamic states to autoscaler."""
140
- self.latest_version_ever_ready = dynamic_states.pop(
141
- 'latest_version_ever_ready', constants.INITIAL_VERSION)
142
- self._load_dynamic_states(dynamic_states)
143
-
144
-
145
- class RequestRateAutoscaler(Autoscaler):
146
- """RequestRateAutoscaler: Autoscale according to request rate.
147
-
148
- Scales when the number of requests per replica in the given interval
149
- is above or below the target qps per replica. The instance can be
150
- either spot or on-demand, but not both.
151
- """
152
-
153
- def __init__(self, service_name: str,
154
- spec: 'service_spec.SkyServiceSpec') -> None:
155
- """Initialize the request rate autoscaler.
185
+ # --------------- Utility Functions ---------------
156
186
 
157
- Variables:
158
- target_qps_per_replica: Target qps per replica for autoscaling.
159
- qps_window_size: Window size for qps calculating.
160
- request_timestamps: All request timestamps within the window.
161
- upscale_counter: counter for upscale number of replicas.
162
- downscale_counter: counter for downscale number of replicas.
163
- scale_up_consecutive_periods: period for scaling up.
164
- scale_down_consecutive_periods: period for scaling down.
187
+ def _clip_target_num_replicas(self, target_num_replicas: int) -> int:
188
+ """Clip target number of replicas with current minimal and maximum
189
+ number of replicas.
165
190
  """
166
- super().__init__(service_name, spec)
167
- self.target_qps_per_replica: Optional[
168
- float] = spec.target_qps_per_replica
169
- self.qps_window_size: int = constants.AUTOSCALER_QPS_WINDOW_SIZE_SECONDS
170
- self.request_timestamps: List[float] = []
171
- self.upscale_counter: int = 0
172
- self.downscale_counter: int = 0
173
- upscale_delay_seconds = (
174
- spec.upscale_delay_seconds if spec.upscale_delay_seconds is not None
175
- else constants.AUTOSCALER_DEFAULT_UPSCALE_DELAY_SECONDS)
176
- self.scale_up_consecutive_periods: int = int(
177
- upscale_delay_seconds /
178
- constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
179
- downscale_delay_seconds = (
180
- spec.downscale_delay_seconds
181
- if spec.downscale_delay_seconds is not None else
182
- constants.AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS)
183
- self.scale_down_consecutive_periods: int = int(
184
- downscale_delay_seconds /
185
- constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
186
-
187
- def _cal_target_num_replicas_based_on_qps(self) -> int:
188
- # Recalculate target_num_replicas based on QPS.
189
- # Reclip self.target_num_replicas with new min and max replicas.
190
- if self.target_qps_per_replica is None:
191
- return self.min_replicas
192
- target_num_replicas = math.ceil(
193
- len(self.request_timestamps) / self.qps_window_size /
194
- self.target_qps_per_replica)
195
191
  return max(self.min_replicas, min(self.max_replicas,
196
192
  target_num_replicas))
197
193
 
198
- def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
199
- update_mode: serve_utils.UpdateMode) -> None:
200
- super().update_version(version, spec, update_mode)
201
- self.target_qps_per_replica = spec.target_qps_per_replica
202
- upscale_delay_seconds = (
203
- spec.upscale_delay_seconds if spec.upscale_delay_seconds is not None
204
- else constants.AUTOSCALER_DEFAULT_UPSCALE_DELAY_SECONDS)
205
- self.scale_up_consecutive_periods = int(
206
- upscale_delay_seconds /
207
- constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
208
- downscale_delay_seconds = (
209
- spec.downscale_delay_seconds
210
- if spec.downscale_delay_seconds is not None else
211
- constants.AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS)
212
- self.scale_down_consecutive_periods = int(
213
- downscale_delay_seconds /
214
- constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
215
-
216
- # We directly set the target_num_replicas here instead of
217
- # calling `_set_target_num_replica_with_hysteresis` to have the replicas
218
- # quickly scale after each update.
219
- self.target_num_replicas = self._cal_target_num_replicas_based_on_qps()
220
- # Cleanup hysteretic counters.
221
- self.upscale_counter = 0
222
- self.downscale_counter = 0
223
-
224
- def collect_request_information(
225
- self, request_aggregator_info: Dict[str, Any]) -> None:
226
- """Collect request information from aggregator for autoscaling.
227
-
228
- request_aggregator_info should be a dict with the following format:
229
-
230
- {
231
- 'timestamps': [timestamp1 (float), timestamp2 (float), ...]
232
- }
233
- """
234
- self.request_timestamps.extend(
235
- request_aggregator_info.get('timestamps', []))
236
- current_time = time.time()
237
- index = bisect.bisect_left(self.request_timestamps,
238
- current_time - self.qps_window_size)
239
- self.request_timestamps = self.request_timestamps[index:]
240
- logger.info(f'Num of requests in the last {self.qps_window_size} '
241
- f'seconds: {len(self.request_timestamps)}')
242
-
243
- def _set_target_num_replica_with_hysteresis(self) -> None:
244
- """Set target_num_replicas based on request rate with hysteresis."""
245
- # Keep self.target_num_replicas unchange when autoscaling
246
- # is not enabled, i.e. self.target_qps_per_replica is None.
247
- # In this case, self.target_num_replicas will be min_replicas.
248
- if self.target_qps_per_replica is None:
249
- return
250
-
251
- # Convert to requests per second.
252
- target_num_replicas = self._cal_target_num_replicas_based_on_qps()
253
- old_target_num_replicas = self.target_num_replicas
254
-
255
- # Faster scale up when there is no replica.
256
- if self.target_num_replicas == 0:
257
- self.target_num_replicas = target_num_replicas
258
- elif target_num_replicas > self.target_num_replicas:
259
- self.upscale_counter += 1
260
- self.downscale_counter = 0
261
- if self.upscale_counter >= self.scale_up_consecutive_periods:
262
- self.upscale_counter = 0
263
- self.target_num_replicas = target_num_replicas
264
- elif target_num_replicas < self.target_num_replicas:
265
- self.downscale_counter += 1
266
- self.upscale_counter = 0
267
- if self.downscale_counter >= self.scale_down_consecutive_periods:
268
- self.downscale_counter = 0
269
- self.target_num_replicas = target_num_replicas
270
- else:
271
- self.upscale_counter = self.downscale_counter = 0
272
-
273
- num_requests_per_second = len(
274
- self.request_timestamps) / self.qps_window_size
275
- logger.info(
276
- f'Requests per second: {num_requests_per_second}. '
277
- f'Current target number of replicas: {old_target_num_replicas}. '
278
- f'Final target number of replicas: {self.target_num_replicas}. '
279
- f'Upscale counter: {self.upscale_counter}/'
280
- f'{self.scale_up_consecutive_periods}. '
281
- f'Downscale counter: {self.downscale_counter}/'
282
- f'{self.scale_down_consecutive_periods}')
283
-
284
194
  @classmethod
285
- def _select_nonterminal_replicas_to_scale_down(
286
- cls, num_limit: int,
287
- replica_infos: Iterable['replica_managers.ReplicaInfo']
288
- ) -> List[int]:
289
- status_order = serve_state.ReplicaStatus.scale_down_decision_order()
290
- replicas = list(replica_infos)
291
- assert all(info.status in status_order for info in replicas), (
292
- 'All replicas to scale down should be in provisioning or launched '
293
- 'status.', replicas)
294
- replicas = sorted(
295
- replicas,
296
- key=lambda info: (
297
- status_order.index(info.status),
298
- # Sort by version in ascending order, so we scale down the older
299
- # versions first.
300
- info.version,
301
- # Sort `info.replica_id` in descending order so that the
302
- # replicas in the same version starts to provisioning later are
303
- # scaled down first.
304
- -info.replica_id))
305
- assert len(replicas) >= num_limit, (
306
- 'Not enough replicas to scale down.', replicas, num_limit)
307
- return [info.replica_id for info in replicas][:num_limit]
195
+ def from_spec(cls, service_name: str,
196
+ spec: 'service_spec.SkyServiceSpec') -> 'Autoscaler':
197
+ # TODO(MaoZiming): use NAME to get the class.
198
+ if spec.use_ondemand_fallback:
199
+ return FallbackRequestRateAutoscaler(service_name, spec)
200
+ else:
201
+ return RequestRateAutoscaler(service_name, spec)
308
202
 
309
203
  def get_decision_interval(self) -> int:
310
- # Reduce autoscaler interval when target_num_replicas = 0.
311
- # This will happen when min_replicas = 0 and no traffic.
204
+ """Get the decision interval for the autoscaler.
205
+
206
+ We reduce the decision interval when the desired number of replicas is
207
+ 0, to make the service scale faster when the service is not running.
208
+ This will happen when min_replicas = 0 and no traffic.
209
+ """
312
210
  if self.target_num_replicas == 0:
313
211
  return constants.AUTOSCALER_NO_REPLICA_DECISION_INTERVAL_SECONDS
314
212
  else:
315
213
  return constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS
316
214
 
317
- def select_outdated_replicas_to_scale_down(
318
- self,
319
- replica_infos: List['replica_managers.ReplicaInfo']) -> List[int]:
215
+ def _select_outdated_replicas_to_scale_down(
216
+ self,
217
+ replica_infos: List['replica_managers.ReplicaInfo'],
218
+ active_versions: List[int],
219
+ ) -> List[int]:
320
220
  """Select outdated replicas to scale down."""
321
221
 
322
222
  if self.update_mode == serve_utils.UpdateMode.ROLLING:
@@ -350,19 +250,12 @@ class RequestRateAutoscaler(Autoscaler):
350
250
  # `_select_replicas_to_scale_down` will make sure we scale the
351
251
  # replicas in initializing statuses first before scaling down the
352
252
  # READY old replicas.
353
- return self._select_nonterminal_replicas_to_scale_down(
253
+ return _select_nonterminal_replicas_to_scale_down(
354
254
  max(0,
355
255
  len(old_nonterminal_replicas) - num_old_replicas_to_keep),
356
256
  old_nonterminal_replicas,
357
257
  )
358
258
 
359
- # Use the active versions set by replica manager to make sure we only
360
- # scale down the outdated replicas that are not used by the load
361
- # balancer.
362
- record = serve_state.get_service_from_name(self._service_name)
363
- assert record is not None, (f'No service record found for '
364
- f'{self._service_name}')
365
- active_versions = record['active_versions']
366
259
  if not active_versions:
367
260
  # active_versions can be empty when none of the replicas are ready
368
261
  # when the load balancer sync with the controller.
@@ -376,36 +269,35 @@ class RequestRateAutoscaler(Autoscaler):
376
269
  # number of ready new replicas is greater than or equal to the min
377
270
  # replicas instead of the target, to ensure the service being updated
378
271
  # to the latest version faster.
379
- all_replica_ids_to_scale_down: List[int] = []
380
- for info in replica_infos:
381
- if info.version < latest_version_with_min_replicas:
382
- all_replica_ids_to_scale_down.append(info.replica_id)
272
+ return [
273
+ info.replica_id
274
+ for info in replica_infos
275
+ if info.version < latest_version_with_min_replicas
276
+ ]
383
277
 
384
- return all_replica_ids_to_scale_down
385
-
386
- def evaluate_scaling(
278
+ def generate_scaling_decisions(
387
279
  self,
388
280
  replica_infos: List['replica_managers.ReplicaInfo'],
281
+ active_versions: List[int],
389
282
  ) -> List[AutoscalerDecision]:
390
- """Evaluate Autoscaling decisions based on replica information.
391
- If the number of launched replicas is less than the target,
392
- Trigger a scale up. Else, trigger a scale down.
283
+ """Generate Autoscaling decisions based on replica information.
284
+ If the number of launched replicas is less than the target, trigger a
285
+ scale up. Else, trigger a scale down. This function also handles the
286
+ version control of the replicas.
393
287
 
394
288
  For future compatibility, we return a list of AutoscalerDecision.
395
289
  Scale-up could include both spot and on-demand, each with a resource
396
290
  override dict. Active migration could require returning both SCALE_UP
397
291
  and SCALE_DOWN.
398
292
  """
399
- latest_replicas: List['replica_managers.ReplicaInfo'] = []
400
- latest_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
401
293
 
294
+ # Handle latest version unrecoverable failure first.
295
+ latest_replicas: List['replica_managers.ReplicaInfo'] = []
402
296
  for info in replica_infos:
403
297
  if info.version == self.latest_version:
404
298
  latest_replicas.append(info)
405
- if not info.is_terminal:
406
- latest_nonterminal_replicas.append(info)
407
- if info.is_ready:
408
- self.latest_version_ever_ready = self.latest_version
299
+ if info.is_ready:
300
+ self.latest_version_ever_ready = self.latest_version
409
301
  if self.latest_version_ever_ready < self.latest_version:
410
302
  for info in latest_replicas:
411
303
  if info.status_property.unrecoverable_failure():
@@ -415,55 +307,229 @@ class RequestRateAutoscaler(Autoscaler):
415
307
  # and restart.
416
308
  return []
417
309
 
418
- self._set_target_num_replica_with_hysteresis()
310
+ scaling_decisions = []
419
311
 
420
- scaling_options: List[AutoscalerDecision] = []
421
- all_replica_ids_to_scale_down: List[int] = []
312
+ # If rolling update is in progress, we scale down old replicas based on
313
+ # the number of ready new replicas and the traffic is directed to both
314
+ # old and new replicas. Or, for blue_green update, once there is
315
+ # min_replicas number of ready new replicas, we will direct all traffic
316
+ # to them, we can scale down all old replicas.
317
+ # TODO(MaoZiming,zhwu): corner case: We should make sure the fallback
318
+ # replicas are ready before scaling down the old replicas to avoid the
319
+ # situation that all the ready new replicas are preempted together.
320
+ scaling_decisions.extend(
321
+ _generate_scale_down_decisions(
322
+ self._select_outdated_replicas_to_scale_down(
323
+ replica_infos, active_versions)))
324
+
325
+ # If the latest version is ever ready, we can proceed to generate
326
+ # decisions from the implementations in subclasses.
327
+ scaling_decisions.extend(
328
+ self._generate_scaling_decisions(replica_infos))
329
+
330
+ if not scaling_decisions:
331
+ logger.info('No scaling needed.')
332
+
333
+ return scaling_decisions
334
+
335
+ def dump_dynamic_states(self) -> Dict[str, Any]:
336
+ """Dump dynamic states from autoscaler."""
337
+ states = {'latest_version_ever_ready': self.latest_version_ever_ready}
338
+ states.update(self._dump_dynamic_states())
339
+ return states
340
+
341
+ def load_dynamic_states(self, dynamic_states: Dict[str, Any]) -> None:
342
+ """Load dynamic states to autoscaler."""
343
+ self.latest_version_ever_ready = dynamic_states.pop(
344
+ 'latest_version_ever_ready', constants.INITIAL_VERSION)
345
+ self._load_dynamic_states(dynamic_states)
346
+
347
+
348
+ class _AutoscalerWithHysteresis(Autoscaler):
349
+ """_AutoscalerWithHysteresis: Autoscale with hysteresis.
350
+
351
+ This is an internal class for developing autoscalers with hysteresis. It
352
+ only scales when the number of replicas is above or below the target number
353
+ of replicas for a certain number of consecutive periods.
354
+ """
355
+
356
+ def _setup_thresholds(self, spec: 'service_spec.SkyServiceSpec') -> None:
357
+ upscale_delay_seconds = (
358
+ spec.upscale_delay_seconds if spec.upscale_delay_seconds is not None
359
+ else constants.AUTOSCALER_DEFAULT_UPSCALE_DELAY_SECONDS)
360
+ self.scale_up_threshold: int = int(
361
+ upscale_delay_seconds /
362
+ constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
363
+ downscale_delay_seconds = (
364
+ spec.downscale_delay_seconds
365
+ if spec.downscale_delay_seconds is not None else
366
+ constants.AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS)
367
+ self.scale_down_threshold: int = int(
368
+ downscale_delay_seconds /
369
+ constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS)
370
+
371
+ def __init__(self, service_name: str,
372
+ spec: 'service_spec.SkyServiceSpec') -> None:
373
+ """Initialize the hysteresis autoscaler.
374
+
375
+ Variables:
376
+ upscale_counter: Counter for upscale decisions of replicas.
377
+ downscale_counter: Counter for downscale decisions of replicas.
378
+ scale_up_threshold: The threshold to trigger a scale up.
379
+ scale_down_threshold: The threshold to trigger a scale down.
380
+ """
381
+ super().__init__(service_name, spec)
382
+ self.upscale_counter: int = 0
383
+ self.downscale_counter: int = 0
384
+ self._setup_thresholds(spec)
385
+
386
+ def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
387
+ update_mode: serve_utils.UpdateMode) -> None:
388
+ super().update_version(version, spec, update_mode)
389
+ # We directly set the target_num_replicas here instead of calling
390
+ # `_set_target_num_replicas_with_hysteresis` to have the replicas
391
+ # quickly scale after each update.
392
+ self.target_num_replicas = self._calculate_target_num_replicas()
393
+ # Cleanup hysteresis counters.
394
+ self.upscale_counter = 0
395
+ self.downscale_counter = 0
396
+ self._setup_thresholds(spec)
422
397
 
423
- # Case 1. If rolling update is in progress, we scale down old replicas
424
- # based on the number of ready new replicas and the traffic is directed
425
- # to both old and new replicas.
426
- # Or, for blue_green update, once there is min_replicas number of ready
427
- # new replicas, we will direct all traffic to them, we can scale down
428
- # all old replicas.
429
- all_replica_ids_to_scale_down.extend(
430
- self.select_outdated_replicas_to_scale_down(replica_infos))
398
+ def _set_target_num_replicas_with_hysteresis(self) -> None:
399
+ """Set target_num_replicas based on request rate with hysteresis."""
400
+ target_num_replicas = self._calculate_target_num_replicas()
401
+ old_target_num_replicas = self.target_num_replicas
431
402
 
432
- # Case 2. when latest_nonterminal_replicas is less
403
+ # Faster scale up when there is no replica.
404
+ if self.target_num_replicas == 0:
405
+ self.target_num_replicas = target_num_replicas
406
+ elif target_num_replicas > self.target_num_replicas:
407
+ self.upscale_counter += 1
408
+ self.downscale_counter = 0
409
+ if self.upscale_counter >= self.scale_up_threshold:
410
+ self.upscale_counter = 0
411
+ self.target_num_replicas = target_num_replicas
412
+ elif target_num_replicas < self.target_num_replicas:
413
+ self.downscale_counter += 1
414
+ self.upscale_counter = 0
415
+ if self.downscale_counter >= self.scale_down_threshold:
416
+ self.downscale_counter = 0
417
+ self.target_num_replicas = target_num_replicas
418
+ else:
419
+ self.upscale_counter = self.downscale_counter = 0
420
+
421
+ logger.info(
422
+ f'Old target number of replicas: {old_target_num_replicas}. '
423
+ f'Current target number of replicas: {target_num_replicas}. '
424
+ f'Final target number of replicas: {self.target_num_replicas}. '
425
+ f'Upscale counter: {self.upscale_counter}/'
426
+ f'{self.scale_up_threshold}. '
427
+ f'Downscale counter: {self.downscale_counter}/'
428
+ f'{self.scale_down_threshold}. ')
429
+
430
+
431
+ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
432
+ """RequestRateAutoscaler: Autoscale according to request rate.
433
+
434
+ Scales when the number of requests per replica in the given interval
435
+ is above or below the target qps per replica. The instance can be
436
+ either spot or on-demand, but not both.
437
+ """
438
+
439
+ def __init__(self, service_name: str,
440
+ spec: 'service_spec.SkyServiceSpec') -> None:
441
+ """Initialize the request rate autoscaler.
442
+
443
+ Variables:
444
+ target_qps_per_replica: Target qps per replica for autoscaling.
445
+ qps_window_size: Window size for qps calculating.
446
+ request_timestamps: All request timestamps within the window.
447
+ """
448
+ super().__init__(service_name, spec)
449
+ self.target_qps_per_replica: Optional[
450
+ float] = spec.target_qps_per_replica
451
+ self.qps_window_size: int = constants.AUTOSCALER_QPS_WINDOW_SIZE_SECONDS
452
+ self.request_timestamps: List[float] = []
453
+
454
+ def _calculate_target_num_replicas(self) -> int:
455
+ if self.target_qps_per_replica is None:
456
+ return self.min_replicas
457
+ num_requests_per_second = len(
458
+ self.request_timestamps) / self.qps_window_size
459
+ target_num_replicas = math.ceil(num_requests_per_second /
460
+ self.target_qps_per_replica)
461
+ logger.info(f'Requests per second: {num_requests_per_second}. '
462
+ f'Target number of replicas: {target_num_replicas}.')
463
+ return self._clip_target_num_replicas(target_num_replicas)
464
+
465
+ def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
466
+ update_mode: serve_utils.UpdateMode) -> None:
467
+ super().update_version(version, spec, update_mode)
468
+ self.target_qps_per_replica = spec.target_qps_per_replica
469
+
470
+ def collect_request_information(
471
+ self, request_aggregator_info: Dict[str, Any]) -> None:
472
+ """Collect request information from aggregator for autoscaling.
473
+
474
+ request_aggregator_info should be a dict with the following format:
475
+
476
+ {
477
+ 'timestamps': [timestamp1 (float), timestamp2 (float), ...]
478
+ }
479
+ """
480
+ self.request_timestamps.extend(
481
+ request_aggregator_info.get('timestamps', []))
482
+ current_time = time.time()
483
+ index = bisect.bisect_left(self.request_timestamps,
484
+ current_time - self.qps_window_size)
485
+ self.request_timestamps = self.request_timestamps[index:]
486
+ logger.info(f'Num of requests in the last {self.qps_window_size} '
487
+ f'seconds: {len(self.request_timestamps)}')
488
+
489
+ def _generate_scaling_decisions(
490
+ self,
491
+ replica_infos: List['replica_managers.ReplicaInfo'],
492
+ ) -> List[AutoscalerDecision]:
493
+ """Generate Autoscaling decisions based on request rate."""
494
+
495
+ self._set_target_num_replicas_with_hysteresis()
496
+
497
+ latest_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
498
+
499
+ for info in replica_infos:
500
+ if info.version == self.latest_version:
501
+ if not info.is_terminal:
502
+ latest_nonterminal_replicas.append(info)
503
+
504
+ scaling_decisions: List[AutoscalerDecision] = []
505
+
506
+ # Case 1. when latest_nonterminal_replicas is less
433
507
  # than num_to_provision, we always scale up new replicas.
434
508
  if len(latest_nonterminal_replicas) < self.target_num_replicas:
435
509
  num_replicas_to_scale_up = (self.target_num_replicas -
436
510
  len(latest_nonterminal_replicas))
437
511
  logger.info('Number of replicas to scale up: '
438
512
  f'{num_replicas_to_scale_up}')
439
- for _ in range(num_replicas_to_scale_up):
440
- scaling_options.append(
441
- AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP,
442
- target=None))
513
+ scaling_decisions.extend(
514
+ _generate_scale_up_decisions(num_replicas_to_scale_up, None))
443
515
 
444
- # Case 3: when latest_nonterminal_replicas is more
516
+ # Case 2: when latest_nonterminal_replicas is more
445
517
  # than self.target_num_replicas, we scale down new replicas.
518
+ replicas_to_scale_down = []
446
519
  if len(latest_nonterminal_replicas) > self.target_num_replicas:
447
520
  num_replicas_to_scale_down = (len(latest_nonterminal_replicas) -
448
521
  self.target_num_replicas)
449
522
  replicas_to_scale_down = (
450
- RequestRateAutoscaler.
451
523
  _select_nonterminal_replicas_to_scale_down(
452
- num_limit=num_replicas_to_scale_down,
453
- replica_infos=latest_nonterminal_replicas))
524
+ num_replicas_to_scale_down, latest_nonterminal_replicas))
454
525
  logger.info(
455
526
  'Number of replicas to scale down: '
456
527
  f'{num_replicas_to_scale_down} {replicas_to_scale_down}')
457
- all_replica_ids_to_scale_down.extend(replicas_to_scale_down)
458
528
 
459
- for replica_id in all_replica_ids_to_scale_down:
460
- scaling_options.append(
461
- AutoscalerDecision(AutoscalerDecisionOperator.SCALE_DOWN,
462
- target=replica_id))
529
+ scaling_decisions.extend(
530
+ _generate_scale_down_decisions(replicas_to_scale_down))
463
531
 
464
- if not scaling_options:
465
- logger.info('No scaling needed.')
466
- return scaling_options
532
+ return scaling_decisions
467
533
 
468
534
  def _dump_dynamic_states(self) -> Dict[str, Any]:
469
535
  return {
@@ -485,16 +551,19 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
485
551
 
486
552
  When spec.base_ondemand_fallback_replicas is set, we make sure
487
553
  there are at least spec.base_ondemand_fallback_replicas on-demands
488
- to be always there to provide basic gurantee for the availability.
554
+ to be always there to provide basic guarantee for the availability.
489
555
 
490
556
  When spec.dynamic_ondemand_fallback is set, on-demand instances
491
557
  will be scheduled to provision for any preempted spot instance, i.e.,
492
558
  on-demand instance are used as dynamic fallback of spot.
493
559
  """
494
560
 
495
- def __init__(self, service_name: str,
496
- spec: 'service_spec.SkyServiceSpec') -> None:
497
- super().__init__(service_name, spec)
561
+ # job_recovery field is checked earlier in core
562
+ SPOT_OVERRIDE = {'use_spot': True}
563
+ ONDEMAND_OVERRIDE = {'use_spot': False}
564
+
565
+ def _setup_fallback_options(self,
566
+ spec: 'service_spec.SkyServiceSpec') -> None:
498
567
  self.base_ondemand_fallback_replicas: int = (
499
568
  spec.base_ondemand_fallback_replicas
500
569
  if spec.base_ondemand_fallback_replicas is not None else 0)
@@ -505,37 +574,42 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
505
574
  spec.dynamic_ondemand_fallback
506
575
  if spec.dynamic_ondemand_fallback is not None else False)
507
576
 
577
+ def __init__(self, service_name: str,
578
+ spec: 'service_spec.SkyServiceSpec') -> None:
579
+ """Initialize the fallback request rate autoscaler.
580
+
581
+ Variables:
582
+ base_ondemand_fallback_replicas: Minimum number of on-demand
583
+ replicas to be always there.
584
+ dynamic_ondemand_fallback: Whether to dynamically provision
585
+ on-demand instances for preempted spot instances.
586
+ """
587
+ super().__init__(service_name, spec)
588
+ self._setup_fallback_options(spec)
589
+
508
590
  def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
509
591
  update_mode: serve_utils.UpdateMode) -> None:
510
592
  super().update_version(version, spec, update_mode=update_mode)
511
- self.base_ondemand_fallback_replicas = (
512
- spec.base_ondemand_fallback_replicas
513
- if spec.base_ondemand_fallback_replicas is not None else 0)
514
- # Assert: Either dynamic_ondemand_fallback is set
515
- # or base_ondemand_fallback_replicas is greater than 0.
516
- assert spec.use_ondemand_fallback
517
- self.dynamic_ondemand_fallback = (spec.dynamic_ondemand_fallback
518
- if spec.dynamic_ondemand_fallback
519
- is not None else False)
593
+ self._setup_fallback_options(spec)
520
594
 
521
- # job_recovery field is checked earlier in core
522
- def _get_spot_resources_override_dict(self) -> Dict[str, Any]:
523
- return {'use_spot': True}
524
-
525
- def _get_ondemand_resources_override_dict(self) -> Dict[str, Any]:
526
- return {'use_spot': False}
527
-
528
- def evaluate_scaling(
595
+ def _generate_scaling_decisions(
529
596
  self,
530
597
  replica_infos: List['replica_managers.ReplicaInfo'],
531
598
  ) -> List[AutoscalerDecision]:
599
+ """Generate Autoscaling decisions based on request rate, with on-demand
600
+ fallback.
601
+
602
+ The autoscaler will make sure there are at least
603
+ `base_ondemand_fallback_replicas` on-demand replicas to be always there,
604
+ so the service can provide basic guarantee for the availability.
605
+ """
606
+
607
+ self._set_target_num_replicas_with_hysteresis()
532
608
 
533
609
  latest_nonterminal_replicas = list(
534
610
  filter(
535
611
  lambda info: not info.is_terminal and info.version == self.
536
612
  latest_version, replica_infos))
537
-
538
- self._set_target_num_replica_with_hysteresis()
539
613
  num_nonterminal_spot, num_ready_spot = 0, 0
540
614
  num_nonterminal_ondemand, num_ready_ondemand = 0, 0
541
615
 
@@ -550,22 +624,14 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
550
624
  num_nonterminal_ondemand += 1
551
625
 
552
626
  logger.info(
553
- 'Number of alive spot instances: '
554
- f'{num_nonterminal_spot}, '
627
+ f'Number of alive spot instances: {num_nonterminal_spot}, '
555
628
  f'Number of ready spot instances: {num_ready_spot}, '
556
- 'Number of alive on-demand instances: '
557
- f' {num_nonterminal_ondemand}, '
629
+ f'Number of alive on-demand instances: {num_nonterminal_ondemand}, '
558
630
  f'Number of ready on-demand instances: {num_ready_ondemand}')
559
631
 
560
- scaling_options: List[AutoscalerDecision] = []
632
+ scaling_decisions: List[AutoscalerDecision] = []
561
633
  all_replica_ids_to_scale_down: List[int] = []
562
634
 
563
- # TODO(MaoZiming,zhwu): coner case: We should make sure the fallback
564
- # replicas are ready before scaling down the old replicas to avoid the
565
- # situation that all the ready new replicas are preempted together.
566
- all_replica_ids_to_scale_down.extend(
567
- self.select_outdated_replicas_to_scale_down(replica_infos))
568
-
569
635
  # Decide how many spot instances to launch.
570
636
  num_spot_to_provision = (self.target_num_replicas -
571
637
  self.base_ondemand_fallback_replicas)
@@ -575,18 +641,15 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
575
641
  num_nonterminal_spot)
576
642
  logger.info('Number of spot instances to scale up: '
577
643
  f'{num_spot_to_scale_up}')
578
- for _ in range(num_spot_to_scale_up):
579
- scaling_options.append(
580
- AutoscalerDecision(
581
- AutoscalerDecisionOperator.SCALE_UP,
582
- target=self._get_spot_resources_override_dict()))
644
+ scaling_decisions.extend(
645
+ _generate_scale_up_decisions(num_spot_to_scale_up,
646
+ self.SPOT_OVERRIDE))
583
647
  elif num_nonterminal_spot > num_spot_to_provision:
584
648
  # Too many spot instances, scale down.
585
649
  # Get the replica to scale down with _select_replicas_to_scale_down
586
650
  num_spot_to_scale_down = (num_nonterminal_spot -
587
651
  num_spot_to_provision)
588
652
  replicas_to_scale_down = (
589
- RequestRateAutoscaler.
590
653
  _select_nonterminal_replicas_to_scale_down(
591
654
  num_spot_to_scale_down,
592
655
  filter(lambda info: info.is_spot,
@@ -610,16 +673,13 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
610
673
  num_nonterminal_ondemand)
611
674
  logger.info('Number of on-demand instances to scale up: '
612
675
  f'{num_ondemand_to_scale_up}')
613
- for _ in range(num_ondemand_to_scale_up):
614
- scaling_options.append(
615
- AutoscalerDecision(
616
- AutoscalerDecisionOperator.SCALE_UP,
617
- target=self._get_ondemand_resources_override_dict()))
676
+ scaling_decisions.extend(
677
+ _generate_scale_up_decisions(num_ondemand_to_scale_up,
678
+ self.ONDEMAND_OVERRIDE))
618
679
  else:
619
680
  num_ondemand_to_scale_down = (num_nonterminal_ondemand -
620
681
  num_ondemand_to_provision)
621
682
  replicas_to_scale_down = (
622
- RequestRateAutoscaler.
623
683
  _select_nonterminal_replicas_to_scale_down(
624
684
  num_ondemand_to_scale_down,
625
685
  filter(lambda info: not info.is_spot,
@@ -630,9 +690,7 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
630
690
 
631
691
  all_replica_ids_to_scale_down.extend(replicas_to_scale_down)
632
692
 
633
- for replica_id in all_replica_ids_to_scale_down:
634
- scaling_options.append(
635
- AutoscalerDecision(AutoscalerDecisionOperator.SCALE_DOWN,
636
- target=replica_id))
693
+ scaling_decisions.extend(
694
+ _generate_scale_down_decisions(all_replica_ids_to_scale_down))
637
695
 
638
- return scaling_options
696
+ return scaling_decisions