skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
- import collections
3
2
  import dataclasses
4
- import enum
5
3
  import functools
6
4
  import multiprocessing
7
5
  from multiprocessing import pool as mp_pool
@@ -13,16 +11,16 @@ import typing
13
11
  from typing import Any, Dict, List, Optional, Tuple
14
12
 
15
13
  import colorama
16
- import psutil
14
+ import filelock
17
15
  import requests
18
16
 
19
- import sky
20
17
  from sky import backends
21
18
  from sky import core
22
19
  from sky import exceptions
23
20
  from sky import execution
24
21
  from sky import global_user_state
25
22
  from sky import sky_logging
23
+ from sky import task as task_lib
26
24
  from sky.backends import backend_utils
27
25
  from sky.jobs import scheduler as jobs_scheduler
28
26
  from sky.serve import constants as serve_constants
@@ -41,7 +39,6 @@ from sky.utils import status_lib
41
39
  from sky.utils import ux_utils
42
40
 
43
41
  if typing.TYPE_CHECKING:
44
- from sky import resources
45
42
  from sky.serve import service_spec
46
43
 
47
44
  logger = sky_logging.init_logger(__name__)
@@ -51,10 +48,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
51
48
  _RETRY_INIT_GAP_SECONDS = 60
52
49
  _DEFAULT_DRAIN_SECONDS = 120
53
50
 
54
- # Since sky.launch is very resource demanding, we limit the number of
55
- # concurrent sky.launch process to avoid overloading the machine.
56
- _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
57
-
58
51
 
59
52
  # TODO(tian): Combine this with
60
53
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
@@ -81,7 +74,7 @@ def launch_cluster(replica_id: int,
81
74
  try:
82
75
  config = common_utils.read_yaml(
83
76
  os.path.expanduser(service_task_yaml_path))
84
- task = sky.Task.from_yaml_config(config)
77
+ task = task_lib.Task.from_yaml_config(config)
85
78
  if resources_override is not None:
86
79
  resources = task.resources
87
80
  overrided_resources = [
@@ -177,7 +170,7 @@ def terminate_cluster(cluster_name: str,
177
170
 
178
171
  def _get_resources_ports(service_task_yaml_path: str) -> str:
179
172
  """Get the resources ports used by the task."""
180
- task = sky.Task.from_yaml(service_task_yaml_path)
173
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
181
174
  # Already checked all ports are valid in sky.serve.core.up
182
175
  assert task.resources, task
183
176
  assert task.service is not None, task
@@ -195,7 +188,7 @@ def _should_use_spot(service_task_yaml_path: str,
195
188
  if use_spot_override is not None:
196
189
  assert isinstance(use_spot_override, bool)
197
190
  return use_spot_override
198
- task = sky.Task.from_yaml(service_task_yaml_path)
191
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
199
192
  spot_use_resources = [
200
193
  resources for resources in task.resources if resources.use_spot
201
194
  ]
@@ -204,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
204
197
  return len(spot_use_resources) == len(task.resources)
205
198
 
206
199
 
200
+ # Every function that calls serve_state.add_or_update_replica should acquire
201
+ # this lock. It is to prevent race condition when the replica status is updated
202
+ # by multiple threads at the same time. The modification of replica info is
203
+ # 2 database calls: read the whole replica info object, unpickle it, and modify
204
+ # corresponding fields. Then it is write back to the database. We need to ensure
205
+ # the read-modify-write operation is atomic.
207
206
  def with_lock(func):
208
207
 
209
208
  @functools.wraps(func)
@@ -214,22 +213,6 @@ def with_lock(func):
214
213
  return wrapper
215
214
 
216
215
 
217
- class ProcessStatus(enum.Enum):
218
- """Process status."""
219
-
220
- # The process is running
221
- RUNNING = 'RUNNING'
222
-
223
- # The process is finished and succeeded
224
- SUCCEEDED = 'SUCCEEDED'
225
-
226
- # The process is interrupted
227
- INTERRUPTED = 'INTERRUPTED'
228
-
229
- # The process failed
230
- FAILED = 'FAILED'
231
-
232
-
233
216
  @dataclasses.dataclass
234
217
  class ReplicaStatusProperty:
235
218
  """Some properties that determine replica status.
@@ -241,15 +224,16 @@ class ReplicaStatusProperty:
241
224
  first_ready_time: The first time the service is ready.
242
225
  sky_down_status: Process status of sky.down.
243
226
  """
244
- # None means sky.launch is not called yet.
245
- sky_launch_status: Optional[ProcessStatus] = None
227
+ # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
228
+ sky_launch_status: common_utils.ProcessStatus = (
229
+ common_utils.ProcessStatus.SCHEDULED)
246
230
  user_app_failed: bool = False
247
231
  service_ready_now: bool = False
248
232
  # None means readiness probe is not succeeded yet;
249
233
  # -1 means the initial delay seconds is exceeded.
250
234
  first_ready_time: Optional[float] = None
251
235
  # None means sky.down is not called yet.
252
- sky_down_status: Optional[ProcessStatus] = None
236
+ sky_down_status: Optional[common_utils.ProcessStatus] = None
253
237
  # Whether the termination is caused by autoscaler's decision
254
238
  is_scale_down: bool = False
255
239
  # The replica's spot instance was preempted.
@@ -304,7 +288,7 @@ class ReplicaStatusProperty:
304
288
  (1) Job status;
305
289
  (2) Readiness probe.
306
290
  """
307
- if self.sky_launch_status != ProcessStatus.SUCCEEDED:
291
+ if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
308
292
  return False
309
293
  if self.sky_down_status is not None:
310
294
  return False
@@ -318,37 +302,43 @@ class ReplicaStatusProperty:
318
302
 
319
303
  def to_replica_status(self) -> serve_state.ReplicaStatus:
320
304
  """Convert status property to human-readable replica status."""
321
- if self.sky_launch_status is None:
305
+ # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
306
+ # we use None to represent sky.launch is not called yet.
307
+ if (self.sky_launch_status is None or
308
+ self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
322
309
  # Pending to launch
323
310
  return serve_state.ReplicaStatus.PENDING
324
- if self.sky_launch_status == ProcessStatus.RUNNING:
325
- if self.sky_down_status == ProcessStatus.FAILED:
311
+ if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
312
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
326
313
  return serve_state.ReplicaStatus.FAILED_CLEANUP
327
- if self.sky_down_status == ProcessStatus.SUCCEEDED:
314
+ if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
328
315
  # This indicate it is a scale_down with correct teardown.
329
316
  # Should have been cleaned from the replica table.
330
317
  return serve_state.ReplicaStatus.UNKNOWN
331
318
  # Still launching
332
319
  return serve_state.ReplicaStatus.PROVISIONING
333
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
320
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
334
321
  # sky.down is running and a scale down interrupted sky.launch
335
322
  return serve_state.ReplicaStatus.SHUTTING_DOWN
336
323
  if self.sky_down_status is not None:
337
324
  if self.preempted:
338
325
  # Replica (spot) is preempted
339
326
  return serve_state.ReplicaStatus.PREEMPTED
340
- if self.sky_down_status == ProcessStatus.RUNNING:
327
+ if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
328
+ # sky.down is scheduled to run, but not started yet.
329
+ return serve_state.ReplicaStatus.SHUTTING_DOWN
330
+ if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
341
331
  # sky.down is running
342
332
  return serve_state.ReplicaStatus.SHUTTING_DOWN
343
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
333
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
344
334
  return serve_state.ReplicaStatus.SHUTTING_DOWN
345
- if self.sky_down_status == ProcessStatus.FAILED:
335
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
346
336
  # sky.down failed
347
337
  return serve_state.ReplicaStatus.FAILED_CLEANUP
348
338
  if self.user_app_failed:
349
339
  # Failed on user setup/run
350
340
  return serve_state.ReplicaStatus.FAILED
351
- if self.sky_launch_status == ProcessStatus.FAILED:
341
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
352
342
  # sky.launch failed
353
343
  return serve_state.ReplicaStatus.FAILED_PROVISION
354
344
  if self.first_ready_time is None:
@@ -364,7 +354,7 @@ class ReplicaStatusProperty:
364
354
  # This indicate it is a scale_down with correct teardown.
365
355
  # Should have been cleaned from the replica table.
366
356
  return serve_state.ReplicaStatus.UNKNOWN
367
- if self.sky_launch_status == ProcessStatus.FAILED:
357
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
368
358
  # sky.launch failed
369
359
  # The down process has not been started if it reaches here,
370
360
  # due to the `if self.sky_down_status is not None`` check above.
@@ -688,7 +678,7 @@ class SkyPilotReplicaManager(ReplicaManager):
688
678
  service_task_yaml_path: str) -> None:
689
679
  super().__init__(service_name, spec)
690
680
  self.service_task_yaml_path = service_task_yaml_path
691
- task = sky.Task.from_yaml(service_task_yaml_path)
681
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
692
682
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
693
683
  spot_placer.SpotPlacer.from_task(spec, task))
694
684
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -708,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
708
698
 
709
699
  self._recover_replica_operations()
710
700
 
701
+ @with_lock
711
702
  def _recover_replica_operations(self):
712
703
  """Let's see are there something to do for ReplicaManager in a
713
704
  recovery run"""
@@ -748,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
748
739
  # Replica management functions #
749
740
  ################################
750
741
 
751
- # Adding lock here to make sure spot placer's current locations are
752
- # consistent with the replicas' status.
753
- @with_lock
742
+ # We don't need to add lock here since every caller of this function
743
+ # will acquire the lock.
754
744
  def _launch_replica(
755
745
  self,
756
746
  replica_id: int,
@@ -806,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
806
796
  # to avoid too many sky.launch running at the same time.
807
797
  self._launch_process_pool[replica_id] = p
808
798
 
799
+ @with_lock
809
800
  def scale_up(self,
810
801
  resources_override: Optional[Dict[str, Any]] = None) -> None:
811
802
  self._launch_replica(self._next_replica_id, resources_override)
812
803
  self._next_replica_id += 1
813
804
 
805
+ def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
806
+ if exitcode != 0:
807
+ logger.error(f'Down process for replica {info.replica_id} '
808
+ f'exited abnormally with code {exitcode}.')
809
+ info.status_property.sky_down_status = (
810
+ common_utils.ProcessStatus.FAILED)
811
+ else:
812
+ info.status_property.sky_down_status = (
813
+ common_utils.ProcessStatus.SUCCEEDED)
814
+ # Failed replica still count as a replica. In our current design, we
815
+ # want to fail early if user code have any error. This will prevent
816
+ # infinite loop of teardown and re-provision. However, there is a
817
+ # special case that if the replica is UP for longer than
818
+ # initial_delay_seconds, we assume it is just some random failure and
819
+ # we should restart the replica. Please refer to the implementation of
820
+ # `is_scale_down_succeeded` for more details.
821
+ # TODO(tian): Currently, restart replicas that failed within
822
+ # initial_delay_seconds is not supported. We should add it
823
+ # later when we support `sky serve update`.
824
+ removal_reason = None
825
+ if info.status_property.is_scale_down:
826
+ # This means the cluster is deleted due to an autoscaler
827
+ # decision or the cluster is recovering from preemption.
828
+ # Delete the replica info so it won't count as a replica.
829
+ if info.status_property.preempted:
830
+ removal_reason = 'for preemption recovery'
831
+ else:
832
+ removal_reason = 'normally'
833
+ # Don't keep failed record for version mismatch replicas,
834
+ # since user should fixed the error before update.
835
+ elif info.version != self.latest_version:
836
+ removal_reason = 'for version outdated'
837
+ elif info.status_property.purged:
838
+ removal_reason = 'for purge'
839
+ elif info.status_property.failed_spot_availability:
840
+ removal_reason = 'for spot availability failure'
841
+ else:
842
+ logger.info(f'Termination of replica {info.replica_id} '
843
+ 'finished. Replica info is kept since some '
844
+ 'failure detected.')
845
+ serve_state.add_or_update_replica(self._service_name,
846
+ info.replica_id, info)
847
+ if removal_reason is not None:
848
+ serve_state.remove_replica(self._service_name, info.replica_id)
849
+ logger.info(f'Replica {info.replica_id} removed from the '
850
+ f'replica table {removal_reason}.')
851
+
852
+ # We don't need to add lock here since every caller of this function
853
+ # will acquire the lock.
814
854
  def _terminate_replica(self,
815
855
  replica_id: int,
816
856
  sync_down_logs: bool,
@@ -828,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
828
868
  info = serve_state.get_replica_info_from_id(self._service_name,
829
869
  replica_id)
830
870
  assert info is not None
831
- info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
871
+ info.status_property.sky_launch_status = (
872
+ common_utils.ProcessStatus.INTERRUPTED)
832
873
  serve_state.add_or_update_replica(self._service_name, replica_id,
833
874
  info)
834
875
  launch_process = self._launch_process_pool[replica_id]
@@ -872,8 +913,9 @@ class SkyPilotReplicaManager(ReplicaManager):
872
913
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
873
914
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
874
915
  'replica_jobs')
875
- job_log_file_name = (controller_utils.download_and_stream_job_log(
876
- backend, handle, replica_job_logs_dir))
916
+ job_ids = ['1'] if self._is_pool else None
917
+ job_log_file_name = controller_utils.download_and_stream_job_log(
918
+ backend, handle, replica_job_logs_dir, job_ids)
877
919
  if job_log_file_name is not None:
878
920
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
879
921
  with open(log_file_name, 'a',
@@ -899,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
899
941
 
900
942
  logger.info(f'preempted: {info.status_property.preempted}, '
901
943
  f'replica_id: {replica_id}')
944
+ info.status_property.is_scale_down = is_scale_down
945
+ info.status_property.purged = purge
946
+
947
+ # If the cluster does not exist, it means either the cluster never
948
+ # exists (e.g., the cluster is scaled down before it gets a chance to
949
+ # provision) or the cluster is preempted and cleaned up by the status
950
+ # refresh. In this case, we skip spawning a new down process to save
951
+ # controller resources.
952
+ if global_user_state.get_cluster_from_name(info.cluster_name) is None:
953
+ self._handle_sky_down_finish(info, exitcode=0)
954
+ return
955
+
956
+ # Otherwise, start the process to terminate the cluster.
902
957
  p = multiprocessing.Process(
903
958
  target=ux_utils.RedirectOutputForProcess(terminate_cluster,
904
959
  log_file_name, 'a').run,
905
960
  args=(info.cluster_name, replica_drain_delay_seconds),
906
961
  )
907
- info.status_property.sky_down_status = ProcessStatus.RUNNING
908
- info.status_property.is_scale_down = is_scale_down
909
- info.status_property.purged = purge
962
+ info.status_property.sky_down_status = (
963
+ common_utils.ProcessStatus.SCHEDULED)
910
964
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
911
- p.start()
912
965
  self._down_process_pool[replica_id] = p
913
966
 
967
+ @with_lock
914
968
  def scale_down(self, replica_id: int, purge: bool = False) -> None:
915
969
  self._terminate_replica(
916
970
  replica_id,
@@ -919,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
919
973
  is_scale_down=True,
920
974
  purge=purge)
921
975
 
976
+ # We don't need to add lock here since every caller of this function
977
+ # will acquire the lock.
922
978
  def _handle_preemption(self, info: ReplicaInfo) -> bool:
923
979
  """Handle preemption of the replica if any error happened.
924
980
 
@@ -981,7 +1037,9 @@ class SkyPilotReplicaManager(ReplicaManager):
981
1037
  # To avoid `dictionary changed size during iteration` error.
982
1038
  launch_process_pool_snapshot = list(self._launch_process_pool.items())
983
1039
  for replica_id, p in launch_process_pool_snapshot:
984
- if not p.is_alive():
1040
+ if p.is_alive():
1041
+ continue
1042
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
985
1043
  info = serve_state.get_replica_info_from_id(
986
1044
  self._service_name, replica_id)
987
1045
  assert info is not None, replica_id
@@ -989,11 +1047,10 @@ class SkyPilotReplicaManager(ReplicaManager):
989
1047
  schedule_next_jobs = False
990
1048
  if info.status == serve_state.ReplicaStatus.PENDING:
991
1049
  # sky.launch not started yet
992
- if (serve_state.total_number_provisioning_replicas() <
993
- _MAX_NUM_LAUNCH):
1050
+ if controller_utils.can_provision():
994
1051
  p.start()
995
1052
  info.status_property.sky_launch_status = (
996
- ProcessStatus.RUNNING)
1053
+ common_utils.ProcessStatus.RUNNING)
997
1054
  else:
998
1055
  # sky.launch finished
999
1056
  # TODO(tian): Try-catch in process, and have an enum return
@@ -1010,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1010
1067
  f'exited abnormally with code {p.exitcode}.'
1011
1068
  ' Terminating...')
1012
1069
  info.status_property.sky_launch_status = (
1013
- ProcessStatus.FAILED)
1070
+ common_utils.ProcessStatus.FAILED)
1014
1071
  error_in_sky_launch = True
1015
1072
  else:
1016
1073
  info.status_property.sky_launch_status = (
1017
- ProcessStatus.SUCCEEDED)
1074
+ common_utils.ProcessStatus.SUCCEEDED)
1018
1075
  schedule_next_jobs = True
1019
1076
  if self._spot_placer is not None and info.is_spot:
1020
1077
  # TODO(tian): Currently, we set the location to
@@ -1036,69 +1093,36 @@ class SkyPilotReplicaManager(ReplicaManager):
1036
1093
  serve_state.add_or_update_replica(self._service_name,
1037
1094
  replica_id, info)
1038
1095
  if schedule_next_jobs and self._is_pool:
1039
- jobs_scheduler.maybe_schedule_next_jobs(
1040
- pool=self._service_name)
1096
+ jobs_scheduler.maybe_schedule_next_jobs()
1041
1097
  if error_in_sky_launch:
1042
1098
  # Teardown after update replica info since
1043
1099
  # _terminate_replica will update the replica info too.
1044
1100
  self._terminate_replica(replica_id,
1045
1101
  sync_down_logs=True,
1046
1102
  replica_drain_delay_seconds=0)
1103
+ # Try schedule next job after acquiring the lock.
1104
+ jobs_scheduler.maybe_schedule_next_jobs()
1047
1105
  down_process_pool_snapshot = list(self._down_process_pool.items())
1048
1106
  for replica_id, p in down_process_pool_snapshot:
1049
- if not p.is_alive():
1050
- logger.info(
1051
- f'Terminate process for replica {replica_id} finished.')
1052
- del self._down_process_pool[replica_id]
1053
- info = serve_state.get_replica_info_from_id(
1054
- self._service_name, replica_id)
1055
- assert info is not None, replica_id
1056
- if p.exitcode != 0:
1057
- logger.error(f'Down process for replica {replica_id} '
1058
- f'exited abnormally with code {p.exitcode}.')
1059
- info.status_property.sky_down_status = (
1060
- ProcessStatus.FAILED)
1061
- else:
1107
+ if p.is_alive():
1108
+ continue
1109
+ info = serve_state.get_replica_info_from_id(self._service_name,
1110
+ replica_id)
1111
+ assert info is not None, replica_id
1112
+ if (info.status_property.sky_down_status ==
1113
+ common_utils.ProcessStatus.SCHEDULED):
1114
+ # sky.down not started yet
1115
+ if controller_utils.can_terminate():
1116
+ p.start()
1062
1117
  info.status_property.sky_down_status = (
1063
- ProcessStatus.SUCCEEDED)
1064
- # Failed replica still count as a replica. In our current
1065
- # design, we want to fail early if user code have any error.
1066
- # This will prevent infinite loop of teardown and
1067
- # re-provision. However, there is a special case that if the
1068
- # replica is UP for longer than initial_delay_seconds, we
1069
- # assume it is just some random failure and we should restart
1070
- # the replica. Please refer to the implementation of
1071
- # `is_scale_down_succeeded` for more details.
1072
- # TODO(tian): Currently, restart replicas that failed within
1073
- # initial_delay_seconds is not supported. We should add it
1074
- # later when we support `sky serve update`.
1075
- removal_reason = None
1076
- if info.status_property.is_scale_down:
1077
- # This means the cluster is deleted due to an autoscaler
1078
- # decision or the cluster is recovering from preemption.
1079
- # Delete the replica info so it won't count as a replica.
1080
- if info.status_property.preempted:
1081
- removal_reason = 'for preemption recovery'
1082
- else:
1083
- removal_reason = 'normally'
1084
- # Don't keep failed record for version mismatch replicas,
1085
- # since user should fixed the error before update.
1086
- elif info.version != self.latest_version:
1087
- removal_reason = 'for version outdated'
1088
- elif info.status_property.purged:
1089
- removal_reason = 'for purge'
1090
- elif info.status_property.failed_spot_availability:
1091
- removal_reason = 'for spot availability failure'
1092
- else:
1093
- logger.info(f'Termination of replica {replica_id} '
1094
- 'finished. Replica info is kept since some '
1095
- 'failure detected.')
1118
+ common_utils.ProcessStatus.RUNNING)
1096
1119
  serve_state.add_or_update_replica(self._service_name,
1097
1120
  replica_id, info)
1098
- if removal_reason is not None:
1099
- serve_state.remove_replica(self._service_name, replica_id)
1100
- logger.info(f'Replica {replica_id} removed from the '
1101
- f'replica table {removal_reason}.')
1121
+ else:
1122
+ logger.info(
1123
+ f'Terminate process for replica {replica_id} finished.')
1124
+ del self._down_process_pool[replica_id]
1125
+ self._handle_sky_down_finish(info, exitcode=p.exitcode)
1102
1126
 
1103
1127
  # Clean old version
1104
1128
  replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1394,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1394
1418
  old_config_any_of = old_config.get('resources',
1395
1419
  {}).pop('any_of', [])
1396
1420
 
1397
- def normalize_dict_list(lst):
1398
- return collections.Counter(
1399
- frozenset(d.items()) for d in lst)
1400
-
1401
- if (normalize_dict_list(old_config_any_of) !=
1402
- normalize_dict_list(new_config_any_of)):
1421
+ if (resources_utils.normalize_any_of_resources_config(
1422
+ old_config_any_of) != resources_utils.
1423
+ normalize_any_of_resources_config(new_config_any_of)):
1403
1424
  logger.info('Replica config changed (any_of), skipping. '
1404
1425
  f'old: {old_config_any_of}, '
1405
1426
  f'new: {new_config_any_of}')
sky/serve/serve_state.py CHANGED
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
502
502
  return records
503
503
 
504
504
 
505
+ @init_db
506
+ def get_num_services() -> int:
507
+ """Get the number of services."""
508
+ assert _SQLALCHEMY_ENGINE is not None
509
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
510
+ return session.execute(
511
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
512
+ ).select_from(services_table)).fetchone()[0]
513
+
514
+
505
515
  @init_db
506
516
  def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
507
517
  """Get all existing service records."""
@@ -660,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
660
670
  return provisioning_count
661
671
 
662
672
 
673
+ @init_db
674
+ def total_number_terminating_replicas() -> int:
675
+ """Returns the total number of terminating replicas."""
676
+ assert _SQLALCHEMY_ENGINE is not None
677
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
678
+ rows = session.execute(sqlalchemy.select(
679
+ replicas_table.c.replica_info)).fetchall()
680
+ terminating_count = 0
681
+ for row in rows:
682
+ replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
683
+ if (replica_info.status_property.sky_down_status ==
684
+ common_utils.ProcessStatus.RUNNING):
685
+ terminating_count += 1
686
+ return terminating_count
687
+
688
+
689
+ @init_db
690
+ def total_number_scheduled_to_terminate_replicas() -> int:
691
+ """Returns the total number of terminating replicas."""
692
+ assert _SQLALCHEMY_ENGINE is not None
693
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
694
+ rows = session.execute(sqlalchemy.select(
695
+ replicas_table.c.replica_info)).fetchall()
696
+ terminating_count = 0
697
+ for row in rows:
698
+ replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
699
+ if (replica_info.status_property.sky_down_status ==
700
+ common_utils.ProcessStatus.SCHEDULED):
701
+ terminating_count += 1
702
+ return terminating_count
703
+
704
+
663
705
  def get_replicas_at_status(
664
706
  service_name: str,
665
707
  status: ReplicaStatus,