skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (136) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend.py +5 -3
  4. sky/backends/backend_utils.py +22 -7
  5. sky/backends/cloud_vm_ray_backend.py +50 -18
  6. sky/backends/local_docker_backend.py +8 -3
  7. sky/client/cli/command.py +25 -10
  8. sky/client/sdk.py +51 -1
  9. sky/clouds/kubernetes.py +2 -6
  10. sky/clouds/nebius.py +3 -1
  11. sky/core.py +9 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  19. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  20. sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  23. sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  27. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
  28. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  29. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  32. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
  47. sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
  48. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  49. sky/dashboard/out/clusters/[cluster].html +1 -1
  50. sky/dashboard/out/clusters.html +1 -1
  51. sky/dashboard/out/config.html +1 -1
  52. sky/dashboard/out/index.html +1 -1
  53. sky/dashboard/out/infra/[context].html +1 -1
  54. sky/dashboard/out/infra.html +1 -1
  55. sky/dashboard/out/jobs/[job].html +1 -1
  56. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/volumes.html +1 -1
  60. sky/dashboard/out/workspace/new.html +1 -1
  61. sky/dashboard/out/workspaces/[name].html +1 -1
  62. sky/dashboard/out/workspaces.html +1 -1
  63. sky/data/storage_utils.py +29 -9
  64. sky/execution.py +13 -10
  65. sky/global_user_state.py +131 -2
  66. sky/jobs/constants.py +1 -1
  67. sky/jobs/recovery_strategy.py +0 -3
  68. sky/jobs/scheduler.py +14 -21
  69. sky/jobs/server/core.py +64 -10
  70. sky/jobs/server/utils.py +1 -1
  71. sky/jobs/state.py +1 -3
  72. sky/jobs/utils.py +159 -11
  73. sky/provision/aws/config.py +19 -3
  74. sky/provision/aws/instance.py +2 -1
  75. sky/provision/kubernetes/instance.py +2 -1
  76. sky/provision/nebius/utils.py +101 -86
  77. sky/provision/provisioner.py +13 -8
  78. sky/resources.py +5 -5
  79. sky/schemas/api/responses.py +50 -1
  80. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  81. sky/serve/replica_managers.py +123 -101
  82. sky/serve/serve_state.py +32 -0
  83. sky/serve/serve_utils.py +37 -16
  84. sky/serve/service.py +51 -17
  85. sky/server/common.py +2 -3
  86. sky/server/constants.py +1 -1
  87. sky/server/requests/payloads.py +6 -0
  88. sky/server/requests/serializers/decoders.py +20 -5
  89. sky/server/requests/serializers/encoders.py +21 -8
  90. sky/server/server.py +57 -11
  91. sky/templates/kubernetes-ray.yml.j2 +1 -0
  92. sky/utils/cli_utils/status_utils.py +2 -1
  93. sky/utils/common_utils.py +20 -0
  94. sky/utils/controller_utils.py +17 -4
  95. sky/utils/db/migration_utils.py +1 -1
  96. sky/utils/log_utils.py +14 -5
  97. sky/utils/resources_utils.py +25 -1
  98. sky/utils/schemas.py +3 -0
  99. sky/utils/ux_utils.py +36 -5
  100. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
  102. sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  104. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  105. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  106. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  109. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  110. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  111. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  114. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  117. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  119. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
  121. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
  131. /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
  132. /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
  133. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  """Responses for the API server."""
2
2
 
3
- from typing import Optional
3
+ from typing import Any, Dict, List, Optional
4
4
 
5
5
  import pydantic
6
6
 
7
7
  from sky import models
8
8
  from sky.server import common
9
+ from sky.utils import status_lib
9
10
 
10
11
 
11
12
  class ResponseBaseModel(pydantic.BaseModel):
@@ -46,6 +47,9 @@ class ResponseBaseModel(pydantic.BaseModel):
46
47
  def __setitem__(self, key, value):
47
48
  setattr(self, key, value)
48
49
 
50
+ def get(self, key, default=None):
51
+ return getattr(self, key, default)
52
+
49
53
  def __contains__(self, key):
50
54
  return hasattr(self, key)
51
55
 
@@ -58,6 +62,9 @@ class ResponseBaseModel(pydantic.BaseModel):
58
62
  def items(self):
59
63
  return self.model_dump().items()
60
64
 
65
+ def __repr__(self):
66
+ return self.__dict__.__repr__()
67
+
61
68
 
62
69
  class APIHealthResponse(ResponseBaseModel):
63
70
  """Response for the API health endpoint."""
@@ -68,3 +75,45 @@ class APIHealthResponse(ResponseBaseModel):
68
75
  commit: str = ''
69
76
  basic_auth_enabled: bool = False
70
77
  user: Optional[models.User] = None
78
+
79
+
80
+ class StatusResponse(ResponseBaseModel):
81
+ """Response for the status endpoint."""
82
+ name: str
83
+ launched_at: int
84
+ # pydantic cannot generate the pydantic-core schema for
85
+ # backends.ResourceHandle, so we use Any here.
86
+ # This is an internally facing field anyway, so it's less
87
+ # of a problem that it's not typed.
88
+ handle: Any
89
+ last_use: str
90
+ status: status_lib.ClusterStatus
91
+ autostop: int
92
+ to_down: bool
93
+ owner: Optional[List[str]] = None
94
+ # metadata is a JSON, so we use Any here.
95
+ metadata: Optional[Dict[str, Any]] = None
96
+ cluster_hash: str
97
+ # pydantic cannot generate the pydantic-core schema for
98
+ # storage_mounts_metadata, so we use Any here.
99
+ storage_mounts_metadata: Optional[Dict[str, Any]] = None
100
+ cluster_ever_up: bool
101
+ status_updated_at: int
102
+ user_hash: str
103
+ user_name: str
104
+ config_hash: Optional[str] = None
105
+ workspace: str
106
+ last_creation_yaml: Optional[str] = None
107
+ last_creation_command: Optional[str] = None
108
+ is_managed: bool
109
+ last_event: Optional[str] = None
110
+ resources_str: Optional[str] = None
111
+ resources_str_full: Optional[str] = None
112
+ # credentials is a JSON, so we use Any here.
113
+ credentials: Optional[Dict[str, Any]] = None
114
+ nodes: int
115
+ cloud: Optional[str] = None
116
+ region: Optional[str] = None
117
+ cpus: Optional[str] = None
118
+ memory: Optional[str] = None
119
+ accelerators: Optional[str] = None
@@ -0,0 +1,41 @@
1
+ """Add provision_log_path to clusters and cluster_history.
2
+
3
+ Revision ID: 006
4
+ Revises: 005
5
+ Create Date: 2025-08-12
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '006'
18
+ down_revision: Union[str, Sequence[str], None] = '005'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add provision_log_path columns."""
25
+ with op.get_context().autocommit_block():
26
+ # clusters.provision_log_path
27
+ db_utils.add_column_to_table_alembic('clusters',
28
+ 'provision_log_path',
29
+ sa.Text(),
30
+ server_default=None)
31
+
32
+ # cluster_history.provision_log_path
33
+ db_utils.add_column_to_table_alembic('cluster_history',
34
+ 'provision_log_path',
35
+ sa.Text(),
36
+ server_default=None)
37
+
38
+
39
+ def downgrade():
40
+ """No-op for backward compatibility."""
41
+ pass
@@ -1,7 +1,5 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
- import collections
3
2
  import dataclasses
4
- import enum
5
3
  import functools
6
4
  import multiprocessing
7
5
  from multiprocessing import pool as mp_pool
@@ -199,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
199
197
  return len(spot_use_resources) == len(task.resources)
200
198
 
201
199
 
200
+ # Every function that calls serve_state.add_or_update_replica should acquire
201
+ # this lock. It is to prevent race condition when the replica status is updated
202
+ # by multiple threads at the same time. The modification of replica info is
203
+ # 2 database calls: read the whole replica info object, unpickle it, and modify
204
+ # corresponding fields. Then it is write back to the database. We need to ensure
205
+ # the read-modify-write operation is atomic.
202
206
  def with_lock(func):
203
207
 
204
208
  @functools.wraps(func)
@@ -209,22 +213,6 @@ def with_lock(func):
209
213
  return wrapper
210
214
 
211
215
 
212
- class ProcessStatus(enum.Enum):
213
- """Process status."""
214
-
215
- # The process is running
216
- RUNNING = 'RUNNING'
217
-
218
- # The process is finished and succeeded
219
- SUCCEEDED = 'SUCCEEDED'
220
-
221
- # The process is interrupted
222
- INTERRUPTED = 'INTERRUPTED'
223
-
224
- # The process failed
225
- FAILED = 'FAILED'
226
-
227
-
228
216
  @dataclasses.dataclass
229
217
  class ReplicaStatusProperty:
230
218
  """Some properties that determine replica status.
@@ -236,15 +224,16 @@ class ReplicaStatusProperty:
236
224
  first_ready_time: The first time the service is ready.
237
225
  sky_down_status: Process status of sky.down.
238
226
  """
239
- # None means sky.launch is not called yet.
240
- sky_launch_status: Optional[ProcessStatus] = None
227
+ # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
228
+ sky_launch_status: common_utils.ProcessStatus = (
229
+ common_utils.ProcessStatus.SCHEDULED)
241
230
  user_app_failed: bool = False
242
231
  service_ready_now: bool = False
243
232
  # None means readiness probe is not succeeded yet;
244
233
  # -1 means the initial delay seconds is exceeded.
245
234
  first_ready_time: Optional[float] = None
246
235
  # None means sky.down is not called yet.
247
- sky_down_status: Optional[ProcessStatus] = None
236
+ sky_down_status: Optional[common_utils.ProcessStatus] = None
248
237
  # Whether the termination is caused by autoscaler's decision
249
238
  is_scale_down: bool = False
250
239
  # The replica's spot instance was preempted.
@@ -299,7 +288,7 @@ class ReplicaStatusProperty:
299
288
  (1) Job status;
300
289
  (2) Readiness probe.
301
290
  """
302
- if self.sky_launch_status != ProcessStatus.SUCCEEDED:
291
+ if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
303
292
  return False
304
293
  if self.sky_down_status is not None:
305
294
  return False
@@ -313,37 +302,43 @@ class ReplicaStatusProperty:
313
302
 
314
303
  def to_replica_status(self) -> serve_state.ReplicaStatus:
315
304
  """Convert status property to human-readable replica status."""
316
- if self.sky_launch_status is None:
305
+ # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
306
+ # we use None to represent sky.launch is not called yet.
307
+ if (self.sky_launch_status is None or
308
+ self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
317
309
  # Pending to launch
318
310
  return serve_state.ReplicaStatus.PENDING
319
- if self.sky_launch_status == ProcessStatus.RUNNING:
320
- if self.sky_down_status == ProcessStatus.FAILED:
311
+ if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
312
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
321
313
  return serve_state.ReplicaStatus.FAILED_CLEANUP
322
- if self.sky_down_status == ProcessStatus.SUCCEEDED:
314
+ if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
323
315
  # This indicate it is a scale_down with correct teardown.
324
316
  # Should have been cleaned from the replica table.
325
317
  return serve_state.ReplicaStatus.UNKNOWN
326
318
  # Still launching
327
319
  return serve_state.ReplicaStatus.PROVISIONING
328
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
320
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
329
321
  # sky.down is running and a scale down interrupted sky.launch
330
322
  return serve_state.ReplicaStatus.SHUTTING_DOWN
331
323
  if self.sky_down_status is not None:
332
324
  if self.preempted:
333
325
  # Replica (spot) is preempted
334
326
  return serve_state.ReplicaStatus.PREEMPTED
335
- if self.sky_down_status == ProcessStatus.RUNNING:
327
+ if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
328
+ # sky.down is scheduled to run, but not started yet.
329
+ return serve_state.ReplicaStatus.SHUTTING_DOWN
330
+ if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
336
331
  # sky.down is running
337
332
  return serve_state.ReplicaStatus.SHUTTING_DOWN
338
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
333
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
339
334
  return serve_state.ReplicaStatus.SHUTTING_DOWN
340
- if self.sky_down_status == ProcessStatus.FAILED:
335
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
341
336
  # sky.down failed
342
337
  return serve_state.ReplicaStatus.FAILED_CLEANUP
343
338
  if self.user_app_failed:
344
339
  # Failed on user setup/run
345
340
  return serve_state.ReplicaStatus.FAILED
346
- if self.sky_launch_status == ProcessStatus.FAILED:
341
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
347
342
  # sky.launch failed
348
343
  return serve_state.ReplicaStatus.FAILED_PROVISION
349
344
  if self.first_ready_time is None:
@@ -359,7 +354,7 @@ class ReplicaStatusProperty:
359
354
  # This indicate it is a scale_down with correct teardown.
360
355
  # Should have been cleaned from the replica table.
361
356
  return serve_state.ReplicaStatus.UNKNOWN
362
- if self.sky_launch_status == ProcessStatus.FAILED:
357
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
363
358
  # sky.launch failed
364
359
  # The down process has not been started if it reaches here,
365
360
  # due to the `if self.sky_down_status is not None`` check above.
@@ -703,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
703
698
 
704
699
  self._recover_replica_operations()
705
700
 
701
+ @with_lock
706
702
  def _recover_replica_operations(self):
707
703
  """Let's see are there something to do for ReplicaManager in a
708
704
  recovery run"""
@@ -743,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
743
739
  # Replica management functions #
744
740
  ################################
745
741
 
746
- # Adding lock here to make sure spot placer's current locations are
747
- # consistent with the replicas' status.
748
- @with_lock
742
+ # We don't need to add lock here since every caller of this function
743
+ # will acquire the lock.
749
744
  def _launch_replica(
750
745
  self,
751
746
  replica_id: int,
@@ -801,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
801
796
  # to avoid too many sky.launch running at the same time.
802
797
  self._launch_process_pool[replica_id] = p
803
798
 
799
+ @with_lock
804
800
  def scale_up(self,
805
801
  resources_override: Optional[Dict[str, Any]] = None) -> None:
806
802
  self._launch_replica(self._next_replica_id, resources_override)
807
803
  self._next_replica_id += 1
808
804
 
805
+ def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
806
+ if exitcode != 0:
807
+ logger.error(f'Down process for replica {info.replica_id} '
808
+ f'exited abnormally with code {exitcode}.')
809
+ info.status_property.sky_down_status = (
810
+ common_utils.ProcessStatus.FAILED)
811
+ else:
812
+ info.status_property.sky_down_status = (
813
+ common_utils.ProcessStatus.SUCCEEDED)
814
+ # Failed replica still count as a replica. In our current design, we
815
+ # want to fail early if user code have any error. This will prevent
816
+ # infinite loop of teardown and re-provision. However, there is a
817
+ # special case that if the replica is UP for longer than
818
+ # initial_delay_seconds, we assume it is just some random failure and
819
+ # we should restart the replica. Please refer to the implementation of
820
+ # `is_scale_down_succeeded` for more details.
821
+ # TODO(tian): Currently, restart replicas that failed within
822
+ # initial_delay_seconds is not supported. We should add it
823
+ # later when we support `sky serve update`.
824
+ removal_reason = None
825
+ if info.status_property.is_scale_down:
826
+ # This means the cluster is deleted due to an autoscaler
827
+ # decision or the cluster is recovering from preemption.
828
+ # Delete the replica info so it won't count as a replica.
829
+ if info.status_property.preempted:
830
+ removal_reason = 'for preemption recovery'
831
+ else:
832
+ removal_reason = 'normally'
833
+ # Don't keep failed record for version mismatch replicas,
834
+ # since user should fixed the error before update.
835
+ elif info.version != self.latest_version:
836
+ removal_reason = 'for version outdated'
837
+ elif info.status_property.purged:
838
+ removal_reason = 'for purge'
839
+ elif info.status_property.failed_spot_availability:
840
+ removal_reason = 'for spot availability failure'
841
+ else:
842
+ logger.info(f'Termination of replica {info.replica_id} '
843
+ 'finished. Replica info is kept since some '
844
+ 'failure detected.')
845
+ serve_state.add_or_update_replica(self._service_name,
846
+ info.replica_id, info)
847
+ if removal_reason is not None:
848
+ serve_state.remove_replica(self._service_name, info.replica_id)
849
+ logger.info(f'Replica {info.replica_id} removed from the '
850
+ f'replica table {removal_reason}.')
851
+
852
+ # We don't need to add lock here since every caller of this function
853
+ # will acquire the lock.
809
854
  def _terminate_replica(self,
810
855
  replica_id: int,
811
856
  sync_down_logs: bool,
@@ -823,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
823
868
  info = serve_state.get_replica_info_from_id(self._service_name,
824
869
  replica_id)
825
870
  assert info is not None
826
- info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
871
+ info.status_property.sky_launch_status = (
872
+ common_utils.ProcessStatus.INTERRUPTED)
827
873
  serve_state.add_or_update_replica(self._service_name, replica_id,
828
874
  info)
829
875
  launch_process = self._launch_process_pool[replica_id]
@@ -895,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
895
941
 
896
942
  logger.info(f'preempted: {info.status_property.preempted}, '
897
943
  f'replica_id: {replica_id}')
944
+ info.status_property.is_scale_down = is_scale_down
945
+ info.status_property.purged = purge
946
+
947
+ # If the cluster does not exist, it means either the cluster never
948
+ # exists (e.g., the cluster is scaled down before it gets a chance to
949
+ # provision) or the cluster is preempted and cleaned up by the status
950
+ # refresh. In this case, we skip spawning a new down process to save
951
+ # controller resources.
952
+ if global_user_state.get_cluster_from_name(info.cluster_name) is None:
953
+ self._handle_sky_down_finish(info, exitcode=0)
954
+ return
955
+
956
+ # Otherwise, start the process to terminate the cluster.
898
957
  p = multiprocessing.Process(
899
958
  target=ux_utils.RedirectOutputForProcess(terminate_cluster,
900
959
  log_file_name, 'a').run,
901
960
  args=(info.cluster_name, replica_drain_delay_seconds),
902
961
  )
903
- info.status_property.sky_down_status = ProcessStatus.RUNNING
904
- info.status_property.is_scale_down = is_scale_down
905
- info.status_property.purged = purge
962
+ info.status_property.sky_down_status = (
963
+ common_utils.ProcessStatus.SCHEDULED)
906
964
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
907
- p.start()
908
965
  self._down_process_pool[replica_id] = p
909
966
 
967
+ @with_lock
910
968
  def scale_down(self, replica_id: int, purge: bool = False) -> None:
911
969
  self._terminate_replica(
912
970
  replica_id,
@@ -915,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
915
973
  is_scale_down=True,
916
974
  purge=purge)
917
975
 
976
+ # We don't need to add lock here since every caller of this function
977
+ # will acquire the lock.
918
978
  def _handle_preemption(self, info: ReplicaInfo) -> bool:
919
979
  """Handle preemption of the replica if any error happened.
920
980
 
@@ -990,7 +1050,7 @@ class SkyPilotReplicaManager(ReplicaManager):
990
1050
  if controller_utils.can_provision():
991
1051
  p.start()
992
1052
  info.status_property.sky_launch_status = (
993
- ProcessStatus.RUNNING)
1053
+ common_utils.ProcessStatus.RUNNING)
994
1054
  else:
995
1055
  # sky.launch finished
996
1056
  # TODO(tian): Try-catch in process, and have an enum return
@@ -1007,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1007
1067
  f'exited abnormally with code {p.exitcode}.'
1008
1068
  ' Terminating...')
1009
1069
  info.status_property.sky_launch_status = (
1010
- ProcessStatus.FAILED)
1070
+ common_utils.ProcessStatus.FAILED)
1011
1071
  error_in_sky_launch = True
1012
1072
  else:
1013
1073
  info.status_property.sky_launch_status = (
1014
- ProcessStatus.SUCCEEDED)
1074
+ common_utils.ProcessStatus.SUCCEEDED)
1015
1075
  schedule_next_jobs = True
1016
1076
  if self._spot_placer is not None and info.is_spot:
1017
1077
  # TODO(tian): Currently, we set the location to
@@ -1033,8 +1093,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1033
1093
  serve_state.add_or_update_replica(self._service_name,
1034
1094
  replica_id, info)
1035
1095
  if schedule_next_jobs and self._is_pool:
1036
- jobs_scheduler.maybe_schedule_next_jobs(
1037
- pool=self._service_name)
1096
+ jobs_scheduler.maybe_schedule_next_jobs()
1038
1097
  if error_in_sky_launch:
1039
1098
  # Teardown after update replica info since
1040
1099
  # _terminate_replica will update the replica info too.
@@ -1045,59 +1104,25 @@ class SkyPilotReplicaManager(ReplicaManager):
1045
1104
  jobs_scheduler.maybe_schedule_next_jobs()
1046
1105
  down_process_pool_snapshot = list(self._down_process_pool.items())
1047
1106
  for replica_id, p in down_process_pool_snapshot:
1048
- if not p.is_alive():
1049
- logger.info(
1050
- f'Terminate process for replica {replica_id} finished.')
1051
- del self._down_process_pool[replica_id]
1052
- info = serve_state.get_replica_info_from_id(
1053
- self._service_name, replica_id)
1054
- assert info is not None, replica_id
1055
- if p.exitcode != 0:
1056
- logger.error(f'Down process for replica {replica_id} '
1057
- f'exited abnormally with code {p.exitcode}.')
1058
- info.status_property.sky_down_status = (
1059
- ProcessStatus.FAILED)
1060
- else:
1107
+ if p.is_alive():
1108
+ continue
1109
+ info = serve_state.get_replica_info_from_id(self._service_name,
1110
+ replica_id)
1111
+ assert info is not None, replica_id
1112
+ if (info.status_property.sky_down_status ==
1113
+ common_utils.ProcessStatus.SCHEDULED):
1114
+ # sky.down not started yet
1115
+ if controller_utils.can_terminate():
1116
+ p.start()
1061
1117
  info.status_property.sky_down_status = (
1062
- ProcessStatus.SUCCEEDED)
1063
- # Failed replica still count as a replica. In our current
1064
- # design, we want to fail early if user code have any error.
1065
- # This will prevent infinite loop of teardown and
1066
- # re-provision. However, there is a special case that if the
1067
- # replica is UP for longer than initial_delay_seconds, we
1068
- # assume it is just some random failure and we should restart
1069
- # the replica. Please refer to the implementation of
1070
- # `is_scale_down_succeeded` for more details.
1071
- # TODO(tian): Currently, restart replicas that failed within
1072
- # initial_delay_seconds is not supported. We should add it
1073
- # later when we support `sky serve update`.
1074
- removal_reason = None
1075
- if info.status_property.is_scale_down:
1076
- # This means the cluster is deleted due to an autoscaler
1077
- # decision or the cluster is recovering from preemption.
1078
- # Delete the replica info so it won't count as a replica.
1079
- if info.status_property.preempted:
1080
- removal_reason = 'for preemption recovery'
1081
- else:
1082
- removal_reason = 'normally'
1083
- # Don't keep failed record for version mismatch replicas,
1084
- # since user should fixed the error before update.
1085
- elif info.version != self.latest_version:
1086
- removal_reason = 'for version outdated'
1087
- elif info.status_property.purged:
1088
- removal_reason = 'for purge'
1089
- elif info.status_property.failed_spot_availability:
1090
- removal_reason = 'for spot availability failure'
1091
- else:
1092
- logger.info(f'Termination of replica {replica_id} '
1093
- 'finished. Replica info is kept since some '
1094
- 'failure detected.')
1118
+ common_utils.ProcessStatus.RUNNING)
1095
1119
  serve_state.add_or_update_replica(self._service_name,
1096
1120
  replica_id, info)
1097
- if removal_reason is not None:
1098
- serve_state.remove_replica(self._service_name, replica_id)
1099
- logger.info(f'Replica {replica_id} removed from the '
1100
- f'replica table {removal_reason}.')
1121
+ else:
1122
+ logger.info(
1123
+ f'Terminate process for replica {replica_id} finished.')
1124
+ del self._down_process_pool[replica_id]
1125
+ self._handle_sky_down_finish(info, exitcode=p.exitcode)
1101
1126
 
1102
1127
  # Clean old version
1103
1128
  replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1393,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1393
1418
  old_config_any_of = old_config.get('resources',
1394
1419
  {}).pop('any_of', [])
1395
1420
 
1396
- def normalize_dict_list(lst):
1397
- return collections.Counter(
1398
- frozenset(d.items()) for d in lst)
1399
-
1400
- if (normalize_dict_list(old_config_any_of) !=
1401
- normalize_dict_list(new_config_any_of)):
1421
+ if (resources_utils.normalize_any_of_resources_config(
1422
+ old_config_any_of) != resources_utils.
1423
+ normalize_any_of_resources_config(new_config_any_of)):
1402
1424
  logger.info('Replica config changed (any_of), skipping. '
1403
1425
  f'old: {old_config_any_of}, '
1404
1426
  f'new: {new_config_any_of}')
sky/serve/serve_state.py CHANGED
@@ -670,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
670
670
  return provisioning_count
671
671
 
672
672
 
673
+ @init_db
674
+ def total_number_terminating_replicas() -> int:
675
+ """Returns the total number of terminating replicas."""
676
+ assert _SQLALCHEMY_ENGINE is not None
677
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
678
+ rows = session.execute(sqlalchemy.select(
679
+ replicas_table.c.replica_info)).fetchall()
680
+ terminating_count = 0
681
+ for row in rows:
682
+ replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
683
+ if (replica_info.status_property.sky_down_status ==
684
+ common_utils.ProcessStatus.RUNNING):
685
+ terminating_count += 1
686
+ return terminating_count
687
+
688
+
689
+ @init_db
690
+ def total_number_scheduled_to_terminate_replicas() -> int:
691
+ """Returns the total number of terminating replicas."""
692
+ assert _SQLALCHEMY_ENGINE is not None
693
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
694
+ rows = session.execute(sqlalchemy.select(
695
+ replicas_table.c.replica_info)).fetchall()
696
+ terminating_count = 0
697
+ for row in rows:
698
+ replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
699
+ if (replica_info.status_property.sky_down_status ==
700
+ common_utils.ProcessStatus.SCHEDULED):
701
+ terminating_count += 1
702
+ return terminating_count
703
+
704
+
673
705
  def get_replicas_at_status(
674
706
  service_name: str,
675
707
  status: ReplicaStatus,
sky/serve/serve_utils.py CHANGED
@@ -63,7 +63,10 @@ _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
63
63
  # when changing UX as this assumption is used to expand some log files while
64
64
  # ignoring others.
65
65
  _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
66
- _SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
66
+ _SKYPILOT_PROVISION_API_LOG_PATTERN = (
67
+ fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
68
+ # New hint pattern for provision logs
69
+ _SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
67
70
  _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
68
71
 
69
72
  # TODO(tian): Find all existing replica id and print here.
@@ -1114,31 +1117,49 @@ def _process_line(line: str,
1114
1117
  return False
1115
1118
  return cluster_record['status'] == status_lib.ClusterStatus.UP
1116
1119
 
1117
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
1120
+ provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1121
+ line)
1122
+ provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
1123
+ line)
1118
1124
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1119
1125
 
1120
- if provision_log_prompt is not None:
1121
- log_path = provision_log_prompt.group(1)
1122
- nested_log_path = pathlib.Path(
1123
- skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1124
- log_path).resolve()
1125
-
1126
+ def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
1126
1127
  try:
1127
- with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
1128
- # We still exit if more than 10 seconds without new content
1129
- # to avoid any internal bug that causes the launch to fail
1130
- # while cluster status remains INIT.
1128
+ with open(p, 'r', newline='', encoding='utf-8') as f:
1129
+ # Exit if >10s without new content to avoid hanging when INIT
1131
1130
  yield from log_utils.follow_logs(f,
1132
1131
  should_stop=cluster_is_up,
1133
1132
  stop_on_eof=stop_on_eof,
1134
1133
  idle_timeout_seconds=10)
1135
1134
  except FileNotFoundError:
1135
+ # Fall back cleanly if the hinted path doesn't exist
1136
1136
  yield line
1137
-
1138
1137
  yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
1139
- f'Try to expand log file {nested_log_path} but not '
1140
- f'found. Skipping...{colorama.Style.RESET_ALL}')
1141
- pass
1138
+ f'Try to expand log file {p} but not found. Skipping...'
1139
+ f'{colorama.Style.RESET_ALL}')
1140
+ return
1141
+
1142
+ if provision_api_log_prompt is not None:
1143
+ rel_path = provision_api_log_prompt.group(1)
1144
+ nested_log_path = pathlib.Path(
1145
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1146
+ rel_path).resolve()
1147
+ yield from _stream_provision_path(nested_log_path)
1148
+ return
1149
+
1150
+ if provision_log_cmd_prompt is not None:
1151
+ # Resolve provision log via cluster table first, then history.
1152
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1153
+ cluster_name)
1154
+ if not log_path_str:
1155
+ log_path_str = (
1156
+ global_user_state.get_cluster_history_provision_log_path(
1157
+ cluster_name))
1158
+ if not log_path_str:
1159
+ yield line
1160
+ return
1161
+ yield from _stream_provision_path(
1162
+ pathlib.Path(log_path_str).expanduser().resolve())
1142
1163
  return
1143
1164
 
1144
1165
  if log_prompt is not None: