skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
|
-
import collections
|
|
3
2
|
import dataclasses
|
|
4
|
-
import enum
|
|
5
3
|
import functools
|
|
6
4
|
import multiprocessing
|
|
7
5
|
from multiprocessing import pool as mp_pool
|
|
@@ -13,16 +11,16 @@ import typing
|
|
|
13
11
|
from typing import Any, Dict, List, Optional, Tuple
|
|
14
12
|
|
|
15
13
|
import colorama
|
|
16
|
-
import
|
|
14
|
+
import filelock
|
|
17
15
|
import requests
|
|
18
16
|
|
|
19
|
-
import sky
|
|
20
17
|
from sky import backends
|
|
21
18
|
from sky import core
|
|
22
19
|
from sky import exceptions
|
|
23
20
|
from sky import execution
|
|
24
21
|
from sky import global_user_state
|
|
25
22
|
from sky import sky_logging
|
|
23
|
+
from sky import task as task_lib
|
|
26
24
|
from sky.backends import backend_utils
|
|
27
25
|
from sky.jobs import scheduler as jobs_scheduler
|
|
28
26
|
from sky.serve import constants as serve_constants
|
|
@@ -41,7 +39,6 @@ from sky.utils import status_lib
|
|
|
41
39
|
from sky.utils import ux_utils
|
|
42
40
|
|
|
43
41
|
if typing.TYPE_CHECKING:
|
|
44
|
-
from sky import resources
|
|
45
42
|
from sky.serve import service_spec
|
|
46
43
|
|
|
47
44
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -51,10 +48,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
|
51
48
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
52
49
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
53
50
|
|
|
54
|
-
# Since sky.launch is very resource demanding, we limit the number of
|
|
55
|
-
# concurrent sky.launch process to avoid overloading the machine.
|
|
56
|
-
_MAX_NUM_LAUNCH = psutil.cpu_count() * 2
|
|
57
|
-
|
|
58
51
|
|
|
59
52
|
# TODO(tian): Combine this with
|
|
60
53
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
@@ -81,7 +74,7 @@ def launch_cluster(replica_id: int,
|
|
|
81
74
|
try:
|
|
82
75
|
config = common_utils.read_yaml(
|
|
83
76
|
os.path.expanduser(service_task_yaml_path))
|
|
84
|
-
task =
|
|
77
|
+
task = task_lib.Task.from_yaml_config(config)
|
|
85
78
|
if resources_override is not None:
|
|
86
79
|
resources = task.resources
|
|
87
80
|
overrided_resources = [
|
|
@@ -177,7 +170,7 @@ def terminate_cluster(cluster_name: str,
|
|
|
177
170
|
|
|
178
171
|
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
179
172
|
"""Get the resources ports used by the task."""
|
|
180
|
-
task =
|
|
173
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
181
174
|
# Already checked all ports are valid in sky.serve.core.up
|
|
182
175
|
assert task.resources, task
|
|
183
176
|
assert task.service is not None, task
|
|
@@ -195,7 +188,7 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
195
188
|
if use_spot_override is not None:
|
|
196
189
|
assert isinstance(use_spot_override, bool)
|
|
197
190
|
return use_spot_override
|
|
198
|
-
task =
|
|
191
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
199
192
|
spot_use_resources = [
|
|
200
193
|
resources for resources in task.resources if resources.use_spot
|
|
201
194
|
]
|
|
@@ -204,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
204
197
|
return len(spot_use_resources) == len(task.resources)
|
|
205
198
|
|
|
206
199
|
|
|
200
|
+
# Every function that calls serve_state.add_or_update_replica should acquire
|
|
201
|
+
# this lock. It is to prevent race condition when the replica status is updated
|
|
202
|
+
# by multiple threads at the same time. The modification of replica info is
|
|
203
|
+
# 2 database calls: read the whole replica info object, unpickle it, and modify
|
|
204
|
+
# corresponding fields. Then it is write back to the database. We need to ensure
|
|
205
|
+
# the read-modify-write operation is atomic.
|
|
207
206
|
def with_lock(func):
|
|
208
207
|
|
|
209
208
|
@functools.wraps(func)
|
|
@@ -214,22 +213,6 @@ def with_lock(func):
|
|
|
214
213
|
return wrapper
|
|
215
214
|
|
|
216
215
|
|
|
217
|
-
class ProcessStatus(enum.Enum):
|
|
218
|
-
"""Process status."""
|
|
219
|
-
|
|
220
|
-
# The process is running
|
|
221
|
-
RUNNING = 'RUNNING'
|
|
222
|
-
|
|
223
|
-
# The process is finished and succeeded
|
|
224
|
-
SUCCEEDED = 'SUCCEEDED'
|
|
225
|
-
|
|
226
|
-
# The process is interrupted
|
|
227
|
-
INTERRUPTED = 'INTERRUPTED'
|
|
228
|
-
|
|
229
|
-
# The process failed
|
|
230
|
-
FAILED = 'FAILED'
|
|
231
|
-
|
|
232
|
-
|
|
233
216
|
@dataclasses.dataclass
|
|
234
217
|
class ReplicaStatusProperty:
|
|
235
218
|
"""Some properties that determine replica status.
|
|
@@ -241,15 +224,16 @@ class ReplicaStatusProperty:
|
|
|
241
224
|
first_ready_time: The first time the service is ready.
|
|
242
225
|
sky_down_status: Process status of sky.down.
|
|
243
226
|
"""
|
|
244
|
-
#
|
|
245
|
-
sky_launch_status:
|
|
227
|
+
# sky.launch will always be scheduled on creation of ReplicaStatusProperty.
|
|
228
|
+
sky_launch_status: common_utils.ProcessStatus = (
|
|
229
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
246
230
|
user_app_failed: bool = False
|
|
247
231
|
service_ready_now: bool = False
|
|
248
232
|
# None means readiness probe is not succeeded yet;
|
|
249
233
|
# -1 means the initial delay seconds is exceeded.
|
|
250
234
|
first_ready_time: Optional[float] = None
|
|
251
235
|
# None means sky.down is not called yet.
|
|
252
|
-
sky_down_status: Optional[ProcessStatus] = None
|
|
236
|
+
sky_down_status: Optional[common_utils.ProcessStatus] = None
|
|
253
237
|
# Whether the termination is caused by autoscaler's decision
|
|
254
238
|
is_scale_down: bool = False
|
|
255
239
|
# The replica's spot instance was preempted.
|
|
@@ -304,7 +288,7 @@ class ReplicaStatusProperty:
|
|
|
304
288
|
(1) Job status;
|
|
305
289
|
(2) Readiness probe.
|
|
306
290
|
"""
|
|
307
|
-
if self.sky_launch_status != ProcessStatus.SUCCEEDED:
|
|
291
|
+
if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
|
|
308
292
|
return False
|
|
309
293
|
if self.sky_down_status is not None:
|
|
310
294
|
return False
|
|
@@ -318,37 +302,43 @@ class ReplicaStatusProperty:
|
|
|
318
302
|
|
|
319
303
|
def to_replica_status(self) -> serve_state.ReplicaStatus:
|
|
320
304
|
"""Convert status property to human-readable replica status."""
|
|
321
|
-
|
|
305
|
+
# Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
|
|
306
|
+
# we use None to represent sky.launch is not called yet.
|
|
307
|
+
if (self.sky_launch_status is None or
|
|
308
|
+
self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
|
|
322
309
|
# Pending to launch
|
|
323
310
|
return serve_state.ReplicaStatus.PENDING
|
|
324
|
-
if self.sky_launch_status == ProcessStatus.RUNNING:
|
|
325
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
311
|
+
if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
|
|
312
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
326
313
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
327
|
-
if self.sky_down_status == ProcessStatus.SUCCEEDED:
|
|
314
|
+
if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
|
|
328
315
|
# This indicate it is a scale_down with correct teardown.
|
|
329
316
|
# Should have been cleaned from the replica table.
|
|
330
317
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
331
318
|
# Still launching
|
|
332
319
|
return serve_state.ReplicaStatus.PROVISIONING
|
|
333
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
320
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
334
321
|
# sky.down is running and a scale down interrupted sky.launch
|
|
335
322
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
336
323
|
if self.sky_down_status is not None:
|
|
337
324
|
if self.preempted:
|
|
338
325
|
# Replica (spot) is preempted
|
|
339
326
|
return serve_state.ReplicaStatus.PREEMPTED
|
|
340
|
-
if self.sky_down_status == ProcessStatus.
|
|
327
|
+
if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
|
|
328
|
+
# sky.down is scheduled to run, but not started yet.
|
|
329
|
+
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
330
|
+
if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
|
|
341
331
|
# sky.down is running
|
|
342
332
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
343
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
333
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
344
334
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
345
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
335
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
346
336
|
# sky.down failed
|
|
347
337
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
348
338
|
if self.user_app_failed:
|
|
349
339
|
# Failed on user setup/run
|
|
350
340
|
return serve_state.ReplicaStatus.FAILED
|
|
351
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
341
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
352
342
|
# sky.launch failed
|
|
353
343
|
return serve_state.ReplicaStatus.FAILED_PROVISION
|
|
354
344
|
if self.first_ready_time is None:
|
|
@@ -364,7 +354,7 @@ class ReplicaStatusProperty:
|
|
|
364
354
|
# This indicate it is a scale_down with correct teardown.
|
|
365
355
|
# Should have been cleaned from the replica table.
|
|
366
356
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
367
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
357
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
368
358
|
# sky.launch failed
|
|
369
359
|
# The down process has not been started if it reaches here,
|
|
370
360
|
# due to the `if self.sky_down_status is not None`` check above.
|
|
@@ -688,7 +678,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
688
678
|
service_task_yaml_path: str) -> None:
|
|
689
679
|
super().__init__(service_name, spec)
|
|
690
680
|
self.service_task_yaml_path = service_task_yaml_path
|
|
691
|
-
task =
|
|
681
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
692
682
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
693
683
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
694
684
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
|
@@ -708,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
708
698
|
|
|
709
699
|
self._recover_replica_operations()
|
|
710
700
|
|
|
701
|
+
@with_lock
|
|
711
702
|
def _recover_replica_operations(self):
|
|
712
703
|
"""Let's see are there something to do for ReplicaManager in a
|
|
713
704
|
recovery run"""
|
|
@@ -748,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
748
739
|
# Replica management functions #
|
|
749
740
|
################################
|
|
750
741
|
|
|
751
|
-
#
|
|
752
|
-
#
|
|
753
|
-
@with_lock
|
|
742
|
+
# We don't need to add lock here since every caller of this function
|
|
743
|
+
# will acquire the lock.
|
|
754
744
|
def _launch_replica(
|
|
755
745
|
self,
|
|
756
746
|
replica_id: int,
|
|
@@ -806,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
806
796
|
# to avoid too many sky.launch running at the same time.
|
|
807
797
|
self._launch_process_pool[replica_id] = p
|
|
808
798
|
|
|
799
|
+
@with_lock
|
|
809
800
|
def scale_up(self,
|
|
810
801
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
811
802
|
self._launch_replica(self._next_replica_id, resources_override)
|
|
812
803
|
self._next_replica_id += 1
|
|
813
804
|
|
|
805
|
+
def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
|
|
806
|
+
if exitcode != 0:
|
|
807
|
+
logger.error(f'Down process for replica {info.replica_id} '
|
|
808
|
+
f'exited abnormally with code {exitcode}.')
|
|
809
|
+
info.status_property.sky_down_status = (
|
|
810
|
+
common_utils.ProcessStatus.FAILED)
|
|
811
|
+
else:
|
|
812
|
+
info.status_property.sky_down_status = (
|
|
813
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
814
|
+
# Failed replica still count as a replica. In our current design, we
|
|
815
|
+
# want to fail early if user code have any error. This will prevent
|
|
816
|
+
# infinite loop of teardown and re-provision. However, there is a
|
|
817
|
+
# special case that if the replica is UP for longer than
|
|
818
|
+
# initial_delay_seconds, we assume it is just some random failure and
|
|
819
|
+
# we should restart the replica. Please refer to the implementation of
|
|
820
|
+
# `is_scale_down_succeeded` for more details.
|
|
821
|
+
# TODO(tian): Currently, restart replicas that failed within
|
|
822
|
+
# initial_delay_seconds is not supported. We should add it
|
|
823
|
+
# later when we support `sky serve update`.
|
|
824
|
+
removal_reason = None
|
|
825
|
+
if info.status_property.is_scale_down:
|
|
826
|
+
# This means the cluster is deleted due to an autoscaler
|
|
827
|
+
# decision or the cluster is recovering from preemption.
|
|
828
|
+
# Delete the replica info so it won't count as a replica.
|
|
829
|
+
if info.status_property.preempted:
|
|
830
|
+
removal_reason = 'for preemption recovery'
|
|
831
|
+
else:
|
|
832
|
+
removal_reason = 'normally'
|
|
833
|
+
# Don't keep failed record for version mismatch replicas,
|
|
834
|
+
# since user should fixed the error before update.
|
|
835
|
+
elif info.version != self.latest_version:
|
|
836
|
+
removal_reason = 'for version outdated'
|
|
837
|
+
elif info.status_property.purged:
|
|
838
|
+
removal_reason = 'for purge'
|
|
839
|
+
elif info.status_property.failed_spot_availability:
|
|
840
|
+
removal_reason = 'for spot availability failure'
|
|
841
|
+
else:
|
|
842
|
+
logger.info(f'Termination of replica {info.replica_id} '
|
|
843
|
+
'finished. Replica info is kept since some '
|
|
844
|
+
'failure detected.')
|
|
845
|
+
serve_state.add_or_update_replica(self._service_name,
|
|
846
|
+
info.replica_id, info)
|
|
847
|
+
if removal_reason is not None:
|
|
848
|
+
serve_state.remove_replica(self._service_name, info.replica_id)
|
|
849
|
+
logger.info(f'Replica {info.replica_id} removed from the '
|
|
850
|
+
f'replica table {removal_reason}.')
|
|
851
|
+
|
|
852
|
+
# We don't need to add lock here since every caller of this function
|
|
853
|
+
# will acquire the lock.
|
|
814
854
|
def _terminate_replica(self,
|
|
815
855
|
replica_id: int,
|
|
816
856
|
sync_down_logs: bool,
|
|
@@ -828,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
828
868
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
829
869
|
replica_id)
|
|
830
870
|
assert info is not None
|
|
831
|
-
info.status_property.sky_launch_status =
|
|
871
|
+
info.status_property.sky_launch_status = (
|
|
872
|
+
common_utils.ProcessStatus.INTERRUPTED)
|
|
832
873
|
serve_state.add_or_update_replica(self._service_name, replica_id,
|
|
833
874
|
info)
|
|
834
875
|
launch_process = self._launch_process_pool[replica_id]
|
|
@@ -872,8 +913,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
872
913
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
873
914
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
874
915
|
'replica_jobs')
|
|
875
|
-
|
|
876
|
-
|
|
916
|
+
job_ids = ['1'] if self._is_pool else None
|
|
917
|
+
job_log_file_name = controller_utils.download_and_stream_job_log(
|
|
918
|
+
backend, handle, replica_job_logs_dir, job_ids)
|
|
877
919
|
if job_log_file_name is not None:
|
|
878
920
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
879
921
|
with open(log_file_name, 'a',
|
|
@@ -899,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
899
941
|
|
|
900
942
|
logger.info(f'preempted: {info.status_property.preempted}, '
|
|
901
943
|
f'replica_id: {replica_id}')
|
|
944
|
+
info.status_property.is_scale_down = is_scale_down
|
|
945
|
+
info.status_property.purged = purge
|
|
946
|
+
|
|
947
|
+
# If the cluster does not exist, it means either the cluster never
|
|
948
|
+
# exists (e.g., the cluster is scaled down before it gets a chance to
|
|
949
|
+
# provision) or the cluster is preempted and cleaned up by the status
|
|
950
|
+
# refresh. In this case, we skip spawning a new down process to save
|
|
951
|
+
# controller resources.
|
|
952
|
+
if global_user_state.get_cluster_from_name(info.cluster_name) is None:
|
|
953
|
+
self._handle_sky_down_finish(info, exitcode=0)
|
|
954
|
+
return
|
|
955
|
+
|
|
956
|
+
# Otherwise, start the process to terminate the cluster.
|
|
902
957
|
p = multiprocessing.Process(
|
|
903
958
|
target=ux_utils.RedirectOutputForProcess(terminate_cluster,
|
|
904
959
|
log_file_name, 'a').run,
|
|
905
960
|
args=(info.cluster_name, replica_drain_delay_seconds),
|
|
906
961
|
)
|
|
907
|
-
info.status_property.sky_down_status =
|
|
908
|
-
|
|
909
|
-
info.status_property.purged = purge
|
|
962
|
+
info.status_property.sky_down_status = (
|
|
963
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
910
964
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
911
|
-
p.start()
|
|
912
965
|
self._down_process_pool[replica_id] = p
|
|
913
966
|
|
|
967
|
+
@with_lock
|
|
914
968
|
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
|
915
969
|
self._terminate_replica(
|
|
916
970
|
replica_id,
|
|
@@ -919,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
919
973
|
is_scale_down=True,
|
|
920
974
|
purge=purge)
|
|
921
975
|
|
|
976
|
+
# We don't need to add lock here since every caller of this function
|
|
977
|
+
# will acquire the lock.
|
|
922
978
|
def _handle_preemption(self, info: ReplicaInfo) -> bool:
|
|
923
979
|
"""Handle preemption of the replica if any error happened.
|
|
924
980
|
|
|
@@ -981,7 +1037,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
981
1037
|
# To avoid `dictionary changed size during iteration` error.
|
|
982
1038
|
launch_process_pool_snapshot = list(self._launch_process_pool.items())
|
|
983
1039
|
for replica_id, p in launch_process_pool_snapshot:
|
|
984
|
-
if
|
|
1040
|
+
if p.is_alive():
|
|
1041
|
+
continue
|
|
1042
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
985
1043
|
info = serve_state.get_replica_info_from_id(
|
|
986
1044
|
self._service_name, replica_id)
|
|
987
1045
|
assert info is not None, replica_id
|
|
@@ -989,11 +1047,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
989
1047
|
schedule_next_jobs = False
|
|
990
1048
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
991
1049
|
# sky.launch not started yet
|
|
992
|
-
if
|
|
993
|
-
_MAX_NUM_LAUNCH):
|
|
1050
|
+
if controller_utils.can_provision():
|
|
994
1051
|
p.start()
|
|
995
1052
|
info.status_property.sky_launch_status = (
|
|
996
|
-
ProcessStatus.RUNNING)
|
|
1053
|
+
common_utils.ProcessStatus.RUNNING)
|
|
997
1054
|
else:
|
|
998
1055
|
# sky.launch finished
|
|
999
1056
|
# TODO(tian): Try-catch in process, and have an enum return
|
|
@@ -1010,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1010
1067
|
f'exited abnormally with code {p.exitcode}.'
|
|
1011
1068
|
' Terminating...')
|
|
1012
1069
|
info.status_property.sky_launch_status = (
|
|
1013
|
-
ProcessStatus.FAILED)
|
|
1070
|
+
common_utils.ProcessStatus.FAILED)
|
|
1014
1071
|
error_in_sky_launch = True
|
|
1015
1072
|
else:
|
|
1016
1073
|
info.status_property.sky_launch_status = (
|
|
1017
|
-
ProcessStatus.SUCCEEDED)
|
|
1074
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
1018
1075
|
schedule_next_jobs = True
|
|
1019
1076
|
if self._spot_placer is not None and info.is_spot:
|
|
1020
1077
|
# TODO(tian): Currently, we set the location to
|
|
@@ -1036,69 +1093,36 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1036
1093
|
serve_state.add_or_update_replica(self._service_name,
|
|
1037
1094
|
replica_id, info)
|
|
1038
1095
|
if schedule_next_jobs and self._is_pool:
|
|
1039
|
-
jobs_scheduler.maybe_schedule_next_jobs(
|
|
1040
|
-
pool=self._service_name)
|
|
1096
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1041
1097
|
if error_in_sky_launch:
|
|
1042
1098
|
# Teardown after update replica info since
|
|
1043
1099
|
# _terminate_replica will update the replica info too.
|
|
1044
1100
|
self._terminate_replica(replica_id,
|
|
1045
1101
|
sync_down_logs=True,
|
|
1046
1102
|
replica_drain_delay_seconds=0)
|
|
1103
|
+
# Try schedule next job after acquiring the lock.
|
|
1104
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1047
1105
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
1048
1106
|
for replica_id, p in down_process_pool_snapshot:
|
|
1049
|
-
if
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
info.status_property.sky_down_status = (
|
|
1060
|
-
ProcessStatus.FAILED)
|
|
1061
|
-
else:
|
|
1107
|
+
if p.is_alive():
|
|
1108
|
+
continue
|
|
1109
|
+
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
1110
|
+
replica_id)
|
|
1111
|
+
assert info is not None, replica_id
|
|
1112
|
+
if (info.status_property.sky_down_status ==
|
|
1113
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
1114
|
+
# sky.down not started yet
|
|
1115
|
+
if controller_utils.can_terminate():
|
|
1116
|
+
p.start()
|
|
1062
1117
|
info.status_property.sky_down_status = (
|
|
1063
|
-
ProcessStatus.
|
|
1064
|
-
# Failed replica still count as a replica. In our current
|
|
1065
|
-
# design, we want to fail early if user code have any error.
|
|
1066
|
-
# This will prevent infinite loop of teardown and
|
|
1067
|
-
# re-provision. However, there is a special case that if the
|
|
1068
|
-
# replica is UP for longer than initial_delay_seconds, we
|
|
1069
|
-
# assume it is just some random failure and we should restart
|
|
1070
|
-
# the replica. Please refer to the implementation of
|
|
1071
|
-
# `is_scale_down_succeeded` for more details.
|
|
1072
|
-
# TODO(tian): Currently, restart replicas that failed within
|
|
1073
|
-
# initial_delay_seconds is not supported. We should add it
|
|
1074
|
-
# later when we support `sky serve update`.
|
|
1075
|
-
removal_reason = None
|
|
1076
|
-
if info.status_property.is_scale_down:
|
|
1077
|
-
# This means the cluster is deleted due to an autoscaler
|
|
1078
|
-
# decision or the cluster is recovering from preemption.
|
|
1079
|
-
# Delete the replica info so it won't count as a replica.
|
|
1080
|
-
if info.status_property.preempted:
|
|
1081
|
-
removal_reason = 'for preemption recovery'
|
|
1082
|
-
else:
|
|
1083
|
-
removal_reason = 'normally'
|
|
1084
|
-
# Don't keep failed record for version mismatch replicas,
|
|
1085
|
-
# since user should fixed the error before update.
|
|
1086
|
-
elif info.version != self.latest_version:
|
|
1087
|
-
removal_reason = 'for version outdated'
|
|
1088
|
-
elif info.status_property.purged:
|
|
1089
|
-
removal_reason = 'for purge'
|
|
1090
|
-
elif info.status_property.failed_spot_availability:
|
|
1091
|
-
removal_reason = 'for spot availability failure'
|
|
1092
|
-
else:
|
|
1093
|
-
logger.info(f'Termination of replica {replica_id} '
|
|
1094
|
-
'finished. Replica info is kept since some '
|
|
1095
|
-
'failure detected.')
|
|
1118
|
+
common_utils.ProcessStatus.RUNNING)
|
|
1096
1119
|
serve_state.add_or_update_replica(self._service_name,
|
|
1097
1120
|
replica_id, info)
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1121
|
+
else:
|
|
1122
|
+
logger.info(
|
|
1123
|
+
f'Terminate process for replica {replica_id} finished.')
|
|
1124
|
+
del self._down_process_pool[replica_id]
|
|
1125
|
+
self._handle_sky_down_finish(info, exitcode=p.exitcode)
|
|
1102
1126
|
|
|
1103
1127
|
# Clean old version
|
|
1104
1128
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
@@ -1394,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1394
1418
|
old_config_any_of = old_config.get('resources',
|
|
1395
1419
|
{}).pop('any_of', [])
|
|
1396
1420
|
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
if (normalize_dict_list(old_config_any_of) !=
|
|
1402
|
-
normalize_dict_list(new_config_any_of)):
|
|
1421
|
+
if (resources_utils.normalize_any_of_resources_config(
|
|
1422
|
+
old_config_any_of) != resources_utils.
|
|
1423
|
+
normalize_any_of_resources_config(new_config_any_of)):
|
|
1403
1424
|
logger.info('Replica config changed (any_of), skipping. '
|
|
1404
1425
|
f'old: {old_config_any_of}, '
|
|
1405
1426
|
f'new: {new_config_any_of}')
|
sky/serve/serve_state.py
CHANGED
|
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
|
|
|
502
502
|
return records
|
|
503
503
|
|
|
504
504
|
|
|
505
|
+
@init_db
|
|
506
|
+
def get_num_services() -> int:
|
|
507
|
+
"""Get the number of services."""
|
|
508
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
509
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
510
|
+
return session.execute(
|
|
511
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
512
|
+
).select_from(services_table)).fetchone()[0]
|
|
513
|
+
|
|
514
|
+
|
|
505
515
|
@init_db
|
|
506
516
|
def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
|
|
507
517
|
"""Get all existing service records."""
|
|
@@ -660,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
|
|
|
660
670
|
return provisioning_count
|
|
661
671
|
|
|
662
672
|
|
|
673
|
+
@init_db
|
|
674
|
+
def total_number_terminating_replicas() -> int:
|
|
675
|
+
"""Returns the total number of terminating replicas."""
|
|
676
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
677
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
678
|
+
rows = session.execute(sqlalchemy.select(
|
|
679
|
+
replicas_table.c.replica_info)).fetchall()
|
|
680
|
+
terminating_count = 0
|
|
681
|
+
for row in rows:
|
|
682
|
+
replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
|
|
683
|
+
if (replica_info.status_property.sky_down_status ==
|
|
684
|
+
common_utils.ProcessStatus.RUNNING):
|
|
685
|
+
terminating_count += 1
|
|
686
|
+
return terminating_count
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
@init_db
|
|
690
|
+
def total_number_scheduled_to_terminate_replicas() -> int:
|
|
691
|
+
"""Returns the total number of terminating replicas."""
|
|
692
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
693
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
694
|
+
rows = session.execute(sqlalchemy.select(
|
|
695
|
+
replicas_table.c.replica_info)).fetchall()
|
|
696
|
+
terminating_count = 0
|
|
697
|
+
for row in rows:
|
|
698
|
+
replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
|
|
699
|
+
if (replica_info.status_property.sky_down_status ==
|
|
700
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
701
|
+
terminating_count += 1
|
|
702
|
+
return terminating_count
|
|
703
|
+
|
|
704
|
+
|
|
663
705
|
def get_replicas_at_status(
|
|
664
706
|
service_name: str,
|
|
665
707
|
status: ReplicaStatus,
|