skypilot-nightly 1.0.0.dev20241019__py3-none-any.whl → 1.0.0.dev20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +46 -12
- sky/clouds/azure.py +0 -1
- sky/clouds/gcp.py +3 -0
- sky/clouds/lambda_cloud.py +9 -5
- sky/clouds/oci.py +36 -37
- sky/clouds/service_catalog/oci_catalog.py +22 -0
- sky/clouds/utils/oci_utils.py +11 -1
- sky/provision/docker_utils.py +12 -7
- sky/provision/paperspace/utils.py +2 -0
- sky/resources.py +11 -12
- sky/serve/__init__.py +2 -0
- sky/serve/constants.py +8 -1
- sky/serve/controller.py +70 -0
- sky/serve/core.py +47 -0
- sky/serve/replica_managers.py +13 -4
- sky/serve/serve_utils.py +65 -13
- sky/templates/lambda-ray.yml.j2 +20 -0
- sky/templates/oci-ray.yml.j2 +12 -2
- sky/utils/command_runner.py +3 -1
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/RECORD +26 -26
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '3c3bcee5cfe720a96ab67f4049a557a79e7f077f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241021'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -4380,9 +4380,14 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4380
4380
|
default=False,
|
4381
4381
|
required=False,
|
4382
4382
|
help='Skip confirmation prompt.')
|
4383
|
+
@click.option('--replica-id',
|
4384
|
+
default=None,
|
4385
|
+
type=int,
|
4386
|
+
help='Tear down a given replica')
|
4383
4387
|
# pylint: disable=redefined-builtin
|
4384
|
-
def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool
|
4385
|
-
|
4388
|
+
def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
|
4389
|
+
replica_id: Optional[int]):
|
4390
|
+
"""Teardown service(s) or a replica.
|
4386
4391
|
|
4387
4392
|
SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
|
4388
4393
|
both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence.
|
@@ -4408,6 +4413,12 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
|
|
4408
4413
|
\b
|
4409
4414
|
# Forcefully tear down a service in failed status.
|
4410
4415
|
sky serve down failed-service --purge
|
4416
|
+
\b
|
4417
|
+
# Tear down a specific replica
|
4418
|
+
sky serve down my-service --replica-id 1
|
4419
|
+
\b
|
4420
|
+
# Forcefully tear down a specific replica, even in failed status.
|
4421
|
+
sky serve down my-service --replica-id 1 --purge
|
4411
4422
|
"""
|
4412
4423
|
if sum([len(service_names) > 0, all]) != 1:
|
4413
4424
|
argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len(
|
@@ -4417,22 +4428,45 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
|
|
4417
4428
|
'Can only specify one of SERVICE_NAMES or --all. '
|
4418
4429
|
f'Provided {argument_str!r}.')
|
4419
4430
|
|
4431
|
+
replica_id_is_defined = replica_id is not None
|
4432
|
+
if replica_id_is_defined:
|
4433
|
+
if len(service_names) != 1:
|
4434
|
+
service_names_str = ', '.join(service_names)
|
4435
|
+
raise click.UsageError(f'The --replica-id option can only be used '
|
4436
|
+
f'with a single service name. Got: '
|
4437
|
+
f'{service_names_str}.')
|
4438
|
+
if all:
|
4439
|
+
raise click.UsageError('The --replica-id option cannot be used '
|
4440
|
+
'with the --all option.')
|
4441
|
+
|
4420
4442
|
backend_utils.is_controller_accessible(
|
4421
4443
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
4422
4444
|
stopped_message='All services should have been terminated.',
|
4423
4445
|
exit_if_not_accessible=True)
|
4424
4446
|
|
4425
4447
|
if not yes:
|
4426
|
-
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
|
4432
|
-
|
4433
|
-
|
4434
|
-
|
4435
|
-
|
4448
|
+
if replica_id_is_defined:
|
4449
|
+
click.confirm(
|
4450
|
+
f'Terminating replica ID {replica_id} in '
|
4451
|
+
f'{service_names[0]!r}. Proceed?',
|
4452
|
+
default=True,
|
4453
|
+
abort=True,
|
4454
|
+
show_default=True)
|
4455
|
+
else:
|
4456
|
+
quoted_service_names = [f'{name!r}' for name in service_names]
|
4457
|
+
service_identity_str = (f'service(s) '
|
4458
|
+
f'{", ".join(quoted_service_names)}')
|
4459
|
+
if all:
|
4460
|
+
service_identity_str = 'all services'
|
4461
|
+
click.confirm(f'Terminating {service_identity_str}. Proceed?',
|
4462
|
+
default=True,
|
4463
|
+
abort=True,
|
4464
|
+
show_default=True)
|
4465
|
+
|
4466
|
+
if replica_id_is_defined:
|
4467
|
+
serve_lib.terminate_replica(service_names[0], replica_id, purge)
|
4468
|
+
else:
|
4469
|
+
serve_lib.down(service_names=service_names, all=all, purge=purge)
|
4436
4470
|
|
4437
4471
|
|
4438
4472
|
@serve.command('logs', cls=_DocumentedCodeCommand)
|
sky/clouds/azure.py
CHANGED
@@ -329,7 +329,6 @@ class Azure(clouds.Cloud):
|
|
329
329
|
runcmd:
|
330
330
|
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
|
331
331
|
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
|
332
|
-
- usermod -aG docker skypilot:ssh_user
|
333
332
|
write_files:
|
334
333
|
- path: /etc/apt/apt.conf.d/20auto-upgrades
|
335
334
|
content: |
|
sky/clouds/gcp.py
CHANGED
@@ -477,6 +477,9 @@ class GCP(clouds.Cloud):
|
|
477
477
|
'runtime_version']
|
478
478
|
resources_vars['tpu_node_name'] = r.accelerator_args.get(
|
479
479
|
'tpu_name')
|
480
|
+
# TPU VMs require privileged mode for docker containers to
|
481
|
+
# access TPU devices.
|
482
|
+
resources_vars['docker_run_options'] = ['--privileged']
|
480
483
|
else:
|
481
484
|
# Convert to GCP names:
|
482
485
|
# https://cloud.google.com/compute/docs/gpus
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
|
|
37
37
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
38
38
|
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
|
39
39
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
|
40
|
-
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
|
41
|
-
f'Docker image is currently not supported on {_REPR}. '
|
42
|
-
'You can try running docker command inside the `run` section in task.yaml.'
|
43
|
-
),
|
44
40
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
|
45
41
|
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
|
46
42
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
|
@@ -173,12 +169,20 @@ class Lambda(clouds.Cloud):
|
|
173
169
|
else:
|
174
170
|
custom_resources = None
|
175
171
|
|
176
|
-
|
172
|
+
resources_vars = {
|
177
173
|
'instance_type': resources.instance_type,
|
178
174
|
'custom_resources': custom_resources,
|
179
175
|
'region': region.name,
|
180
176
|
}
|
181
177
|
|
178
|
+
if acc_dict is not None:
|
179
|
+
# Lambda cloud's docker runtime information does not contain
|
180
|
+
# 'nvidia-container-runtime', causing no GPU option is added to
|
181
|
+
# the docker run command. We patch this by adding it here.
|
182
|
+
resources_vars['docker_run_options'] = ['--gpus all']
|
183
|
+
|
184
|
+
return resources_vars
|
185
|
+
|
182
186
|
def _get_feasible_launchable_resources(
|
183
187
|
self, resources: 'resources_lib.Resources'
|
184
188
|
) -> 'resources_utils.FeasibleResources':
|
sky/clouds/oci.py
CHANGED
@@ -17,6 +17,8 @@ History:
|
|
17
17
|
make_deploy_resources_variables(): Bug fix for specify the image_id as
|
18
18
|
the ocid of the image in the task.yaml file, in this case the image_id
|
19
19
|
for the node config should be set to the ocid instead of a dict.
|
20
|
+
- Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
|
21
|
+
Support more OS types additional to ubuntu for OCI resources.
|
20
22
|
"""
|
21
23
|
import json
|
22
24
|
import logging
|
@@ -295,10 +297,21 @@ class OCI(clouds.Cloud):
|
|
295
297
|
cpus=None if cpus is None else float(cpus),
|
296
298
|
disk_tier=resources.disk_tier)
|
297
299
|
|
300
|
+
image_str = self._get_image_str(image_id=resources.image_id,
|
301
|
+
instance_type=resources.instance_type,
|
302
|
+
region=region.name)
|
303
|
+
|
304
|
+
# pylint: disable=import-outside-toplevel
|
305
|
+
from sky.clouds.service_catalog import oci_catalog
|
306
|
+
os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
|
307
|
+
region=region.name)
|
308
|
+
logger.debug(f'OS type for the image {image_str} is {os_type}')
|
309
|
+
|
298
310
|
return {
|
299
311
|
'instance_type': instance_type,
|
300
312
|
'custom_resources': custom_resources,
|
301
313
|
'region': region.name,
|
314
|
+
'os_type': os_type,
|
302
315
|
'cpus': str(cpus),
|
303
316
|
'memory': resources.memory,
|
304
317
|
'disk_size': resources.disk_size,
|
@@ -501,59 +514,45 @@ class OCI(clouds.Cloud):
|
|
501
514
|
region_name: str,
|
502
515
|
instance_type: str,
|
503
516
|
) -> str:
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
image_id_str = image_id[None]
|
509
|
-
else:
|
510
|
-
assert region_name in image_id, image_id
|
511
|
-
image_id_str = image_id[region_name]
|
517
|
+
image_id_str = self._get_image_str(image_id=image_id,
|
518
|
+
instance_type=instance_type,
|
519
|
+
region=region_name)
|
520
|
+
|
512
521
|
if image_id_str.startswith('skypilot:'):
|
513
522
|
image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
|
514
523
|
region_name,
|
515
524
|
clouds='oci')
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
# implementor, we need to find a better way to handle this.
|
523
|
-
raise exceptions.ResourcesUnavailableError(
|
524
|
-
'! ERR: No image found in catalog for region '
|
525
|
-
f'{region_name}. Try setting a valid image_id.')
|
525
|
+
|
526
|
+
# Image_id should be impossible be None, except for the case when
|
527
|
+
# user specify an image tag which does not exist in the image.csv
|
528
|
+
# catalog file which only possible in "test" / "evaluation" phase.
|
529
|
+
# Therefore, we use assert here.
|
530
|
+
assert image_id_str is not None
|
526
531
|
|
527
532
|
logger.debug(f'Got real image_id {image_id_str}')
|
528
533
|
return image_id_str
|
529
534
|
|
530
|
-
def
|
535
|
+
def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]],
|
536
|
+
instance_type: str, region: str):
|
537
|
+
if image_id is None:
|
538
|
+
image_str = self._get_default_image_tag(instance_type)
|
539
|
+
elif None in image_id:
|
540
|
+
image_str = image_id[None]
|
541
|
+
else:
|
542
|
+
assert region in image_id, image_id
|
543
|
+
image_str = image_id[region]
|
544
|
+
return image_str
|
545
|
+
|
546
|
+
def _get_default_image_tag(self, instance_type: str) -> str:
|
531
547
|
acc = self.get_accelerators_from_instance_type(instance_type)
|
532
548
|
|
533
549
|
if acc is None:
|
534
550
|
image_tag = oci_utils.oci_config.get_default_image_tag()
|
535
|
-
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
|
536
|
-
region_name,
|
537
|
-
clouds='oci')
|
538
551
|
else:
|
539
552
|
assert len(acc) == 1, acc
|
540
553
|
image_tag = oci_utils.oci_config.get_default_gpu_image_tag()
|
541
|
-
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
|
542
|
-
region_name,
|
543
|
-
clouds='oci')
|
544
554
|
|
545
|
-
|
546
|
-
logger.debug(
|
547
|
-
f'Got default image_id {image_id_str} from tag {image_tag}')
|
548
|
-
return image_id_str
|
549
|
-
|
550
|
-
# Raise ResourcesUnavailableError to make sure the failover in
|
551
|
-
# CloudVMRayBackend will be correctly triggered.
|
552
|
-
# TODO(zhwu): This is a information leakage to the cloud implementor,
|
553
|
-
# we need to find a better way to handle this.
|
554
|
-
raise exceptions.ResourcesUnavailableError(
|
555
|
-
'ERR: No image found in catalog for region '
|
556
|
-
f'{region_name}. Try update your default image_id settings.')
|
555
|
+
return image_tag
|
557
556
|
|
558
557
|
def get_vpu_from_disktier(
|
559
558
|
self, cpus: Optional[float],
|
@@ -7,6 +7,8 @@ History:
|
|
7
7
|
- Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
|
8
8
|
- Hysun He (hysun.he@oracle.com) @ Jun, 2023: Reduce retry times by
|
9
9
|
excluding those unsubscribed regions.
|
10
|
+
- Hysun He (hysun.he@oracle.com) @ Oct 14, 2024: Bug fix for validation
|
11
|
+
of the Marketplace images
|
10
12
|
"""
|
11
13
|
|
12
14
|
import logging
|
@@ -206,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
|
206
208
|
|
207
209
|
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
208
210
|
"""Returns whether the image tag is valid."""
|
211
|
+
# Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't
|
212
|
+
# check with region for the Marketplace images.
|
213
|
+
df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
|
214
|
+
if df.empty:
|
215
|
+
return False
|
216
|
+
app_catalog_listing_id = df['AppCatalogListingId'].iloc[0]
|
217
|
+
if app_catalog_listing_id:
|
218
|
+
return True
|
209
219
|
return common.is_image_tag_valid_impl(_image_df, tag, region)
|
220
|
+
|
221
|
+
|
222
|
+
def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
223
|
+
del region
|
224
|
+
df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
|
225
|
+
if df.empty:
|
226
|
+
os_type = oci_utils.oci_config.get_default_image_os()
|
227
|
+
else:
|
228
|
+
os_type = df['OS'].iloc[0]
|
229
|
+
|
230
|
+
logger.debug(f'Operation system for the image {tag} is {os_type}')
|
231
|
+
return os_type
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
"""OCI Configuration.
|
2
2
|
History:
|
3
|
-
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
|
4
3
|
- Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
|
4
|
+
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
|
5
|
+
- Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
|
6
|
+
configuration.
|
5
7
|
"""
|
6
8
|
import logging
|
7
9
|
import os
|
@@ -121,5 +123,13 @@ class OCIConfig:
|
|
121
123
|
return skypilot_config.get_nested(
|
122
124
|
('oci', 'default', 'oci_config_profile'), 'DEFAULT')
|
123
125
|
|
126
|
+
@classmethod
|
127
|
+
def get_default_image_os(cls) -> str:
|
128
|
+
# Get the default image OS. Instead of hardcoding, we give a choice to
|
129
|
+
# set the default image OS type in the sky's user-config file. (if not
|
130
|
+
# specified, use the hardcode one at last)
|
131
|
+
return skypilot_config.get_nested(('oci', 'default', 'image_os_type'),
|
132
|
+
'ubuntu')
|
133
|
+
|
124
134
|
|
125
135
|
oci_config = OCIConfig()
|
sky/provision/docker_utils.py
CHANGED
@@ -253,12 +253,13 @@ class DockerInitializer:
|
|
253
253
|
# issue with nvidia container toolkit:
|
254
254
|
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
|
255
255
|
self._run(
|
256
|
-
'
|
256
|
+
'{ which jq || sudo apt update && sudo apt install -y jq; } && '
|
257
|
+
'{ [ -f /etc/docker/daemon.json ] || '
|
257
258
|
'echo "{}" | sudo tee /etc/docker/daemon.json;'
|
258
259
|
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
|
259
260
|
'/etc/docker/daemon.json > /tmp/daemon.json;'
|
260
261
|
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
|
261
|
-
'sudo systemctl restart docker')
|
262
|
+
'sudo systemctl restart docker; } || true')
|
262
263
|
user_docker_run_options = self.docker_config.get('run_options', [])
|
263
264
|
start_command = docker_start_cmds(
|
264
265
|
specific_image,
|
@@ -335,7 +336,11 @@ class DockerInitializer:
|
|
335
336
|
|
336
337
|
def _check_docker_installed(self):
|
337
338
|
no_exist = 'NoExist'
|
339
|
+
# SkyPilot: Add the current user to the docker group first (if needed),
|
340
|
+
# before checking if docker is installed to avoid permission issues.
|
338
341
|
cleaned_output = self._run(
|
342
|
+
'id -nG $USER | grep -qw docker || '
|
343
|
+
'sudo usermod -aG docker $USER > /dev/null 2>&1;'
|
339
344
|
f'command -v {self.docker_cmd} || echo {no_exist!r}')
|
340
345
|
if no_exist in cleaned_output or 'docker' not in cleaned_output:
|
341
346
|
logger.error(
|
@@ -424,8 +429,8 @@ class DockerInitializer:
|
|
424
429
|
def _check_container_exited(self) -> bool:
|
425
430
|
if self.initialized:
|
426
431
|
return True
|
427
|
-
output =
|
428
|
-
|
429
|
-
|
430
|
-
return 'false' in output.lower(
|
431
|
-
|
432
|
+
output = self._run(check_docker_running_cmd(self.container_name,
|
433
|
+
self.docker_cmd),
|
434
|
+
wait_for_docker_daemon=True)
|
435
|
+
return ('false' in output.lower() and
|
436
|
+
'no such object' not in output.lower())
|
@@ -132,6 +132,8 @@ class PaperspaceCloudClient:
|
|
132
132
|
'apt-get update \n'
|
133
133
|
'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
|
134
134
|
'fi \n'
|
135
|
+
# TODO(tian): Maybe remove this as well since we are now adding
|
136
|
+
# users to docker group in the DockerInitializer. Need to test.
|
135
137
|
'usermod -aG docker paperspace \n'
|
136
138
|
f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
|
137
139
|
try:
|
sky/resources.py
CHANGED
@@ -842,12 +842,6 @@ class Resources:
|
|
842
842
|
|
843
843
|
if self.extract_docker_image() is not None:
|
844
844
|
# TODO(tian): validate the docker image exists / of reasonable size
|
845
|
-
if self.accelerators is not None:
|
846
|
-
for acc in self.accelerators.keys():
|
847
|
-
if acc.lower().startswith('tpu'):
|
848
|
-
with ux_utils.print_exception_no_traceback():
|
849
|
-
raise ValueError(
|
850
|
-
'Docker image is not supported for TPU VM.')
|
851
845
|
if self.cloud is not None:
|
852
846
|
self.cloud.check_features_are_supported(
|
853
847
|
self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE})
|
@@ -1032,6 +1026,12 @@ class Resources:
|
|
1032
1026
|
self.accelerators is not None):
|
1033
1027
|
initial_setup_commands = [constants.DISABLE_GPU_ECC_COMMAND]
|
1034
1028
|
|
1029
|
+
docker_image = self.extract_docker_image()
|
1030
|
+
|
1031
|
+
# Cloud specific variables
|
1032
|
+
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1033
|
+
self, cluster_name, region, zones, dryrun)
|
1034
|
+
|
1035
1035
|
# Docker run options
|
1036
1036
|
docker_run_options = skypilot_config.get_nested(
|
1037
1037
|
('docker', 'run_options'),
|
@@ -1039,18 +1039,17 @@ class Resources:
|
|
1039
1039
|
override_configs=self.cluster_config_overrides)
|
1040
1040
|
if isinstance(docker_run_options, str):
|
1041
1041
|
docker_run_options = [docker_run_options]
|
1042
|
+
# Special accelerator runtime might require additional docker run
|
1043
|
+
# options. e.g., for TPU, we need --privileged.
|
1044
|
+
if 'docker_run_options' in cloud_specific_variables:
|
1045
|
+
docker_run_options.extend(
|
1046
|
+
cloud_specific_variables['docker_run_options'])
|
1042
1047
|
if docker_run_options and isinstance(self.cloud, clouds.Kubernetes):
|
1043
1048
|
logger.warning(
|
1044
1049
|
f'{colorama.Style.DIM}Docker run options are specified, '
|
1045
1050
|
'but ignored for Kubernetes: '
|
1046
1051
|
f'{" ".join(docker_run_options)}'
|
1047
1052
|
f'{colorama.Style.RESET_ALL}')
|
1048
|
-
|
1049
|
-
docker_image = self.extract_docker_image()
|
1050
|
-
|
1051
|
-
# Cloud specific variables
|
1052
|
-
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1053
|
-
self, cluster_name, region, zones, dryrun)
|
1054
1053
|
return dict(
|
1055
1054
|
cloud_specific_variables,
|
1056
1055
|
**{
|
sky/serve/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from sky.serve.constants import SKYSERVE_METADATA_DIR
|
|
8
8
|
from sky.serve.core import down
|
9
9
|
from sky.serve.core import status
|
10
10
|
from sky.serve.core import tail_logs
|
11
|
+
from sky.serve.core import terminate_replica
|
11
12
|
from sky.serve.core import up
|
12
13
|
from sky.serve.core import update
|
13
14
|
from sky.serve.serve_state import ReplicaStatus
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
42
43
|
'SKY_SERVE_CONTROLLER_NAME',
|
43
44
|
'SKYSERVE_METADATA_DIR',
|
44
45
|
'status',
|
46
|
+
'terminate_replica',
|
45
47
|
'tail_logs',
|
46
48
|
'up',
|
47
49
|
'update',
|
sky/serve/constants.py
CHANGED
@@ -92,4 +92,11 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
92
92
|
# change for the serve_utils.ServeCodeGen, we need to bump this version, so that
|
93
93
|
# the user can be notified to update their SkyPilot serve version on the remote
|
94
94
|
# cluster.
|
95
|
-
|
95
|
+
# Changelog:
|
96
|
+
# v1.0 - Introduce rolling update.
|
97
|
+
# v2.0 - Added template-replica feature.
|
98
|
+
SERVE_VERSION = 2
|
99
|
+
|
100
|
+
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
101
|
+
'The version of service is outdated and does not support manually '
|
102
|
+
'terminating replicas. Please terminate the service and spin up again.')
|
sky/serve/controller.py
CHANGED
@@ -9,6 +9,7 @@ import time
|
|
9
9
|
import traceback
|
10
10
|
from typing import Any, Dict, List
|
11
11
|
|
12
|
+
import colorama
|
12
13
|
import fastapi
|
13
14
|
from fastapi import responses
|
14
15
|
import uvicorn
|
@@ -157,6 +158,75 @@ class SkyServeController:
|
|
157
158
|
return responses.JSONResponse(content={'message': 'Error'},
|
158
159
|
status_code=500)
|
159
160
|
|
161
|
+
@self._app.post('/controller/terminate_replica')
|
162
|
+
async def terminate_replica(
|
163
|
+
request: fastapi.Request) -> fastapi.Response:
|
164
|
+
request_data = await request.json()
|
165
|
+
replica_id = request_data['replica_id']
|
166
|
+
assert isinstance(replica_id,
|
167
|
+
int), 'Error: replica ID must be an integer.'
|
168
|
+
purge = request_data['purge']
|
169
|
+
assert isinstance(purge, bool), 'Error: purge must be a boolean.'
|
170
|
+
replica_info = serve_state.get_replica_info_from_id(
|
171
|
+
self._service_name, replica_id)
|
172
|
+
assert replica_info is not None, (f'Error: replica '
|
173
|
+
f'{replica_id} does not exist.')
|
174
|
+
replica_status = replica_info.status
|
175
|
+
|
176
|
+
if replica_status == serve_state.ReplicaStatus.SHUTTING_DOWN:
|
177
|
+
return responses.JSONResponse(
|
178
|
+
status_code=409,
|
179
|
+
content={
|
180
|
+
'message':
|
181
|
+
f'Replica {replica_id} of service '
|
182
|
+
f'{self._service_name!r} is already in the process '
|
183
|
+
f'of terminating. Skip terminating now.'
|
184
|
+
})
|
185
|
+
|
186
|
+
if (replica_status in serve_state.ReplicaStatus.failed_statuses()
|
187
|
+
and not purge):
|
188
|
+
return responses.JSONResponse(
|
189
|
+
status_code=409,
|
190
|
+
content={
|
191
|
+
'message': f'{colorama.Fore.YELLOW}Replica '
|
192
|
+
f'{replica_id} of service '
|
193
|
+
f'{self._service_name!r} is in failed '
|
194
|
+
f'status ({replica_info.status}). '
|
195
|
+
f'Skipping its termination as it could '
|
196
|
+
f'lead to a resource leak. '
|
197
|
+
f'(Use `sky serve down '
|
198
|
+
f'{self._service_name!r} --replica-id '
|
199
|
+
f'{replica_id} --purge` to '
|
200
|
+
'forcefully terminate the replica.)'
|
201
|
+
f'{colorama.Style.RESET_ALL}'
|
202
|
+
})
|
203
|
+
|
204
|
+
self._replica_manager.scale_down(replica_id, purge=purge)
|
205
|
+
|
206
|
+
action = 'terminated' if not purge else 'purged'
|
207
|
+
message = (f'{colorama.Fore.GREEN}Replica {replica_id} of service '
|
208
|
+
f'{self._service_name!r} is scheduled to be '
|
209
|
+
f'{action}.{colorama.Style.RESET_ALL}\n'
|
210
|
+
f'Please use {ux_utils.BOLD}sky serve status '
|
211
|
+
f'{self._service_name}{ux_utils.RESET_BOLD} '
|
212
|
+
f'to check the latest status.')
|
213
|
+
return responses.JSONResponse(status_code=200,
|
214
|
+
content={'message': message})
|
215
|
+
|
216
|
+
@self._app.exception_handler(Exception)
|
217
|
+
async def validation_exception_handler(
|
218
|
+
request: fastapi.Request, exc: Exception) -> fastapi.Response:
|
219
|
+
with ux_utils.enable_traceback():
|
220
|
+
logger.error(f'Error in controller: {exc!r}')
|
221
|
+
return responses.JSONResponse(
|
222
|
+
status_code=500,
|
223
|
+
content={
|
224
|
+
'message':
|
225
|
+
(f'Failed method {request.method} at URL {request.url}.'
|
226
|
+
f' Exception message is {exc!r}.')
|
227
|
+
},
|
228
|
+
)
|
229
|
+
|
160
230
|
threading.Thread(target=self._run_autoscaler).start()
|
161
231
|
|
162
232
|
logger.info('SkyServe Controller started on '
|
sky/serve/core.py
CHANGED
@@ -503,6 +503,53 @@ def down(
|
|
503
503
|
sky_logging.print(stdout)
|
504
504
|
|
505
505
|
|
506
|
+
@usage_lib.entrypoint
|
507
|
+
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
508
|
+
"""Tear down a specific replica for the given service.
|
509
|
+
|
510
|
+
Args:
|
511
|
+
service_name: Name of the service.
|
512
|
+
replica_id: ID of replica to terminate.
|
513
|
+
purge: Whether to terminate replicas in a failed status. These replicas
|
514
|
+
may lead to resource leaks, so we require the user to explicitly
|
515
|
+
specify this flag to make sure they are aware of this potential
|
516
|
+
resource leak.
|
517
|
+
|
518
|
+
Raises:
|
519
|
+
sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
|
520
|
+
RuntimeError: if failed to terminate the replica.
|
521
|
+
"""
|
522
|
+
handle = backend_utils.is_controller_accessible(
|
523
|
+
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
524
|
+
stopped_message=
|
525
|
+
'No service is running now. Please spin up a service first.',
|
526
|
+
non_existent_message='No service is running now. '
|
527
|
+
'Please spin up a service first.',
|
528
|
+
)
|
529
|
+
|
530
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
531
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
532
|
+
|
533
|
+
code = serve_utils.ServeCodeGen.terminate_replica(service_name, replica_id,
|
534
|
+
purge)
|
535
|
+
returncode, stdout, stderr = backend.run_on_head(handle,
|
536
|
+
code,
|
537
|
+
require_outputs=True,
|
538
|
+
stream_logs=False,
|
539
|
+
separate_stderr=True)
|
540
|
+
|
541
|
+
try:
|
542
|
+
subprocess_utils.handle_returncode(returncode,
|
543
|
+
code,
|
544
|
+
'Failed to terminate the replica',
|
545
|
+
stderr,
|
546
|
+
stream_logs=True)
|
547
|
+
except exceptions.CommandError as e:
|
548
|
+
raise RuntimeError(e.error_msg) from e
|
549
|
+
|
550
|
+
sky_logging.print(stdout)
|
551
|
+
|
552
|
+
|
506
553
|
@usage_lib.entrypoint
|
507
554
|
def status(
|
508
555
|
service_names: Optional[Union[str,
|
sky/serve/replica_managers.py
CHANGED
@@ -247,6 +247,8 @@ class ReplicaStatusProperty:
|
|
247
247
|
is_scale_down: bool = False
|
248
248
|
# The replica's spot instance was preempted.
|
249
249
|
preempted: bool = False
|
250
|
+
# Whether the replica is purged.
|
251
|
+
purged: bool = False
|
250
252
|
|
251
253
|
def remove_terminated_replica(self) -> bool:
|
252
254
|
"""Whether to remove the replica record from the replica table.
|
@@ -307,6 +309,8 @@ class ReplicaStatusProperty:
|
|
307
309
|
return False
|
308
310
|
if self.preempted:
|
309
311
|
return False
|
312
|
+
if self.purged:
|
313
|
+
return False
|
310
314
|
return True
|
311
315
|
|
312
316
|
def to_replica_status(self) -> serve_state.ReplicaStatus:
|
@@ -590,7 +594,7 @@ class ReplicaManager:
|
|
590
594
|
"""
|
591
595
|
raise NotImplementedError
|
592
596
|
|
593
|
-
def scale_down(self, replica_id: int) -> None:
|
597
|
+
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
594
598
|
"""Scale down replica with replica_id."""
|
595
599
|
raise NotImplementedError
|
596
600
|
|
@@ -679,7 +683,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
679
683
|
replica_id: int,
|
680
684
|
sync_down_logs: bool,
|
681
685
|
replica_drain_delay_seconds: int,
|
682
|
-
is_scale_down: bool = False
|
686
|
+
is_scale_down: bool = False,
|
687
|
+
purge: bool = False) -> None:
|
683
688
|
|
684
689
|
if replica_id in self._launch_process_pool:
|
685
690
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
@@ -763,16 +768,18 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
763
768
|
)
|
764
769
|
info.status_property.sky_down_status = ProcessStatus.RUNNING
|
765
770
|
info.status_property.is_scale_down = is_scale_down
|
771
|
+
info.status_property.purged = purge
|
766
772
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
767
773
|
p.start()
|
768
774
|
self._down_process_pool[replica_id] = p
|
769
775
|
|
770
|
-
def scale_down(self, replica_id: int) -> None:
|
776
|
+
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
771
777
|
self._terminate_replica(
|
772
778
|
replica_id,
|
773
779
|
sync_down_logs=False,
|
774
780
|
replica_drain_delay_seconds=_DEFAULT_DRAIN_SECONDS,
|
775
|
-
is_scale_down=True
|
781
|
+
is_scale_down=True,
|
782
|
+
purge=purge)
|
776
783
|
|
777
784
|
def _handle_preemption(self, info: ReplicaInfo) -> bool:
|
778
785
|
"""Handle preemption of the replica if any error happened.
|
@@ -911,6 +918,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
911
918
|
# since user should fixed the error before update.
|
912
919
|
elif info.version != self.latest_version:
|
913
920
|
removal_reason = 'for version outdated'
|
921
|
+
elif info.status_property.purged:
|
922
|
+
removal_reason = 'for purge'
|
914
923
|
else:
|
915
924
|
logger.info(f'Termination of replica {replica_id} '
|
916
925
|
'finished. Replica info is kept since some '
|
sky/serve/serve_utils.py
CHANGED
@@ -246,9 +246,11 @@ def set_service_status_and_active_versions_from_replica(
|
|
246
246
|
update_mode: UpdateMode) -> None:
|
247
247
|
record = serve_state.get_service_from_name(service_name)
|
248
248
|
if record is None:
|
249
|
-
|
250
|
-
|
251
|
-
|
249
|
+
with ux_utils.print_exception_no_traceback():
|
250
|
+
raise ValueError(
|
251
|
+
'The service is up-ed in an old version and does not '
|
252
|
+
'support update. Please `sky serve down` '
|
253
|
+
'it first and relaunch the service.')
|
252
254
|
if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
|
253
255
|
# When the service is shutting down, there is a period of time which the
|
254
256
|
# controller still responds to the request, and the replica is not
|
@@ -289,7 +291,8 @@ def update_service_status() -> None:
|
|
289
291
|
def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
290
292
|
service_status = _get_service_status(service_name)
|
291
293
|
if service_status is None:
|
292
|
-
|
294
|
+
with ux_utils.print_exception_no_traceback():
|
295
|
+
raise ValueError(f'Service {service_name!r} does not exist.')
|
293
296
|
controller_port = service_status['controller_port']
|
294
297
|
resp = requests.post(
|
295
298
|
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
@@ -299,20 +302,56 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
|
299
302
|
'mode': mode,
|
300
303
|
})
|
301
304
|
if resp.status_code == 404:
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
+
with ux_utils.print_exception_no_traceback():
|
306
|
+
raise ValueError(
|
307
|
+
'The service is up-ed in an old version and does not '
|
308
|
+
'support update. Please `sky serve down` '
|
309
|
+
'it first and relaunch the service. ')
|
305
310
|
elif resp.status_code == 400:
|
306
|
-
|
311
|
+
with ux_utils.print_exception_no_traceback():
|
312
|
+
raise ValueError(f'Client error during service update: {resp.text}')
|
307
313
|
elif resp.status_code == 500:
|
308
|
-
|
314
|
+
with ux_utils.print_exception_no_traceback():
|
315
|
+
raise RuntimeError(
|
316
|
+
f'Server error during service update: {resp.text}')
|
309
317
|
elif resp.status_code != 200:
|
310
|
-
|
318
|
+
with ux_utils.print_exception_no_traceback():
|
319
|
+
raise ValueError(f'Failed to update service: {resp.text}')
|
311
320
|
|
312
321
|
service_msg = resp.json()['message']
|
313
322
|
return common_utils.encode_payload(service_msg)
|
314
323
|
|
315
324
|
|
325
|
+
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
326
|
+
service_status = _get_service_status(service_name)
|
327
|
+
if service_status is None:
|
328
|
+
with ux_utils.print_exception_no_traceback():
|
329
|
+
raise ValueError(f'Service {service_name!r} does not exist.')
|
330
|
+
replica_info = serve_state.get_replica_info_from_id(service_name,
|
331
|
+
replica_id)
|
332
|
+
if replica_info is None:
|
333
|
+
with ux_utils.print_exception_no_traceback():
|
334
|
+
raise ValueError(
|
335
|
+
f'Replica {replica_id} for service {service_name} does not '
|
336
|
+
'exist.')
|
337
|
+
|
338
|
+
controller_port = service_status['controller_port']
|
339
|
+
resp = requests.post(
|
340
|
+
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
341
|
+
'/controller/terminate_replica',
|
342
|
+
json={
|
343
|
+
'replica_id': replica_id,
|
344
|
+
'purge': purge,
|
345
|
+
})
|
346
|
+
|
347
|
+
message: str = resp.json()['message']
|
348
|
+
if resp.status_code != 200:
|
349
|
+
with ux_utils.print_exception_no_traceback():
|
350
|
+
raise ValueError(f'Failed to terminate replica {replica_id} '
|
351
|
+
f'in {service_name}. Reason:\n{message}')
|
352
|
+
return message
|
353
|
+
|
354
|
+
|
316
355
|
def _get_service_status(
|
317
356
|
service_name: str,
|
318
357
|
with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
|
@@ -644,8 +683,9 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
644
683
|
for info in replica_info:
|
645
684
|
if info.replica_id == replica_id:
|
646
685
|
return info.status
|
647
|
-
|
648
|
-
|
686
|
+
with ux_utils.print_exception_no_traceback():
|
687
|
+
raise ValueError(
|
688
|
+
_FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
|
649
689
|
|
650
690
|
finish_stream = (
|
651
691
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
@@ -735,7 +775,7 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
|
|
735
775
|
|
736
776
|
|
737
777
|
def get_endpoint(service_record: Dict[str, Any]) -> str:
|
738
|
-
# Don't use backend_utils.
|
778
|
+
# Don't use backend_utils.is_controller_accessible since it is too slow.
|
739
779
|
handle = global_user_state.get_handle_from_cluster_name(
|
740
780
|
SKY_SERVE_CONTROLLER_NAME)
|
741
781
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
@@ -915,6 +955,18 @@ class ServeCodeGen:
|
|
915
955
|
]
|
916
956
|
return cls._build(code)
|
917
957
|
|
958
|
+
@classmethod
|
959
|
+
def terminate_replica(cls, service_name: str, replica_id: int,
|
960
|
+
purge: bool) -> str:
|
961
|
+
code = [
|
962
|
+
f'(lambda: print(serve_utils.terminate_replica({service_name!r}, '
|
963
|
+
f'{replica_id}, {purge}), end="", flush=True) '
|
964
|
+
'if getattr(constants, "SERVE_VERSION", 0) >= 2 else '
|
965
|
+
f'exec("raise RuntimeError('
|
966
|
+
f'{constants.TERMINATE_REPLICA_VERSION_MISMATCH_ERROR!r})"))()'
|
967
|
+
]
|
968
|
+
return cls._build(code)
|
969
|
+
|
918
970
|
@classmethod
|
919
971
|
def wait_service_registration(cls, service_name: str, job_id: int) -> str:
|
920
972
|
code = [
|
sky/templates/lambda-ray.yml.j2
CHANGED
@@ -5,6 +5,26 @@ max_workers: {{num_nodes - 1}}
|
|
5
5
|
upscaling_speed: {{num_nodes - 1}}
|
6
6
|
idle_timeout_minutes: 60
|
7
7
|
|
8
|
+
{%- if docker_image is not none %}
|
9
|
+
docker:
|
10
|
+
image: {{docker_image}}
|
11
|
+
container_name: {{docker_container_name}}
|
12
|
+
run_options:
|
13
|
+
- --ulimit nofile=1048576:1048576
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
26
|
+
{%- endif %}
|
27
|
+
|
8
28
|
provider:
|
9
29
|
type: external
|
10
30
|
module: sky.provision.lambda
|
sky/templates/oci-ray.yml.j2
CHANGED
@@ -16,7 +16,11 @@ provider:
|
|
16
16
|
disable_launch_config_check: true
|
17
17
|
|
18
18
|
auth:
|
19
|
+
{% if os_type == "ubuntu" %}
|
19
20
|
ssh_user: ubuntu
|
21
|
+
{% else %}
|
22
|
+
ssh_user: opc
|
23
|
+
{% endif %}
|
20
24
|
ssh_private_key: {{ssh_private_key}}
|
21
25
|
|
22
26
|
available_node_types:
|
@@ -85,14 +89,20 @@ setup_commands:
|
|
85
89
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
86
90
|
# Line 'mkdir -p ..': disable host key check
|
87
91
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
88
|
-
-
|
92
|
+
- echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true;
|
93
|
+
{%- if os_type == "ubuntu" %}
|
94
|
+
sudo systemctl stop unattended-upgrades || true;
|
89
95
|
sudo systemctl disable unattended-upgrades || true;
|
90
96
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
91
97
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
92
98
|
sudo pkill -9 apt-get;
|
93
99
|
sudo pkill -9 dpkg;
|
94
100
|
sudo dpkg --configure -a;
|
95
|
-
|
101
|
+
{%- else %}
|
102
|
+
sudo /usr/libexec/oci-growfs -y || true;
|
103
|
+
sudo systemctl stop firewalld || true;
|
104
|
+
sudo systemctl disable firewalld || true;
|
105
|
+
{%- endif %}
|
96
106
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
97
107
|
{{ conda_installation_commands }}
|
98
108
|
{{ ray_skypilot_installation_commands }}
|
sky/utils/command_runner.py
CHANGED
@@ -502,8 +502,10 @@ class SSHCommandRunner(CommandRunner):
|
|
502
502
|
if self.ssh_control_name is not None:
|
503
503
|
control_path = _ssh_control_path(self.ssh_control_name)
|
504
504
|
if control_path is not None:
|
505
|
+
# Suppress the `Exit request sent.` output for this comamnd
|
506
|
+
# which would interrupt the CLI spinner.
|
505
507
|
cmd = (f'ssh -O exit -S {control_path}/%C '
|
506
|
-
f'{self.ssh_user}@{self.ip}')
|
508
|
+
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
|
507
509
|
logger.debug(f'Closing cached connection {control_path!r} with '
|
508
510
|
f'cmd: {cmd}')
|
509
511
|
log_lib.run_with_log(cmd,
|
{skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=3RKD64rxAs9PlurlTvQY9SSAILIPEznrfVw4n_oyctk,5854
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=XcQeVtH5J7xcARGagYicmTUfd2145jN2nvnZaZXtZlI,209981
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
@@ -10,7 +10,7 @@ sky/exceptions.py,sha256=D7WARzYRt4dGjXo6gI-gzkoodZbKF1D-qncm_DbHB28,8846
|
|
10
10
|
sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
|
11
11
|
sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
|
12
12
|
sky/optimizer.py,sha256=OzxWiA6ZC0tyJ1eNMy4e72vitjfLKfbOLF9ywZOccXU,59343
|
13
|
-
sky/resources.py,sha256=
|
13
|
+
sky/resources.py,sha256=Qk_CYvLO8OFsnRLqXu-nG6qXfJEZ2aBMzxFJHYaXTvE,67398
|
14
14
|
sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
|
15
15
|
sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
|
16
16
|
sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
|
@@ -41,16 +41,16 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
|
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
43
|
sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
|
44
|
-
sky/clouds/azure.py,sha256=
|
44
|
+
sky/clouds/azure.py,sha256=jTgynKU5tuOyBe97n2I7_k9P0Sw0QFU-6wLDLFwQhfM,28634
|
45
45
|
sky/clouds/cloud.py,sha256=PPk-Cbf1YbJT8bswcQLtPBtko02OWrRGJKkLzDpytTI,34858
|
46
46
|
sky/clouds/cloud_registry.py,sha256=4yQMv-iBSgyN5aNL4Qxbn0JVE-dkVoEUIgj7S1z9S_Q,955
|
47
47
|
sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
|
48
48
|
sky/clouds/fluidstack.py,sha256=iOmoOx52yTrHKMzwBDaxFJCfNo79M61d5tj-Np24Lyc,12436
|
49
|
-
sky/clouds/gcp.py,sha256=
|
49
|
+
sky/clouds/gcp.py,sha256=m_dH04HqgU-DdW4R9wrSr66IpPt9JMKHEvHEGGFpeRo,54655
|
50
50
|
sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
|
51
51
|
sky/clouds/kubernetes.py,sha256=aWoXWR-S4puZHzuUHroLKxLdTpkqU7j75dQlXECnsmE,28679
|
52
|
-
sky/clouds/lambda_cloud.py,sha256=
|
53
|
-
sky/clouds/oci.py,sha256=
|
52
|
+
sky/clouds/lambda_cloud.py,sha256=11dKUSunHUgaPZ1t8O85X29_NJ-o26sCt5DjwAPFgl4,12697
|
53
|
+
sky/clouds/oci.py,sha256=ecVgcbCVJwDLtaYXs-yGDzwPYRr23KvjnzFOXwaY2O0,26914
|
54
54
|
sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
|
55
55
|
sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
|
56
56
|
sky/clouds/scp.py,sha256=2KLTuNSMdBzK8CLwSesv7efOuiLidIMoyNG4AOt5Sqw,15870
|
@@ -67,7 +67,7 @@ sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOi
|
|
67
67
|
sky/clouds/service_catalog/ibm_catalog.py,sha256=0dzjmXABFECzaAuIa0E6pVINhVK6-G6U52Mj-L45gK8,4472
|
68
68
|
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=6OocEUkgyJtBgHwzu4RPsvru6pj6RwGU-4uSFNQmsSM,8254
|
69
69
|
sky/clouds/service_catalog/lambda_catalog.py,sha256=BAhUGqHj8aVe1zUhEQNO7bQUhcd9jAespGvPyQubTJY,5281
|
70
|
-
sky/clouds/service_catalog/oci_catalog.py,sha256=
|
70
|
+
sky/clouds/service_catalog/oci_catalog.py,sha256=AG1mOgc-iWaX4zapONWMZPNd2RCKCsaNOyFc0eq_LFU,8551
|
71
71
|
sky/clouds/service_catalog/paperspace_catalog.py,sha256=W8GgGlPbbWViELQ8EZfmIkxSbeQcCmMRUX4ecIIYDsk,3768
|
72
72
|
sky/clouds/service_catalog/runpod_catalog.py,sha256=NwZlolzihZeRxQKYIDhoXeUkJ3BSH1S6B_DszNDXT1g,4184
|
73
73
|
sky/clouds/service_catalog/scp_catalog.py,sha256=4XnaZE5Q4XrrNnDnVhsHkH6jxmWXBeQqa9QqKqHKjSI,5174
|
@@ -84,7 +84,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DS
|
|
84
84
|
sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
|
86
86
|
sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
|
87
|
-
sky/clouds/utils/oci_utils.py,sha256=
|
87
|
+
sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
|
88
88
|
sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
|
89
89
|
sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
|
90
90
|
sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
|
@@ -105,7 +105,7 @@ sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx
|
|
105
105
|
sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
|
106
106
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
107
107
|
sky/provision/constants.py,sha256=DvHj3wpqdpaSBHMOGIfVWLLWGJoz0eOQAx73DwYMNEk,531
|
108
|
-
sky/provision/docker_utils.py,sha256=
|
108
|
+
sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
|
109
109
|
sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
|
110
110
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
111
111
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
@@ -151,7 +151,7 @@ sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByx
|
|
151
151
|
sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
|
152
152
|
sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
|
153
153
|
sky/provision/paperspace/instance.py,sha256=q_V01DZSMXLfy63Zwt6AQotq02JuXQZb5CHS_JttlwE,12046
|
154
|
-
sky/provision/paperspace/utils.py,sha256=
|
154
|
+
sky/provision/paperspace/utils.py,sha256=uOmxbDKjV6skFizC4gYXSxRuEqso5ck2kF7MbtNmhEs,9580
|
155
155
|
sky/provision/runpod/__init__.py,sha256=6HYvHI27EaLrX1SS0vWVhdLu5HDBeZCdvAeDJuwM5pk,556
|
156
156
|
sky/provision/runpod/config.py,sha256=9ulZJVL7nHuxhTdoj8D7lNn7SdicJ5zc6FIcHIG9tcg,321
|
157
157
|
sky/provision/runpod/instance.py,sha256=ucmFQEzapbxylsl6K9EUo7bHTZYzvfECo6tpJc-MFrw,9577
|
@@ -171,16 +171,16 @@ sky/provision/vsphere/common/service_manager_factory.py,sha256=YkvfHiRXFK_Nb406z
|
|
171
171
|
sky/provision/vsphere/common/ssl_helper.py,sha256=TYzN9K0i_Mk_17PKGyGPgvOGfoizysuuIeYapcy_tWE,795
|
172
172
|
sky/provision/vsphere/common/vapiconnect.py,sha256=R2I1ZWBA19d11fZ_FrIzQT8E1aLl1HU4Rdcj8Z5r3NE,2932
|
173
173
|
sky/provision/vsphere/common/vim_utils.py,sha256=EMWLS8ILpdx6XwUZ9I53y0B_1yFrRrlr4jjIMT84hAc,17877
|
174
|
-
sky/serve/__init__.py,sha256=
|
174
|
+
sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
|
175
175
|
sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
|
176
|
-
sky/serve/constants.py,sha256=
|
177
|
-
sky/serve/controller.py,sha256=
|
178
|
-
sky/serve/core.py,sha256=
|
176
|
+
sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
|
177
|
+
sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
|
178
|
+
sky/serve/core.py,sha256=pz62ERWyHcg2p-rtzVjBZaWmKrK6Hx213YPoa_J5Tlo,31097
|
179
179
|
sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
|
180
180
|
sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
|
181
|
-
sky/serve/replica_managers.py,sha256=
|
181
|
+
sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
|
182
182
|
sky/serve/serve_state.py,sha256=5BZSKKKxQRk-0mku17Ch4Veu4qOhaFvaOJY3zrZCkLw,19315
|
183
|
-
sky/serve/serve_utils.py,sha256=
|
183
|
+
sky/serve/serve_utils.py,sha256=9tqh7i-99Kll-24sKhfjEzjTOnGXWJQdeqIyNkFVoMo,40180
|
184
184
|
sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
|
185
185
|
sky/serve/service_spec.py,sha256=iRhW95SERvb4NWtV10uCuhgvW31HuSAmZZ55OX0WK8s,15309
|
186
186
|
sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
|
@@ -230,9 +230,9 @@ sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8w
|
|
230
230
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
|
231
231
|
sky/templates/kubernetes-ray.yml.j2,sha256=Wq9luXc6-t141uyHbtOy1IDmLMM0PBbePTZfZEtAKw0,18160
|
232
232
|
sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
|
233
|
-
sky/templates/lambda-ray.yml.j2,sha256=
|
233
|
+
sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
|
234
234
|
sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
|
235
|
-
sky/templates/oci-ray.yml.j2,sha256=
|
235
|
+
sky/templates/oci-ray.yml.j2,sha256=E-xnadts-x88vYRI1QGFzgfGGKFospmo2N9d_0cPN5I,7144
|
236
236
|
sky/templates/paperspace-ray.yml.j2,sha256=HQjZNamrB_a4fOMCxQXSVdV5JIHtbGtAE0JzEO8uuVQ,4021
|
237
237
|
sky/templates/runpod-ray.yml.j2,sha256=p3BtYBHzROtNJqnjEo1xCmGSJQfCZYdarWszhDYyl0Q,3697
|
238
238
|
sky/templates/scp-ray.yml.j2,sha256=I9u8Ax-lit-d6UrCC9BVU8avst8w1cwK6TrzZBcz_JM,5608
|
@@ -245,7 +245,7 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
245
245
|
sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
|
246
246
|
sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
|
247
247
|
sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
|
248
|
-
sky/utils/command_runner.py,sha256=
|
248
|
+
sky/utils/command_runner.py,sha256=TEFJlmIGzlZxZppcBdwDK4AscM0-08L2XRFwQIRK9OA,34784
|
249
249
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
250
250
|
sky/utils/common_utils.py,sha256=MwFhIcvCEMBo7kbENUjN3qRNO5SoMV0fzAORc65c5x0,24525
|
251
251
|
sky/utils/controller_utils.py,sha256=V05hiLJIjqqXssYzs_Gchk4-tijgpMgLJsRW8ymhS-E,40625
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/METADATA,sha256=mv3zgaXoDB_-9jX9Sk5tEP5oiCfv5xZ76gMbFothu4g,19540
|
279
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241021.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241019.dist-info → skypilot_nightly-1.0.0.dev20241021.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|