skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +9 -11
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vast/utils.py +2 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +55 -40
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +441 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/backends/backend_utils.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
from datetime import datetime
|
3
3
|
import enum
|
4
4
|
import fnmatch
|
5
|
-
import functools
|
6
5
|
import hashlib
|
7
6
|
import os
|
8
7
|
import pathlib
|
@@ -12,7 +11,6 @@ import shlex
|
|
12
11
|
import subprocess
|
13
12
|
import sys
|
14
13
|
import tempfile
|
15
|
-
import textwrap
|
16
14
|
import time
|
17
15
|
import typing
|
18
16
|
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
@@ -38,20 +36,21 @@ from sky import global_user_state
|
|
38
36
|
from sky import provision as provision_lib
|
39
37
|
from sky import sky_logging
|
40
38
|
from sky import skypilot_config
|
41
|
-
from sky import status_lib
|
42
|
-
from sky.clouds import cloud_registry
|
43
39
|
from sky.provision import instance_setup
|
44
40
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
45
41
|
from sky.skylet import constants
|
46
42
|
from sky.usage import usage_lib
|
47
|
-
from sky.utils import
|
43
|
+
from sky.utils import cluster_utils
|
48
44
|
from sky.utils import command_runner
|
45
|
+
from sky.utils import common
|
49
46
|
from sky.utils import common_utils
|
50
47
|
from sky.utils import controller_utils
|
51
48
|
from sky.utils import env_options
|
49
|
+
from sky.utils import registry
|
52
50
|
from sky.utils import resources_utils
|
53
51
|
from sky.utils import rich_utils
|
54
52
|
from sky.utils import schemas
|
53
|
+
from sky.utils import status_lib
|
55
54
|
from sky.utils import subprocess_utils
|
56
55
|
from sky.utils import timeline
|
57
56
|
from sky.utils import ux_utils
|
@@ -69,7 +68,6 @@ SKY_REMOTE_APP_DIR = '~/.sky/sky_app'
|
|
69
68
|
# Exclude subnet mask from IP address regex.
|
70
69
|
IP_ADDR_REGEX = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!/\d{1,2})\b'
|
71
70
|
SKY_REMOTE_PATH = '~/.sky/wheels'
|
72
|
-
SKY_USER_FILE_PATH = '~/.sky/generated'
|
73
71
|
|
74
72
|
# Do not use /tmp because it gets cleared on VM restart.
|
75
73
|
_SKY_REMOTE_FILE_MOUNTS_DIR = '~/.sky/file_mounts/'
|
@@ -191,7 +189,8 @@ def is_ip(s: str) -> bool:
|
|
191
189
|
|
192
190
|
|
193
191
|
def _get_yaml_path_from_cluster_name(cluster_name: str,
|
194
|
-
prefix: str = SKY_USER_FILE_PATH
|
192
|
+
prefix: str = constants.SKY_USER_FILE_PATH
|
193
|
+
) -> str:
|
195
194
|
output_path = pathlib.Path(
|
196
195
|
prefix).expanduser().resolve() / f'{cluster_name}.yml'
|
197
196
|
os.makedirs(output_path.parents[0], exist_ok=True)
|
@@ -421,182 +420,6 @@ class FileMountHelper(object):
|
|
421
420
|
return ' && '.join(commands)
|
422
421
|
|
423
422
|
|
424
|
-
class SSHConfigHelper(object):
|
425
|
-
"""Helper for handling local SSH configuration."""
|
426
|
-
|
427
|
-
ssh_conf_path = '~/.ssh/config'
|
428
|
-
ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
|
429
|
-
ssh_conf_per_cluster_lock_path = os.path.expanduser(
|
430
|
-
'~/.sky/ssh_config_{}.lock')
|
431
|
-
ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
|
432
|
-
|
433
|
-
@classmethod
|
434
|
-
def _get_generated_config(cls, autogen_comment: str, host_name: str,
|
435
|
-
ip: str, username: str, ssh_key_path: str,
|
436
|
-
proxy_command: Optional[str], port: int,
|
437
|
-
docker_proxy_command: Optional[str]):
|
438
|
-
if proxy_command is not None:
|
439
|
-
# Already checked in resources
|
440
|
-
assert docker_proxy_command is None, (
|
441
|
-
'Cannot specify both proxy_command and docker_proxy_command.')
|
442
|
-
proxy = f'ProxyCommand {proxy_command}'
|
443
|
-
elif docker_proxy_command is not None:
|
444
|
-
proxy = f'ProxyCommand {docker_proxy_command}'
|
445
|
-
else:
|
446
|
-
proxy = ''
|
447
|
-
# StrictHostKeyChecking=no skips the host key check for the first
|
448
|
-
# time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
|
449
|
-
# prevent the host key from being added to the known_hosts file and
|
450
|
-
# always return an empty file for known hosts, making the ssh think
|
451
|
-
# this is a first-time connection, and thus skipping the host key
|
452
|
-
# check.
|
453
|
-
codegen = textwrap.dedent(f"""\
|
454
|
-
{autogen_comment}
|
455
|
-
Host {host_name}
|
456
|
-
HostName {ip}
|
457
|
-
User {username}
|
458
|
-
IdentityFile {ssh_key_path}
|
459
|
-
AddKeysToAgent yes
|
460
|
-
IdentitiesOnly yes
|
461
|
-
ForwardAgent yes
|
462
|
-
StrictHostKeyChecking no
|
463
|
-
UserKnownHostsFile=/dev/null
|
464
|
-
GlobalKnownHostsFile=/dev/null
|
465
|
-
Port {port}
|
466
|
-
{proxy}
|
467
|
-
""".rstrip())
|
468
|
-
codegen = codegen + '\n'
|
469
|
-
return codegen
|
470
|
-
|
471
|
-
@classmethod
|
472
|
-
@timeline.FileLockEvent(ssh_conf_lock_path)
|
473
|
-
def add_cluster(
|
474
|
-
cls,
|
475
|
-
cluster_name: str,
|
476
|
-
ips: List[str],
|
477
|
-
auth_config: Dict[str, str],
|
478
|
-
ports: List[int],
|
479
|
-
docker_user: Optional[str] = None,
|
480
|
-
ssh_user: Optional[str] = None,
|
481
|
-
):
|
482
|
-
"""Add authentication information for cluster to local SSH config file.
|
483
|
-
|
484
|
-
If a host with `cluster_name` already exists and the configuration was
|
485
|
-
not added by sky, then `ip` is used to identify the host instead in the
|
486
|
-
file.
|
487
|
-
|
488
|
-
If a host with `cluster_name` already exists and the configuration was
|
489
|
-
added by sky (e.g. a spot instance), then the configuration is
|
490
|
-
overwritten.
|
491
|
-
|
492
|
-
Args:
|
493
|
-
cluster_name: Cluster name (see `sky status`)
|
494
|
-
ips: List of public IP addresses in the cluster. First IP is head
|
495
|
-
node.
|
496
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
497
|
-
ports: List of port numbers for SSH corresponding to ips
|
498
|
-
docker_user: If not None, use this user to ssh into the docker
|
499
|
-
ssh_user: Override the ssh_user in auth_config
|
500
|
-
"""
|
501
|
-
if ssh_user is None:
|
502
|
-
username = auth_config['ssh_user']
|
503
|
-
else:
|
504
|
-
username = ssh_user
|
505
|
-
if docker_user is not None:
|
506
|
-
username = docker_user
|
507
|
-
key_path = os.path.expanduser(auth_config['ssh_private_key'])
|
508
|
-
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
509
|
-
f'{cluster_name}` to remove)')
|
510
|
-
ip = ips[0]
|
511
|
-
if docker_user is not None:
|
512
|
-
ip = 'localhost'
|
513
|
-
|
514
|
-
config_path = os.path.expanduser(cls.ssh_conf_path)
|
515
|
-
|
516
|
-
if not os.path.exists(config_path):
|
517
|
-
config = ['\n']
|
518
|
-
with open(config_path,
|
519
|
-
'w',
|
520
|
-
encoding='utf-8',
|
521
|
-
opener=functools.partial(os.open, mode=0o644)) as f:
|
522
|
-
f.writelines(config)
|
523
|
-
|
524
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
525
|
-
config = f.readlines()
|
526
|
-
|
527
|
-
ssh_dir = cls.ssh_cluster_path.format('')
|
528
|
-
os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
|
529
|
-
|
530
|
-
# Handle Include on top of Config file
|
531
|
-
include_str = f'Include {cls.ssh_cluster_path.format("*")}'
|
532
|
-
found = False
|
533
|
-
for i, line in enumerate(config):
|
534
|
-
config_str = line.strip()
|
535
|
-
if config_str == include_str:
|
536
|
-
found = True
|
537
|
-
break
|
538
|
-
if 'Host' in config_str:
|
539
|
-
break
|
540
|
-
if not found:
|
541
|
-
# Did not find Include string. Insert `Include` lines.
|
542
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
543
|
-
config.insert(
|
544
|
-
0,
|
545
|
-
f'# Added by SkyPilot for ssh config of all clusters\n{include_str}\n'
|
546
|
-
)
|
547
|
-
f.write(''.join(config).strip())
|
548
|
-
f.write('\n' * 2)
|
549
|
-
|
550
|
-
proxy_command = auth_config.get('ssh_proxy_command', None)
|
551
|
-
|
552
|
-
docker_proxy_command_generator = None
|
553
|
-
if docker_user is not None:
|
554
|
-
docker_proxy_command_generator = lambda ip, port: ' '.join(
|
555
|
-
['ssh'] + command_runner.ssh_options_list(
|
556
|
-
key_path, ssh_control_name=None, port=port) +
|
557
|
-
['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
|
558
|
-
|
559
|
-
codegen = ''
|
560
|
-
# Add the nodes to the codegen
|
561
|
-
for i, ip in enumerate(ips):
|
562
|
-
docker_proxy_command = None
|
563
|
-
port = ports[i]
|
564
|
-
if docker_proxy_command_generator is not None:
|
565
|
-
docker_proxy_command = docker_proxy_command_generator(ip, port)
|
566
|
-
ip = 'localhost'
|
567
|
-
port = constants.DEFAULT_DOCKER_PORT
|
568
|
-
node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
|
569
|
-
# TODO(romilb): Update port number when k8s supports multinode
|
570
|
-
codegen += cls._get_generated_config(
|
571
|
-
sky_autogen_comment, node_name, ip, username, key_path,
|
572
|
-
proxy_command, port, docker_proxy_command) + '\n'
|
573
|
-
|
574
|
-
cluster_config_path = os.path.expanduser(
|
575
|
-
cls.ssh_cluster_path.format(cluster_name))
|
576
|
-
|
577
|
-
with open(cluster_config_path,
|
578
|
-
'w',
|
579
|
-
encoding='utf-8',
|
580
|
-
opener=functools.partial(os.open, mode=0o644)) as f:
|
581
|
-
f.write(codegen)
|
582
|
-
|
583
|
-
@classmethod
|
584
|
-
def remove_cluster(cls, cluster_name: str):
|
585
|
-
"""Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
|
586
|
-
|
587
|
-
If no existing host matching the provided specification is found, then
|
588
|
-
nothing is removed.
|
589
|
-
|
590
|
-
Args:
|
591
|
-
cluster_name: Cluster name.
|
592
|
-
"""
|
593
|
-
with timeline.FileLockEvent(
|
594
|
-
cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
|
595
|
-
cluster_config_path = os.path.expanduser(
|
596
|
-
cls.ssh_cluster_path.format(cluster_name))
|
597
|
-
common_utils.remove_file_if_exists(cluster_config_path)
|
598
|
-
|
599
|
-
|
600
423
|
def _replace_yaml_dicts(
|
601
424
|
new_yaml: str, old_yaml: str, restore_key_names: Set[str],
|
602
425
|
restore_key_names_exceptions: Sequence[Tuple[str, ...]]) -> str:
|
@@ -800,7 +623,7 @@ def write_cluster_config(
|
|
800
623
|
else:
|
801
624
|
excluded_clouds.add(cloud)
|
802
625
|
|
803
|
-
for cloud_str, cloud_obj in
|
626
|
+
for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
|
804
627
|
remote_identity_config = skypilot_config.get_nested(
|
805
628
|
(cloud_str.lower(), 'remote_identity'), None)
|
806
629
|
if remote_identity_config:
|
@@ -810,7 +633,8 @@ def write_cluster_config(
|
|
810
633
|
|
811
634
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
812
635
|
|
813
|
-
|
636
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
637
|
+
auth_config = {'ssh_private_key': private_key_path}
|
814
638
|
region_name = resources_vars.get('region')
|
815
639
|
|
816
640
|
yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
|
@@ -940,7 +764,7 @@ def write_cluster_config(
|
|
940
764
|
'sky_local_path': str(local_wheel_path),
|
941
765
|
# Add yaml file path to the template variables.
|
942
766
|
'sky_ray_yaml_remote_path':
|
943
|
-
|
767
|
+
cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
|
944
768
|
'sky_ray_yaml_local_path': tmp_yaml_path,
|
945
769
|
'sky_version': str(version.parse(sky.__version__)),
|
946
770
|
'sky_wheel_hash': wheel_hash,
|
@@ -1369,7 +1193,7 @@ def wait_until_ray_cluster_ready(
|
|
1369
1193
|
|
1370
1194
|
|
1371
1195
|
def ssh_credential_from_yaml(
|
1372
|
-
cluster_yaml: str,
|
1196
|
+
cluster_yaml: Optional[str],
|
1373
1197
|
docker_user: Optional[str] = None,
|
1374
1198
|
ssh_user: Optional[str] = None,
|
1375
1199
|
) -> Dict[str, Any]:
|
@@ -1381,6 +1205,8 @@ def ssh_credential_from_yaml(
|
|
1381
1205
|
the docker container.
|
1382
1206
|
ssh_user: override the ssh_user in the cluster yaml.
|
1383
1207
|
"""
|
1208
|
+
if cluster_yaml is None:
|
1209
|
+
return dict()
|
1384
1210
|
config = common_utils.read_yaml(cluster_yaml)
|
1385
1211
|
auth_section = config['auth']
|
1386
1212
|
if ssh_user is None:
|
@@ -1501,12 +1327,6 @@ def check_local_gpus() -> bool:
|
|
1501
1327
|
return is_functional
|
1502
1328
|
|
1503
1329
|
|
1504
|
-
def generate_cluster_name():
|
1505
|
-
# TODO: change this ID formatting to something more pleasant.
|
1506
|
-
# User name is helpful in non-isolated accounts, e.g., GCP, Azure.
|
1507
|
-
return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'
|
1508
|
-
|
1509
|
-
|
1510
1330
|
def _query_head_ip_with_retries(cluster_yaml: str,
|
1511
1331
|
max_attempts: int = 1) -> str:
|
1512
1332
|
"""Returns the IP of the head node by querying the cloud.
|
@@ -1572,8 +1392,8 @@ def get_node_ips(cluster_yaml: str,
|
|
1572
1392
|
"""
|
1573
1393
|
ray_config = common_utils.read_yaml(cluster_yaml)
|
1574
1394
|
# Use the new provisioner for AWS.
|
1575
|
-
provider_name =
|
1576
|
-
cloud =
|
1395
|
+
provider_name = cluster_utils.get_provider_name(ray_config)
|
1396
|
+
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
1577
1397
|
assert cloud is not None, provider_name
|
1578
1398
|
|
1579
1399
|
if cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
|
@@ -1841,7 +1661,7 @@ def check_can_clone_disk_and_override_task(
|
|
1841
1661
|
with ux_utils.print_exception_no_traceback():
|
1842
1662
|
raise exceptions.NotSupportedError(
|
1843
1663
|
f'Cannot clone disk from cluster {cluster_name!r} '
|
1844
|
-
f'({source_cluster_status!r}). Please stop the '
|
1664
|
+
f'({source_cluster_status.value!r}). Please stop the '
|
1845
1665
|
f'cluster first: sky stop {cluster_name}')
|
1846
1666
|
|
1847
1667
|
if target_cluster_name is not None:
|
@@ -1921,8 +1741,7 @@ def check_can_clone_disk_and_override_task(
|
|
1921
1741
|
return task, handle
|
1922
1742
|
|
1923
1743
|
|
1924
|
-
def
|
1925
|
-
cluster_name: str) -> Optional[Dict[str, Any]]:
|
1744
|
+
def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
1926
1745
|
"""Update the cluster status.
|
1927
1746
|
|
1928
1747
|
The cluster status is updated by checking ray cluster and real status from
|
@@ -1932,6 +1751,10 @@ def _update_cluster_status_no_lock(
|
|
1932
1751
|
the design of the cluster status and transition, please refer to the
|
1933
1752
|
sky/design_docs/cluster_status.md
|
1934
1753
|
|
1754
|
+
Note: this function is only safe to be called when the caller process is
|
1755
|
+
holding the cluster lock, which means no other processes are modifying the
|
1756
|
+
cluster.
|
1757
|
+
|
1935
1758
|
Returns:
|
1936
1759
|
If the cluster is terminated or does not exist, return None. Otherwise
|
1937
1760
|
returns the input record with status and handle potentially updated.
|
@@ -1949,6 +1772,13 @@ def _update_cluster_status_no_lock(
|
|
1949
1772
|
if record is None:
|
1950
1773
|
return None
|
1951
1774
|
handle = record['handle']
|
1775
|
+
if handle.cluster_yaml is None:
|
1776
|
+
# Remove cluster from db since this cluster does not have a config file
|
1777
|
+
# or any other ongoing requests
|
1778
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
1779
|
+
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
1780
|
+
'Removing the cluster from cache.')
|
1781
|
+
return None
|
1952
1782
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1953
1783
|
return record
|
1954
1784
|
cluster_name = handle.cluster_name
|
@@ -2032,7 +1862,7 @@ def _update_cluster_status_no_lock(
|
|
2032
1862
|
requested_resources=None,
|
2033
1863
|
ready=True,
|
2034
1864
|
is_launch=False)
|
2035
|
-
return
|
1865
|
+
return global_user_state.get_cluster_from_name(cluster_name)
|
2036
1866
|
|
2037
1867
|
# All cases below are transitioning the cluster to non-UP states.
|
2038
1868
|
|
@@ -2212,9 +2042,9 @@ def refresh_cluster_record(
|
|
2212
2042
|
) -> Optional[Dict[str, Any]]:
|
2213
2043
|
"""Refresh the cluster, and return the possibly updated record.
|
2214
2044
|
|
2215
|
-
|
2216
|
-
|
2217
|
-
|
2045
|
+
The function will update the cached cluster status in the global state. For
|
2046
|
+
the design of the cluster status and transition, please refer to the
|
2047
|
+
sky/design_docs/cluster_status.md
|
2218
2048
|
|
2219
2049
|
Args:
|
2220
2050
|
cluster_name: The name of the cluster.
|
@@ -2270,22 +2100,19 @@ def refresh_cluster_record(
|
|
2270
2100
|
return record
|
2271
2101
|
|
2272
2102
|
if not acquire_per_cluster_status_lock:
|
2273
|
-
return
|
2103
|
+
return _update_cluster_status(cluster_name)
|
2274
2104
|
|
2275
2105
|
# Try to acquire the lock so we can fetch the status.
|
2276
2106
|
try:
|
2277
2107
|
with lock.acquire(blocking=False):
|
2278
|
-
# Lock acquired.
|
2279
|
-
|
2280
2108
|
# Check the cluster status again, since it could have been
|
2281
2109
|
# updated between our last check and acquiring the lock.
|
2282
2110
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
2283
2111
|
if record is None or not _must_refresh_cluster_status(
|
2284
2112
|
record, force_refresh_statuses):
|
2285
2113
|
return record
|
2286
|
-
|
2287
2114
|
# Update and return the cluster status.
|
2288
|
-
return
|
2115
|
+
return _update_cluster_status(cluster_name)
|
2289
2116
|
except filelock.Timeout:
|
2290
2117
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
2291
2118
|
# available and we have blocking=False.
|
@@ -2608,10 +2435,21 @@ class CloudFilter(enum.Enum):
|
|
2608
2435
|
LOCAL = 'local'
|
2609
2436
|
|
2610
2437
|
|
2438
|
+
def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
|
2439
|
+
"""Returns a list of clusters that match the glob pattern."""
|
2440
|
+
glob_clusters = []
|
2441
|
+
for cluster in clusters:
|
2442
|
+
glob_cluster = global_user_state.get_glob_cluster_names(cluster)
|
2443
|
+
if len(glob_cluster) == 0 and not silent:
|
2444
|
+
logger.info(f'Cluster {cluster} not found.')
|
2445
|
+
glob_clusters.extend(glob_cluster)
|
2446
|
+
return list(set(glob_clusters))
|
2447
|
+
|
2448
|
+
|
2611
2449
|
def get_clusters(
|
2612
|
-
|
2613
|
-
refresh: bool,
|
2450
|
+
refresh: common.StatusRefreshMode,
|
2614
2451
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
2452
|
+
all_users: bool = True,
|
2615
2453
|
) -> List[Dict[str, Any]]:
|
2616
2454
|
"""Returns a list of cached or optionally refreshed cluster records.
|
2617
2455
|
|
@@ -2636,20 +2474,55 @@ def get_clusters(
|
|
2636
2474
|
terminated, the record will be omitted from the returned list.
|
2637
2475
|
"""
|
2638
2476
|
records = global_user_state.get_clusters()
|
2639
|
-
|
2640
|
-
|
2477
|
+
if not all_users:
|
2478
|
+
current_user_hash = common_utils.get_user_hash()
|
2641
2479
|
records = [
|
2642
2480
|
record for record in records
|
2643
|
-
if
|
2481
|
+
if record['user_hash'] == current_user_hash
|
2644
2482
|
]
|
2645
2483
|
|
2646
2484
|
yellow = colorama.Fore.YELLOW
|
2647
2485
|
bright = colorama.Style.BRIGHT
|
2648
2486
|
reset = colorama.Style.RESET_ALL
|
2649
2487
|
|
2488
|
+
def _update_record_with_credentials_and_resources_str(
|
2489
|
+
record: Optional[Dict[str, Any]]) -> None:
|
2490
|
+
"""Add the credentials to the record.
|
2491
|
+
|
2492
|
+
This is useful for the client side to setup the ssh config of the
|
2493
|
+
cluster.
|
2494
|
+
"""
|
2495
|
+
if record is None:
|
2496
|
+
return
|
2497
|
+
handle = record['handle']
|
2498
|
+
if handle is None:
|
2499
|
+
return
|
2500
|
+
record['resources_str'] = resources_utils.get_readable_resources_repr(
|
2501
|
+
handle)
|
2502
|
+
credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
2503
|
+
handle.docker_user,
|
2504
|
+
handle.ssh_user)
|
2505
|
+
|
2506
|
+
if not credentials:
|
2507
|
+
return
|
2508
|
+
ssh_private_key_path = credentials.get('ssh_private_key', None)
|
2509
|
+
if ssh_private_key_path is not None:
|
2510
|
+
with open(os.path.expanduser(ssh_private_key_path),
|
2511
|
+
'r',
|
2512
|
+
encoding='utf-8') as f:
|
2513
|
+
credentials['ssh_private_key_content'] = f.read()
|
2514
|
+
else:
|
2515
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
2516
|
+
with open(os.path.expanduser(private_key_path),
|
2517
|
+
'r',
|
2518
|
+
encoding='utf-8') as f:
|
2519
|
+
credentials['ssh_private_key_content'] = f.read()
|
2520
|
+
record['credentials'] = credentials
|
2521
|
+
|
2650
2522
|
if cluster_names is not None:
|
2651
2523
|
if isinstance(cluster_names, str):
|
2652
2524
|
cluster_names = [cluster_names]
|
2525
|
+
cluster_names = _get_glob_clusters(cluster_names, silent=True)
|
2653
2526
|
new_records = []
|
2654
2527
|
not_exist_cluster_names = []
|
2655
2528
|
for cluster_name in cluster_names:
|
@@ -2664,7 +2537,11 @@ def get_clusters(
|
|
2664
2537
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
2665
2538
|
records = new_records
|
2666
2539
|
|
2667
|
-
|
2540
|
+
# Add auth_config to the records
|
2541
|
+
for record in records:
|
2542
|
+
_update_record_with_credentials_and_resources_str(record)
|
2543
|
+
|
2544
|
+
if refresh == common.StatusRefreshMode.NONE:
|
2668
2545
|
return records
|
2669
2546
|
|
2670
2547
|
plural = 's' if len(records) > 1 else ''
|
@@ -2675,12 +2552,18 @@ def get_clusters(
|
|
2675
2552
|
f'Refreshing status for {len(records)} cluster{plural}'),
|
2676
2553
|
total=len(records))
|
2677
2554
|
|
2555
|
+
if refresh == common.StatusRefreshMode.FORCE:
|
2556
|
+
force_refresh_statuses = set(status_lib.ClusterStatus)
|
2557
|
+
else:
|
2558
|
+
force_refresh_statuses = None
|
2559
|
+
|
2678
2560
|
def _refresh_cluster(cluster_name):
|
2679
2561
|
try:
|
2680
2562
|
record = refresh_cluster_record(
|
2681
2563
|
cluster_name,
|
2682
|
-
force_refresh_statuses=
|
2564
|
+
force_refresh_statuses=force_refresh_statuses,
|
2683
2565
|
acquire_per_cluster_status_lock=True)
|
2566
|
+
_update_record_with_credentials_and_resources_str(record)
|
2684
2567
|
except (exceptions.ClusterStatusFetchingError,
|
2685
2568
|
exceptions.CloudUserIdentityError,
|
2686
2569
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
@@ -2692,9 +2575,11 @@ def get_clusters(
|
|
2692
2575
|
return record
|
2693
2576
|
|
2694
2577
|
cluster_names = [record['name'] for record in records]
|
2695
|
-
|
2696
|
-
|
2697
|
-
|
2578
|
+
updated_records = []
|
2579
|
+
if len(cluster_names) > 0:
|
2580
|
+
with progress:
|
2581
|
+
updated_records = subprocess_utils.run_in_parallel(
|
2582
|
+
_refresh_cluster, cluster_names)
|
2698
2583
|
|
2699
2584
|
# Show information for removed clusters.
|
2700
2585
|
kept_records = []
|
@@ -2731,6 +2616,7 @@ def get_clusters(
|
|
2731
2616
|
f'{len(failed_clusters)} cluster{plural}:{reset}')
|
2732
2617
|
for cluster_name, e in failed_clusters:
|
2733
2618
|
logger.warning(f' {bright}{cluster_name}{reset}: {e}')
|
2619
|
+
|
2734
2620
|
return kept_records
|
2735
2621
|
|
2736
2622
|
|
@@ -2960,8 +2846,7 @@ def get_endpoints(cluster: str,
|
|
2960
2846
|
except ValueError:
|
2961
2847
|
with ux_utils.print_exception_no_traceback():
|
2962
2848
|
raise ValueError(f'Invalid endpoint {port!r}.') from None
|
2963
|
-
cluster_records = get_clusters(
|
2964
|
-
refresh=False,
|
2849
|
+
cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
|
2965
2850
|
cluster_names=[cluster])
|
2966
2851
|
if not cluster_records:
|
2967
2852
|
with ux_utils.print_exception_no_traceback():
|
@@ -2974,7 +2859,9 @@ def get_endpoints(cluster: str,
|
|
2974
2859
|
with ux_utils.print_exception_no_traceback():
|
2975
2860
|
raise exceptions.ClusterNotUpError(
|
2976
2861
|
f'Cluster {cluster_record["name"]!r} '
|
2977
|
-
'is not in UP status.',
|
2862
|
+
'is not in UP status.',
|
2863
|
+
cluster_status=cluster_record['status'],
|
2864
|
+
handle=cluster_record['handle'])
|
2978
2865
|
handle = cluster_record['handle']
|
2979
2866
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
2980
2867
|
with ux_utils.print_exception_no_traceback():
|
@@ -2990,7 +2877,7 @@ def get_endpoints(cluster: str,
|
|
2990
2877
|
except exceptions.NotSupportedError:
|
2991
2878
|
with ux_utils.print_exception_no_traceback():
|
2992
2879
|
raise ValueError('Querying endpoints is not supported '
|
2993
|
-
f'for
|
2880
|
+
f'for {cluster!r} on {cloud}.') from None
|
2994
2881
|
|
2995
2882
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
2996
2883
|
port_details = provision_lib.query_ports(repr(cloud),
|