skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@
2
2
  from datetime import datetime
3
3
  import enum
4
4
  import fnmatch
5
- import functools
6
5
  import hashlib
7
6
  import os
8
7
  import pathlib
@@ -12,7 +11,6 @@ import shlex
12
11
  import subprocess
13
12
  import sys
14
13
  import tempfile
15
- import textwrap
16
14
  import time
17
15
  import typing
18
16
  from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
@@ -38,20 +36,21 @@ from sky import global_user_state
38
36
  from sky import provision as provision_lib
39
37
  from sky import sky_logging
40
38
  from sky import skypilot_config
41
- from sky import status_lib
42
- from sky.clouds import cloud_registry
43
39
  from sky.provision import instance_setup
44
40
  from sky.provision.kubernetes import utils as kubernetes_utils
45
41
  from sky.skylet import constants
46
42
  from sky.usage import usage_lib
47
- from sky.utils import cluster_yaml_utils
43
+ from sky.utils import cluster_utils
48
44
  from sky.utils import command_runner
45
+ from sky.utils import common
49
46
  from sky.utils import common_utils
50
47
  from sky.utils import controller_utils
51
48
  from sky.utils import env_options
49
+ from sky.utils import registry
52
50
  from sky.utils import resources_utils
53
51
  from sky.utils import rich_utils
54
52
  from sky.utils import schemas
53
+ from sky.utils import status_lib
55
54
  from sky.utils import subprocess_utils
56
55
  from sky.utils import timeline
57
56
  from sky.utils import ux_utils
@@ -69,7 +68,6 @@ SKY_REMOTE_APP_DIR = '~/.sky/sky_app'
69
68
  # Exclude subnet mask from IP address regex.
70
69
  IP_ADDR_REGEX = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!/\d{1,2})\b'
71
70
  SKY_REMOTE_PATH = '~/.sky/wheels'
72
- SKY_USER_FILE_PATH = '~/.sky/generated'
73
71
 
74
72
  # Do not use /tmp because it gets cleared on VM restart.
75
73
  _SKY_REMOTE_FILE_MOUNTS_DIR = '~/.sky/file_mounts/'
@@ -191,7 +189,8 @@ def is_ip(s: str) -> bool:
191
189
 
192
190
 
193
191
  def _get_yaml_path_from_cluster_name(cluster_name: str,
194
- prefix: str = SKY_USER_FILE_PATH) -> str:
192
+ prefix: str = constants.SKY_USER_FILE_PATH
193
+ ) -> str:
195
194
  output_path = pathlib.Path(
196
195
  prefix).expanduser().resolve() / f'{cluster_name}.yml'
197
196
  os.makedirs(output_path.parents[0], exist_ok=True)
@@ -421,182 +420,6 @@ class FileMountHelper(object):
421
420
  return ' && '.join(commands)
422
421
 
423
422
 
424
- class SSHConfigHelper(object):
425
- """Helper for handling local SSH configuration."""
426
-
427
- ssh_conf_path = '~/.ssh/config'
428
- ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
429
- ssh_conf_per_cluster_lock_path = os.path.expanduser(
430
- '~/.sky/ssh_config_{}.lock')
431
- ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
432
-
433
- @classmethod
434
- def _get_generated_config(cls, autogen_comment: str, host_name: str,
435
- ip: str, username: str, ssh_key_path: str,
436
- proxy_command: Optional[str], port: int,
437
- docker_proxy_command: Optional[str]):
438
- if proxy_command is not None:
439
- # Already checked in resources
440
- assert docker_proxy_command is None, (
441
- 'Cannot specify both proxy_command and docker_proxy_command.')
442
- proxy = f'ProxyCommand {proxy_command}'
443
- elif docker_proxy_command is not None:
444
- proxy = f'ProxyCommand {docker_proxy_command}'
445
- else:
446
- proxy = ''
447
- # StrictHostKeyChecking=no skips the host key check for the first
448
- # time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
449
- # prevent the host key from being added to the known_hosts file and
450
- # always return an empty file for known hosts, making the ssh think
451
- # this is a first-time connection, and thus skipping the host key
452
- # check.
453
- codegen = textwrap.dedent(f"""\
454
- {autogen_comment}
455
- Host {host_name}
456
- HostName {ip}
457
- User {username}
458
- IdentityFile {ssh_key_path}
459
- AddKeysToAgent yes
460
- IdentitiesOnly yes
461
- ForwardAgent yes
462
- StrictHostKeyChecking no
463
- UserKnownHostsFile=/dev/null
464
- GlobalKnownHostsFile=/dev/null
465
- Port {port}
466
- {proxy}
467
- """.rstrip())
468
- codegen = codegen + '\n'
469
- return codegen
470
-
471
- @classmethod
472
- @timeline.FileLockEvent(ssh_conf_lock_path)
473
- def add_cluster(
474
- cls,
475
- cluster_name: str,
476
- ips: List[str],
477
- auth_config: Dict[str, str],
478
- ports: List[int],
479
- docker_user: Optional[str] = None,
480
- ssh_user: Optional[str] = None,
481
- ):
482
- """Add authentication information for cluster to local SSH config file.
483
-
484
- If a host with `cluster_name` already exists and the configuration was
485
- not added by sky, then `ip` is used to identify the host instead in the
486
- file.
487
-
488
- If a host with `cluster_name` already exists and the configuration was
489
- added by sky (e.g. a spot instance), then the configuration is
490
- overwritten.
491
-
492
- Args:
493
- cluster_name: Cluster name (see `sky status`)
494
- ips: List of public IP addresses in the cluster. First IP is head
495
- node.
496
- auth_config: read_yaml(handle.cluster_yaml)['auth']
497
- ports: List of port numbers for SSH corresponding to ips
498
- docker_user: If not None, use this user to ssh into the docker
499
- ssh_user: Override the ssh_user in auth_config
500
- """
501
- if ssh_user is None:
502
- username = auth_config['ssh_user']
503
- else:
504
- username = ssh_user
505
- if docker_user is not None:
506
- username = docker_user
507
- key_path = os.path.expanduser(auth_config['ssh_private_key'])
508
- sky_autogen_comment = ('# Added by sky (use `sky stop/down '
509
- f'{cluster_name}` to remove)')
510
- ip = ips[0]
511
- if docker_user is not None:
512
- ip = 'localhost'
513
-
514
- config_path = os.path.expanduser(cls.ssh_conf_path)
515
-
516
- if not os.path.exists(config_path):
517
- config = ['\n']
518
- with open(config_path,
519
- 'w',
520
- encoding='utf-8',
521
- opener=functools.partial(os.open, mode=0o644)) as f:
522
- f.writelines(config)
523
-
524
- with open(config_path, 'r', encoding='utf-8') as f:
525
- config = f.readlines()
526
-
527
- ssh_dir = cls.ssh_cluster_path.format('')
528
- os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
529
-
530
- # Handle Include on top of Config file
531
- include_str = f'Include {cls.ssh_cluster_path.format("*")}'
532
- found = False
533
- for i, line in enumerate(config):
534
- config_str = line.strip()
535
- if config_str == include_str:
536
- found = True
537
- break
538
- if 'Host' in config_str:
539
- break
540
- if not found:
541
- # Did not find Include string. Insert `Include` lines.
542
- with open(config_path, 'w', encoding='utf-8') as f:
543
- config.insert(
544
- 0,
545
- f'# Added by SkyPilot for ssh config of all clusters\n{include_str}\n'
546
- )
547
- f.write(''.join(config).strip())
548
- f.write('\n' * 2)
549
-
550
- proxy_command = auth_config.get('ssh_proxy_command', None)
551
-
552
- docker_proxy_command_generator = None
553
- if docker_user is not None:
554
- docker_proxy_command_generator = lambda ip, port: ' '.join(
555
- ['ssh'] + command_runner.ssh_options_list(
556
- key_path, ssh_control_name=None, port=port) +
557
- ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
558
-
559
- codegen = ''
560
- # Add the nodes to the codegen
561
- for i, ip in enumerate(ips):
562
- docker_proxy_command = None
563
- port = ports[i]
564
- if docker_proxy_command_generator is not None:
565
- docker_proxy_command = docker_proxy_command_generator(ip, port)
566
- ip = 'localhost'
567
- port = constants.DEFAULT_DOCKER_PORT
568
- node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
569
- # TODO(romilb): Update port number when k8s supports multinode
570
- codegen += cls._get_generated_config(
571
- sky_autogen_comment, node_name, ip, username, key_path,
572
- proxy_command, port, docker_proxy_command) + '\n'
573
-
574
- cluster_config_path = os.path.expanduser(
575
- cls.ssh_cluster_path.format(cluster_name))
576
-
577
- with open(cluster_config_path,
578
- 'w',
579
- encoding='utf-8',
580
- opener=functools.partial(os.open, mode=0o644)) as f:
581
- f.write(codegen)
582
-
583
- @classmethod
584
- def remove_cluster(cls, cluster_name: str):
585
- """Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
586
-
587
- If no existing host matching the provided specification is found, then
588
- nothing is removed.
589
-
590
- Args:
591
- cluster_name: Cluster name.
592
- """
593
- with timeline.FileLockEvent(
594
- cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
595
- cluster_config_path = os.path.expanduser(
596
- cls.ssh_cluster_path.format(cluster_name))
597
- common_utils.remove_file_if_exists(cluster_config_path)
598
-
599
-
600
423
  def _replace_yaml_dicts(
601
424
  new_yaml: str, old_yaml: str, restore_key_names: Set[str],
602
425
  restore_key_names_exceptions: Sequence[Tuple[str, ...]]) -> str:
@@ -800,7 +623,7 @@ def write_cluster_config(
800
623
  else:
801
624
  excluded_clouds.add(cloud)
802
625
 
803
- for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
626
+ for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
804
627
  remote_identity_config = skypilot_config.get_nested(
805
628
  (cloud_str.lower(), 'remote_identity'), None)
806
629
  if remote_identity_config:
@@ -810,7 +633,8 @@ def write_cluster_config(
810
633
 
811
634
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
812
635
 
813
- auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
636
+ private_key_path, _ = auth.get_or_generate_keys()
637
+ auth_config = {'ssh_private_key': private_key_path}
814
638
  region_name = resources_vars.get('region')
815
639
 
816
640
  yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
@@ -940,7 +764,7 @@ def write_cluster_config(
940
764
  'sky_local_path': str(local_wheel_path),
941
765
  # Add yaml file path to the template variables.
942
766
  'sky_ray_yaml_remote_path':
943
- cluster_yaml_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
767
+ cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
944
768
  'sky_ray_yaml_local_path': tmp_yaml_path,
945
769
  'sky_version': str(version.parse(sky.__version__)),
946
770
  'sky_wheel_hash': wheel_hash,
@@ -1369,7 +1193,7 @@ def wait_until_ray_cluster_ready(
1369
1193
 
1370
1194
 
1371
1195
  def ssh_credential_from_yaml(
1372
- cluster_yaml: str,
1196
+ cluster_yaml: Optional[str],
1373
1197
  docker_user: Optional[str] = None,
1374
1198
  ssh_user: Optional[str] = None,
1375
1199
  ) -> Dict[str, Any]:
@@ -1381,6 +1205,8 @@ def ssh_credential_from_yaml(
1381
1205
  the docker container.
1382
1206
  ssh_user: override the ssh_user in the cluster yaml.
1383
1207
  """
1208
+ if cluster_yaml is None:
1209
+ return dict()
1384
1210
  config = common_utils.read_yaml(cluster_yaml)
1385
1211
  auth_section = config['auth']
1386
1212
  if ssh_user is None:
@@ -1501,12 +1327,6 @@ def check_local_gpus() -> bool:
1501
1327
  return is_functional
1502
1328
 
1503
1329
 
1504
- def generate_cluster_name():
1505
- # TODO: change this ID formatting to something more pleasant.
1506
- # User name is helpful in non-isolated accounts, e.g., GCP, Azure.
1507
- return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'
1508
-
1509
-
1510
1330
  def _query_head_ip_with_retries(cluster_yaml: str,
1511
1331
  max_attempts: int = 1) -> str:
1512
1332
  """Returns the IP of the head node by querying the cloud.
@@ -1572,8 +1392,8 @@ def get_node_ips(cluster_yaml: str,
1572
1392
  """
1573
1393
  ray_config = common_utils.read_yaml(cluster_yaml)
1574
1394
  # Use the new provisioner for AWS.
1575
- provider_name = cluster_yaml_utils.get_provider_name(ray_config)
1576
- cloud = cloud_registry.CLOUD_REGISTRY.from_str(provider_name)
1395
+ provider_name = cluster_utils.get_provider_name(ray_config)
1396
+ cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
1577
1397
  assert cloud is not None, provider_name
1578
1398
 
1579
1399
  if cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
@@ -1841,7 +1661,7 @@ def check_can_clone_disk_and_override_task(
1841
1661
  with ux_utils.print_exception_no_traceback():
1842
1662
  raise exceptions.NotSupportedError(
1843
1663
  f'Cannot clone disk from cluster {cluster_name!r} '
1844
- f'({source_cluster_status!r}). Please stop the '
1664
+ f'({source_cluster_status.value!r}). Please stop the '
1845
1665
  f'cluster first: sky stop {cluster_name}')
1846
1666
 
1847
1667
  if target_cluster_name is not None:
@@ -1921,8 +1741,7 @@ def check_can_clone_disk_and_override_task(
1921
1741
  return task, handle
1922
1742
 
1923
1743
 
1924
- def _update_cluster_status_no_lock(
1925
- cluster_name: str) -> Optional[Dict[str, Any]]:
1744
+ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1926
1745
  """Update the cluster status.
1927
1746
 
1928
1747
  The cluster status is updated by checking ray cluster and real status from
@@ -1932,6 +1751,10 @@ def _update_cluster_status_no_lock(
1932
1751
  the design of the cluster status and transition, please refer to the
1933
1752
  sky/design_docs/cluster_status.md
1934
1753
 
1754
+ Note: this function is only safe to be called when the caller process is
1755
+ holding the cluster lock, which means no other processes are modifying the
1756
+ cluster.
1757
+
1935
1758
  Returns:
1936
1759
  If the cluster is terminated or does not exist, return None. Otherwise
1937
1760
  returns the input record with status and handle potentially updated.
@@ -1949,6 +1772,13 @@ def _update_cluster_status_no_lock(
1949
1772
  if record is None:
1950
1773
  return None
1951
1774
  handle = record['handle']
1775
+ if handle.cluster_yaml is None:
1776
+ # Remove cluster from db since this cluster does not have a config file
1777
+ # or any other ongoing requests
1778
+ global_user_state.remove_cluster(cluster_name, terminate=True)
1779
+ logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
1780
+ 'Removing the cluster from cache.')
1781
+ return None
1952
1782
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1953
1783
  return record
1954
1784
  cluster_name = handle.cluster_name
@@ -2032,7 +1862,7 @@ def _update_cluster_status_no_lock(
2032
1862
  requested_resources=None,
2033
1863
  ready=True,
2034
1864
  is_launch=False)
2035
- return record
1865
+ return global_user_state.get_cluster_from_name(cluster_name)
2036
1866
 
2037
1867
  # All cases below are transitioning the cluster to non-UP states.
2038
1868
 
@@ -2212,9 +2042,9 @@ def refresh_cluster_record(
2212
2042
  ) -> Optional[Dict[str, Any]]:
2213
2043
  """Refresh the cluster, and return the possibly updated record.
2214
2044
 
2215
- This function will also check the owner identity of the cluster, and raise
2216
- exceptions if the current user is not the same as the user who created the
2217
- cluster.
2045
+ The function will update the cached cluster status in the global state. For
2046
+ the design of the cluster status and transition, please refer to the
2047
+ sky/design_docs/cluster_status.md
2218
2048
 
2219
2049
  Args:
2220
2050
  cluster_name: The name of the cluster.
@@ -2270,22 +2100,19 @@ def refresh_cluster_record(
2270
2100
  return record
2271
2101
 
2272
2102
  if not acquire_per_cluster_status_lock:
2273
- return _update_cluster_status_no_lock(cluster_name)
2103
+ return _update_cluster_status(cluster_name)
2274
2104
 
2275
2105
  # Try to acquire the lock so we can fetch the status.
2276
2106
  try:
2277
2107
  with lock.acquire(blocking=False):
2278
- # Lock acquired.
2279
-
2280
2108
  # Check the cluster status again, since it could have been
2281
2109
  # updated between our last check and acquiring the lock.
2282
2110
  record = global_user_state.get_cluster_from_name(cluster_name)
2283
2111
  if record is None or not _must_refresh_cluster_status(
2284
2112
  record, force_refresh_statuses):
2285
2113
  return record
2286
-
2287
2114
  # Update and return the cluster status.
2288
- return _update_cluster_status_no_lock(cluster_name)
2115
+ return _update_cluster_status(cluster_name)
2289
2116
  except filelock.Timeout:
2290
2117
  # lock.acquire() will throw a Timeout exception if the lock is not
2291
2118
  # available and we have blocking=False.
@@ -2608,10 +2435,21 @@ class CloudFilter(enum.Enum):
2608
2435
  LOCAL = 'local'
2609
2436
 
2610
2437
 
2438
+ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
2439
+ """Returns a list of clusters that match the glob pattern."""
2440
+ glob_clusters = []
2441
+ for cluster in clusters:
2442
+ glob_cluster = global_user_state.get_glob_cluster_names(cluster)
2443
+ if len(glob_cluster) == 0 and not silent:
2444
+ logger.info(f'Cluster {cluster} not found.')
2445
+ glob_clusters.extend(glob_cluster)
2446
+ return list(set(glob_clusters))
2447
+
2448
+
2611
2449
  def get_clusters(
2612
- include_controller: bool,
2613
- refresh: bool,
2450
+ refresh: common.StatusRefreshMode,
2614
2451
  cluster_names: Optional[Union[str, List[str]]] = None,
2452
+ all_users: bool = True,
2615
2453
  ) -> List[Dict[str, Any]]:
2616
2454
  """Returns a list of cached or optionally refreshed cluster records.
2617
2455
 
@@ -2636,20 +2474,55 @@ def get_clusters(
2636
2474
  terminated, the record will be omitted from the returned list.
2637
2475
  """
2638
2476
  records = global_user_state.get_clusters()
2639
-
2640
- if not include_controller:
2477
+ if not all_users:
2478
+ current_user_hash = common_utils.get_user_hash()
2641
2479
  records = [
2642
2480
  record for record in records
2643
- if controller_utils.Controllers.from_name(record['name']) is None
2481
+ if record['user_hash'] == current_user_hash
2644
2482
  ]
2645
2483
 
2646
2484
  yellow = colorama.Fore.YELLOW
2647
2485
  bright = colorama.Style.BRIGHT
2648
2486
  reset = colorama.Style.RESET_ALL
2649
2487
 
2488
+ def _update_record_with_credentials_and_resources_str(
2489
+ record: Optional[Dict[str, Any]]) -> None:
2490
+ """Add the credentials to the record.
2491
+
2492
+ This is useful for the client side to setup the ssh config of the
2493
+ cluster.
2494
+ """
2495
+ if record is None:
2496
+ return
2497
+ handle = record['handle']
2498
+ if handle is None:
2499
+ return
2500
+ record['resources_str'] = resources_utils.get_readable_resources_repr(
2501
+ handle)
2502
+ credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2503
+ handle.docker_user,
2504
+ handle.ssh_user)
2505
+
2506
+ if not credentials:
2507
+ return
2508
+ ssh_private_key_path = credentials.get('ssh_private_key', None)
2509
+ if ssh_private_key_path is not None:
2510
+ with open(os.path.expanduser(ssh_private_key_path),
2511
+ 'r',
2512
+ encoding='utf-8') as f:
2513
+ credentials['ssh_private_key_content'] = f.read()
2514
+ else:
2515
+ private_key_path, _ = auth.get_or_generate_keys()
2516
+ with open(os.path.expanduser(private_key_path),
2517
+ 'r',
2518
+ encoding='utf-8') as f:
2519
+ credentials['ssh_private_key_content'] = f.read()
2520
+ record['credentials'] = credentials
2521
+
2650
2522
  if cluster_names is not None:
2651
2523
  if isinstance(cluster_names, str):
2652
2524
  cluster_names = [cluster_names]
2525
+ cluster_names = _get_glob_clusters(cluster_names, silent=True)
2653
2526
  new_records = []
2654
2527
  not_exist_cluster_names = []
2655
2528
  for cluster_name in cluster_names:
@@ -2664,7 +2537,11 @@ def get_clusters(
2664
2537
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2665
2538
  records = new_records
2666
2539
 
2667
- if not refresh:
2540
+ # Add auth_config to the records
2541
+ for record in records:
2542
+ _update_record_with_credentials_and_resources_str(record)
2543
+
2544
+ if refresh == common.StatusRefreshMode.NONE:
2668
2545
  return records
2669
2546
 
2670
2547
  plural = 's' if len(records) > 1 else ''
@@ -2675,12 +2552,18 @@ def get_clusters(
2675
2552
  f'Refreshing status for {len(records)} cluster{plural}'),
2676
2553
  total=len(records))
2677
2554
 
2555
+ if refresh == common.StatusRefreshMode.FORCE:
2556
+ force_refresh_statuses = set(status_lib.ClusterStatus)
2557
+ else:
2558
+ force_refresh_statuses = None
2559
+
2678
2560
  def _refresh_cluster(cluster_name):
2679
2561
  try:
2680
2562
  record = refresh_cluster_record(
2681
2563
  cluster_name,
2682
- force_refresh_statuses=set(status_lib.ClusterStatus),
2564
+ force_refresh_statuses=force_refresh_statuses,
2683
2565
  acquire_per_cluster_status_lock=True)
2566
+ _update_record_with_credentials_and_resources_str(record)
2684
2567
  except (exceptions.ClusterStatusFetchingError,
2685
2568
  exceptions.CloudUserIdentityError,
2686
2569
  exceptions.ClusterOwnerIdentityMismatchError) as e:
@@ -2692,9 +2575,11 @@ def get_clusters(
2692
2575
  return record
2693
2576
 
2694
2577
  cluster_names = [record['name'] for record in records]
2695
- with progress:
2696
- updated_records = subprocess_utils.run_in_parallel(
2697
- _refresh_cluster, cluster_names)
2578
+ updated_records = []
2579
+ if len(cluster_names) > 0:
2580
+ with progress:
2581
+ updated_records = subprocess_utils.run_in_parallel(
2582
+ _refresh_cluster, cluster_names)
2698
2583
 
2699
2584
  # Show information for removed clusters.
2700
2585
  kept_records = []
@@ -2731,6 +2616,7 @@ def get_clusters(
2731
2616
  f'{len(failed_clusters)} cluster{plural}:{reset}')
2732
2617
  for cluster_name, e in failed_clusters:
2733
2618
  logger.warning(f' {bright}{cluster_name}{reset}: {e}')
2619
+
2734
2620
  return kept_records
2735
2621
 
2736
2622
 
@@ -2960,8 +2846,7 @@ def get_endpoints(cluster: str,
2960
2846
  except ValueError:
2961
2847
  with ux_utils.print_exception_no_traceback():
2962
2848
  raise ValueError(f'Invalid endpoint {port!r}.') from None
2963
- cluster_records = get_clusters(include_controller=True,
2964
- refresh=False,
2849
+ cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
2965
2850
  cluster_names=[cluster])
2966
2851
  if not cluster_records:
2967
2852
  with ux_utils.print_exception_no_traceback():
@@ -2974,7 +2859,9 @@ def get_endpoints(cluster: str,
2974
2859
  with ux_utils.print_exception_no_traceback():
2975
2860
  raise exceptions.ClusterNotUpError(
2976
2861
  f'Cluster {cluster_record["name"]!r} '
2977
- 'is not in UP status.', cluster_record['status'])
2862
+ 'is not in UP status.',
2863
+ cluster_status=cluster_record['status'],
2864
+ handle=cluster_record['handle'])
2978
2865
  handle = cluster_record['handle']
2979
2866
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
2980
2867
  with ux_utils.print_exception_no_traceback():
@@ -2990,7 +2877,7 @@ def get_endpoints(cluster: str,
2990
2877
  except exceptions.NotSupportedError:
2991
2878
  with ux_utils.print_exception_no_traceback():
2992
2879
  raise ValueError('Querying endpoints is not supported '
2993
- f'for cluster {cluster!r} on {cloud}.') from None
2880
+ f'for {cluster!r} on {cloud}.') from None
2994
2881
 
2995
2882
  config = common_utils.read_yaml(handle.cluster_yaml)
2996
2883
  port_details = provision_lib.query_ports(repr(cloud),