skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = '5fc4b25c5fd6b2833aabf992583d1b1e3f843f42'
10
+ _SKYPILOT_COMMIT_SHA = '827d534c8bbfa61b895467b9431283e923dd9841'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250922'
40
+ __version__ = '1.0.0.dev20250926'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
sky/backends/backend.py CHANGED
@@ -95,6 +95,12 @@ class Backend(Generic[_ResourceHandleType]):
95
95
  envs_and_secrets: Dict[str, str]) -> None:
96
96
  return self._sync_workdir(handle, workdir, envs_and_secrets)
97
97
 
98
+ @timeline.event
99
+ @usage_lib.messages.usage.update_runtime('download_file')
100
+ def download_file(self, handle: _ResourceHandleType, local_file_path: str,
101
+ remote_file_path: str) -> None:
102
+ return self._download_file(handle, local_file_path, remote_file_path)
103
+
98
104
  @timeline.event
99
105
  @usage_lib.messages.usage.update_runtime('sync_file_mounts')
100
106
  def sync_file_mounts(
@@ -172,6 +178,10 @@ class Backend(Generic[_ResourceHandleType]):
172
178
  envs_and_secrets: Dict[str, str]) -> None:
173
179
  raise NotImplementedError
174
180
 
181
+ def _download_file(self, handle: _ResourceHandleType, local_file_path: str,
182
+ remote_file_path: str) -> None:
183
+ raise NotImplementedError
184
+
175
185
  def _sync_file_mounts(
176
186
  self,
177
187
  handle: _ResourceHandleType,
@@ -52,6 +52,7 @@ from sky.utils import cluster_utils
52
52
  from sky.utils import command_runner
53
53
  from sky.utils import common
54
54
  from sky.utils import common_utils
55
+ from sky.utils import context as context_lib
55
56
  from sky.utils import context_utils
56
57
  from sky.utils import controller_utils
57
58
  from sky.utils import env_options
@@ -796,7 +797,7 @@ def write_cluster_config(
796
797
  cloud=str(cloud).lower(),
797
798
  region=region.name,
798
799
  keys=('use_ssm',),
799
- default_value=False)
800
+ default_value=None)
800
801
 
801
802
  if use_ssm and ssh_proxy_command is not None:
802
803
  raise exceptions.InvalidCloudConfigs(
@@ -804,15 +805,18 @@ def write_cluster_config(
804
805
  f'is already set to {ssh_proxy_command!r}. Please remove '
805
806
  'ssh_proxy_command or set use_ssm to false.')
806
807
 
807
- if not use_ssm and use_internal_ips and ssh_proxy_command is None:
808
- logger.warning(
809
- f'{colorama.Fore.YELLOW}'
810
- 'use_internal_ips is set to true, '
811
- 'but ssh_proxy_command is not set. Defaulting to '
812
- 'using SSM. Specify ssh_proxy_command to use a different '
813
- 'https://docs.skypilot.co/en/latest/reference/config.html#'
814
- f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
815
- use_ssm = True
808
+ if use_internal_ips and ssh_proxy_command is None:
809
+ # Only if use_ssm is explicitly not set, we default to using SSM.
810
+ if use_ssm is None:
811
+ logger.warning(
812
+ f'{colorama.Fore.YELLOW}'
813
+ 'use_internal_ips is set to true, '
814
+ 'but ssh_proxy_command is not set. Defaulting to '
815
+ 'using SSM. Specify ssh_proxy_command to use a different '
816
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
817
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
818
+ use_ssm = True
819
+
816
820
  if use_ssm:
817
821
  aws_profile = os.environ.get('AWS_PROFILE', None)
818
822
  profile_str = f'--profile {aws_profile}' if aws_profile else ''
@@ -1843,7 +1847,9 @@ def check_owner_identity(cluster_name: str) -> None:
1843
1847
  """
1844
1848
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1845
1849
  return
1846
- record = global_user_state.get_cluster_from_name(cluster_name)
1850
+ record = global_user_state.get_cluster_from_name(cluster_name,
1851
+ include_user_info=False,
1852
+ summary_response=True)
1847
1853
  if record is None:
1848
1854
  return
1849
1855
  handle = record['handle']
@@ -1930,6 +1936,7 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1930
1936
  }
1931
1937
 
1932
1938
 
1939
+ @context_utils.cancellation_guard
1933
1940
  def _query_cluster_status_via_cloud_api(
1934
1941
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1935
1942
  ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
@@ -2137,7 +2144,10 @@ def check_can_clone_disk_and_override_task(
2137
2144
  return task, handle
2138
2145
 
2139
2146
 
2140
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2147
+ def _update_cluster_status(
2148
+ cluster_name: str,
2149
+ include_user_info: bool = True,
2150
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2141
2151
  """Update the cluster status.
2142
2152
 
2143
2153
  The cluster status is updated by checking ray cluster and real status from
@@ -2164,7 +2174,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2164
2174
  fetched from the cloud provider or there are leaked nodes causing
2165
2175
  the node number larger than expected.
2166
2176
  """
2167
- record = global_user_state.get_cluster_from_name(cluster_name)
2177
+ record = global_user_state.get_cluster_from_name(
2178
+ cluster_name,
2179
+ include_user_info=include_user_info,
2180
+ summary_response=summary_response)
2168
2181
  if record is None:
2169
2182
  return None
2170
2183
  handle = record['handle']
@@ -2340,7 +2353,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2340
2353
  ready=True,
2341
2354
  is_launch=False,
2342
2355
  existing_cluster_hash=record['cluster_hash'])
2343
- return global_user_state.get_cluster_from_name(cluster_name)
2356
+ return global_user_state.get_cluster_from_name(
2357
+ cluster_name,
2358
+ include_user_info=include_user_info,
2359
+ summary_response=summary_response)
2344
2360
 
2345
2361
  # All cases below are transitioning the cluster to non-UP states.
2346
2362
  launched_resources = handle.launched_resources.assert_launchable()
@@ -2552,7 +2568,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2552
2568
  ready=False,
2553
2569
  is_launch=False,
2554
2570
  existing_cluster_hash=record['cluster_hash'])
2555
- return global_user_state.get_cluster_from_name(cluster_name)
2571
+ return global_user_state.get_cluster_from_name(
2572
+ cluster_name,
2573
+ include_user_info=include_user_info,
2574
+ summary_response=summary_response)
2556
2575
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2557
2576
  # STOPPED.
2558
2577
  verb = 'terminated' if to_terminate else 'stopped'
@@ -2567,7 +2586,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2567
2586
  nop_if_duplicate=True,
2568
2587
  )
2569
2588
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2570
- return global_user_state.get_cluster_from_name(cluster_name)
2589
+ return global_user_state.get_cluster_from_name(
2590
+ cluster_name,
2591
+ include_user_info=include_user_info,
2592
+ summary_response=summary_response)
2571
2593
 
2572
2594
 
2573
2595
  def _must_refresh_cluster_status(
@@ -2589,12 +2611,13 @@ def _must_refresh_cluster_status(
2589
2611
 
2590
2612
 
2591
2613
  def refresh_cluster_record(
2592
- cluster_name: str,
2593
- *,
2594
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2595
- acquire_per_cluster_status_lock: bool = True,
2596
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2597
- ) -> Optional[Dict[str, Any]]:
2614
+ cluster_name: str,
2615
+ *,
2616
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2617
+ acquire_per_cluster_status_lock: bool = True,
2618
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2619
+ include_user_info: bool = True,
2620
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2598
2621
  """Refresh the cluster, and return the possibly updated record.
2599
2622
 
2600
2623
  The function will update the cached cluster status in the global state. For
@@ -2634,7 +2657,11 @@ def refresh_cluster_record(
2634
2657
  the node number larger than expected.
2635
2658
  """
2636
2659
 
2637
- record = global_user_state.get_cluster_from_name(cluster_name)
2660
+ ctx = context_lib.get()
2661
+ record = global_user_state.get_cluster_from_name(
2662
+ cluster_name,
2663
+ include_user_info=include_user_info,
2664
+ summary_response=summary_response)
2638
2665
  if record is None:
2639
2666
  return None
2640
2667
  # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
@@ -2653,12 +2680,16 @@ def refresh_cluster_record(
2653
2680
 
2654
2681
  # Loop until we have an up-to-date status or until we acquire the lock.
2655
2682
  while True:
2683
+ # Check if the context is canceled.
2684
+ if ctx is not None and ctx.is_canceled():
2685
+ raise asyncio.CancelledError()
2656
2686
  # Check to see if we can return the cached status.
2657
2687
  if not _must_refresh_cluster_status(record, force_refresh_statuses):
2658
2688
  return record
2659
2689
 
2660
2690
  if not acquire_per_cluster_status_lock:
2661
- return _update_cluster_status(cluster_name)
2691
+ return _update_cluster_status(cluster_name, include_user_info,
2692
+ summary_response)
2662
2693
 
2663
2694
  # Try to acquire the lock so we can fetch the status.
2664
2695
  try:
@@ -2666,12 +2697,16 @@ def refresh_cluster_record(
2666
2697
  # Check the cluster status again, since it could have been
2667
2698
  # updated between our last check and acquiring the lock.
2668
2699
  record = global_user_state.get_cluster_from_name(
2669
- cluster_name)
2700
+ cluster_name,
2701
+ include_user_info=include_user_info,
2702
+ summary_response=summary_response)
2670
2703
  if record is None or not _must_refresh_cluster_status(
2671
2704
  record, force_refresh_statuses):
2672
2705
  return record
2673
2706
  # Update and return the cluster status.
2674
- return _update_cluster_status(cluster_name)
2707
+ return _update_cluster_status(cluster_name,
2708
+ include_user_info,
2709
+ summary_response)
2675
2710
 
2676
2711
  except locks.LockTimeout:
2677
2712
  # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2692,7 +2727,10 @@ def refresh_cluster_record(
2692
2727
  time.sleep(lock.poll_interval)
2693
2728
 
2694
2729
  # Refresh for next loop iteration.
2695
- record = global_user_state.get_cluster_from_name(cluster_name)
2730
+ record = global_user_state.get_cluster_from_name(
2731
+ cluster_name,
2732
+ include_user_info=include_user_info,
2733
+ summary_response=summary_response)
2696
2734
  if record is None:
2697
2735
  return None
2698
2736
 
@@ -2717,7 +2755,9 @@ def refresh_cluster_status_handle(
2717
2755
  cluster_name,
2718
2756
  force_refresh_statuses=force_refresh_statuses,
2719
2757
  acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2720
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2758
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2759
+ include_user_info=False,
2760
+ summary_response=True)
2721
2761
  if record is None:
2722
2762
  return None, None
2723
2763
  return record['status'], record['handle']
@@ -2768,7 +2808,9 @@ def check_cluster_available(
2768
2808
  exceptions.CloudUserIdentityError: if we fail to get the current user
2769
2809
  identity.
2770
2810
  """
2771
- record = global_user_state.get_cluster_from_name(cluster_name)
2811
+ record = global_user_state.get_cluster_from_name(cluster_name,
2812
+ include_user_info=False,
2813
+ summary_response=True)
2772
2814
  if dryrun:
2773
2815
  assert record is not None, cluster_name
2774
2816
  return record['handle']
@@ -2955,7 +2997,8 @@ def is_controller_accessible(
2955
2997
  f'fatal, but {controller_name} commands/calls may hang or return '
2956
2998
  'stale information, when the controller is not up.\n'
2957
2999
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2958
- record = global_user_state.get_cluster_from_name(cluster_name)
3000
+ record = global_user_state.get_cluster_from_name(
3001
+ cluster_name, include_user_info=False, summary_response=True)
2959
3002
  if record is not None:
2960
3003
  controller_status, handle = record['status'], record['handle']
2961
3004
  # We check the connection even if the cluster has a cached status UP
@@ -3012,22 +3055,98 @@ class CloudFilter(enum.Enum):
3012
3055
  LOCAL = 'local'
3013
3056
 
3014
3057
 
3015
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3058
+ def _get_glob_clusters(
3059
+ clusters: List[str],
3060
+ silent: bool = False,
3061
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
3016
3062
  """Returns a list of clusters that match the glob pattern."""
3017
3063
  glob_clusters = []
3018
3064
  for cluster in clusters:
3019
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3065
+ glob_cluster = global_user_state.get_glob_cluster_names(
3066
+ cluster, workspaces_filter=workspaces_filter)
3020
3067
  if len(glob_cluster) == 0 and not silent:
3021
3068
  logger.info(f'Cluster {cluster} not found.')
3022
3069
  glob_clusters.extend(glob_cluster)
3023
3070
  return list(set(glob_clusters))
3024
3071
 
3025
3072
 
3073
+ def _refresh_cluster(
3074
+ cluster_name: str,
3075
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3076
+ include_user_info: bool = True,
3077
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3078
+ try:
3079
+ record = refresh_cluster_record(
3080
+ cluster_name,
3081
+ force_refresh_statuses=force_refresh_statuses,
3082
+ acquire_per_cluster_status_lock=True,
3083
+ include_user_info=include_user_info,
3084
+ summary_response=summary_response)
3085
+ except (exceptions.ClusterStatusFetchingError,
3086
+ exceptions.CloudUserIdentityError,
3087
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3088
+ # Do not fail the entire refresh process. The caller will
3089
+ # handle the 'UNKNOWN' status, and collect the errors into
3090
+ # a table.
3091
+ record = {'status': 'UNKNOWN', 'error': e}
3092
+ return record
3093
+
3094
+
3095
+ def refresh_cluster_records() -> None:
3096
+ """Refreshes the status of all clusters, except managed clusters.
3097
+
3098
+ Used by the background status refresh daemon.
3099
+ This function is a stripped-down version of get_clusters, with only the
3100
+ bare bones refresh logic.
3101
+
3102
+ Returns:
3103
+ None
3104
+
3105
+ Raises:
3106
+ None
3107
+ """
3108
+ exclude_managed_clusters = True
3109
+ if env_options.Options.SHOW_DEBUG_INFO.get():
3110
+ exclude_managed_clusters = False
3111
+ cluster_names = global_user_state.get_cluster_names(
3112
+ exclude_managed_clusters=exclude_managed_clusters,)
3113
+
3114
+ # TODO(syang): we should try not to leak
3115
+ # request info in backend_utils.py.
3116
+ # Refactor this to use some other info to
3117
+ # determine if a launch is in progress.
3118
+ request = requests_lib.get_request_tasks(
3119
+ req_filter=requests_lib.RequestTaskFilter(
3120
+ status=[requests_lib.RequestStatus.RUNNING],
3121
+ cluster_names=cluster_names,
3122
+ include_request_names=['sky.launch']))
3123
+ cluster_names_with_launch_request = {
3124
+ request.cluster_name for request in request
3125
+ }
3126
+ cluster_names_without_launch_request = [
3127
+ cluster_name for cluster_name in cluster_names
3128
+ if cluster_name not in cluster_names_with_launch_request
3129
+ ]
3130
+
3131
+ def _refresh_cluster_record(cluster_name):
3132
+ return _refresh_cluster(cluster_name,
3133
+ force_refresh_statuses=set(
3134
+ status_lib.ClusterStatus),
3135
+ include_user_info=False,
3136
+ summary_response=True)
3137
+
3138
+ if len(cluster_names) > 0:
3139
+ # Do not refresh the clusters that have an active launch request.
3140
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3141
+ cluster_names_without_launch_request)
3142
+
3143
+
3026
3144
  def get_clusters(
3027
3145
  refresh: common.StatusRefreshMode,
3028
3146
  cluster_names: Optional[Union[str, List[str]]] = None,
3029
3147
  all_users: bool = True,
3030
3148
  include_credentials: bool = False,
3149
+ summary_response: bool = False,
3031
3150
  # Internal only:
3032
3151
  # pylint: disable=invalid-name
3033
3152
  _include_is_managed: bool = False,
@@ -3055,10 +3174,23 @@ def get_clusters(
3055
3174
  A list of cluster records. If the cluster does not exist or has been
3056
3175
  terminated, the record will be omitted from the returned list.
3057
3176
  """
3177
+ accessible_workspaces = workspaces_core.get_workspaces()
3058
3178
  if cluster_names is not None:
3059
3179
  if isinstance(cluster_names, str):
3060
3180
  cluster_names = [cluster_names]
3061
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
3181
+ non_glob_cluster_names = []
3182
+ glob_cluster_names = []
3183
+ for cluster_name in cluster_names:
3184
+ if ux_utils.is_glob_pattern(cluster_name):
3185
+ glob_cluster_names.append(cluster_name)
3186
+ else:
3187
+ non_glob_cluster_names.append(cluster_name)
3188
+ cluster_names = non_glob_cluster_names
3189
+ if glob_cluster_names:
3190
+ cluster_names += _get_glob_clusters(
3191
+ glob_cluster_names,
3192
+ silent=True,
3193
+ workspaces_filter=accessible_workspaces)
3062
3194
 
3063
3195
  exclude_managed_clusters = False
3064
3196
  if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -3066,13 +3198,12 @@ def get_clusters(
3066
3198
  user_hashes_filter = None
3067
3199
  if not all_users:
3068
3200
  user_hashes_filter = {common_utils.get_current_user().id}
3069
- accessible_workspaces = workspaces_core.get_workspaces()
3070
3201
  records = global_user_state.get_clusters(
3071
3202
  exclude_managed_clusters=exclude_managed_clusters,
3072
3203
  user_hashes_filter=user_hashes_filter,
3073
3204
  workspaces_filter=accessible_workspaces,
3074
3205
  cluster_names=cluster_names,
3075
- )
3206
+ summary_response=summary_response)
3076
3207
 
3077
3208
  yellow = colorama.Fore.YELLOW
3078
3209
  bright = colorama.Style.BRIGHT
@@ -3080,12 +3211,10 @@ def get_clusters(
3080
3211
 
3081
3212
  if cluster_names is not None:
3082
3213
  record_names = {record['name'] for record in records}
3083
- not_exist_cluster_names = [
3084
- cluster_name for cluster_name in cluster_names
3085
- if cluster_name not in record_names
3086
- ]
3087
- if not_exist_cluster_names:
3088
- clusters_str = ', '.join(not_exist_cluster_names)
3214
+ not_found_clusters = ux_utils.get_non_matched_query(
3215
+ cluster_names, record_names)
3216
+ if not_found_clusters:
3217
+ clusters_str = ', '.join(not_found_clusters)
3089
3218
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
3090
3219
 
3091
3220
  def _get_records_with_handle(
@@ -3096,7 +3225,7 @@ def get_clusters(
3096
3225
  if record is not None and record['handle'] is not None
3097
3226
  ]
3098
3227
 
3099
- def _update_records_with_resources_str(
3228
+ def _update_records_with_handle_info(
3100
3229
  records: List[Optional[Dict[str, Any]]]) -> None:
3101
3230
  """Add resource str to record"""
3102
3231
  for record in _get_records_with_handle(records):
@@ -3107,6 +3236,8 @@ def get_clusters(
3107
3236
  record[
3108
3237
  'resources_str_full'] = resources_utils.get_readable_resources_repr(
3109
3238
  handle, simplify=False)
3239
+ if not summary_response:
3240
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3110
3241
 
3111
3242
  def _update_records_with_credentials(
3112
3243
  records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3146,7 +3277,7 @@ def get_clusters(
3146
3277
  record['credentials'] = credential
3147
3278
 
3148
3279
  def _update_records_with_resources(
3149
- records: List[Optional[Dict[str, Any]]]) -> None:
3280
+ records: List[Optional[Dict[str, Any]]],) -> None:
3150
3281
  """Add the resources to the record."""
3151
3282
  for record in _get_records_with_handle(records):
3152
3283
  handle = record['handle']
@@ -3165,8 +3296,8 @@ def get_clusters(
3165
3296
  f'{handle.launched_resources.accelerators}'
3166
3297
  if handle.launched_resources.accelerators else None)
3167
3298
 
3168
- # Add auth_config to the records
3169
- _update_records_with_resources_str(records)
3299
+ # Add handle info to the records
3300
+ _update_records_with_handle_info(records)
3170
3301
  if include_credentials:
3171
3302
  _update_records_with_credentials(records)
3172
3303
  if refresh == common.StatusRefreshMode.NONE:
@@ -3187,47 +3318,44 @@ def get_clusters(
3187
3318
  else:
3188
3319
  force_refresh_statuses = None
3189
3320
 
3190
- def _refresh_cluster(cluster_name):
3191
- # TODO(syang): we should try not to leak
3192
- # request info in backend_utils.py.
3193
- # Refactor this to use some other info to
3194
- # determine if a launch is in progress.
3195
- request = requests_lib.get_request_tasks(
3196
- req_filter=requests_lib.RequestTaskFilter(
3197
- status=[requests_lib.RequestStatus.RUNNING],
3198
- cluster_names=[cluster_name],
3199
- include_request_names=['sky.launch']))
3200
- if len(request) > 0:
3201
- # There is an active launch request on the cluster,
3202
- # so we don't want to update the cluster status until
3203
- # the request is completed.
3204
- logger.debug(f'skipping refresh for cluster {cluster_name} '
3205
- 'as there is an active launch request')
3206
- return global_user_state.get_cluster_from_name(cluster_name)
3207
- try:
3208
- record = refresh_cluster_record(
3209
- cluster_name,
3210
- force_refresh_statuses=force_refresh_statuses,
3211
- acquire_per_cluster_status_lock=True)
3212
- _update_records_with_resources_str([record])
3321
+ def _refresh_cluster_record(cluster_name):
3322
+ record = _refresh_cluster(cluster_name,
3323
+ force_refresh_statuses=force_refresh_statuses,
3324
+ include_user_info=True,
3325
+ summary_response=summary_response)
3326
+ if 'error' not in record:
3327
+ _update_records_with_handle_info([record])
3213
3328
  if include_credentials:
3214
3329
  _update_records_with_credentials([record])
3215
- except (exceptions.ClusterStatusFetchingError,
3216
- exceptions.CloudUserIdentityError,
3217
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3218
- # Do not fail the entire refresh process. The caller will
3219
- # handle the 'UNKNOWN' status, and collect the errors into
3220
- # a table.
3221
- record = {'status': 'UNKNOWN', 'error': e}
3222
- progress.update(task, advance=1)
3330
+ progress.update(task, advance=1)
3223
3331
  return record
3224
3332
 
3225
3333
  cluster_names = [record['name'] for record in records]
3226
- updated_records = []
3227
- if len(cluster_names) > 0:
3334
+ # TODO(syang): we should try not to leak
3335
+ # request info in backend_utils.py.
3336
+ # Refactor this to use some other info to
3337
+ # determine if a launch is in progress.
3338
+ request = requests_lib.get_request_tasks(
3339
+ req_filter=requests_lib.RequestTaskFilter(
3340
+ status=[requests_lib.RequestStatus.RUNNING],
3341
+ cluster_names=cluster_names,
3342
+ include_request_names=['sky.launch']))
3343
+ cluster_names_with_launch_request = {
3344
+ request.cluster_name for request in request
3345
+ }
3346
+ cluster_names_without_launch_request = [
3347
+ cluster_name for cluster_name in cluster_names
3348
+ if cluster_name not in cluster_names_with_launch_request
3349
+ ]
3350
+ # for clusters that have an active launch request, we do not refresh the status
3351
+ updated_records = [
3352
+ record for record in records
3353
+ if record['name'] in cluster_names_with_launch_request
3354
+ ]
3355
+ if len(cluster_names_without_launch_request) > 0:
3228
3356
  with progress:
3229
3357
  updated_records = subprocess_utils.run_in_parallel(
3230
- _refresh_cluster, cluster_names)
3358
+ _refresh_cluster_record, cluster_names_without_launch_request)
3231
3359
 
3232
3360
  # Show information for removed clusters.
3233
3361
  kept_records = []
@@ -116,6 +116,9 @@ Path = str
116
116
 
117
117
  SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
118
118
  SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
119
+ # Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
120
+ # from interfering with the Ray cluster in the user's task (if any).
121
+ UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
119
122
 
120
123
  logger = sky_logging.init_logger(__name__)
121
124
 
@@ -712,6 +715,8 @@ class RayCodeGen:
712
715
  done
713
716
  echo "skypilot: cached mount uploaded complete"
714
717
  fi""")
718
+ unset_ray_env_vars = ' && '.join(
719
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
715
720
  self._code += [
716
721
  sky_env_vars_dict_str,
717
722
  textwrap.dedent(f"""\
@@ -721,6 +726,7 @@ class RayCodeGen:
721
726
  script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
722
727
 
723
728
  if script is not None:
729
+ script=f'{unset_ray_env_vars}; {{script}}'
724
730
  script += rclone_flush_script
725
731
  sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
726
732
 
@@ -3261,9 +3267,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3261
3267
  # Usage Collection:
3262
3268
  usage_lib.messages.usage.update_cluster_resources(
3263
3269
  handle.launched_nodes, launched_resources)
3264
- record = global_user_state.get_cluster_from_name(cluster_name)
3265
- if record is not None:
3266
- usage_lib.messages.usage.update_cluster_status(record['status'])
3270
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
3271
+ if status is not None:
3272
+ usage_lib.messages.usage.update_cluster_status(status)
3267
3273
 
3268
3274
  assert launched_resources.region is not None, handle
3269
3275
 
@@ -3532,8 +3538,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3532
3538
  error_message + '\n' + str(e),
3533
3539
  failover_history=e.failover_history) from None
3534
3540
  if dryrun:
3535
- record = global_user_state.get_cluster_from_name(cluster_name)
3536
- return record['handle'] if record is not None else None, False
3541
+ handle = global_user_state.get_handle_from_cluster_name(
3542
+ cluster_name)
3543
+ return handle if handle is not None else None, False
3537
3544
 
3538
3545
  if config_dict['provisioning_skipped']:
3539
3546
  # Skip further provisioning.
@@ -3541,10 +3548,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3541
3548
  # ('handle', 'provision_record', 'resources_vars')
3542
3549
  # We need to return the handle - but it should be the existing
3543
3550
  # handle for the cluster.
3544
- record = global_user_state.get_cluster_from_name(cluster_name)
3545
- assert record is not None and record['handle'] is not None, (
3546
- cluster_name, record)
3547
- return record['handle'], True
3551
+ handle = global_user_state.get_handle_from_cluster_name(
3552
+ cluster_name)
3553
+ assert handle is not None, (cluster_name, handle)
3554
+ return handle, True
3548
3555
 
3549
3556
  if 'provision_record' in config_dict:
3550
3557
  # New provisioner is used here.
@@ -3939,6 +3946,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3939
3946
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3940
3947
  # Need this `-i` option to make sure `source ~/.bashrc` work
3941
3948
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
3949
+ unset_ray_env_vars = ' && '.join(
3950
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
3951
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3942
3952
  runners = handle.get_command_runners(avoid_ssh_control=True)
3943
3953
 
3944
3954
  def _setup_node(node_id: int) -> None:
@@ -4088,6 +4098,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4088
4098
  logger.info(
4089
4099
  ux_utils.finishing_message('Setup completed.', setup_log_path))
4090
4100
 
4101
+ def _download_file(self, handle: CloudVmRayResourceHandle,
4102
+ local_file_path: str, remote_file_path: str) -> None:
4103
+ """Syncs file from remote to local."""
4104
+ runners = handle.get_command_runners()
4105
+ head_runner = runners[0]
4106
+ head_runner.rsync(
4107
+ source=local_file_path,
4108
+ target=remote_file_path,
4109
+ up=False,
4110
+ stream_logs=False,
4111
+ )
4112
+
4091
4113
  def _exec_code_on_head(
4092
4114
  self,
4093
4115
  handle: CloudVmRayResourceHandle,
@@ -4992,10 +5014,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4992
5014
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4993
5015
  'up.')
4994
5016
  if not cluster_status_fetched:
4995
- record = global_user_state.get_cluster_from_name(
5017
+ status = global_user_state.get_status_from_cluster_name(
4996
5018
  handle.cluster_name)
4997
- prev_cluster_status = record[
4998
- 'status'] if record is not None else None
5019
+ prev_cluster_status = status if status is not None else None
4999
5020
  if prev_cluster_status is None:
5000
5021
  # When the cluster is not in the cluster table, we guarantee that
5001
5022
  # all related resources / cache / config are cleaned up, i.e. it
@@ -5568,7 +5589,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5568
5589
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
5569
5590
  # TODO(zhwu): complete the list of exceptions.
5570
5591
  """
5571
- record = global_user_state.get_cluster_from_name(cluster_name)
5592
+ record = global_user_state.get_cluster_from_name(
5593
+ cluster_name, include_user_info=False, summary_response=True)
5572
5594
  if record is None:
5573
5595
  handle_before_refresh = None
5574
5596
  status_before_refresh = None
@@ -5589,6 +5611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5589
5611
  cluster_name,
5590
5612
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
5591
5613
  acquire_per_cluster_status_lock=False,
5614
+ include_user_info=False,
5615
+ summary_response=True,
5592
5616
  )
5593
5617
  if record is not None:
5594
5618
  prev_cluster_status = record['status']