skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (113) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +200 -78
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +104 -53
  7. sky/client/sdk.py +13 -5
  8. sky/client/sdk_async.py +4 -2
  9. sky/clouds/kubernetes.py +2 -1
  10. sky/clouds/runpod.py +20 -7
  11. sky/core.py +7 -53
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/mounting_utils.py +19 -10
  36. sky/execution.py +4 -2
  37. sky/global_user_state.py +224 -38
  38. sky/jobs/client/sdk.py +10 -1
  39. sky/jobs/controller.py +7 -7
  40. sky/jobs/server/core.py +3 -3
  41. sky/jobs/server/server.py +15 -11
  42. sky/jobs/utils.py +1 -1
  43. sky/logs/agent.py +30 -3
  44. sky/logs/aws.py +9 -19
  45. sky/provision/__init__.py +2 -1
  46. sky/provision/aws/instance.py +2 -1
  47. sky/provision/azure/instance.py +2 -1
  48. sky/provision/cudo/instance.py +2 -2
  49. sky/provision/do/instance.py +2 -2
  50. sky/provision/docker_utils.py +41 -19
  51. sky/provision/fluidstack/instance.py +2 -2
  52. sky/provision/gcp/instance.py +2 -1
  53. sky/provision/hyperbolic/instance.py +2 -1
  54. sky/provision/instance_setup.py +1 -1
  55. sky/provision/kubernetes/instance.py +134 -8
  56. sky/provision/lambda_cloud/instance.py +2 -1
  57. sky/provision/nebius/instance.py +2 -1
  58. sky/provision/oci/instance.py +2 -1
  59. sky/provision/paperspace/instance.py +2 -2
  60. sky/provision/primeintellect/instance.py +2 -2
  61. sky/provision/provisioner.py +1 -0
  62. sky/provision/runpod/instance.py +2 -2
  63. sky/provision/scp/instance.py +2 -2
  64. sky/provision/seeweb/instance.py +2 -1
  65. sky/provision/vast/instance.py +2 -1
  66. sky/provision/vsphere/instance.py +6 -5
  67. sky/schemas/api/responses.py +2 -1
  68. sky/serve/autoscalers.py +2 -0
  69. sky/serve/client/impl.py +45 -19
  70. sky/serve/replica_managers.py +12 -5
  71. sky/serve/serve_utils.py +5 -11
  72. sky/serve/server/core.py +9 -6
  73. sky/serve/server/impl.py +78 -25
  74. sky/serve/server/server.py +4 -5
  75. sky/serve/service_spec.py +33 -0
  76. sky/server/auth/oauth2_proxy.py +2 -2
  77. sky/server/constants.py +1 -1
  78. sky/server/daemons.py +2 -3
  79. sky/server/requests/executor.py +56 -6
  80. sky/server/requests/payloads.py +31 -8
  81. sky/server/requests/preconditions.py +2 -3
  82. sky/server/rest.py +2 -0
  83. sky/server/server.py +28 -19
  84. sky/server/stream_utils.py +34 -12
  85. sky/setup_files/dependencies.py +12 -2
  86. sky/setup_files/setup.py +44 -44
  87. sky/skylet/constants.py +2 -3
  88. sky/templates/kubernetes-ray.yml.j2 +16 -15
  89. sky/usage/usage_lib.py +3 -0
  90. sky/utils/cli_utils/status_utils.py +4 -5
  91. sky/utils/context.py +104 -29
  92. sky/utils/controller_utils.py +7 -6
  93. sky/utils/kubernetes/create_cluster.sh +13 -28
  94. sky/utils/kubernetes/delete_cluster.sh +10 -7
  95. sky/utils/kubernetes/generate_kind_config.py +6 -66
  96. sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
  97. sky/utils/kubernetes_enums.py +5 -0
  98. sky/utils/ux_utils.py +35 -1
  99. sky/utils/yaml_utils.py +9 -0
  100. sky/volumes/client/sdk.py +44 -8
  101. sky/volumes/server/server.py +33 -7
  102. sky/volumes/volume.py +22 -14
  103. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
  104. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
  105. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  109. /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
  110. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
  111. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
  112. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
  113. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'c20815b0bb0906dce85a57513cf887951dbe7a8d'
10
+ _SKYPILOT_COMMIT_SHA = 'c5a7c4995b9a92ce1c005ad783d2725c7f7f9af2'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250919'
40
+ __version__ = '1.0.0.dev20250925'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
sky/backends/backend.py CHANGED
@@ -95,6 +95,12 @@ class Backend(Generic[_ResourceHandleType]):
95
95
  envs_and_secrets: Dict[str, str]) -> None:
96
96
  return self._sync_workdir(handle, workdir, envs_and_secrets)
97
97
 
98
+ @timeline.event
99
+ @usage_lib.messages.usage.update_runtime('download_file')
100
+ def download_file(self, handle: _ResourceHandleType, local_file_path: str,
101
+ remote_file_path: str) -> None:
102
+ return self._download_file(handle, local_file_path, remote_file_path)
103
+
98
104
  @timeline.event
99
105
  @usage_lib.messages.usage.update_runtime('sync_file_mounts')
100
106
  def sync_file_mounts(
@@ -172,6 +178,10 @@ class Backend(Generic[_ResourceHandleType]):
172
178
  envs_and_secrets: Dict[str, str]) -> None:
173
179
  raise NotImplementedError
174
180
 
181
+ def _download_file(self, handle: _ResourceHandleType, local_file_path: str,
182
+ remote_file_path: str) -> None:
183
+ raise NotImplementedError
184
+
175
185
  def _sync_file_mounts(
176
186
  self,
177
187
  handle: _ResourceHandleType,
@@ -52,6 +52,7 @@ from sky.utils import cluster_utils
52
52
  from sky.utils import command_runner
53
53
  from sky.utils import common
54
54
  from sky.utils import common_utils
55
+ from sky.utils import context as context_lib
55
56
  from sky.utils import context_utils
56
57
  from sky.utils import controller_utils
57
58
  from sky.utils import env_options
@@ -1843,7 +1844,9 @@ def check_owner_identity(cluster_name: str) -> None:
1843
1844
  """
1844
1845
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1845
1846
  return
1846
- record = global_user_state.get_cluster_from_name(cluster_name)
1847
+ record = global_user_state.get_cluster_from_name(cluster_name,
1848
+ include_user_info=False,
1849
+ summary_response=True)
1847
1850
  if record is None:
1848
1851
  return
1849
1852
  handle = record['handle']
@@ -1930,6 +1933,7 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1930
1933
  }
1931
1934
 
1932
1935
 
1936
+ @context_utils.cancellation_guard
1933
1937
  def _query_cluster_status_via_cloud_api(
1934
1938
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1935
1939
  ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
@@ -2137,7 +2141,10 @@ def check_can_clone_disk_and_override_task(
2137
2141
  return task, handle
2138
2142
 
2139
2143
 
2140
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2144
+ def _update_cluster_status(
2145
+ cluster_name: str,
2146
+ include_user_info: bool = True,
2147
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2141
2148
  """Update the cluster status.
2142
2149
 
2143
2150
  The cluster status is updated by checking ray cluster and real status from
@@ -2164,7 +2171,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2164
2171
  fetched from the cloud provider or there are leaked nodes causing
2165
2172
  the node number larger than expected.
2166
2173
  """
2167
- record = global_user_state.get_cluster_from_name(cluster_name)
2174
+ record = global_user_state.get_cluster_from_name(
2175
+ cluster_name,
2176
+ include_user_info=include_user_info,
2177
+ summary_response=summary_response)
2168
2178
  if record is None:
2169
2179
  return None
2170
2180
  handle = record['handle']
@@ -2340,7 +2350,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2340
2350
  ready=True,
2341
2351
  is_launch=False,
2342
2352
  existing_cluster_hash=record['cluster_hash'])
2343
- return global_user_state.get_cluster_from_name(cluster_name)
2353
+ return global_user_state.get_cluster_from_name(
2354
+ cluster_name,
2355
+ include_user_info=include_user_info,
2356
+ summary_response=summary_response)
2344
2357
 
2345
2358
  # All cases below are transitioning the cluster to non-UP states.
2346
2359
  launched_resources = handle.launched_resources.assert_launchable()
@@ -2552,7 +2565,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2552
2565
  ready=False,
2553
2566
  is_launch=False,
2554
2567
  existing_cluster_hash=record['cluster_hash'])
2555
- return global_user_state.get_cluster_from_name(cluster_name)
2568
+ return global_user_state.get_cluster_from_name(
2569
+ cluster_name,
2570
+ include_user_info=include_user_info,
2571
+ summary_response=summary_response)
2556
2572
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2557
2573
  # STOPPED.
2558
2574
  verb = 'terminated' if to_terminate else 'stopped'
@@ -2567,7 +2583,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2567
2583
  nop_if_duplicate=True,
2568
2584
  )
2569
2585
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2570
- return global_user_state.get_cluster_from_name(cluster_name)
2586
+ return global_user_state.get_cluster_from_name(
2587
+ cluster_name,
2588
+ include_user_info=include_user_info,
2589
+ summary_response=summary_response)
2571
2590
 
2572
2591
 
2573
2592
  def _must_refresh_cluster_status(
@@ -2589,12 +2608,13 @@ def _must_refresh_cluster_status(
2589
2608
 
2590
2609
 
2591
2610
  def refresh_cluster_record(
2592
- cluster_name: str,
2593
- *,
2594
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2595
- acquire_per_cluster_status_lock: bool = True,
2596
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2597
- ) -> Optional[Dict[str, Any]]:
2611
+ cluster_name: str,
2612
+ *,
2613
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2614
+ acquire_per_cluster_status_lock: bool = True,
2615
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2616
+ include_user_info: bool = True,
2617
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2598
2618
  """Refresh the cluster, and return the possibly updated record.
2599
2619
 
2600
2620
  The function will update the cached cluster status in the global state. For
@@ -2634,7 +2654,11 @@ def refresh_cluster_record(
2634
2654
  the node number larger than expected.
2635
2655
  """
2636
2656
 
2637
- record = global_user_state.get_cluster_from_name(cluster_name)
2657
+ ctx = context_lib.get()
2658
+ record = global_user_state.get_cluster_from_name(
2659
+ cluster_name,
2660
+ include_user_info=include_user_info,
2661
+ summary_response=summary_response)
2638
2662
  if record is None:
2639
2663
  return None
2640
2664
  # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
@@ -2653,12 +2677,16 @@ def refresh_cluster_record(
2653
2677
 
2654
2678
  # Loop until we have an up-to-date status or until we acquire the lock.
2655
2679
  while True:
2680
+ # Check if the context is canceled.
2681
+ if ctx is not None and ctx.is_canceled():
2682
+ raise asyncio.CancelledError()
2656
2683
  # Check to see if we can return the cached status.
2657
2684
  if not _must_refresh_cluster_status(record, force_refresh_statuses):
2658
2685
  return record
2659
2686
 
2660
2687
  if not acquire_per_cluster_status_lock:
2661
- return _update_cluster_status(cluster_name)
2688
+ return _update_cluster_status(cluster_name, include_user_info,
2689
+ summary_response)
2662
2690
 
2663
2691
  # Try to acquire the lock so we can fetch the status.
2664
2692
  try:
@@ -2666,12 +2694,16 @@ def refresh_cluster_record(
2666
2694
  # Check the cluster status again, since it could have been
2667
2695
  # updated between our last check and acquiring the lock.
2668
2696
  record = global_user_state.get_cluster_from_name(
2669
- cluster_name)
2697
+ cluster_name,
2698
+ include_user_info=include_user_info,
2699
+ summary_response=summary_response)
2670
2700
  if record is None or not _must_refresh_cluster_status(
2671
2701
  record, force_refresh_statuses):
2672
2702
  return record
2673
2703
  # Update and return the cluster status.
2674
- return _update_cluster_status(cluster_name)
2704
+ return _update_cluster_status(cluster_name,
2705
+ include_user_info,
2706
+ summary_response)
2675
2707
 
2676
2708
  except locks.LockTimeout:
2677
2709
  # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2692,7 +2724,10 @@ def refresh_cluster_record(
2692
2724
  time.sleep(lock.poll_interval)
2693
2725
 
2694
2726
  # Refresh for next loop iteration.
2695
- record = global_user_state.get_cluster_from_name(cluster_name)
2727
+ record = global_user_state.get_cluster_from_name(
2728
+ cluster_name,
2729
+ include_user_info=include_user_info,
2730
+ summary_response=summary_response)
2696
2731
  if record is None:
2697
2732
  return None
2698
2733
 
@@ -2717,7 +2752,9 @@ def refresh_cluster_status_handle(
2717
2752
  cluster_name,
2718
2753
  force_refresh_statuses=force_refresh_statuses,
2719
2754
  acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2720
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2755
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2756
+ include_user_info=False,
2757
+ summary_response=True)
2721
2758
  if record is None:
2722
2759
  return None, None
2723
2760
  return record['status'], record['handle']
@@ -2768,7 +2805,9 @@ def check_cluster_available(
2768
2805
  exceptions.CloudUserIdentityError: if we fail to get the current user
2769
2806
  identity.
2770
2807
  """
2771
- record = global_user_state.get_cluster_from_name(cluster_name)
2808
+ record = global_user_state.get_cluster_from_name(cluster_name,
2809
+ include_user_info=False,
2810
+ summary_response=True)
2772
2811
  if dryrun:
2773
2812
  assert record is not None, cluster_name
2774
2813
  return record['handle']
@@ -2955,7 +2994,8 @@ def is_controller_accessible(
2955
2994
  f'fatal, but {controller_name} commands/calls may hang or return '
2956
2995
  'stale information, when the controller is not up.\n'
2957
2996
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2958
- record = global_user_state.get_cluster_from_name(cluster_name)
2997
+ record = global_user_state.get_cluster_from_name(
2998
+ cluster_name, include_user_info=False, summary_response=True)
2959
2999
  if record is not None:
2960
3000
  controller_status, handle = record['status'], record['handle']
2961
3001
  # We check the connection even if the cluster has a cached status UP
@@ -3012,22 +3052,98 @@ class CloudFilter(enum.Enum):
3012
3052
  LOCAL = 'local'
3013
3053
 
3014
3054
 
3015
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3055
+ def _get_glob_clusters(
3056
+ clusters: List[str],
3057
+ silent: bool = False,
3058
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
3016
3059
  """Returns a list of clusters that match the glob pattern."""
3017
3060
  glob_clusters = []
3018
3061
  for cluster in clusters:
3019
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3062
+ glob_cluster = global_user_state.get_glob_cluster_names(
3063
+ cluster, workspaces_filter=workspaces_filter)
3020
3064
  if len(glob_cluster) == 0 and not silent:
3021
3065
  logger.info(f'Cluster {cluster} not found.')
3022
3066
  glob_clusters.extend(glob_cluster)
3023
3067
  return list(set(glob_clusters))
3024
3068
 
3025
3069
 
3070
+ def _refresh_cluster(
3071
+ cluster_name: str,
3072
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3073
+ include_user_info: bool = True,
3074
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3075
+ try:
3076
+ record = refresh_cluster_record(
3077
+ cluster_name,
3078
+ force_refresh_statuses=force_refresh_statuses,
3079
+ acquire_per_cluster_status_lock=True,
3080
+ include_user_info=include_user_info,
3081
+ summary_response=summary_response)
3082
+ except (exceptions.ClusterStatusFetchingError,
3083
+ exceptions.CloudUserIdentityError,
3084
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3085
+ # Do not fail the entire refresh process. The caller will
3086
+ # handle the 'UNKNOWN' status, and collect the errors into
3087
+ # a table.
3088
+ record = {'status': 'UNKNOWN', 'error': e}
3089
+ return record
3090
+
3091
+
3092
+ def refresh_cluster_records() -> None:
3093
+ """Refreshes the status of all clusters, except managed clusters.
3094
+
3095
+ Used by the background status refresh daemon.
3096
+ This function is a stripped-down version of get_clusters, with only the
3097
+ bare bones refresh logic.
3098
+
3099
+ Returns:
3100
+ None
3101
+
3102
+ Raises:
3103
+ None
3104
+ """
3105
+ exclude_managed_clusters = True
3106
+ if env_options.Options.SHOW_DEBUG_INFO.get():
3107
+ exclude_managed_clusters = False
3108
+ cluster_names = global_user_state.get_cluster_names(
3109
+ exclude_managed_clusters=exclude_managed_clusters,)
3110
+
3111
+ # TODO(syang): we should try not to leak
3112
+ # request info in backend_utils.py.
3113
+ # Refactor this to use some other info to
3114
+ # determine if a launch is in progress.
3115
+ request = requests_lib.get_request_tasks(
3116
+ req_filter=requests_lib.RequestTaskFilter(
3117
+ status=[requests_lib.RequestStatus.RUNNING],
3118
+ cluster_names=cluster_names,
3119
+ include_request_names=['sky.launch']))
3120
+ cluster_names_with_launch_request = {
3121
+ request.cluster_name for request in request
3122
+ }
3123
+ cluster_names_without_launch_request = [
3124
+ cluster_name for cluster_name in cluster_names
3125
+ if cluster_name not in cluster_names_with_launch_request
3126
+ ]
3127
+
3128
+ def _refresh_cluster_record(cluster_name):
3129
+ return _refresh_cluster(cluster_name,
3130
+ force_refresh_statuses=set(
3131
+ status_lib.ClusterStatus),
3132
+ include_user_info=False,
3133
+ summary_response=True)
3134
+
3135
+ if len(cluster_names) > 0:
3136
+ # Do not refresh the clusters that have an active launch request.
3137
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3138
+ cluster_names_without_launch_request)
3139
+
3140
+
3026
3141
  def get_clusters(
3027
3142
  refresh: common.StatusRefreshMode,
3028
3143
  cluster_names: Optional[Union[str, List[str]]] = None,
3029
3144
  all_users: bool = True,
3030
3145
  include_credentials: bool = False,
3146
+ summary_response: bool = False,
3031
3147
  # Internal only:
3032
3148
  # pylint: disable=invalid-name
3033
3149
  _include_is_managed: bool = False,
@@ -3055,6 +3171,23 @@ def get_clusters(
3055
3171
  A list of cluster records. If the cluster does not exist or has been
3056
3172
  terminated, the record will be omitted from the returned list.
3057
3173
  """
3174
+ accessible_workspaces = workspaces_core.get_workspaces()
3175
+ if cluster_names is not None:
3176
+ if isinstance(cluster_names, str):
3177
+ cluster_names = [cluster_names]
3178
+ non_glob_cluster_names = []
3179
+ glob_cluster_names = []
3180
+ for cluster_name in cluster_names:
3181
+ if ux_utils.is_glob_pattern(cluster_name):
3182
+ glob_cluster_names.append(cluster_name)
3183
+ else:
3184
+ non_glob_cluster_names.append(cluster_name)
3185
+ cluster_names = non_glob_cluster_names
3186
+ if glob_cluster_names:
3187
+ cluster_names += _get_glob_clusters(
3188
+ glob_cluster_names,
3189
+ silent=True,
3190
+ workspaces_filter=accessible_workspaces)
3058
3191
 
3059
3192
  exclude_managed_clusters = False
3060
3193
  if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -3062,34 +3195,24 @@ def get_clusters(
3062
3195
  user_hashes_filter = None
3063
3196
  if not all_users:
3064
3197
  user_hashes_filter = {common_utils.get_current_user().id}
3065
- accessible_workspaces = workspaces_core.get_workspaces()
3066
-
3067
3198
  records = global_user_state.get_clusters(
3068
3199
  exclude_managed_clusters=exclude_managed_clusters,
3069
3200
  user_hashes_filter=user_hashes_filter,
3070
- workspaces_filter=accessible_workspaces)
3201
+ workspaces_filter=accessible_workspaces,
3202
+ cluster_names=cluster_names,
3203
+ summary_response=summary_response)
3071
3204
 
3072
3205
  yellow = colorama.Fore.YELLOW
3073
3206
  bright = colorama.Style.BRIGHT
3074
3207
  reset = colorama.Style.RESET_ALL
3075
3208
 
3076
3209
  if cluster_names is not None:
3077
- if isinstance(cluster_names, str):
3078
- cluster_names = [cluster_names]
3079
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
3080
- new_records = []
3081
- not_exist_cluster_names = []
3082
- for cluster_name in cluster_names:
3083
- for record in records:
3084
- if record['name'] == cluster_name:
3085
- new_records.append(record)
3086
- break
3087
- else:
3088
- not_exist_cluster_names.append(cluster_name)
3089
- if not_exist_cluster_names:
3090
- clusters_str = ', '.join(not_exist_cluster_names)
3210
+ record_names = {record['name'] for record in records}
3211
+ not_found_clusters = ux_utils.get_non_matched_query(
3212
+ cluster_names, record_names)
3213
+ if not_found_clusters:
3214
+ clusters_str = ', '.join(not_found_clusters)
3091
3215
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
3092
- records = new_records
3093
3216
 
3094
3217
  def _get_records_with_handle(
3095
3218
  records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
@@ -3099,7 +3222,7 @@ def get_clusters(
3099
3222
  if record is not None and record['handle'] is not None
3100
3223
  ]
3101
3224
 
3102
- def _update_records_with_resources_str(
3225
+ def _update_records_with_handle_info(
3103
3226
  records: List[Optional[Dict[str, Any]]]) -> None:
3104
3227
  """Add resource str to record"""
3105
3228
  for record in _get_records_with_handle(records):
@@ -3110,6 +3233,8 @@ def get_clusters(
3110
3233
  record[
3111
3234
  'resources_str_full'] = resources_utils.get_readable_resources_repr(
3112
3235
  handle, simplify=False)
3236
+ if not summary_response:
3237
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3113
3238
 
3114
3239
  def _update_records_with_credentials(
3115
3240
  records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3149,7 +3274,7 @@ def get_clusters(
3149
3274
  record['credentials'] = credential
3150
3275
 
3151
3276
  def _update_records_with_resources(
3152
- records: List[Optional[Dict[str, Any]]]) -> None:
3277
+ records: List[Optional[Dict[str, Any]]],) -> None:
3153
3278
  """Add the resources to the record."""
3154
3279
  for record in _get_records_with_handle(records):
3155
3280
  handle = record['handle']
@@ -3168,8 +3293,8 @@ def get_clusters(
3168
3293
  f'{handle.launched_resources.accelerators}'
3169
3294
  if handle.launched_resources.accelerators else None)
3170
3295
 
3171
- # Add auth_config to the records
3172
- _update_records_with_resources_str(records)
3296
+ # Add handle info to the records
3297
+ _update_records_with_handle_info(records)
3173
3298
  if include_credentials:
3174
3299
  _update_records_with_credentials(records)
3175
3300
  if refresh == common.StatusRefreshMode.NONE:
@@ -3190,47 +3315,44 @@ def get_clusters(
3190
3315
  else:
3191
3316
  force_refresh_statuses = None
3192
3317
 
3193
- def _refresh_cluster(cluster_name):
3194
- # TODO(syang): we should try not to leak
3195
- # request info in backend_utils.py.
3196
- # Refactor this to use some other info to
3197
- # determine if a launch is in progress.
3198
- request = requests_lib.get_request_tasks(
3199
- req_filter=requests_lib.RequestTaskFilter(
3200
- status=[requests_lib.RequestStatus.RUNNING],
3201
- cluster_names=[cluster_name],
3202
- include_request_names=['sky.launch']))
3203
- if len(request) > 0:
3204
- # There is an active launch request on the cluster,
3205
- # so we don't want to update the cluster status until
3206
- # the request is completed.
3207
- logger.debug(f'skipping refresh for cluster {cluster_name} '
3208
- 'as there is an active launch request')
3209
- return global_user_state.get_cluster_from_name(cluster_name)
3210
- try:
3211
- record = refresh_cluster_record(
3212
- cluster_name,
3213
- force_refresh_statuses=force_refresh_statuses,
3214
- acquire_per_cluster_status_lock=True)
3215
- _update_records_with_resources_str([record])
3318
+ def _refresh_cluster_record(cluster_name):
3319
+ record = _refresh_cluster(cluster_name,
3320
+ force_refresh_statuses=force_refresh_statuses,
3321
+ include_user_info=True,
3322
+ summary_response=summary_response)
3323
+ if 'error' not in record:
3324
+ _update_records_with_handle_info([record])
3216
3325
  if include_credentials:
3217
3326
  _update_records_with_credentials([record])
3218
- except (exceptions.ClusterStatusFetchingError,
3219
- exceptions.CloudUserIdentityError,
3220
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3221
- # Do not fail the entire refresh process. The caller will
3222
- # handle the 'UNKNOWN' status, and collect the errors into
3223
- # a table.
3224
- record = {'status': 'UNKNOWN', 'error': e}
3225
- progress.update(task, advance=1)
3327
+ progress.update(task, advance=1)
3226
3328
  return record
3227
3329
 
3228
3330
  cluster_names = [record['name'] for record in records]
3229
- updated_records = []
3230
- if len(cluster_names) > 0:
3331
+ # TODO(syang): we should try not to leak
3332
+ # request info in backend_utils.py.
3333
+ # Refactor this to use some other info to
3334
+ # determine if a launch is in progress.
3335
+ request = requests_lib.get_request_tasks(
3336
+ req_filter=requests_lib.RequestTaskFilter(
3337
+ status=[requests_lib.RequestStatus.RUNNING],
3338
+ cluster_names=cluster_names,
3339
+ include_request_names=['sky.launch']))
3340
+ cluster_names_with_launch_request = {
3341
+ request.cluster_name for request in request
3342
+ }
3343
+ cluster_names_without_launch_request = [
3344
+ cluster_name for cluster_name in cluster_names
3345
+ if cluster_name not in cluster_names_with_launch_request
3346
+ ]
3347
+ # for clusters that have an active launch request, we do not refresh the status
3348
+ updated_records = [
3349
+ record for record in records
3350
+ if record['name'] in cluster_names_with_launch_request
3351
+ ]
3352
+ if len(cluster_names_without_launch_request) > 0:
3231
3353
  with progress:
3232
3354
  updated_records = subprocess_utils.run_in_parallel(
3233
- _refresh_cluster, cluster_names)
3355
+ _refresh_cluster_record, cluster_names_without_launch_request)
3234
3356
 
3235
3357
  # Show information for removed clusters.
3236
3358
  kept_records = []
@@ -116,6 +116,9 @@ Path = str
116
116
 
117
117
  SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
118
118
  SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
119
+ # Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
120
+ # from interfering with the Ray cluster in the user's task (if any).
121
+ UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
119
122
 
120
123
  logger = sky_logging.init_logger(__name__)
121
124
 
@@ -712,6 +715,8 @@ class RayCodeGen:
712
715
  done
713
716
  echo "skypilot: cached mount uploaded complete"
714
717
  fi""")
718
+ unset_ray_env_vars = ' && '.join(
719
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
715
720
  self._code += [
716
721
  sky_env_vars_dict_str,
717
722
  textwrap.dedent(f"""\
@@ -721,6 +726,7 @@ class RayCodeGen:
721
726
  script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
722
727
 
723
728
  if script is not None:
729
+ script=f'{unset_ray_env_vars}; {{script}}'
724
730
  script += rclone_flush_script
725
731
  sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
726
732
 
@@ -3261,9 +3267,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3261
3267
  # Usage Collection:
3262
3268
  usage_lib.messages.usage.update_cluster_resources(
3263
3269
  handle.launched_nodes, launched_resources)
3264
- record = global_user_state.get_cluster_from_name(cluster_name)
3265
- if record is not None:
3266
- usage_lib.messages.usage.update_cluster_status(record['status'])
3270
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
3271
+ if status is not None:
3272
+ usage_lib.messages.usage.update_cluster_status(status)
3267
3273
 
3268
3274
  assert launched_resources.region is not None, handle
3269
3275
 
@@ -3532,8 +3538,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3532
3538
  error_message + '\n' + str(e),
3533
3539
  failover_history=e.failover_history) from None
3534
3540
  if dryrun:
3535
- record = global_user_state.get_cluster_from_name(cluster_name)
3536
- return record['handle'] if record is not None else None, False
3541
+ handle = global_user_state.get_handle_from_cluster_name(
3542
+ cluster_name)
3543
+ return handle if handle is not None else None, False
3537
3544
 
3538
3545
  if config_dict['provisioning_skipped']:
3539
3546
  # Skip further provisioning.
@@ -3541,10 +3548,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3541
3548
  # ('handle', 'provision_record', 'resources_vars')
3542
3549
  # We need to return the handle - but it should be the existing
3543
3550
  # handle for the cluster.
3544
- record = global_user_state.get_cluster_from_name(cluster_name)
3545
- assert record is not None and record['handle'] is not None, (
3546
- cluster_name, record)
3547
- return record['handle'], True
3551
+ handle = global_user_state.get_handle_from_cluster_name(
3552
+ cluster_name)
3553
+ assert handle is not None, (cluster_name, handle)
3554
+ return handle, True
3548
3555
 
3549
3556
  if 'provision_record' in config_dict:
3550
3557
  # New provisioner is used here.
@@ -3939,6 +3946,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3939
3946
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3940
3947
  # Need this `-i` option to make sure `source ~/.bashrc` work
3941
3948
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
3949
+ unset_ray_env_vars = ' && '.join(
3950
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
3951
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3942
3952
  runners = handle.get_command_runners(avoid_ssh_control=True)
3943
3953
 
3944
3954
  def _setup_node(node_id: int) -> None:
@@ -4088,6 +4098,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4088
4098
  logger.info(
4089
4099
  ux_utils.finishing_message('Setup completed.', setup_log_path))
4090
4100
 
4101
+ def _download_file(self, handle: CloudVmRayResourceHandle,
4102
+ local_file_path: str, remote_file_path: str) -> None:
4103
+ """Syncs file from remote to local."""
4104
+ runners = handle.get_command_runners()
4105
+ head_runner = runners[0]
4106
+ head_runner.rsync(
4107
+ source=local_file_path,
4108
+ target=remote_file_path,
4109
+ up=False,
4110
+ stream_logs=False,
4111
+ )
4112
+
4091
4113
  def _exec_code_on_head(
4092
4114
  self,
4093
4115
  handle: CloudVmRayResourceHandle,
@@ -4992,10 +5014,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4992
5014
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4993
5015
  'up.')
4994
5016
  if not cluster_status_fetched:
4995
- record = global_user_state.get_cluster_from_name(
5017
+ status = global_user_state.get_status_from_cluster_name(
4996
5018
  handle.cluster_name)
4997
- prev_cluster_status = record[
4998
- 'status'] if record is not None else None
5019
+ prev_cluster_status = status if status is not None else None
4999
5020
  if prev_cluster_status is None:
5000
5021
  # When the cluster is not in the cluster table, we guarantee that
5001
5022
  # all related resources / cache / config are cleaned up, i.e. it
@@ -5568,7 +5589,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5568
5589
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
5569
5590
  # TODO(zhwu): complete the list of exceptions.
5570
5591
  """
5571
- record = global_user_state.get_cluster_from_name(cluster_name)
5592
+ record = global_user_state.get_cluster_from_name(
5593
+ cluster_name, include_user_info=False, summary_response=True)
5572
5594
  if record is None:
5573
5595
  handle_before_refresh = None
5574
5596
  status_before_refresh = None
@@ -5589,6 +5611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5589
5611
  cluster_name,
5590
5612
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
5591
5613
  acquire_per_cluster_status_lock=False,
5614
+ include_user_info=False,
5615
+ summary_response=True,
5592
5616
  )
5593
5617
  if record is not None:
5594
5618
  prev_cluster_status = record['status']
@@ -189,6 +189,15 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
189
189
  ' a NoOp. If you are running sky exec, your workdir has not'
190
190
  ' been updated.')
191
191
 
192
+ def _download_file(self, handle: LocalDockerResourceHandle,
193
+ local_file_path: str, remote_file_path: str) -> None:
194
+ """Syncs file from remote to local."""
195
+ # Copy from docker container to local
196
+ container = self.containers[handle]
197
+ copy_cmd = (
198
+ f'docker cp {container.name}:{remote_file_path} {local_file_path}')
199
+ subprocess.run(copy_cmd, shell=True, check=True)
200
+
192
201
  def _sync_file_mounts(
193
202
  self,
194
203
  handle: LocalDockerResourceHandle,