skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,7 @@ from sky.server.requests import requests as requests_lib
48
48
  from sky.skylet import autostop_lib
49
49
  from sky.skylet import constants
50
50
  from sky.usage import usage_lib
51
+ from sky.utils import auth_utils
51
52
  from sky.utils import cluster_utils
52
53
  from sky.utils import command_runner
53
54
  from sky.utils import common
@@ -755,7 +756,7 @@ def write_cluster_config(
755
756
  assert k not in credentials, f'{k} already in credentials'
756
757
  credentials[k] = v
757
758
 
758
- private_key_path, _ = auth.get_or_generate_keys()
759
+ private_key_path, _ = auth_utils.get_or_generate_keys()
759
760
  auth_config = {'ssh_private_key': private_key_path}
760
761
  region_name = resources_vars.get('region')
761
762
 
@@ -1124,6 +1125,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1124
1125
  config = auth.setup_fluidstack_authentication(config)
1125
1126
  elif isinstance(cloud, clouds.Hyperbolic):
1126
1127
  config = auth.setup_hyperbolic_authentication(config)
1128
+ elif isinstance(cloud, clouds.Shadeform):
1129
+ config = auth.setup_shadeform_authentication(config)
1127
1130
  elif isinstance(cloud, clouds.PrimeIntellect):
1128
1131
  config = auth.setup_primeintellect_authentication(config)
1129
1132
  elif isinstance(cloud, clouds.Seeweb):
@@ -1855,6 +1858,13 @@ def check_owner_identity(cluster_name: str) -> None:
1855
1858
  summary_response=True)
1856
1859
  if record is None:
1857
1860
  return
1861
+ _check_owner_identity_with_record(cluster_name, record)
1862
+
1863
+
1864
+ def _check_owner_identity_with_record(cluster_name: str,
1865
+ record: Dict[str, Any]) -> None:
1866
+ if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1867
+ return
1858
1868
  handle = record['handle']
1859
1869
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1860
1870
  return
@@ -1941,7 +1951,8 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1941
1951
 
1942
1952
  @context_utils.cancellation_guard
1943
1953
  def _query_cluster_status_via_cloud_api(
1944
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1954
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
1955
+ retry_if_missing: bool,
1945
1956
  ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1946
1957
  """Returns the status of the cluster as a list of tuples corresponding
1947
1958
  to the node status and an optional reason string for said status.
@@ -1968,8 +1979,11 @@ def _query_cluster_status_via_cloud_api(
1968
1979
  cloud_name = repr(handle.launched_resources.cloud)
1969
1980
  try:
1970
1981
  node_status_dict = provision_lib.query_instances(
1971
- cloud_name, cluster_name, cluster_name_on_cloud,
1972
- provider_config)
1982
+ cloud_name,
1983
+ cluster_name,
1984
+ cluster_name_on_cloud,
1985
+ provider_config,
1986
+ retry_if_missing=retry_if_missing)
1973
1987
  logger.debug(f'Querying {cloud_name} cluster '
1974
1988
  f'{cluster_name_in_hint} '
1975
1989
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -2149,6 +2163,8 @@ def check_can_clone_disk_and_override_task(
2149
2163
 
2150
2164
  def _update_cluster_status(
2151
2165
  cluster_name: str,
2166
+ record: Dict[str, Any],
2167
+ retry_if_missing: bool,
2152
2168
  include_user_info: bool = True,
2153
2169
  summary_response: bool = False) -> Optional[Dict[str, Any]]:
2154
2170
  """Update the cluster status.
@@ -2177,12 +2193,6 @@ def _update_cluster_status(
2177
2193
  fetched from the cloud provider or there are leaked nodes causing
2178
2194
  the node number larger than expected.
2179
2195
  """
2180
- record = global_user_state.get_cluster_from_name(
2181
- cluster_name,
2182
- include_user_info=include_user_info,
2183
- summary_response=summary_response)
2184
- if record is None:
2185
- return None
2186
2196
  handle = record['handle']
2187
2197
  if handle.cluster_yaml is None:
2188
2198
  # Remove cluster from db since this cluster does not have a config file
@@ -2201,7 +2211,8 @@ def _update_cluster_status(
2201
2211
  return record
2202
2212
  cluster_name = handle.cluster_name
2203
2213
 
2204
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2214
+ node_statuses = _query_cluster_status_via_cloud_api(
2215
+ handle, retry_if_missing=retry_if_missing)
2205
2216
 
2206
2217
  all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2207
2218
  for status in node_statuses) and
@@ -2376,7 +2387,8 @@ def _update_cluster_status(
2376
2387
  # and check again. This is a best-effort leak prevention check.
2377
2388
  # See https://github.com/skypilot-org/skypilot/issues/4431.
2378
2389
  time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
2379
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2390
+ node_statuses = _query_cluster_status_via_cloud_api(
2391
+ handle, retry_if_missing=False)
2380
2392
  # Note: even if all the node_statuses are UP now, we will still
2381
2393
  # consider this cluster abnormal, and its status will be INIT.
2382
2394
 
@@ -2620,7 +2632,8 @@ def refresh_cluster_record(
2620
2632
  cluster_lock_already_held: bool = False,
2621
2633
  cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2622
2634
  include_user_info: bool = True,
2623
- summary_response: bool = False) -> Optional[Dict[str, Any]]:
2635
+ summary_response: bool = False,
2636
+ retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
2624
2637
  """Refresh the cluster, and return the possibly updated record.
2625
2638
 
2626
2639
  The function will update the cached cluster status in the global state. For
@@ -2649,6 +2662,8 @@ def refresh_cluster_record(
2649
2662
  value is <0, do not timeout (wait for the lock indefinitely). By
2650
2663
  default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2651
2664
  if correctness is required, you must set this to -1.
2665
+ retry_if_missing: Whether to retry the call to the cloud api if the
2666
+ cluster is not found when querying the live status on the cloud.
2652
2667
 
2653
2668
  Returns:
2654
2669
  If the cluster is terminated or does not exist, return None.
@@ -2675,10 +2690,9 @@ def refresh_cluster_record(
2675
2690
  # using the correct cloud credentials.
2676
2691
  workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
2677
2692
  with skypilot_config.local_active_workspace_ctx(workspace):
2678
- check_owner_identity(cluster_name)
2679
-
2680
- if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2681
- return record
2693
+ # check_owner_identity returns if the record handle is
2694
+ # not a CloudVmRayResourceHandle
2695
+ _check_owner_identity_with_record(cluster_name, record)
2682
2696
 
2683
2697
  # The loop logic allows us to notice if the status was updated in the
2684
2698
  # global_user_state by another process and stop trying to get the lock.
@@ -2695,7 +2709,9 @@ def refresh_cluster_record(
2695
2709
  return record
2696
2710
 
2697
2711
  if cluster_lock_already_held:
2698
- return _update_cluster_status(cluster_name, include_user_info,
2712
+ return _update_cluster_status(cluster_name, record,
2713
+ retry_if_missing,
2714
+ include_user_info,
2699
2715
  summary_response)
2700
2716
 
2701
2717
  # Try to acquire the lock so we can fetch the status.
@@ -2711,7 +2727,8 @@ def refresh_cluster_record(
2711
2727
  record, force_refresh_statuses):
2712
2728
  return record
2713
2729
  # Update and return the cluster status.
2714
- return _update_cluster_status(cluster_name,
2730
+ return _update_cluster_status(cluster_name, record,
2731
+ retry_if_missing,
2715
2732
  include_user_info,
2716
2733
  summary_response)
2717
2734
 
@@ -2749,7 +2766,8 @@ def refresh_cluster_status_handle(
2749
2766
  *,
2750
2767
  force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2751
2768
  cluster_lock_already_held: bool = False,
2752
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2769
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2770
+ retry_if_missing: bool = True,
2753
2771
  ) -> Tuple[Optional[status_lib.ClusterStatus],
2754
2772
  Optional[backends.ResourceHandle]]:
2755
2773
  """Refresh the cluster, and return the possibly updated status and handle.
@@ -2764,7 +2782,8 @@ def refresh_cluster_status_handle(
2764
2782
  cluster_lock_already_held=cluster_lock_already_held,
2765
2783
  cluster_status_lock_timeout=cluster_status_lock_timeout,
2766
2784
  include_user_info=False,
2767
- summary_response=True)
2785
+ summary_response=True,
2786
+ retry_if_missing=retry_if_missing)
2768
2787
  if record is None:
2769
2788
  return None, None
2770
2789
  return record['status'], record['handle']
@@ -3115,25 +3134,23 @@ def refresh_cluster_records() -> None:
3115
3134
  exclude_managed_clusters = True
3116
3135
  if env_options.Options.SHOW_DEBUG_INFO.get():
3117
3136
  exclude_managed_clusters = False
3118
- cluster_names = global_user_state.get_cluster_names(
3119
- exclude_managed_clusters=exclude_managed_clusters,)
3137
+ cluster_names = set(
3138
+ global_user_state.get_cluster_names(
3139
+ exclude_managed_clusters=exclude_managed_clusters,))
3120
3140
 
3121
3141
  # TODO(syang): we should try not to leak
3122
3142
  # request info in backend_utils.py.
3123
3143
  # Refactor this to use some other info to
3124
3144
  # determine if a launch is in progress.
3125
- request = requests_lib.get_request_tasks(
3126
- req_filter=requests_lib.RequestTaskFilter(
3127
- status=[requests_lib.RequestStatus.RUNNING],
3128
- cluster_names=cluster_names,
3129
- include_request_names=['sky.launch']))
3130
3145
  cluster_names_with_launch_request = {
3131
- request.cluster_name for request in request
3146
+ request.cluster_name for request in requests_lib.get_request_tasks(
3147
+ req_filter=requests_lib.RequestTaskFilter(
3148
+ status=[requests_lib.RequestStatus.RUNNING],
3149
+ include_request_names=['sky.launch'],
3150
+ fields=['cluster_name']))
3132
3151
  }
3133
- cluster_names_without_launch_request = [
3134
- cluster_name for cluster_name in cluster_names
3135
- if cluster_name not in cluster_names_with_launch_request
3136
- ]
3152
+ cluster_names_without_launch_request = (cluster_names -
3153
+ cluster_names_with_launch_request)
3137
3154
 
3138
3155
  def _refresh_cluster_record(cluster_name):
3139
3156
  return _refresh_cluster(cluster_name,
@@ -3142,7 +3159,7 @@ def refresh_cluster_records() -> None:
3142
3159
  include_user_info=False,
3143
3160
  summary_response=True)
3144
3161
 
3145
- if len(cluster_names) > 0:
3162
+ if len(cluster_names_without_launch_request) > 0:
3146
3163
  # Do not refresh the clusters that have an active launch request.
3147
3164
  subprocess_utils.run_in_parallel(_refresh_cluster_record,
3148
3165
  cluster_names_without_launch_request)
@@ -3154,6 +3171,7 @@ def get_clusters(
3154
3171
  all_users: bool = True,
3155
3172
  include_credentials: bool = False,
3156
3173
  summary_response: bool = False,
3174
+ include_handle: bool = True,
3157
3175
  # Internal only:
3158
3176
  # pylint: disable=invalid-name
3159
3177
  _include_is_managed: bool = False,
@@ -3237,12 +3255,11 @@ def get_clusters(
3237
3255
  """Add resource str to record"""
3238
3256
  for record in _get_records_with_handle(records):
3239
3257
  handle = record['handle']
3240
- record[
3241
- 'resources_str'] = resources_utils.get_readable_resources_repr(
3242
- handle, simplify=True)
3243
- record[
3244
- 'resources_str_full'] = resources_utils.get_readable_resources_repr(
3245
- handle, simplify=False)
3258
+ resource_str_simple, resource_str_full = (
3259
+ resources_utils.get_readable_resources_repr(
3260
+ handle, simplified_only=False))
3261
+ record['resources_str'] = resource_str_simple
3262
+ record['resources_str_full'] = resource_str_full
3246
3263
  if not summary_response:
3247
3264
  record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3248
3265
 
@@ -3268,9 +3285,17 @@ def get_clusters(
3268
3285
  expanded_private_key_path = os.path.expanduser(
3269
3286
  ssh_private_key_path)
3270
3287
  if not os.path.exists(expanded_private_key_path):
3271
- auth.create_ssh_key_files_from_db(ssh_private_key_path)
3288
+ success = auth_utils.create_ssh_key_files_from_db(
3289
+ ssh_private_key_path)
3290
+ if not success:
3291
+ # If the ssh key files are not found, we do not
3292
+ # update the record with credentials.
3293
+ logger.debug(
3294
+ f'SSH keys not found for cluster {record["name"]} '
3295
+ f'at key path {ssh_private_key_path}')
3296
+ continue
3272
3297
  else:
3273
- private_key_path, _ = auth.get_or_generate_keys()
3298
+ private_key_path, _ = auth_utils.get_or_generate_keys()
3274
3299
  expanded_private_key_path = os.path.expanduser(private_key_path)
3275
3300
  if expanded_private_key_path in cached_private_keys:
3276
3301
  credential['ssh_private_key_content'] = cached_private_keys[
@@ -3302,6 +3327,8 @@ def get_clusters(
3302
3327
  record['accelerators'] = (
3303
3328
  f'{handle.launched_resources.accelerators}'
3304
3329
  if handle.launched_resources.accelerators else None)
3330
+ if not include_handle:
3331
+ record.pop('handle', None)
3305
3332
 
3306
3333
  # Add handle info to the records
3307
3334
  _update_records_with_handle_info(records)
@@ -3330,7 +3357,10 @@ def get_clusters(
3330
3357
  force_refresh_statuses=force_refresh_statuses,
3331
3358
  include_user_info=True,
3332
3359
  summary_response=summary_response)
3333
- if 'error' not in record:
3360
+ # record may be None if the cluster is deleted during refresh,
3361
+ # e.g. all the Pods of a cluster on Kubernetes have been
3362
+ # deleted before refresh.
3363
+ if record is not None and 'error' not in record:
3334
3364
  _update_records_with_handle_info([record])
3335
3365
  if include_credentials:
3336
3366
  _update_records_with_credentials([record])
@@ -3342,45 +3372,56 @@ def get_clusters(
3342
3372
  # request info in backend_utils.py.
3343
3373
  # Refactor this to use some other info to
3344
3374
  # determine if a launch is in progress.
3345
- request = requests_lib.get_request_tasks(
3346
- req_filter=requests_lib.RequestTaskFilter(
3347
- status=[requests_lib.RequestStatus.RUNNING],
3348
- cluster_names=cluster_names,
3349
- include_request_names=['sky.launch']))
3350
3375
  cluster_names_with_launch_request = {
3351
- request.cluster_name for request in request
3376
+ request.cluster_name for request in requests_lib.get_request_tasks(
3377
+ req_filter=requests_lib.RequestTaskFilter(
3378
+ status=[requests_lib.RequestStatus.RUNNING],
3379
+ include_request_names=['sky.launch'],
3380
+ cluster_names=cluster_names,
3381
+ fields=['cluster_name']))
3352
3382
  }
3383
+ # Preserve the index of the cluster name as it appears on "records"
3353
3384
  cluster_names_without_launch_request = [
3354
- cluster_name for cluster_name in cluster_names
3385
+ (i, cluster_name)
3386
+ for i, cluster_name in enumerate(cluster_names)
3355
3387
  if cluster_name not in cluster_names_with_launch_request
3356
3388
  ]
3357
3389
  # for clusters that have an active launch request, we do not refresh the status
3358
- updated_records = [
3359
- record for record in records
3360
- if record['name'] in cluster_names_with_launch_request
3361
- ]
3390
+ updated_records = []
3362
3391
  if len(cluster_names_without_launch_request) > 0:
3363
3392
  with progress:
3364
3393
  updated_records = subprocess_utils.run_in_parallel(
3365
- _refresh_cluster_record, cluster_names_without_launch_request)
3366
-
3394
+ _refresh_cluster_record, [
3395
+ cluster_name
3396
+ for _, cluster_name in cluster_names_without_launch_request
3397
+ ])
3398
+ # Preserve the index of the cluster name as it appears on "records"
3399
+ # before filtering for clusters being launched.
3400
+ updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
3401
+ cluster_names_without_launch_request[i][0]: updated_records[i]
3402
+ for i in range(len(cluster_names_without_launch_request))
3403
+ }
3367
3404
  # Show information for removed clusters.
3368
3405
  kept_records = []
3369
3406
  autodown_clusters, remaining_clusters, failed_clusters = [], [], []
3370
3407
  for i, record in enumerate(records):
3371
- if updated_records[i] is None:
3408
+ if i not in updated_records_dict:
3409
+ # record was not refreshed, keep the original record
3410
+ kept_records.append(record)
3411
+ continue
3412
+ updated_record = updated_records_dict[i]
3413
+ if updated_record is None:
3372
3414
  if record['to_down']:
3373
- autodown_clusters.append(cluster_names[i])
3415
+ autodown_clusters.append(record['name'])
3374
3416
  else:
3375
- remaining_clusters.append(cluster_names[i])
3376
- elif updated_records[i]['status'] == 'UNKNOWN':
3377
- failed_clusters.append(
3378
- (cluster_names[i], updated_records[i]['error']))
3417
+ remaining_clusters.append(record['name'])
3418
+ elif updated_record['status'] == 'UNKNOWN':
3419
+ failed_clusters.append((record['name'], updated_record['error']))
3379
3420
  # Keep the original record if the status is unknown,
3380
3421
  # so that the user can still see the cluster.
3381
3422
  kept_records.append(record)
3382
3423
  else:
3383
- kept_records.append(updated_records[i])
3424
+ kept_records.append(updated_record)
3384
3425
 
3385
3426
  if autodown_clusters:
3386
3427
  plural = 's' if len(autodown_clusters) > 1 else ''