skypilot-nightly 1.0.0.dev20250628__py3-none-any.whl → 1.0.0.dev20250701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +3 -3
  4. sky/client/cli/command.py +1 -2
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/1043-1b39779691bb4030.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/{141-fa5a20cbf401b351.js → 1141-726e5a3f00b67185.js} +2 -2
  9. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/{691.fd9292250ab089af.js → 1691.44e378727a41f3b5.js} +2 -2
  12. sky/dashboard/out/_next/static/chunks/{871-e547295e7e21399c.js → 1871-80dea41717729fa5.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{875.52c962183328b3f2.js → 2875.c24c6d57dc82e436.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/3698-52ad1ca228faa776.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.b3cc2bc1d49d2c3c.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/{947-6620842ef80ae879.js → 3947-b059261d6fa88a1f.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/{697.6460bf72e760addd.js → 4697.f5421144224da9fc.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/4725.4c849b1e05c8e9ad.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/{491.b3d264269613fe09.js → 5491.918ffed0ba7a5294.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +8 -0
  25. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +39 -0
  26. sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/6989-6ff4e45dfb49d11d.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{25.76c246239df93d50.js → 9025.a7c44babfe56ce09.js} +2 -2
  32. sky/dashboard/out/_next/static/chunks/{938-0a770415b5ce4649.js → 938-044ad21de8b4626b.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/9470-21d059a1dfa03f61.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/{framework-87d061ee6ed71b28.js → framework-efc06c2733009cd3.js} +1 -1
  37. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{main-e0e2335212e72357.js → main-c0a4f1ea606d48d2.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{_app-050a9e637b057b24.js → _app-a37b06ddb64521fd.js} +2 -2
  40. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-77d4816945b04793.js → [cluster]-b8e1114e6d38218c.js} +1 -1
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-9744c271a1642f76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/users-cd43fb3c122eedde.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/workspaces-06bde99155fa6292.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/webpack-d427db53e54de9ce.js +1 -0
  56. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +3 -0
  57. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  58. sky/dashboard/out/clusters/[cluster].html +1 -1
  59. sky/dashboard/out/clusters.html +1 -1
  60. sky/dashboard/out/config.html +1 -1
  61. sky/dashboard/out/index.html +1 -1
  62. sky/dashboard/out/infra/[context].html +1 -1
  63. sky/dashboard/out/infra.html +1 -1
  64. sky/dashboard/out/jobs/[job].html +1 -1
  65. sky/dashboard/out/jobs.html +1 -1
  66. sky/dashboard/out/users.html +1 -1
  67. sky/dashboard/out/volumes.html +1 -1
  68. sky/dashboard/out/workspace/new.html +1 -1
  69. sky/dashboard/out/workspaces/[name].html +1 -1
  70. sky/dashboard/out/workspaces.html +1 -1
  71. sky/jobs/controller.py +4 -0
  72. sky/jobs/server/core.py +5 -9
  73. sky/jobs/state.py +820 -670
  74. sky/jobs/utils.py +7 -15
  75. sky/optimizer.py +46 -0
  76. sky/provision/__init__.py +14 -6
  77. sky/provision/kubernetes/constants.py +9 -0
  78. sky/provision/kubernetes/instance.py +17 -14
  79. sky/provision/kubernetes/volume.py +77 -15
  80. sky/server/common.py +1 -0
  81. sky/server/server.py +37 -15
  82. sky/setup_files/dependencies.py +2 -0
  83. sky/skylet/constants.py +1 -0
  84. sky/task.py +13 -1
  85. sky/utils/dag_utils.py +4 -2
  86. sky/utils/log_utils.py +68 -0
  87. sky/volumes/server/core.py +103 -78
  88. sky/volumes/utils.py +22 -5
  89. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/METADATA +4 -1
  90. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/RECORD +96 -94
  91. sky/dashboard/out/_next/static/ZYLkkWSYZjJhLVsObh20y/_buildManifest.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/43-f38a531f6692f281.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +0 -1
  96. sky/dashboard/out/_next/static/chunks/601-111d06d9ded11d00.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/616-50a620ac4a23deb4.js +0 -39
  98. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/785.3446c12ffdf3d188.js +0 -1
  101. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +0 -1
  102. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +0 -8
  103. sky/dashboard/out/_next/static/chunks/937.72796f7afe54075b.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/982.d7bd80ed18cad4cc.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-21080826c6095f21.js +0 -6
  113. sky/dashboard/out/_next/static/chunks/pages/clusters-65b2c90320b8afb8.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-64bdc0b2d3a44709.js +0 -16
  119. sky/dashboard/out/_next/static/chunks/pages/jobs-df7407b5e37d3750.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/users-d7684eaa04c4f58f.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5b59bce9eb208d84.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-04e1b3ad4207b1e9.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/workspaces-c470366a6179f16e.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/webpack-75a3310ef922a299.js +0 -1
  126. sky/dashboard/out/_next/static/css/605ac87514049058.css +0 -3
  127. /sky/dashboard/out/_next/static/{ZYLkkWSYZjJhLVsObh20y → Md3rlE87jmL5uv7gSo8mR}/_ssgManifest.js +0 -0
  128. /sky/dashboard/out/_next/static/chunks/{804-4c9fc53aa74bc191.js → 804-9f5e98ce84d46bdd.js} +0 -0
  129. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/WHEEL +0 -0
  130. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/entry_points.txt +0 -0
  131. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/licenses/LICENSE +0 -0
  132. {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -180,12 +180,6 @@ def is_consolidation_mode() -> bool:
180
180
  return consolidation_mode
181
181
 
182
182
 
183
- def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
184
- """Get the path to the HA dump script for a job."""
185
- return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
186
- ).resolve() / f'sky_job_{job_id}'
187
-
188
-
189
183
  def ha_recovery_for_consolidation_mode():
190
184
  """Recovery logic for HA mode."""
191
185
  # No setup recovery is needed in consolidation mode, as the API server
@@ -221,17 +215,15 @@ def ha_recovery_for_consolidation_mode():
221
215
  managed_job_state.ManagedJobScheduleState.DONE,
222
216
  managed_job_state.ManagedJobScheduleState.WAITING
223
217
  ]:
224
- dump_script_path = get_ha_dump_script_path(job_id)
225
- if not dump_script_path.exists():
226
- f.write(f'Job {job_id}\'s recovery file ({dump_script_path}'
227
- ') does not exist. Skipping recovery. Job '
228
- f'schedule state: {job["schedule_state"]}\n')
218
+ script = managed_job_state.get_ha_recovery_script(job_id)
219
+ if script is None:
220
+ f.write(f'Job {job_id}\'s recovery script does not exist. '
221
+ 'Skipping recovery. Job schedule state: '
222
+ f'{job["schedule_state"]}\n')
229
223
  continue
230
- with open(dump_script_path, 'r', encoding='utf-8') as script_f:
231
- script = script_f.read()
232
224
  runner.run(script)
233
- f.write(f'Job {job_id} (file: {dump_script_path}) completed '
234
- f'recovery at {datetime.datetime.now()}\n')
225
+ f.write(f'Job {job_id} completed recovery at '
226
+ f'{datetime.datetime.now()}\n')
235
227
  f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
236
228
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
237
229
 
sky/optimizer.py CHANGED
@@ -1252,6 +1252,52 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1252
1252
  logger.warning(
1253
1253
  f'{colorama.Fore.YELLOW}{msg}{colorama.Style.RESET_ALL}')
1254
1254
 
1255
+ _check_specified_regions(task)
1256
+
1257
+
1258
+ def _check_specified_regions(task: task_lib.Task) -> None:
1259
+ """Check if specified regions (Kubernetes contexts) are enabled.
1260
+
1261
+ Args:
1262
+ task: The task to check.
1263
+ """
1264
+ # Only check for Kubernetes now
1265
+ if not all(
1266
+ isinstance(resources.cloud, clouds.Kubernetes)
1267
+ for resources in task.resources):
1268
+ return
1269
+ # Kubernetes region is a context if set
1270
+ for resources in task.resources:
1271
+ if resources.region is None:
1272
+ continue
1273
+ existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
1274
+ region = resources.region
1275
+ task_name = f' {task.name!r}' if task.name is not None else ''
1276
+ msg = f'Task{task_name} requires '
1277
+ if region not in existing_contexts:
1278
+ infra_str = f'Kubernetes/{region}'
1279
+ logger.warning(f'{infra_str} is not enabled.')
1280
+ volume_mounts_str = ''
1281
+ if task.volume_mounts:
1282
+ if len(task.volume_mounts) > 1:
1283
+ volume_mounts_str += 'volumes '
1284
+ else:
1285
+ volume_mounts_str += 'volume '
1286
+ volume_mounts_str += ', '.join(
1287
+ [f'{v.volume_name}' for v in task.volume_mounts])
1288
+ volume_mounts_str += f' with infra {infra_str}'
1289
+ if volume_mounts_str:
1290
+ msg += volume_mounts_str
1291
+ else:
1292
+ msg += f'infra {infra_str}'
1293
+ msg += (
1294
+ f' which is not enabled. To enable access, change '
1295
+ f'the task infra requirement or run: {colorama.Style.BRIGHT}'
1296
+ f'sky check {colorama.Style.RESET_ALL}'
1297
+ f'to ensure the infra is enabled.')
1298
+ with ux_utils.print_exception_no_traceback():
1299
+ raise exceptions.ResourcesUnavailableError(msg)
1300
+
1255
1301
 
1256
1302
  def _fill_in_launchable_resources(
1257
1303
  task: task_lib.Task,
sky/provision/__init__.py CHANGED
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
6
6
  import functools
7
7
  import inspect
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Type
9
+ from typing import Any, Dict, List, Optional, Tuple, Type
10
10
 
11
11
  from sky import models
12
12
  from sky import sky_logging
@@ -106,7 +106,7 @@ def bootstrap_instances(
106
106
 
107
107
  @_route_to_cloud_impl
108
108
  def apply_volume(provider_name: str,
109
- config: models.VolumeConfig) -> models.VolumeConfig:
109
+ volume_config: models.VolumeConfig) -> models.VolumeConfig:
110
110
  """Create or register a volume.
111
111
 
112
112
  This function creates or registers a volume with the provided configuration,
@@ -117,15 +117,23 @@ def apply_volume(provider_name: str,
117
117
 
118
118
  @_route_to_cloud_impl
119
119
  def delete_volume(provider_name: str,
120
- config: models.VolumeConfig) -> models.VolumeConfig:
120
+ volume_config: models.VolumeConfig) -> models.VolumeConfig:
121
121
  """Delete a volume."""
122
122
  raise NotImplementedError
123
123
 
124
124
 
125
125
  @_route_to_cloud_impl
126
- def get_volume_usedby(provider_name: str,
127
- config: models.VolumeConfig) -> List[str]:
128
- """Get the usedby of a volume."""
126
+ def get_volume_usedby(
127
+ provider_name: str,
128
+ volume_config: models.VolumeConfig,
129
+ ) -> Tuple[List[str], List[str]]:
130
+ """Get the usedby of a volume.
131
+
132
+ Returns:
133
+ usedby_pods: List of pods using the volume. These may include pods
134
+ not created by SkyPilot.
135
+ usedby_clusters: List of clusters using the volume.
136
+ """
129
137
  raise NotImplementedError
130
138
 
131
139
 
@@ -15,3 +15,12 @@ SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:
15
15
 
16
16
  # cache directory for kubeconfig with modified exec auth
17
17
  SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
18
+
19
+ # Labels for the Pods created by SkyPilot
20
+ TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
21
+ TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
22
+ TAG_POD_INITIALIZED = 'skypilot-initialized'
23
+ TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
24
+
25
+ # Pod phases that are not holding PVCs
26
+ PVC_NOT_HOLD_POD_PHASES = ['Succeeded', 'Failed']
@@ -12,6 +12,7 @@ from sky.provision import common
12
12
  from sky.provision import constants
13
13
  from sky.provision import docker_utils
14
14
  from sky.provision.kubernetes import config as config_lib
15
+ from sky.provision.kubernetes import constants as k8s_constants
15
16
  from sky.provision.kubernetes import network_utils
16
17
  from sky.provision.kubernetes import utils as kubernetes_utils
17
18
  from sky.provision.kubernetes import volume
@@ -30,14 +31,10 @@ _MAX_RETRIES = 3
30
31
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
31
32
 
32
33
  logger = sky_logging.init_logger(__name__)
33
- TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
34
- TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
35
- TAG_POD_INITIALIZED = 'skypilot-initialized'
36
- TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
37
34
 
38
35
 
39
36
  def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
40
- return {TAG_RAY_CLUSTER_NAME: cluster_name}
37
+ return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
41
38
 
42
39
 
43
40
  def _is_head(pod) -> bool:
@@ -75,7 +72,8 @@ def is_high_availability_cluster_by_kubectl(
75
72
  deployment_list = kubernetes.apps_api(
76
73
  context).list_namespaced_deployment(
77
74
  namespace,
78
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
75
+ label_selector=
76
+ f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
79
77
  except kubernetes.api_exception():
80
78
  return False
81
79
  # It is a high availability cluster if there is at least one deployment
@@ -280,10 +278,12 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
280
278
  while _evaluate_timeout():
281
279
  # Get all pods in a single API call using the cluster name label
282
280
  # which all pods in new_nodes should share
283
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
281
+ cluster_name = new_nodes[0].metadata.labels[
282
+ k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
284
283
  pods = kubernetes.core_api(context).list_namespaced_pod(
285
284
  namespace,
286
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
285
+ label_selector=
286
+ f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
287
287
 
288
288
  # Get the set of found pod names and check if we have all expected pods
289
289
  found_pod_names = {pod.metadata.name for pod in pods}
@@ -361,10 +361,12 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
361
361
 
362
362
  while True:
363
363
  # Get all pods in a single API call
364
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
364
+ cluster_name = new_nodes[0].metadata.labels[
365
+ k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
365
366
  all_pods = kubernetes.core_api(context).list_namespaced_pod(
366
367
  namespace,
367
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
368
+ label_selector=
369
+ f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
368
370
 
369
371
  # Get the set of found pod names and check if we have all expected pods
370
372
  found_pod_names = {pod.metadata.name for pod in all_pods}
@@ -732,7 +734,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
732
734
  else:
733
735
  pod_spec['metadata']['labels'] = tags
734
736
  pod_spec['metadata']['labels'].update(
735
- {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
737
+ {k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
736
738
 
737
739
  terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
738
740
  ['Terminating'])
@@ -841,7 +843,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
841
843
  'podAffinityTerm': {
842
844
  'labelSelector': {
843
845
  'matchExpressions': [{
844
- 'key': TAG_SKYPILOT_CLUSTER_NAME,
846
+ 'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME,
845
847
  'operator': 'In',
846
848
  'values': [cluster_name_on_cloud]
847
849
  }]
@@ -884,7 +886,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
884
886
  # Add the deployment name as a label to the pod spec
885
887
  deployment_name = deployment_spec['metadata']['name']
886
888
  pod_spec_copy['metadata']['labels'][
887
- TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
889
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
888
890
  template_pod_spec['metadata'] = pod_spec_copy['metadata']
889
891
  template_pod_spec['spec'].update(pod_spec_copy['spec'])
890
892
  # Propagate the labels to the deployment for identification.
@@ -1289,7 +1291,8 @@ def get_command_runners(
1289
1291
 
1290
1292
  # Try to get deployment name from label first
1291
1293
  head_instance_info = instances[pod_name][0]
1292
- deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
1294
+ deployment = head_instance_info.tags.get(
1295
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
1293
1296
 
1294
1297
  node_list = [((namespace, context), pod_name)]
1295
1298
  head_runner = command_runner.KubernetesCommandRunner(
@@ -1,10 +1,12 @@
1
1
  """Kubernetes pvc provisioning."""
2
2
  from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
+ from sky import global_user_state
4
5
  from sky import models
5
6
  from sky import sky_logging
6
7
  from sky.adaptors import kubernetes
7
8
  from sky.provision.kubernetes import config as config_lib
9
+ from sky.provision.kubernetes import constants as k8s_constants
8
10
  from sky.provision.kubernetes import utils as kubernetes_utils
9
11
  from sky.volumes import volume as volume_lib
10
12
 
@@ -45,17 +47,26 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
45
47
  access_mode = pvc.spec.access_modes[0]
46
48
  if access_mode not in once_modes:
47
49
  continue
48
- usedby = _get_volume_usedby(context, namespace, pvc_name)
49
- if usedby:
50
+ usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
51
+ if usedby_pods:
50
52
  raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
51
53
  f'mode {access_mode} is already '
52
- f'in use by {usedby}.')
54
+ f'in use by Pods {usedby_pods}.')
53
55
 
54
56
 
55
57
  def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
56
58
  """Creates or registers a volume."""
57
59
  context, namespace = _get_context_namespace(config)
58
60
  pvc_spec = _get_pvc_spec(namespace, config)
61
+ # Check if the storage class exists
62
+ storage_class_name = pvc_spec['spec'].get('storageClassName')
63
+ if storage_class_name is not None:
64
+ try:
65
+ kubernetes.storage_api(context).read_storage_class(
66
+ name=storage_class_name)
67
+ except kubernetes.api_exception() as e:
68
+ raise config_lib.KubernetesError(
69
+ f'Check storage class {storage_class_name} error: {e}')
59
70
  create_persistent_volume_claim(namespace, context, pvc_spec)
60
71
  return config
61
72
 
@@ -76,22 +87,73 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
76
87
  return config
77
88
 
78
89
 
79
- def _get_volume_usedby(context: Optional[str], namespace: str,
80
- pvc_name: str) -> List[str]:
81
- """Gets the usedby resources of a volume."""
82
- usedby = []
90
+ def _get_volume_usedby(
91
+ context: Optional[str],
92
+ namespace: str,
93
+ pvc_name: str,
94
+ ) -> Tuple[List[str], List[str]]:
95
+ """Gets the usedby resources of a volume.
96
+
97
+ This function returns the pods and clusters that are using the volume.
98
+ The usedby_pods is accurate, which also includes the Pods that are not
99
+ managed by SkyPilot.
100
+
101
+ Args:
102
+ context: Kubernetes context
103
+ namespace: Kubernetes namespace
104
+ pvc_name: PVC name
105
+
106
+ Returns:
107
+ usedby_pods: List of pods using the volume. These may include pods
108
+ not created by SkyPilot.
109
+ usedby_clusters: List of clusters using the volume.
110
+ """
111
+ usedby_pods = []
112
+ usedby_clusters = []
113
+ field_selector = ','.join([
114
+ f'status.phase!={phase}'
115
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
116
+ ])
117
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
83
118
  # Get all pods in the namespace
84
- pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
119
+ pods = kubernetes.core_api(context).list_namespaced_pod(
120
+ namespace=namespace, field_selector=field_selector)
85
121
  for pod in pods.items:
86
- if pod.spec.volumes is not None:
87
- for volume in pod.spec.volumes:
88
- if volume.persistent_volume_claim is not None:
89
- if volume.persistent_volume_claim.claim_name == pvc_name:
90
- usedby.append(pod.metadata.name)
91
- return usedby
122
+ if pod.spec.volumes is None:
123
+ continue
124
+ for volume in pod.spec.volumes:
125
+ if volume.persistent_volume_claim is None:
126
+ continue
127
+ if volume.persistent_volume_claim.claim_name == pvc_name:
128
+ usedby_pods.append(pod.metadata.name)
129
+ # Get the real cluster name
130
+ cluster_name_on_cloud = pod.metadata.labels.get(
131
+ k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
132
+ if cluster_name_on_cloud is None:
133
+ continue
134
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
135
+ if cluster_name is not None:
136
+ usedby_clusters.append(cluster_name)
137
+ if usedby_pods:
138
+ logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
139
+ f' and clusters {usedby_clusters}')
140
+ return usedby_pods, usedby_clusters
141
+
142
+
143
+ def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
144
+ """Gets the map from cluster name on cloud to cluster name."""
145
+ clusters = global_user_state.get_clusters()
146
+ cloud_to_name_map = {}
147
+ for cluster in clusters:
148
+ handle = cluster['handle']
149
+ if handle is None:
150
+ continue
151
+ cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
152
+ return cloud_to_name_map
92
153
 
93
154
 
94
- def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
155
+ def get_volume_usedby(
156
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
95
157
  """Gets the usedby resources of a volume."""
96
158
  context, namespace = _get_context_namespace(config)
97
159
  pvc_name = config.name_on_cloud
sky/server/common.py CHANGED
@@ -293,6 +293,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
293
293
  try:
294
294
  response = make_authenticated_request('GET',
295
295
  '/api/health',
296
+ server_url=server_url,
296
297
  timeout=2.5)
297
298
  except requests.exceptions.Timeout:
298
299
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
sky/server/server.py CHANGED
@@ -277,29 +277,51 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
277
277
  """Middleware to handle Bearer Token Auth (Service Accounts)."""
278
278
 
279
279
  async def dispatch(self, request: fastapi.Request, call_next):
280
- # Only process requests with Bearer token authorization header
280
+ """Make sure correct bearer token auth is present.
281
+
282
+ 1. If the request has the X-Skypilot-Auth-Mode: token header, it must
283
+ have a valid bearer token.
284
+ 2. For backwards compatibility, if the request has a Bearer token
285
+ beginning with "sky_" (even if X-Skypilot-Auth-Mode is not present),
286
+ it must be a valid token.
287
+ 3. If X-Skypilot-Auth-Mode is not set to "token", and there is no Bearer
288
+ token beginning with "sky_", allow the request to continue.
289
+
290
+ In conjunction with an auth proxy, the idea is to make the auth proxy
291
+ bypass requests with bearer tokens, instead setting the
292
+ X-Skypilot-Auth-Mode header. The auth proxy should either validate the
293
+ auth or set the header X-Skypilot-Auth-Mode: token.
294
+ """
295
+ has_skypilot_auth_header = (
296
+ request.headers.get('X-Skypilot-Auth-Mode') == 'token')
281
297
  auth_header = request.headers.get('authorization')
282
- if not auth_header or not auth_header.lower().startswith('bearer '):
298
+ has_bearer_token_starting_with_sky = (
299
+ auth_header and auth_header.lower().startswith('bearer ') and
300
+ auth_header.split(' ', 1)[1].startswith('sky_'))
301
+
302
+ if (not has_skypilot_auth_header and
303
+ not has_bearer_token_starting_with_sky):
304
+ # This is case #3 above. We do not need to validate the request.
283
305
  # No Bearer token, continue with normal processing (OAuth2 cookies,
284
306
  # etc.)
285
307
  return await call_next(request)
308
+ # After this point, all requests must be validated.
309
+
310
+ if auth_header is None:
311
+ return fastapi.responses.JSONResponse(
312
+ status_code=401, content={'detail': 'Authentication required'})
286
313
 
287
314
  # Extract token
288
- sa_token = auth_header.split(' ', 1)[1]
315
+ split_header = auth_header.split(' ', 1)
316
+ if split_header[0].lower() != 'bearer':
317
+ return fastapi.responses.JSONResponse(
318
+ status_code=401,
319
+ content={'detail': 'Invalid authentication method'})
320
+ sa_token = split_header[1]
289
321
 
290
322
  # Handle SkyPilot service account tokens
291
- if sa_token.startswith('sky_'):
292
- return await self._handle_service_account_token(
293
- request, sa_token, call_next)
294
-
295
- # Handle other Bearer tokens (OAuth2 access tokens, etc.)
296
- # These requests bypassed OAuth2 proxy, so let the application decide
297
- # how to handle them
298
- # For now, we'll let them continue through normal processing
299
- logger.debug(
300
- 'Non-SkyPilot Bearer token detected, continuing with normal '
301
- 'processing')
302
- return await call_next(request)
323
+ return await self._handle_service_account_token(request, sa_token,
324
+ call_next)
303
325
 
304
326
  async def _handle_service_account_token(self, request: fastapi.Request,
305
327
  sa_token: str, call_next):
@@ -65,12 +65,14 @@ install_requires = [
65
65
  # Required for API server metrics
66
66
  'prometheus_client>=0.8.0',
67
67
  'passlib',
68
+ 'pyjwt',
68
69
  ]
69
70
 
70
71
  server_dependencies = [
71
72
  'casbin',
72
73
  'sqlalchemy_adapter',
73
74
  'passlib',
75
+ 'pyjwt',
74
76
  ]
75
77
 
76
78
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -481,6 +481,7 @@ MEMORY_SIZE_PATTERN = (
481
481
  ')?$')
482
482
 
483
483
  LAST_USE_TRUNC_LENGTH = 25
484
+ USED_BY_TRUNC_LENGTH = 25
484
485
 
485
486
  MIN_PRIORITY = -1000
486
487
  MAX_PRIORITY = 1000
sky/task.py CHANGED
@@ -884,6 +884,18 @@ class Task:
884
884
  def volumes(self) -> Dict[str, str]:
885
885
  return self._volumes
886
886
 
887
+ def set_volumes(self, volumes: Dict[str, str]) -> None:
888
+ """Sets the volumes for this task.
889
+
890
+ Args:
891
+ volumes: a dict of ``{mount_path: volume_name}``.
892
+ """
893
+ self._volumes = volumes
894
+
895
+ def update_volumes(self, volumes: Dict[str, str]) -> None:
896
+ """Updates the volumes for this task."""
897
+ self._volumes.update(volumes)
898
+
887
899
  def update_envs(
888
900
  self, envs: Union[None, List[Tuple[str, str]],
889
901
  Dict[str, str]]) -> 'Task':
@@ -1500,7 +1512,7 @@ class Task:
1500
1512
  d[k] = v
1501
1513
  return d
1502
1514
 
1503
- def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
1515
+ def to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1504
1516
  """Returns a yaml-style dict representation of the task.
1505
1517
 
1506
1518
  INTERNAL: this method is internal-facing.
sky/utils/dag_utils.py CHANGED
@@ -147,11 +147,13 @@ def load_chain_dag_from_yaml_str(
147
147
  return _load_chain_dag(configs, env_overrides, secrets_overrides)
148
148
 
149
149
 
150
- def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
150
+ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
151
+ redact_secrets: bool = False) -> str:
151
152
  """Dumps a chain DAG to a YAML string.
152
153
 
153
154
  Args:
154
155
  dag: the DAG to dump.
156
+ redact_secrets: whether to redact secrets in the YAML string.
155
157
 
156
158
  Returns:
157
159
  The YAML string.
@@ -159,7 +161,7 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
159
161
  assert dag.is_chain(), dag
160
162
  configs = [{'name': dag.name}]
161
163
  for task in dag.tasks:
162
- configs.append(task.to_yaml_config())
164
+ configs.append(task.to_yaml_config(redact_secrets=redact_secrets))
163
165
  return common_utils.dump_yaml_str(configs)
164
166
 
165
167
 
sky/utils/log_utils.py CHANGED
@@ -573,6 +573,74 @@ def readable_time_duration(start: Optional[float],
573
573
  return diff
574
574
 
575
575
 
576
+ def human_duration(start: int, end: Optional[int] = None) -> str:
577
+ """Calculates the time elapsed between two timestamps and returns
578
+ it as a human-readable string, similar to Kubernetes' duration format.
579
+
580
+ Args:
581
+ start: The start time as a Unix timestamp (seconds since epoch).
582
+ end: The end time as a Unix timestamp (seconds since epoch).
583
+ If None, current time is used.
584
+
585
+ Returns:
586
+ A string representing the duration, e.g., "2d3h", "15m", "30s".
587
+ Returns "0s" for zero, negative durations, or if the timestamp
588
+ is invalid.
589
+ """
590
+ if not start or start <= 0:
591
+ return '0s'
592
+
593
+ if end is None:
594
+ end = int(time.time())
595
+ duration_seconds = end - start
596
+
597
+ units = {
598
+ 'y': 365 * 24 * 60 * 60,
599
+ 'd': 60 * 60 * 24,
600
+ 'h': 60 * 60,
601
+ 'm': 60,
602
+ 's': 1,
603
+ }
604
+
605
+ if duration_seconds <= 0:
606
+ return '0s'
607
+ elif duration_seconds < 60 * 2:
608
+ return f'{duration_seconds}s'
609
+
610
+ minutes = int(duration_seconds / units['m'])
611
+ if minutes < 10:
612
+ s = int(duration_seconds / units['s']) % 60
613
+ if s == 0:
614
+ return f'{minutes}m'
615
+ return f'{minutes}m{s}s'
616
+ elif minutes < 60 * 3:
617
+ return f'{minutes}m'
618
+
619
+ hours = int(duration_seconds / units['h'])
620
+ days = int(hours / 24)
621
+ years = int(hours / 24 / 365)
622
+ if hours < 8:
623
+ m = int(duration_seconds / units['m']) % 60
624
+ if m == 0:
625
+ return f'{hours}h'
626
+ return f'{hours}h{m}m'
627
+ elif hours < 48:
628
+ return f'{hours}h'
629
+ elif hours < 24 * 8:
630
+ h = hours % 24
631
+ if h == 0:
632
+ return f'{days}d'
633
+ return f'{days}d{h}h'
634
+ elif hours < 24 * 365 * 2:
635
+ return f'{days}d'
636
+ elif hours < 24 * 365 * 8:
637
+ dy = int(hours / 24) % 365
638
+ if dy == 0:
639
+ return f'{years}y'
640
+ return f'{years}y{dy}d'
641
+ return f'{years}y'
642
+
643
+
576
644
  def follow_logs(
577
645
  file: TextIO,
578
646
  *,