skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (113) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +200 -78
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +104 -53
  7. sky/client/sdk.py +13 -5
  8. sky/client/sdk_async.py +4 -2
  9. sky/clouds/kubernetes.py +2 -1
  10. sky/clouds/runpod.py +20 -7
  11. sky/core.py +7 -53
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/mounting_utils.py +19 -10
  36. sky/execution.py +4 -2
  37. sky/global_user_state.py +224 -38
  38. sky/jobs/client/sdk.py +10 -1
  39. sky/jobs/controller.py +7 -7
  40. sky/jobs/server/core.py +3 -3
  41. sky/jobs/server/server.py +15 -11
  42. sky/jobs/utils.py +1 -1
  43. sky/logs/agent.py +30 -3
  44. sky/logs/aws.py +9 -19
  45. sky/provision/__init__.py +2 -1
  46. sky/provision/aws/instance.py +2 -1
  47. sky/provision/azure/instance.py +2 -1
  48. sky/provision/cudo/instance.py +2 -2
  49. sky/provision/do/instance.py +2 -2
  50. sky/provision/docker_utils.py +41 -19
  51. sky/provision/fluidstack/instance.py +2 -2
  52. sky/provision/gcp/instance.py +2 -1
  53. sky/provision/hyperbolic/instance.py +2 -1
  54. sky/provision/instance_setup.py +1 -1
  55. sky/provision/kubernetes/instance.py +134 -8
  56. sky/provision/lambda_cloud/instance.py +2 -1
  57. sky/provision/nebius/instance.py +2 -1
  58. sky/provision/oci/instance.py +2 -1
  59. sky/provision/paperspace/instance.py +2 -2
  60. sky/provision/primeintellect/instance.py +2 -2
  61. sky/provision/provisioner.py +1 -0
  62. sky/provision/runpod/instance.py +2 -2
  63. sky/provision/scp/instance.py +2 -2
  64. sky/provision/seeweb/instance.py +2 -1
  65. sky/provision/vast/instance.py +2 -1
  66. sky/provision/vsphere/instance.py +6 -5
  67. sky/schemas/api/responses.py +2 -1
  68. sky/serve/autoscalers.py +2 -0
  69. sky/serve/client/impl.py +45 -19
  70. sky/serve/replica_managers.py +12 -5
  71. sky/serve/serve_utils.py +5 -11
  72. sky/serve/server/core.py +9 -6
  73. sky/serve/server/impl.py +78 -25
  74. sky/serve/server/server.py +4 -5
  75. sky/serve/service_spec.py +33 -0
  76. sky/server/auth/oauth2_proxy.py +2 -2
  77. sky/server/constants.py +1 -1
  78. sky/server/daemons.py +2 -3
  79. sky/server/requests/executor.py +56 -6
  80. sky/server/requests/payloads.py +31 -8
  81. sky/server/requests/preconditions.py +2 -3
  82. sky/server/rest.py +2 -0
  83. sky/server/server.py +28 -19
  84. sky/server/stream_utils.py +34 -12
  85. sky/setup_files/dependencies.py +12 -2
  86. sky/setup_files/setup.py +44 -44
  87. sky/skylet/constants.py +2 -3
  88. sky/templates/kubernetes-ray.yml.j2 +16 -15
  89. sky/usage/usage_lib.py +3 -0
  90. sky/utils/cli_utils/status_utils.py +4 -5
  91. sky/utils/context.py +104 -29
  92. sky/utils/controller_utils.py +7 -6
  93. sky/utils/kubernetes/create_cluster.sh +13 -28
  94. sky/utils/kubernetes/delete_cluster.sh +10 -7
  95. sky/utils/kubernetes/generate_kind_config.py +6 -66
  96. sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
  97. sky/utils/kubernetes_enums.py +5 -0
  98. sky/utils/ux_utils.py +35 -1
  99. sky/utils/yaml_utils.py +9 -0
  100. sky/volumes/client/sdk.py +44 -8
  101. sky/volumes/server/server.py +33 -7
  102. sky/volumes/volume.py +22 -14
  103. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
  104. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
  105. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  109. /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
  110. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
  111. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
  112. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
  113. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ import shlex
4
4
  import subprocess
5
5
  import sys
6
6
  import tempfile
7
+ import textwrap
7
8
  from typing import List, Optional
8
9
 
9
10
  import colorama
@@ -24,6 +25,9 @@ logger = sky_logging.init_logger(__name__)
24
25
 
25
26
  # Default path for Kubernetes configuration file
26
27
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
28
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
29
+ LOCAL_CLUSTER_PORT_RANGE = 101
30
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
27
31
 
28
32
 
29
33
  def check_ssh_cluster_dependencies(
@@ -252,7 +256,68 @@ def deploy_remote_cluster(ip_list: List[str],
252
256
  is_local=True))
253
257
 
254
258
 
255
- def deploy_local_cluster(gpus: bool):
259
+ def generate_kind_config(port_start: int,
260
+ num_nodes: int = 1,
261
+ gpus: bool = False) -> str:
262
+ """Generate a kind cluster config with ports mapped from host to container
263
+
264
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
265
+ Internally, this will map to ports 30000 - 30100
266
+
267
+ Args:
268
+ path: Path to generate the config file at
269
+ port_start: Port range start for mappings
270
+ num_nodes: Number of nodes in the cluster
271
+ gpus: If true, initialize kind cluster with GPU support
272
+
273
+ Returns:
274
+ The kind cluster config
275
+ """
276
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
277
+ internal_end = internal_start + LOCAL_CLUSTER_PORT_RANGE - 1
278
+
279
+ config = textwrap.dedent(f"""
280
+ apiVersion: kind.x-k8s.io/v1alpha4
281
+ kind: Cluster
282
+ kubeadmConfigPatches:
283
+ - |
284
+ kind: ClusterConfiguration
285
+ apiServer:
286
+ extraArgs:
287
+ "service-node-port-range": {internal_start}-{internal_end}
288
+ nodes:
289
+ - role: control-plane
290
+ kubeadmConfigPatches:
291
+ - |
292
+ kind: InitConfiguration
293
+ nodeRegistration:
294
+ kubeletExtraArgs:
295
+ node-labels: "ingress-ready=true"
296
+ """)
297
+ if gpus:
298
+ config += textwrap.indent(
299
+ textwrap.dedent("""
300
+ extraMounts:
301
+ - hostPath: /dev/null
302
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
303
+ config += textwrap.indent(textwrap.dedent("""
304
+ extraPortMappings:"""), ' ' * 2)
305
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
306
+ config += textwrap.indent(
307
+ textwrap.dedent(f"""
308
+ - containerPort: {internal_start + offset}
309
+ hostPort: {port_start + offset}
310
+ listenAddress: "0.0.0.0"
311
+ protocol: tcp
312
+ """), ' ' * 2)
313
+ if num_nodes > 1:
314
+ config += '- role: worker\n' * (num_nodes - 1)
315
+ return config
316
+
317
+
318
+ def deploy_local_cluster(name: Optional[str], gpus: bool):
319
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
320
+ context_name = f'kind-{name}'
256
321
  cluster_created = False
257
322
 
258
323
  # Check if GPUs are available on the host
@@ -262,41 +327,57 @@ def deploy_local_cluster(gpus: bool):
262
327
  # Check if ~/.kube/config exists:
263
328
  if os.path.exists(os.path.expanduser('~/.kube/config')):
264
329
  curr_context = kubernetes_utils.get_current_kube_config_context_name()
265
- skypilot_context = 'kind-skypilot'
266
- if curr_context is not None and curr_context != skypilot_context:
330
+ if curr_context is not None and curr_context != context_name:
267
331
  logger.info(
268
332
  f'Current context in kube config: {curr_context}'
269
- '\nWill automatically switch to kind-skypilot after the local '
270
- 'cluster is created.')
271
- message_str = 'Creating local cluster{}...'
272
- message_str = message_str.format((' with GPU support (this may take up '
273
- 'to 15 minutes)') if gpus else '')
274
- path_to_package = os.path.dirname(__file__)
275
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
276
-
277
- # Get directory of script and run it from there
278
- cwd = os.path.dirname(os.path.abspath(up_script_path))
279
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
280
- run_command = shlex.split(run_command)
333
+ f'\nWill automatically switch to {context_name} after the '
334
+ 'local cluster is created.')
335
+ message_str = 'Creating local cluster {}{}...'
336
+ message_str = message_str.format(
337
+ name,
338
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
339
+
340
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
341
+ delete=True) as f:
342
+ # Choose random port range to use on the host machine.
343
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
344
+ # port_start = random.randint(300, 399) * 100
345
+ # TODO (kyuds): hard coding to pass smoketests. Need to figure out
346
+ # how to deal with this later.
347
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
348
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
349
+ logger.debug(f'Using port range {port_start}-{port_end}')
350
+ f.write(generate_kind_config(port_start, gpus=gpus))
351
+ f.flush()
352
+
353
+ path_to_package = os.path.dirname(__file__)
354
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
355
+
356
+ # Get directory of script and run it from there
357
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
358
+ run_command = f'{up_script_path} {name} {f.name}'
359
+ if gpus:
360
+ run_command += ' --gpus'
361
+ run_command = shlex.split(run_command)
281
362
 
282
- # Setup logging paths
283
- run_timestamp = sky_logging.get_run_timestamp()
284
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
285
- 'local_up.log')
286
- logger.info(message_str)
363
+ # Setup logging paths
364
+ run_timestamp = sky_logging.get_run_timestamp()
365
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
366
+ 'local_up.log')
367
+ logger.info(message_str)
287
368
 
288
- with rich_utils.safe_status(
289
- ux_utils.spinner_message(message_str,
290
- log_path=log_path,
291
- is_local=True)):
292
- returncode, _, stderr = log_lib.run_with_log(
293
- cmd=run_command,
294
- log_path=log_path,
295
- require_outputs=True,
296
- stream_logs=False,
297
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
298
- is_local=True),
299
- cwd=cwd)
369
+ with rich_utils.safe_status(
370
+ ux_utils.spinner_message(message_str,
371
+ log_path=log_path,
372
+ is_local=True)):
373
+ returncode, _, stderr = log_lib.run_with_log(
374
+ cmd=run_command,
375
+ log_path=log_path,
376
+ require_outputs=True,
377
+ stream_logs=False,
378
+ line_processor=log_utils.SkyLocalUpLineProcessor(
379
+ log_path=log_path, is_local=True),
380
+ cwd=cwd)
300
381
 
301
382
  # Kind always writes to stderr even if it succeeds.
302
383
  # If the failure happens after the cluster is created, we need
@@ -309,11 +390,11 @@ def deploy_local_cluster(gpus: bool):
309
390
  elif returncode == 100:
310
391
  logger.info(
311
392
  ux_utils.finishing_message(
312
- 'Local cluster already exists.\n',
393
+ f'Local cluster {name} already exists.\n',
313
394
  log_path=log_path,
314
395
  is_local=True,
315
396
  follow_up_message=
316
- 'If you want to delete it instead, run: sky local down'))
397
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
317
398
  else:
318
399
  with ux_utils.print_exception_no_traceback():
319
400
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -339,7 +420,7 @@ def deploy_local_cluster(gpus: bool):
339
420
  if gpus:
340
421
  # Get GPU model by querying the node labels
341
422
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
342
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
423
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
343
424
  try:
344
425
  # Run the command and capture the output
345
426
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -375,8 +456,9 @@ def deploy_local_cluster(gpus: bool):
375
456
  'This may cause issues with running tasks.')
376
457
  logger.info(
377
458
  ux_utils.finishing_message(
378
- message=(f'Local Kubernetes cluster created successfully with '
379
- f'{num_cpus} CPUs{gpu_message}.'),
459
+ message=(
460
+ f'Local Kubernetes cluster {name} created successfully '
461
+ f'with {num_cpus} CPUs{gpu_message}.'),
380
462
  log_path=log_path,
381
463
  is_local=True,
382
464
  follow_up_message=(
@@ -384,3 +466,54 @@ def deploy_local_cluster(gpus: bool):
384
466
  'Hint: To change the number of CPUs, change your docker '
385
467
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
386
468
  f'{gpu_hint}')))
469
+
470
+
471
+ def teardown_local_cluster(name: Optional[str] = None):
472
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
473
+ cluster_removed = False
474
+
475
+ path_to_package = os.path.dirname(__file__)
476
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
477
+
478
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
479
+ run_command = f'{down_script_path} {name}'
480
+ run_command = shlex.split(run_command)
481
+
482
+ # Setup logging paths
483
+ run_timestamp = sky_logging.get_run_timestamp()
484
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
485
+ 'local_down.log')
486
+
487
+ with rich_utils.safe_status(
488
+ ux_utils.spinner_message(f'Removing local cluster {name}',
489
+ log_path=log_path,
490
+ is_local=True)):
491
+
492
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
493
+ log_path=log_path,
494
+ require_outputs=True,
495
+ stream_logs=False,
496
+ cwd=cwd)
497
+ stderr = stderr.replace('No kind clusters found.\n', '')
498
+
499
+ if returncode == 0:
500
+ cluster_removed = True
501
+ elif returncode == 100:
502
+ logger.info(
503
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
504
+ else:
505
+ with ux_utils.print_exception_no_traceback():
506
+ raise RuntimeError(f'Failed to down local cluster {name}. '
507
+ f'Stdout: {stdout}'
508
+ f'\nError: {stderr}')
509
+ if cluster_removed:
510
+ # Run sky check
511
+ with rich_utils.safe_status(
512
+ ux_utils.spinner_message('Running sky check...')):
513
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
514
+ clouds=['kubernetes'],
515
+ quiet=True)
516
+ logger.info(
517
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
518
+ log_path=log_path,
519
+ is_local=True))
@@ -44,3 +44,8 @@ class KubernetesAutoscalerType(enum.Enum):
44
44
  KARPENTER = 'karpenter'
45
45
  COREWEAVE = 'coreweave'
46
46
  GENERIC = 'generic'
47
+
48
+ def emits_autoscale_event(self) -> bool:
49
+ """Returns whether specific autoscaler emits the event reason
50
+ TriggeredScaleUp."""
51
+ return self not in {self.KARPENTER}
sky/utils/ux_utils.py CHANGED
@@ -1,11 +1,12 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
3
  import enum
4
+ import fnmatch
4
5
  import os
5
6
  import sys
6
7
  import traceback
7
8
  import typing
8
- from typing import Callable, Optional, Union
9
+ from typing import Callable, Iterable, List, Optional, Union
9
10
 
10
11
  import colorama
11
12
 
@@ -288,3 +289,36 @@ def command_hint_messages(hint_type: CommandHintType,
288
289
  f'{BOLD}sky jobs queue{RESET_BOLD}')
289
290
  else:
290
291
  raise ValueError(f'Invalid hint type: {hint_type}')
292
+
293
+
294
+ def is_glob_pattern(pattern: str) -> bool:
295
+ """Checks if a string contains common glob pattern wildcards."""
296
+ glob_chars = {'*', '?', '[', ']'}
297
+ # Also check for '**' as a specific globstar pattern
298
+ if '**' in pattern:
299
+ return True
300
+ for char in pattern:
301
+ if char in glob_chars:
302
+ return True
303
+ return False
304
+
305
+
306
+ def get_non_matched_query(query_clusters: Iterable[str],
307
+ cluster_names: Iterable[str]) -> List[str]:
308
+ """Gets the non-matched query clusters."""
309
+ glob_query_clusters = []
310
+ non_glob_query_clusters = []
311
+ for cluster_name in query_clusters:
312
+ if is_glob_pattern(cluster_name):
313
+ glob_query_clusters.append(cluster_name)
314
+ else:
315
+ non_glob_query_clusters.append(cluster_name)
316
+ not_found_clusters = [
317
+ query_cluster for query_cluster in non_glob_query_clusters
318
+ if query_cluster not in cluster_names
319
+ ]
320
+ not_found_clusters.extend([
321
+ query_cluster for query_cluster in glob_query_clusters
322
+ if not fnmatch.filter(cluster_names, query_cluster)
323
+ ])
324
+ return not_found_clusters
sky/utils/yaml_utils.py CHANGED
@@ -44,6 +44,15 @@ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
44
44
  return config
45
45
 
46
46
 
47
+ def read_yaml_str(yaml_str: str) -> Dict[str, Any]:
48
+ stream = io.StringIO(yaml_str)
49
+ parsed_yaml = safe_load(stream)
50
+ if not parsed_yaml:
51
+ # Empty dict
52
+ return {}
53
+ return parsed_yaml
54
+
55
+
47
56
  def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
48
57
  stream = io.StringIO(yaml_str)
49
58
  config = safe_load_all(stream)
sky/volumes/client/sdk.py CHANGED
@@ -3,13 +3,16 @@ import json
3
3
  import typing
4
4
  from typing import Any, Dict, List
5
5
 
6
+ from sky import exceptions
6
7
  from sky import sky_logging
7
8
  from sky.adaptors import common as adaptors_common
8
9
  from sky.server import common as server_common
10
+ from sky.server import versions
9
11
  from sky.server.requests import payloads
10
12
  from sky.usage import usage_lib
11
13
  from sky.utils import annotations
12
14
  from sky.utils import context
15
+ from sky.utils import ux_utils
13
16
  from sky.volumes import volume as volume_lib
14
17
 
15
18
  if typing.TYPE_CHECKING:
@@ -71,12 +74,44 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
71
74
  config=volume.config,
72
75
  labels=volume.labels,
73
76
  )
74
- response = requests.post(f'{server_common.get_server_url()}/volumes/apply',
75
- json=json.loads(body.model_dump_json()),
76
- cookies=server_common.get_api_cookie_jar())
77
+ response = server_common.make_authenticated_request(
78
+ 'POST', '/volumes/apply', json=json.loads(body.model_dump_json()))
77
79
  return server_common.get_request_id(response)
78
80
 
79
81
 
82
+ @context.contextual
83
+ @usage_lib.entrypoint
84
+ @server_common.check_server_healthy_or_start
85
+ @annotations.client_api
86
+ @versions.minimal_api_version(20)
87
+ def validate(volume: volume_lib.Volume) -> None:
88
+ """Validates the volume.
89
+
90
+ All validation is done on the server side.
91
+
92
+ Args:
93
+ volume: The volume to validate.
94
+
95
+ Raises:
96
+ ValueError: If the volume is invalid.
97
+ """
98
+ body = payloads.VolumeValidateBody(
99
+ name=volume.name,
100
+ volume_type=volume.type,
101
+ infra=volume.infra,
102
+ resource_name=volume.resource_name,
103
+ size=volume.size,
104
+ config=volume.config,
105
+ labels=volume.labels,
106
+ )
107
+ response = server_common.make_authenticated_request(
108
+ 'POST', '/volumes/validate', json=json.loads(body.model_dump_json()))
109
+ if response.status_code == 400:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise exceptions.deserialize_exception(
112
+ response.json().get('detail'))
113
+
114
+
80
115
  @context.contextual
81
116
  @usage_lib.entrypoint
82
117
  @server_common.check_server_healthy_or_start
@@ -87,8 +122,10 @@ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
87
122
  Returns:
88
123
  The request ID of the list request.
89
124
  """
90
- response = requests.get(f'{server_common.get_server_url()}/volumes',
91
- cookies=server_common.get_api_cookie_jar())
125
+ response = server_common.make_authenticated_request(
126
+ 'GET',
127
+ '/volumes',
128
+ )
92
129
  return server_common.get_request_id(response)
93
130
 
94
131
 
@@ -106,7 +143,6 @@ def delete(names: List[str]) -> server_common.RequestId[None]:
106
143
  The request ID of the delete request.
107
144
  """
108
145
  body = payloads.VolumeDeleteBody(names=names)
109
- response = requests.post(f'{server_common.get_server_url()}/volumes/delete',
110
- json=json.loads(body.model_dump_json()),
111
- cookies=server_common.get_api_cookie_jar())
146
+ response = server_common.make_authenticated_request(
147
+ 'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
112
148
  return server_common.get_request_id(response)
@@ -3,12 +3,13 @@
3
3
  import fastapi
4
4
 
5
5
  from sky import clouds
6
+ from sky import exceptions
6
7
  from sky import sky_logging
7
8
  from sky.server.requests import executor
8
9
  from sky.server.requests import payloads
9
10
  from sky.server.requests import requests as requests_lib
10
11
  from sky.utils import registry
11
- from sky.utils import volume
12
+ from sky.utils import volume as volume_utils
12
13
  from sky.volumes.server import core
13
14
 
14
15
  logger = sky_logging.init_logger(__name__)
@@ -46,6 +47,31 @@ async def volume_delete(request: fastapi.Request,
46
47
  )
47
48
 
48
49
 
50
+ @router.post('/validate')
51
+ async def volume_validate(
52
+ _: fastapi.Request,
53
+ volume_validate_body: payloads.VolumeValidateBody) -> None:
54
+ """Validates a volume."""
55
+ # pylint: disable=import-outside-toplevel
56
+ from sky.volumes import volume as volume_lib
57
+
58
+ try:
59
+ volume_config = {
60
+ 'name': volume_validate_body.name,
61
+ 'type': volume_validate_body.volume_type,
62
+ 'infra': volume_validate_body.infra,
63
+ 'size': volume_validate_body.size,
64
+ 'labels': volume_validate_body.labels,
65
+ 'config': volume_validate_body.config,
66
+ 'resource_name': volume_validate_body.resource_name,
67
+ }
68
+ volume = volume_lib.Volume.from_yaml_config(volume_config)
69
+ volume.validate()
70
+ except Exception as e:
71
+ raise fastapi.HTTPException(status_code=400,
72
+ detail=exceptions.serialize_exception(e))
73
+
74
+
49
75
  @router.post('/apply')
50
76
  async def volume_apply(request: fastapi.Request,
51
77
  volume_apply_body: payloads.VolumeApplyBody) -> None:
@@ -55,7 +81,7 @@ async def volume_apply(request: fastapi.Request,
55
81
  volume_config = volume_apply_body.config
56
82
 
57
83
  supported_volume_types = [
58
- volume_type.value for volume_type in volume.VolumeType
84
+ volume_type.value for volume_type in volume_utils.VolumeType
59
85
  ]
60
86
  if volume_type not in supported_volume_types:
61
87
  raise fastapi.HTTPException(
@@ -64,24 +90,24 @@ async def volume_apply(request: fastapi.Request,
64
90
  if cloud is None:
65
91
  raise fastapi.HTTPException(status_code=400,
66
92
  detail=f'Invalid cloud: {volume_cloud}')
67
- if volume_type == volume.VolumeType.PVC.value:
93
+ if volume_type == volume_utils.VolumeType.PVC.value:
68
94
  if not cloud.is_same_cloud(clouds.Kubernetes()):
69
95
  raise fastapi.HTTPException(
70
96
  status_code=400,
71
97
  detail='PVC storage is only supported on Kubernetes')
72
98
  supported_access_modes = [
73
- access_mode.value for access_mode in volume.VolumeAccessMode
99
+ access_mode.value for access_mode in volume_utils.VolumeAccessMode
74
100
  ]
75
101
  if volume_config is None:
76
102
  volume_config = {}
77
103
  access_mode = volume_config.get('access_mode')
78
104
  if access_mode is None:
79
- volume_config[
80
- 'access_mode'] = volume.VolumeAccessMode.READ_WRITE_ONCE.value
105
+ volume_config['access_mode'] = (
106
+ volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value)
81
107
  elif access_mode not in supported_access_modes:
82
108
  raise fastapi.HTTPException(
83
109
  status_code=400, detail=f'Invalid access mode: {access_mode}')
84
- elif volume_type == volume.VolumeType.RUNPOD_NETWORK_VOLUME.value:
110
+ elif volume_type == volume_utils.VolumeType.RUNPOD_NETWORK_VOLUME.value:
85
111
  if not cloud.is_same_cloud(clouds.RunPod()):
86
112
  raise fastapi.HTTPException(
87
113
  status_code=400,
sky/volumes/volume.py CHANGED
@@ -115,9 +115,6 @@ class Volume:
115
115
  self.region = infra_info.region
116
116
  self.zone = infra_info.zone
117
117
 
118
- # Validate the volume config
119
- self._validate_config()
120
-
121
118
  def _adjust_config(self) -> None:
122
119
  """Adjust the volume config (e.g., parse size)."""
123
120
  if self.size is None:
@@ -132,8 +129,28 @@ class Volume:
132
129
  except ValueError as e:
133
130
  raise ValueError(f'Invalid size {self.size}: {e}') from e
134
131
 
135
- def _validate_config(self) -> None:
136
- """Validate the volume config."""
132
+ def validate(self, skip_cloud_compatibility: bool = False) -> None:
133
+ """Validates the volume."""
134
+ self.validate_name()
135
+ self.validate_size()
136
+ if not skip_cloud_compatibility:
137
+ self.validate_cloud_compatibility()
138
+ # Extra, type-specific validations
139
+ self._validate_config_extra()
140
+
141
+ def validate_name(self) -> None:
142
+ """Validates if the volume name is set."""
143
+ assert self.name is not None, 'Volume name must be set'
144
+
145
+ def validate_size(self) -> None:
146
+ """Validates that size is specified for new volumes."""
147
+ if not self.resource_name and not self.size:
148
+ raise ValueError('Size is required for new volumes. '
149
+ 'Please specify the size in the YAML file or '
150
+ 'use the --size flag.')
151
+
152
+ def validate_cloud_compatibility(self) -> None:
153
+ """Validates that the specified cloud is compatible with volume type."""
137
154
  cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
138
155
  volume_lib.VolumeType(self.type))
139
156
  if self.cloud:
@@ -150,25 +167,16 @@ class Volume:
150
167
  self.region, self.zone = cloud_obj.validate_region_zone(
151
168
  self.region, self.zone)
152
169
 
153
- # Name must be set by factory before validation.
154
- assert self.name is not None
155
170
  valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
156
171
  if not valid:
157
172
  raise ValueError(f'Invalid volume name: {err_msg}')
158
173
 
159
- if not self.resource_name and not self.size:
160
- raise ValueError('Size is required for new volumes. '
161
- 'Please specify the size in the YAML file or '
162
- 'use the --size flag.')
163
174
  if self.labels:
164
175
  for key, value in self.labels.items():
165
176
  valid, err_msg = cloud_obj.is_label_valid(key, value)
166
177
  if not valid:
167
178
  raise ValueError(f'{err_msg}')
168
179
 
169
- # Extra, type-specific validations
170
- self._validate_config_extra()
171
-
172
180
  # Hook methods for subclasses
173
181
  def _validate_config_extra(self) -> None:
174
182
  """Additional type-specific validation.