skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (111) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +194 -69
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +104 -53
  7. sky/client/sdk.py +13 -5
  8. sky/client/sdk_async.py +4 -2
  9. sky/clouds/kubernetes.py +2 -1
  10. sky/clouds/runpod.py +20 -7
  11. sky/core.py +7 -53
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-16ba1d7187d2e3b1.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/mounting_utils.py +19 -10
  36. sky/execution.py +4 -2
  37. sky/global_user_state.py +217 -36
  38. sky/jobs/client/sdk.py +10 -1
  39. sky/jobs/controller.py +7 -7
  40. sky/jobs/server/core.py +3 -3
  41. sky/jobs/server/server.py +15 -11
  42. sky/jobs/utils.py +1 -1
  43. sky/logs/agent.py +30 -3
  44. sky/logs/aws.py +9 -19
  45. sky/provision/__init__.py +2 -1
  46. sky/provision/aws/instance.py +2 -1
  47. sky/provision/azure/instance.py +2 -1
  48. sky/provision/cudo/instance.py +2 -2
  49. sky/provision/do/instance.py +2 -2
  50. sky/provision/docker_utils.py +41 -19
  51. sky/provision/fluidstack/instance.py +2 -2
  52. sky/provision/gcp/instance.py +2 -1
  53. sky/provision/hyperbolic/instance.py +2 -1
  54. sky/provision/instance_setup.py +1 -1
  55. sky/provision/kubernetes/instance.py +134 -8
  56. sky/provision/lambda_cloud/instance.py +2 -1
  57. sky/provision/nebius/instance.py +2 -1
  58. sky/provision/oci/instance.py +2 -1
  59. sky/provision/paperspace/instance.py +2 -2
  60. sky/provision/primeintellect/instance.py +2 -2
  61. sky/provision/provisioner.py +1 -0
  62. sky/provision/runpod/instance.py +2 -2
  63. sky/provision/scp/instance.py +2 -2
  64. sky/provision/seeweb/instance.py +2 -1
  65. sky/provision/vast/instance.py +2 -1
  66. sky/provision/vsphere/instance.py +6 -5
  67. sky/schemas/api/responses.py +2 -1
  68. sky/serve/autoscalers.py +2 -0
  69. sky/serve/client/impl.py +45 -19
  70. sky/serve/replica_managers.py +12 -5
  71. sky/serve/serve_utils.py +5 -7
  72. sky/serve/server/core.py +9 -6
  73. sky/serve/server/impl.py +78 -25
  74. sky/serve/server/server.py +4 -5
  75. sky/serve/service_spec.py +33 -0
  76. sky/server/constants.py +1 -1
  77. sky/server/daemons.py +2 -3
  78. sky/server/requests/executor.py +56 -6
  79. sky/server/requests/payloads.py +31 -8
  80. sky/server/requests/preconditions.py +2 -3
  81. sky/server/rest.py +2 -0
  82. sky/server/server.py +28 -19
  83. sky/server/stream_utils.py +34 -12
  84. sky/setup_files/dependencies.py +4 -1
  85. sky/setup_files/setup.py +44 -44
  86. sky/templates/kubernetes-ray.yml.j2 +16 -15
  87. sky/usage/usage_lib.py +3 -0
  88. sky/utils/cli_utils/status_utils.py +4 -5
  89. sky/utils/context.py +104 -29
  90. sky/utils/controller_utils.py +7 -6
  91. sky/utils/kubernetes/create_cluster.sh +13 -28
  92. sky/utils/kubernetes/delete_cluster.sh +10 -7
  93. sky/utils/kubernetes/generate_kind_config.py +6 -66
  94. sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
  95. sky/utils/kubernetes_enums.py +5 -0
  96. sky/utils/ux_utils.py +35 -1
  97. sky/utils/yaml_utils.py +9 -0
  98. sky/volumes/client/sdk.py +44 -8
  99. sky/volumes/server/server.py +33 -7
  100. sky/volumes/volume.py +22 -14
  101. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +40 -35
  102. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +107 -107
  103. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  107. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
  108. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
  109. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
  110. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
  111. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
@@ -620,15 +620,16 @@ def get_controller_resources(
620
620
  controller_resources_to_use: resources.Resources = list(
621
621
  controller_resources)[0]
622
622
 
623
- controller_record = global_user_state.get_cluster_from_name(
623
+ controller_handle = global_user_state.get_handle_from_cluster_name(
624
624
  controller.value.cluster_name)
625
- if controller_record is not None:
626
- handle = controller_record.get('handle', None)
627
- if handle is not None:
625
+ if controller_handle is not None:
626
+ if controller_handle is not None:
628
627
  # Use the existing resources, but override the autostop config with
629
628
  # the one currently specified in the config.
630
- controller_resources_to_use = handle.launched_resources.copy(
631
- autostop=controller_resources_config_copied.get('autostop'))
629
+ controller_resources_to_use = (
630
+ controller_handle.launched_resources.copy(
631
+ autostop=controller_resources_config_copied.get('autostop'))
632
+ )
632
633
 
633
634
  # If the controller and replicas are from the same cloud (and region/zone),
634
635
  # it should provide better connectivity. We will let the controller choose
@@ -1,22 +1,19 @@
1
1
  #!/bin/bash
2
2
  # Creates a local Kubernetes cluster using kind with optional GPU support
3
- # Usage: ./create_cluster.sh [--gpus]
4
- # Invokes generate_kind_config.py to generate a kind-cluster.yaml with NodePort mappings
3
+ # Usage: ./create_cluster.sh [name] [yaml_path] [--gpus]
5
4
  set -e
6
5
 
7
6
  # Images
8
7
  IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
9
8
  IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
10
9
 
11
- # Limit port range to speed up kind cluster creation
12
- PORT_RANGE_START=30000
13
- PORT_RANGE_END=30100
14
-
15
- USER_HASH=$1
10
+ # Arguments
11
+ NAME=$1
12
+ YAML_PATH=$2
16
13
 
17
14
  # Check for GPU flag
18
15
  ENABLE_GPUS=false
19
- if [[ "$2" == "--gpus" ]]; then
16
+ if [[ "$3" == "--gpus" ]]; then
20
17
  ENABLE_GPUS=true
21
18
  fi
22
19
 
@@ -82,28 +79,16 @@ fi
82
79
  # ====== End of dependency checks =======
83
80
 
84
81
  # Check if the local cluster already exists
85
- if kind get clusters | grep -q skypilot; then
86
- echo "Local cluster already exists. Exiting."
82
+ if kind get clusters | grep -q $NAME; then
83
+ echo "Local cluster $NAME already exists. Exiting."
87
84
  # Switch context to the local cluster
88
- kind export kubeconfig --name skypilot
89
- kubectl config use-context kind-skypilot
85
+ kind export kubeconfig --name $NAME
86
+ kubectl config use-context kind-$NAME
90
87
  exit 100
91
88
  fi
92
89
 
93
- # Generate cluster YAML
94
- YAML_PATH="/tmp/skypilot-kind-$USER_HASH.yaml"
95
- echo "Generating $YAML_PATH"
96
-
97
- # Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
98
- if $ENABLE_GPUS; then
99
- python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
100
- else
101
- python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
102
- fi
103
-
104
- kind create cluster --config $YAML_PATH --name skypilot
105
-
106
- echo "Kind cluster created."
90
+ kind create cluster --config $YAML_PATH --name $NAME
91
+ echo "Kind cluster $NAME created."
107
92
 
108
93
  # Function to wait for GPU operator to be correctly installed
109
94
  wait_for_gpu_operator_installation() {
@@ -157,7 +142,7 @@ if $ENABLE_GPUS; then
157
142
  echo "Enabling GPU support..."
158
143
  # Run patch for missing ldconfig.real
159
144
  # https://github.com/NVIDIA/nvidia-docker/issues/614#issuecomment-423991632
160
- docker exec -ti skypilot-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
145
+ docker exec -ti $NAME-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
161
146
 
162
147
  echo "Installing NVIDIA GPU operator..."
163
148
  # Install the NVIDIA GPU operator
@@ -185,4 +170,4 @@ if $ENABLE_GPUS; then
185
170
  echo "GPU support is enabled. Run 'sky show-gpus --cloud kubernetes' to see the GPUs available on the cluster."
186
171
  fi
187
172
  fi
188
- echo "Number of CPUs available on the local cluster: $NUM_CPUS"
173
+ echo "Number of CPUs available on the local cluster $NAME: $NUM_CPUS"
@@ -1,9 +1,12 @@
1
1
  #!/bin/bash
2
- # Deletes the local kind cluster
3
- # Usage: ./delete_cluster.sh
4
- # Raises error code 100 if the local cluster does not exist
2
+ # Deletes the local kind cluster of [name]
3
+ # Usage: ./delete_cluster.sh [name]
4
+ # Raises error code 100 if the specified local cluster does not exist
5
5
 
6
6
  set -e
7
+
8
+ NAME="${1:-skypilot}"
9
+
7
10
  # Check if docker is running
8
11
  if ! docker info > /dev/null 2>&1; then
9
12
  >&2 echo "Docker is not running. Please start Docker and try again."
@@ -17,13 +20,13 @@ if ! kind version > /dev/null 2>&1; then
17
20
  fi
18
21
 
19
22
  # Check if the local cluster exists
20
- if ! kind get clusters | grep -q skypilot; then
21
- echo "Local cluster does not exist. Exiting."
23
+ if ! kind get clusters | grep -q $NAME; then
24
+ echo "Local cluster $NAME does not exist. Exiting."
22
25
  exit 100
23
26
  fi
24
27
 
25
- kind delete cluster --name skypilot
26
- echo "Local cluster deleted!"
28
+ kind delete cluster --name $NAME
29
+ echo "Local cluster $NAME deleted!"
27
30
 
28
31
  # Switch to the first available context
29
32
  AVAILABLE_CONTEXT=$(kubectl config get-contexts -o name | head -n 1)
@@ -3,67 +3,8 @@
3
3
  Maps specified ports from host to cluster container.
4
4
  """
5
5
  import argparse
6
- import textwrap
7
-
8
-
9
- def generate_kind_config(path: str,
10
- port_start: int = 30000,
11
- port_end: int = 32768,
12
- num_nodes: int = 1,
13
- gpus: bool = False) -> None:
14
- """Generate a kind cluster config with ports mapped from host to container
15
-
16
- Args:
17
- path: Path to generate the config file at
18
- port_start: Port range start
19
- port_end: Port range end
20
- num_nodes: Number of nodes in the cluster
21
- gpus: If true, initialize kind cluster with GPU support
22
- """
23
-
24
- preamble = textwrap.dedent(f"""
25
- apiVersion: kind.x-k8s.io/v1alpha4
26
- kind: Cluster
27
- kubeadmConfigPatches:
28
- - |
29
- kind: ClusterConfiguration
30
- apiServer:
31
- extraArgs:
32
- "service-node-port-range": {port_start}-{port_end}
33
- nodes:
34
- - role: control-plane
35
- kubeadmConfigPatches:
36
- - |
37
- kind: InitConfiguration
38
- nodeRegistration:
39
- kubeletExtraArgs:
40
- node-labels: "ingress-ready=true"
41
- """)
42
- if gpus:
43
- preamble += textwrap.indent(
44
- textwrap.dedent("""
45
- extraMounts:
46
- - hostPath: /dev/null
47
- containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
48
- preamble += textwrap.indent(
49
- textwrap.dedent("""
50
- extraPortMappings:"""), ' ' * 2)
51
- suffix = ''
52
- if num_nodes > 1:
53
- for _ in range(1, num_nodes):
54
- suffix += """- role: worker\n"""
55
- with open(path, 'w', encoding='utf-8') as f:
56
- f.write(preamble)
57
- for port in range(port_start, port_end + 1):
58
- f.write(f"""
59
- - containerPort: {port}
60
- hostPort: {port}
61
- listenAddress: "0.0.0.0"
62
- protocol: tcp""")
63
- f.write('\n')
64
- if suffix:
65
- f.write(suffix)
66
6
 
7
+ from sky.utils.kubernetes import kubernetes_deploy_utils
67
8
 
68
9
  if __name__ == '__main__':
69
10
  parser = argparse.ArgumentParser(description='Generate a kind cluster '
@@ -77,10 +18,6 @@ if __name__ == '__main__':
77
18
  type=int,
78
19
  default=30000,
79
20
  help='Port range start')
80
- parser.add_argument('--port-end',
81
- type=int,
82
- default=32768,
83
- help='Port range end')
84
21
  parser.add_argument('--num-nodes',
85
22
  type=int,
86
23
  default=1,
@@ -90,5 +27,8 @@ if __name__ == '__main__':
90
27
  action='store_true',
91
28
  help='Initialize kind cluster with GPU support')
92
29
  args = parser.parse_args()
93
- generate_kind_config(args.path, args.port_start, args.port_end,
94
- args.num_nodes, args.gpus)
30
+
31
+ with open(args.path, 'w', encoding='utf-8') as f:
32
+ f.write(
33
+ kubernetes_deploy_utils.generate_kind_config(
34
+ args.port_start, args.num_nodes, args.gpus))
@@ -4,6 +4,7 @@ import shlex
4
4
  import subprocess
5
5
  import sys
6
6
  import tempfile
7
+ import textwrap
7
8
  from typing import List, Optional
8
9
 
9
10
  import colorama
@@ -24,6 +25,9 @@ logger = sky_logging.init_logger(__name__)
24
25
 
25
26
  # Default path for Kubernetes configuration file
26
27
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
28
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
29
+ LOCAL_CLUSTER_PORT_RANGE = 101
30
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
27
31
 
28
32
 
29
33
  def check_ssh_cluster_dependencies(
@@ -252,7 +256,68 @@ def deploy_remote_cluster(ip_list: List[str],
252
256
  is_local=True))
253
257
 
254
258
 
255
- def deploy_local_cluster(gpus: bool):
259
+ def generate_kind_config(port_start: int,
260
+ num_nodes: int = 1,
261
+ gpus: bool = False) -> str:
262
+ """Generate a kind cluster config with ports mapped from host to container
263
+
264
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
265
+ Internally, this will map to ports 30000 - 30100
266
+
267
+ Args:
268
+ path: Path to generate the config file at
269
+ port_start: Port range start for mappings
270
+ num_nodes: Number of nodes in the cluster
271
+ gpus: If true, initialize kind cluster with GPU support
272
+
273
+ Returns:
274
+ The kind cluster config
275
+ """
276
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
277
+ internal_end = internal_start + LOCAL_CLUSTER_PORT_RANGE - 1
278
+
279
+ config = textwrap.dedent(f"""
280
+ apiVersion: kind.x-k8s.io/v1alpha4
281
+ kind: Cluster
282
+ kubeadmConfigPatches:
283
+ - |
284
+ kind: ClusterConfiguration
285
+ apiServer:
286
+ extraArgs:
287
+ "service-node-port-range": {internal_start}-{internal_end}
288
+ nodes:
289
+ - role: control-plane
290
+ kubeadmConfigPatches:
291
+ - |
292
+ kind: InitConfiguration
293
+ nodeRegistration:
294
+ kubeletExtraArgs:
295
+ node-labels: "ingress-ready=true"
296
+ """)
297
+ if gpus:
298
+ config += textwrap.indent(
299
+ textwrap.dedent("""
300
+ extraMounts:
301
+ - hostPath: /dev/null
302
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
303
+ config += textwrap.indent(textwrap.dedent("""
304
+ extraPortMappings:"""), ' ' * 2)
305
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
306
+ config += textwrap.indent(
307
+ textwrap.dedent(f"""
308
+ - containerPort: {internal_start + offset}
309
+ hostPort: {port_start + offset}
310
+ listenAddress: "0.0.0.0"
311
+ protocol: tcp
312
+ """), ' ' * 2)
313
+ if num_nodes > 1:
314
+ config += '- role: worker\n' * (num_nodes - 1)
315
+ return config
316
+
317
+
318
+ def deploy_local_cluster(name: Optional[str], gpus: bool):
319
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
320
+ context_name = f'kind-{name}'
256
321
  cluster_created = False
257
322
 
258
323
  # Check if GPUs are available on the host
@@ -262,41 +327,57 @@ def deploy_local_cluster(gpus: bool):
262
327
  # Check if ~/.kube/config exists:
263
328
  if os.path.exists(os.path.expanduser('~/.kube/config')):
264
329
  curr_context = kubernetes_utils.get_current_kube_config_context_name()
265
- skypilot_context = 'kind-skypilot'
266
- if curr_context is not None and curr_context != skypilot_context:
330
+ if curr_context is not None and curr_context != context_name:
267
331
  logger.info(
268
332
  f'Current context in kube config: {curr_context}'
269
- '\nWill automatically switch to kind-skypilot after the local '
270
- 'cluster is created.')
271
- message_str = 'Creating local cluster{}...'
272
- message_str = message_str.format((' with GPU support (this may take up '
273
- 'to 15 minutes)') if gpus else '')
274
- path_to_package = os.path.dirname(__file__)
275
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
276
-
277
- # Get directory of script and run it from there
278
- cwd = os.path.dirname(os.path.abspath(up_script_path))
279
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
280
- run_command = shlex.split(run_command)
333
+ f'\nWill automatically switch to {context_name} after the '
334
+ 'local cluster is created.')
335
+ message_str = 'Creating local cluster {}{}...'
336
+ message_str = message_str.format(
337
+ name,
338
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
339
+
340
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
341
+ delete=True) as f:
342
+ # Choose random port range to use on the host machine.
343
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
344
+ # port_start = random.randint(300, 399) * 100
345
+ # TODO (kyuds): hard coding to pass smoketests. Need to figure out
346
+ # how to deal with this later.
347
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
348
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
349
+ logger.debug(f'Using port range {port_start}-{port_end}')
350
+ f.write(generate_kind_config(port_start, gpus=gpus))
351
+ f.flush()
352
+
353
+ path_to_package = os.path.dirname(__file__)
354
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
355
+
356
+ # Get directory of script and run it from there
357
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
358
+ run_command = f'{up_script_path} {name} {f.name}'
359
+ if gpus:
360
+ run_command += ' --gpus'
361
+ run_command = shlex.split(run_command)
281
362
 
282
- # Setup logging paths
283
- run_timestamp = sky_logging.get_run_timestamp()
284
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
285
- 'local_up.log')
286
- logger.info(message_str)
363
+ # Setup logging paths
364
+ run_timestamp = sky_logging.get_run_timestamp()
365
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
366
+ 'local_up.log')
367
+ logger.info(message_str)
287
368
 
288
- with rich_utils.safe_status(
289
- ux_utils.spinner_message(message_str,
290
- log_path=log_path,
291
- is_local=True)):
292
- returncode, _, stderr = log_lib.run_with_log(
293
- cmd=run_command,
294
- log_path=log_path,
295
- require_outputs=True,
296
- stream_logs=False,
297
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
298
- is_local=True),
299
- cwd=cwd)
369
+ with rich_utils.safe_status(
370
+ ux_utils.spinner_message(message_str,
371
+ log_path=log_path,
372
+ is_local=True)):
373
+ returncode, _, stderr = log_lib.run_with_log(
374
+ cmd=run_command,
375
+ log_path=log_path,
376
+ require_outputs=True,
377
+ stream_logs=False,
378
+ line_processor=log_utils.SkyLocalUpLineProcessor(
379
+ log_path=log_path, is_local=True),
380
+ cwd=cwd)
300
381
 
301
382
  # Kind always writes to stderr even if it succeeds.
302
383
  # If the failure happens after the cluster is created, we need
@@ -309,11 +390,11 @@ def deploy_local_cluster(gpus: bool):
309
390
  elif returncode == 100:
310
391
  logger.info(
311
392
  ux_utils.finishing_message(
312
- 'Local cluster already exists.\n',
393
+ f'Local cluster {name} already exists.\n',
313
394
  log_path=log_path,
314
395
  is_local=True,
315
396
  follow_up_message=
316
- 'If you want to delete it instead, run: sky local down'))
397
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
317
398
  else:
318
399
  with ux_utils.print_exception_no_traceback():
319
400
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -339,7 +420,7 @@ def deploy_local_cluster(gpus: bool):
339
420
  if gpus:
340
421
  # Get GPU model by querying the node labels
341
422
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
342
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
423
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
343
424
  try:
344
425
  # Run the command and capture the output
345
426
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -375,8 +456,9 @@ def deploy_local_cluster(gpus: bool):
375
456
  'This may cause issues with running tasks.')
376
457
  logger.info(
377
458
  ux_utils.finishing_message(
378
- message=(f'Local Kubernetes cluster created successfully with '
379
- f'{num_cpus} CPUs{gpu_message}.'),
459
+ message=(
460
+ f'Local Kubernetes cluster {name} created successfully '
461
+ f'with {num_cpus} CPUs{gpu_message}.'),
380
462
  log_path=log_path,
381
463
  is_local=True,
382
464
  follow_up_message=(
@@ -384,3 +466,54 @@ def deploy_local_cluster(gpus: bool):
384
466
  'Hint: To change the number of CPUs, change your docker '
385
467
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
386
468
  f'{gpu_hint}')))
469
+
470
+
471
+ def teardown_local_cluster(name: Optional[str] = None):
472
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
473
+ cluster_removed = False
474
+
475
+ path_to_package = os.path.dirname(__file__)
476
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
477
+
478
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
479
+ run_command = f'{down_script_path} {name}'
480
+ run_command = shlex.split(run_command)
481
+
482
+ # Setup logging paths
483
+ run_timestamp = sky_logging.get_run_timestamp()
484
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
485
+ 'local_down.log')
486
+
487
+ with rich_utils.safe_status(
488
+ ux_utils.spinner_message(f'Removing local cluster {name}',
489
+ log_path=log_path,
490
+ is_local=True)):
491
+
492
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
493
+ log_path=log_path,
494
+ require_outputs=True,
495
+ stream_logs=False,
496
+ cwd=cwd)
497
+ stderr = stderr.replace('No kind clusters found.\n', '')
498
+
499
+ if returncode == 0:
500
+ cluster_removed = True
501
+ elif returncode == 100:
502
+ logger.info(
503
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
504
+ else:
505
+ with ux_utils.print_exception_no_traceback():
506
+ raise RuntimeError(f'Failed to down local cluster {name}. '
507
+ f'Stdout: {stdout}'
508
+ f'\nError: {stderr}')
509
+ if cluster_removed:
510
+ # Run sky check
511
+ with rich_utils.safe_status(
512
+ ux_utils.spinner_message('Running sky check...')):
513
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
514
+ clouds=['kubernetes'],
515
+ quiet=True)
516
+ logger.info(
517
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
518
+ log_path=log_path,
519
+ is_local=True))
@@ -44,3 +44,8 @@ class KubernetesAutoscalerType(enum.Enum):
44
44
  KARPENTER = 'karpenter'
45
45
  COREWEAVE = 'coreweave'
46
46
  GENERIC = 'generic'
47
+
48
+ def emits_autoscale_event(self) -> bool:
49
+ """Returns whether specific autoscaler emits the event reason
50
+ TriggeredScaleUp."""
51
+ return self not in {self.KARPENTER}
sky/utils/ux_utils.py CHANGED
@@ -1,11 +1,12 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
3
  import enum
4
+ import fnmatch
4
5
  import os
5
6
  import sys
6
7
  import traceback
7
8
  import typing
8
- from typing import Callable, Optional, Union
9
+ from typing import Callable, Iterable, List, Optional, Union
9
10
 
10
11
  import colorama
11
12
 
@@ -288,3 +289,36 @@ def command_hint_messages(hint_type: CommandHintType,
288
289
  f'{BOLD}sky jobs queue{RESET_BOLD}')
289
290
  else:
290
291
  raise ValueError(f'Invalid hint type: {hint_type}')
292
+
293
+
294
+ def is_glob_pattern(pattern: str) -> bool:
295
+ """Checks if a string contains common glob pattern wildcards."""
296
+ glob_chars = {'*', '?', '[', ']'}
297
+ # Also check for '**' as a specific globstar pattern
298
+ if '**' in pattern:
299
+ return True
300
+ for char in pattern:
301
+ if char in glob_chars:
302
+ return True
303
+ return False
304
+
305
+
306
+ def get_non_matched_query(query_clusters: Iterable[str],
307
+ cluster_names: Iterable[str]) -> List[str]:
308
+ """Gets the non-matched query clusters."""
309
+ glob_query_clusters = []
310
+ non_glob_query_clusters = []
311
+ for cluster_name in query_clusters:
312
+ if is_glob_pattern(cluster_name):
313
+ glob_query_clusters.append(cluster_name)
314
+ else:
315
+ non_glob_query_clusters.append(cluster_name)
316
+ not_found_clusters = [
317
+ query_cluster for query_cluster in non_glob_query_clusters
318
+ if query_cluster not in cluster_names
319
+ ]
320
+ not_found_clusters.extend([
321
+ query_cluster for query_cluster in glob_query_clusters
322
+ if not fnmatch.filter(cluster_names, query_cluster)
323
+ ])
324
+ return not_found_clusters
sky/utils/yaml_utils.py CHANGED
@@ -44,6 +44,15 @@ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
44
44
  return config
45
45
 
46
46
 
47
+ def read_yaml_str(yaml_str: str) -> Dict[str, Any]:
48
+ stream = io.StringIO(yaml_str)
49
+ parsed_yaml = safe_load(stream)
50
+ if not parsed_yaml:
51
+ # Empty dict
52
+ return {}
53
+ return parsed_yaml
54
+
55
+
47
56
  def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
48
57
  stream = io.StringIO(yaml_str)
49
58
  config = safe_load_all(stream)
sky/volumes/client/sdk.py CHANGED
@@ -3,13 +3,16 @@ import json
3
3
  import typing
4
4
  from typing import Any, Dict, List
5
5
 
6
+ from sky import exceptions
6
7
  from sky import sky_logging
7
8
  from sky.adaptors import common as adaptors_common
8
9
  from sky.server import common as server_common
10
+ from sky.server import versions
9
11
  from sky.server.requests import payloads
10
12
  from sky.usage import usage_lib
11
13
  from sky.utils import annotations
12
14
  from sky.utils import context
15
+ from sky.utils import ux_utils
13
16
  from sky.volumes import volume as volume_lib
14
17
 
15
18
  if typing.TYPE_CHECKING:
@@ -71,12 +74,44 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
71
74
  config=volume.config,
72
75
  labels=volume.labels,
73
76
  )
74
- response = requests.post(f'{server_common.get_server_url()}/volumes/apply',
75
- json=json.loads(body.model_dump_json()),
76
- cookies=server_common.get_api_cookie_jar())
77
+ response = server_common.make_authenticated_request(
78
+ 'POST', '/volumes/apply', json=json.loads(body.model_dump_json()))
77
79
  return server_common.get_request_id(response)
78
80
 
79
81
 
82
+ @context.contextual
83
+ @usage_lib.entrypoint
84
+ @server_common.check_server_healthy_or_start
85
+ @annotations.client_api
86
+ @versions.minimal_api_version(20)
87
+ def validate(volume: volume_lib.Volume) -> None:
88
+ """Validates the volume.
89
+
90
+ All validation is done on the server side.
91
+
92
+ Args:
93
+ volume: The volume to validate.
94
+
95
+ Raises:
96
+ ValueError: If the volume is invalid.
97
+ """
98
+ body = payloads.VolumeValidateBody(
99
+ name=volume.name,
100
+ volume_type=volume.type,
101
+ infra=volume.infra,
102
+ resource_name=volume.resource_name,
103
+ size=volume.size,
104
+ config=volume.config,
105
+ labels=volume.labels,
106
+ )
107
+ response = server_common.make_authenticated_request(
108
+ 'POST', '/volumes/validate', json=json.loads(body.model_dump_json()))
109
+ if response.status_code == 400:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise exceptions.deserialize_exception(
112
+ response.json().get('detail'))
113
+
114
+
80
115
  @context.contextual
81
116
  @usage_lib.entrypoint
82
117
  @server_common.check_server_healthy_or_start
@@ -87,8 +122,10 @@ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
87
122
  Returns:
88
123
  The request ID of the list request.
89
124
  """
90
- response = requests.get(f'{server_common.get_server_url()}/volumes',
91
- cookies=server_common.get_api_cookie_jar())
125
+ response = server_common.make_authenticated_request(
126
+ 'GET',
127
+ '/volumes',
128
+ )
92
129
  return server_common.get_request_id(response)
93
130
 
94
131
 
@@ -106,7 +143,6 @@ def delete(names: List[str]) -> server_common.RequestId[None]:
106
143
  The request ID of the delete request.
107
144
  """
108
145
  body = payloads.VolumeDeleteBody(names=names)
109
- response = requests.post(f'{server_common.get_server_url()}/volumes/delete',
110
- json=json.loads(body.model_dump_json()),
111
- cookies=server_common.get_api_cookie_jar())
146
+ response = server_common.make_authenticated_request(
147
+ 'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
112
148
  return server_common.get_request_id(response)