skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,19 @@
1
1
  #!/bin/bash
2
2
  # Creates a local Kubernetes cluster using kind with optional GPU support
3
- # Usage: ./create_cluster.sh [--gpus]
4
- # Invokes generate_kind_config.py to generate a kind-cluster.yaml with NodePort mappings
3
+ # Usage: ./create_cluster.sh [name] [yaml_path] [--gpus]
5
4
  set -e
6
5
 
7
6
  # Images
8
7
  IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
9
8
  IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
10
9
 
11
- # Limit port range to speed up kind cluster creation
12
- PORT_RANGE_START=30000
13
- PORT_RANGE_END=30100
14
-
15
- USER_HASH=$1
10
+ # Arguments
11
+ NAME=$1
12
+ YAML_PATH=$2
16
13
 
17
14
  # Check for GPU flag
18
15
  ENABLE_GPUS=false
19
- if [[ "$2" == "--gpus" ]]; then
16
+ if [[ "$3" == "--gpus" ]]; then
20
17
  ENABLE_GPUS=true
21
18
  fi
22
19
 
@@ -82,28 +79,16 @@ fi
82
79
  # ====== End of dependency checks =======
83
80
 
84
81
  # Check if the local cluster already exists
85
- if kind get clusters | grep -q skypilot; then
86
- echo "Local cluster already exists. Exiting."
82
+ if kind get clusters | grep -q $NAME; then
83
+ echo "Local cluster $NAME already exists. Exiting."
87
84
  # Switch context to the local cluster
88
- kind export kubeconfig --name skypilot
89
- kubectl config use-context kind-skypilot
85
+ kind export kubeconfig --name $NAME
86
+ kubectl config use-context kind-$NAME
90
87
  exit 100
91
88
  fi
92
89
 
93
- # Generate cluster YAML
94
- YAML_PATH="/tmp/skypilot-kind-$USER_HASH.yaml"
95
- echo "Generating $YAML_PATH"
96
-
97
- # Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
98
- if $ENABLE_GPUS; then
99
- python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
100
- else
101
- python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
102
- fi
103
-
104
- kind create cluster --config $YAML_PATH --name skypilot
105
-
106
- echo "Kind cluster created."
90
+ kind create cluster --config $YAML_PATH --name $NAME
91
+ echo "Kind cluster $NAME created."
107
92
 
108
93
  # Function to wait for GPU operator to be correctly installed
109
94
  wait_for_gpu_operator_installation() {
@@ -157,7 +142,7 @@ if $ENABLE_GPUS; then
157
142
  echo "Enabling GPU support..."
158
143
  # Run patch for missing ldconfig.real
159
144
  # https://github.com/NVIDIA/nvidia-docker/issues/614#issuecomment-423991632
160
- docker exec -ti skypilot-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
145
+ docker exec -ti $NAME-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
161
146
 
162
147
  echo "Installing NVIDIA GPU operator..."
163
148
  # Install the NVIDIA GPU operator
@@ -185,4 +170,4 @@ if $ENABLE_GPUS; then
185
170
  echo "GPU support is enabled. Run 'sky show-gpus --cloud kubernetes' to see the GPUs available on the cluster."
186
171
  fi
187
172
  fi
188
- echo "Number of CPUs available on the local cluster: $NUM_CPUS"
173
+ echo "Number of CPUs available on the local cluster $NAME: $NUM_CPUS"
@@ -1,9 +1,12 @@
1
1
  #!/bin/bash
2
- # Deletes the local kind cluster
3
- # Usage: ./delete_cluster.sh
4
- # Raises error code 100 if the local cluster does not exist
2
+ # Deletes the local kind cluster of [name]
3
+ # Usage: ./delete_cluster.sh [name]
4
+ # Raises error code 100 if the specified local cluster does not exist
5
5
 
6
6
  set -e
7
+
8
+ NAME="${1:-skypilot}"
9
+
7
10
  # Check if docker is running
8
11
  if ! docker info > /dev/null 2>&1; then
9
12
  >&2 echo "Docker is not running. Please start Docker and try again."
@@ -17,13 +20,13 @@ if ! kind version > /dev/null 2>&1; then
17
20
  fi
18
21
 
19
22
  # Check if the local cluster exists
20
- if ! kind get clusters | grep -q skypilot; then
21
- echo "Local cluster does not exist. Exiting."
23
+ if ! kind get clusters | grep -q $NAME; then
24
+ echo "Local cluster $NAME does not exist. Exiting."
22
25
  exit 100
23
26
  fi
24
27
 
25
- kind delete cluster --name skypilot
26
- echo "Local cluster deleted!"
28
+ kind delete cluster --name $NAME
29
+ echo "Local cluster $NAME deleted!"
27
30
 
28
31
  # Switch to the first available context
29
32
  AVAILABLE_CONTEXT=$(kubectl config get-contexts -o name | head -n 1)
@@ -3,67 +3,8 @@
3
3
  Maps specified ports from host to cluster container.
4
4
  """
5
5
  import argparse
6
- import textwrap
7
-
8
-
9
- def generate_kind_config(path: str,
10
- port_start: int = 30000,
11
- port_end: int = 32768,
12
- num_nodes: int = 1,
13
- gpus: bool = False) -> None:
14
- """Generate a kind cluster config with ports mapped from host to container
15
-
16
- Args:
17
- path: Path to generate the config file at
18
- port_start: Port range start
19
- port_end: Port range end
20
- num_nodes: Number of nodes in the cluster
21
- gpus: If true, initialize kind cluster with GPU support
22
- """
23
-
24
- preamble = textwrap.dedent(f"""
25
- apiVersion: kind.x-k8s.io/v1alpha4
26
- kind: Cluster
27
- kubeadmConfigPatches:
28
- - |
29
- kind: ClusterConfiguration
30
- apiServer:
31
- extraArgs:
32
- "service-node-port-range": {port_start}-{port_end}
33
- nodes:
34
- - role: control-plane
35
- kubeadmConfigPatches:
36
- - |
37
- kind: InitConfiguration
38
- nodeRegistration:
39
- kubeletExtraArgs:
40
- node-labels: "ingress-ready=true"
41
- """)
42
- if gpus:
43
- preamble += textwrap.indent(
44
- textwrap.dedent("""
45
- extraMounts:
46
- - hostPath: /dev/null
47
- containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
48
- preamble += textwrap.indent(
49
- textwrap.dedent("""
50
- extraPortMappings:"""), ' ' * 2)
51
- suffix = ''
52
- if num_nodes > 1:
53
- for _ in range(1, num_nodes):
54
- suffix += """- role: worker\n"""
55
- with open(path, 'w', encoding='utf-8') as f:
56
- f.write(preamble)
57
- for port in range(port_start, port_end + 1):
58
- f.write(f"""
59
- - containerPort: {port}
60
- hostPort: {port}
61
- listenAddress: "0.0.0.0"
62
- protocol: tcp""")
63
- f.write('\n')
64
- if suffix:
65
- f.write(suffix)
66
6
 
7
+ from sky.utils.kubernetes import kubernetes_deploy_utils
67
8
 
68
9
  if __name__ == '__main__':
69
10
  parser = argparse.ArgumentParser(description='Generate a kind cluster '
@@ -77,10 +18,6 @@ if __name__ == '__main__':
77
18
  type=int,
78
19
  default=30000,
79
20
  help='Port range start')
80
- parser.add_argument('--port-end',
81
- type=int,
82
- default=32768,
83
- help='Port range end')
84
21
  parser.add_argument('--num-nodes',
85
22
  type=int,
86
23
  default=1,
@@ -90,5 +27,8 @@ if __name__ == '__main__':
90
27
  action='store_true',
91
28
  help='Initialize kind cluster with GPU support')
92
29
  args = parser.parse_args()
93
- generate_kind_config(args.path, args.port_start, args.port_end,
94
- args.num_nodes, args.gpus)
30
+
31
+ with open(args.path, 'w', encoding='utf-8') as f:
32
+ f.write(
33
+ kubernetes_deploy_utils.generate_kind_config(
34
+ args.port_start, args.num_nodes, args.gpus))
@@ -1,10 +1,12 @@
1
1
  """Utility functions for deploying Kubernetes clusters."""
2
2
  import os
3
+ import random
3
4
  import shlex
4
5
  import subprocess
5
6
  import sys
6
7
  import tempfile
7
- from typing import List, Optional
8
+ import textwrap
9
+ from typing import List, Optional, Tuple
8
10
 
9
11
  import colorama
10
12
 
@@ -24,6 +26,10 @@ logger = sky_logging.init_logger(__name__)
24
26
 
25
27
  # Default path for Kubernetes configuration file
26
28
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
30
+ LOCAL_CLUSTER_PORT_RANGE = 100
31
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
+ LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
27
33
 
28
34
 
29
35
  def check_ssh_cluster_dependencies(
@@ -252,7 +258,93 @@ def deploy_remote_cluster(ip_list: List[str],
252
258
  is_local=True))
253
259
 
254
260
 
255
- def deploy_local_cluster(gpus: bool):
261
+ def generate_kind_config(port_start: int,
262
+ num_nodes: int = 1,
263
+ gpus: bool = False) -> str:
264
+ """Generate a kind cluster config with ports mapped from host to container
265
+
266
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
267
+ Internally, this will map to ports 30000 - 30099
268
+
269
+ Args:
270
+ path: Path to generate the config file at
271
+ port_start: Port range start for mappings
272
+ num_nodes: Number of nodes in the cluster
273
+ gpus: If true, initialize kind cluster with GPU support
274
+
275
+ Returns:
276
+ The kind cluster config
277
+ """
278
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
279
+ internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
280
+
281
+ config = textwrap.dedent(f"""
282
+ apiVersion: kind.x-k8s.io/v1alpha4
283
+ kind: Cluster
284
+ kubeadmConfigPatches:
285
+ - |
286
+ kind: ClusterConfiguration
287
+ apiServer:
288
+ extraArgs:
289
+ "service-node-port-range": {internal_start}-{internal_end}
290
+ nodes:
291
+ - role: control-plane
292
+ kubeadmConfigPatches:
293
+ - |
294
+ kind: InitConfiguration
295
+ nodeRegistration:
296
+ kubeletExtraArgs:
297
+ node-labels: "ingress-ready=true"
298
+ """)
299
+ if gpus:
300
+ config += textwrap.indent(
301
+ textwrap.dedent("""
302
+ extraMounts:
303
+ - hostPath: /dev/null
304
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
305
+ config += textwrap.indent(textwrap.dedent("""
306
+ extraPortMappings:"""), ' ' * 2)
307
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
308
+ config += textwrap.indent(
309
+ textwrap.dedent(f"""
310
+ - containerPort: {internal_start + offset}
311
+ hostPort: {port_start + offset}
312
+ listenAddress: "0.0.0.0"
313
+ protocol: tcp
314
+ """), ' ' * 2)
315
+ if num_nodes > 1:
316
+ config += '- role: worker\n' * (num_nodes - 1)
317
+ return config
318
+
319
+
320
+ def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
321
+ is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
322
+ if port_start is None:
323
+ if is_default:
324
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
325
+ else:
326
+ port_start = random.randint(301, 399) * 100
327
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
328
+
329
+ port_range = f'Current port range: {port_start}-{port_end}'
330
+ if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
331
+ raise ValueError('Default local cluster `skypilot` should have '
332
+ f'port range from 30000 to 30099. {port_range}.')
333
+ if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
334
+ raise ValueError('Port range 30000 to 30099 is reserved for '
335
+ f'default local cluster `skypilot`. {port_range}.')
336
+ if port_start % 100 != 0:
337
+ raise ValueError('Local cluster port start must be a multiple of 100. '
338
+ f'{port_range}.')
339
+
340
+ return port_start, port_end
341
+
342
+
343
+ def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
344
+ gpus: bool):
345
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
346
+ port_start, port_end = _get_port_range(name, port_start)
347
+ context_name = f'kind-{name}'
256
348
  cluster_created = False
257
349
 
258
350
  # Check if GPUs are available on the host
@@ -262,41 +354,52 @@ def deploy_local_cluster(gpus: bool):
262
354
  # Check if ~/.kube/config exists:
263
355
  if os.path.exists(os.path.expanduser('~/.kube/config')):
264
356
  curr_context = kubernetes_utils.get_current_kube_config_context_name()
265
- skypilot_context = 'kind-skypilot'
266
- if curr_context is not None and curr_context != skypilot_context:
357
+ if curr_context is not None and curr_context != context_name:
267
358
  logger.info(
268
359
  f'Current context in kube config: {curr_context}'
269
- '\nWill automatically switch to kind-skypilot after the local '
270
- 'cluster is created.')
271
- message_str = 'Creating local cluster{}...'
272
- message_str = message_str.format((' with GPU support (this may take up '
273
- 'to 15 minutes)') if gpus else '')
274
- path_to_package = os.path.dirname(__file__)
275
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
276
-
277
- # Get directory of script and run it from there
278
- cwd = os.path.dirname(os.path.abspath(up_script_path))
279
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
280
- run_command = shlex.split(run_command)
360
+ f'\nWill automatically switch to {context_name} after the '
361
+ 'local cluster is created.')
362
+ message_str = 'Creating local cluster {}{}...'
363
+ message_str = message_str.format(
364
+ name,
365
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
366
+
367
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
368
+ delete=True) as f:
369
+ # Choose random port range to use on the host machine.
370
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
371
+ logger.debug(f'Using host port range {port_start}-{port_end}')
372
+ f.write(generate_kind_config(port_start, gpus=gpus))
373
+ f.flush()
374
+
375
+ path_to_package = os.path.dirname(__file__)
376
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
377
+
378
+ # Get directory of script and run it from there
379
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
380
+ run_command = f'{up_script_path} {name} {f.name}'
381
+ if gpus:
382
+ run_command += ' --gpus'
383
+ run_command = shlex.split(run_command)
281
384
 
282
- # Setup logging paths
283
- run_timestamp = sky_logging.get_run_timestamp()
284
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
285
- 'local_up.log')
286
- logger.info(message_str)
385
+ # Setup logging paths
386
+ run_timestamp = sky_logging.get_run_timestamp()
387
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
388
+ 'local_up.log')
389
+ logger.info(message_str)
287
390
 
288
- with rich_utils.safe_status(
289
- ux_utils.spinner_message(message_str,
290
- log_path=log_path,
291
- is_local=True)):
292
- returncode, _, stderr = log_lib.run_with_log(
293
- cmd=run_command,
294
- log_path=log_path,
295
- require_outputs=True,
296
- stream_logs=False,
297
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
298
- is_local=True),
299
- cwd=cwd)
391
+ with rich_utils.safe_status(
392
+ ux_utils.spinner_message(message_str,
393
+ log_path=log_path,
394
+ is_local=True)):
395
+ returncode, _, stderr = log_lib.run_with_log(
396
+ cmd=run_command,
397
+ log_path=log_path,
398
+ require_outputs=True,
399
+ stream_logs=False,
400
+ line_processor=log_utils.SkyLocalUpLineProcessor(
401
+ log_path=log_path, is_local=True),
402
+ cwd=cwd)
300
403
 
301
404
  # Kind always writes to stderr even if it succeeds.
302
405
  # If the failure happens after the cluster is created, we need
@@ -309,11 +412,11 @@ def deploy_local_cluster(gpus: bool):
309
412
  elif returncode == 100:
310
413
  logger.info(
311
414
  ux_utils.finishing_message(
312
- 'Local cluster already exists.\n',
415
+ f'Local cluster {name} already exists.\n',
313
416
  log_path=log_path,
314
417
  is_local=True,
315
418
  follow_up_message=
316
- 'If you want to delete it instead, run: sky local down'))
419
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
317
420
  else:
318
421
  with ux_utils.print_exception_no_traceback():
319
422
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -339,7 +442,7 @@ def deploy_local_cluster(gpus: bool):
339
442
  if gpus:
340
443
  # Get GPU model by querying the node labels
341
444
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
342
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
445
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
343
446
  try:
344
447
  # Run the command and capture the output
345
448
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -375,8 +478,10 @@ def deploy_local_cluster(gpus: bool):
375
478
  'This may cause issues with running tasks.')
376
479
  logger.info(
377
480
  ux_utils.finishing_message(
378
- message=(f'Local Kubernetes cluster created successfully with '
379
- f'{num_cpus} CPUs{gpu_message}.'),
481
+ message=(
482
+ f'Local Kubernetes cluster {name} created successfully '
483
+ f'with {num_cpus} CPUs{gpu_message} on host port range '
484
+ f'{port_start}-{port_end}.'),
380
485
  log_path=log_path,
381
486
  is_local=True,
382
487
  follow_up_message=(
@@ -384,3 +489,54 @@ def deploy_local_cluster(gpus: bool):
384
489
  'Hint: To change the number of CPUs, change your docker '
385
490
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
386
491
  f'{gpu_hint}')))
492
+
493
+
494
+ def teardown_local_cluster(name: Optional[str] = None):
495
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
496
+ cluster_removed = False
497
+
498
+ path_to_package = os.path.dirname(__file__)
499
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
500
+
501
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
502
+ run_command = f'{down_script_path} {name}'
503
+ run_command = shlex.split(run_command)
504
+
505
+ # Setup logging paths
506
+ run_timestamp = sky_logging.get_run_timestamp()
507
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
508
+ 'local_down.log')
509
+
510
+ with rich_utils.safe_status(
511
+ ux_utils.spinner_message(f'Removing local cluster {name}',
512
+ log_path=log_path,
513
+ is_local=True)):
514
+
515
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
516
+ log_path=log_path,
517
+ require_outputs=True,
518
+ stream_logs=False,
519
+ cwd=cwd)
520
+ stderr = stderr.replace('No kind clusters found.\n', '')
521
+
522
+ if returncode == 0:
523
+ cluster_removed = True
524
+ elif returncode == 100:
525
+ logger.info(
526
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
527
+ else:
528
+ with ux_utils.print_exception_no_traceback():
529
+ raise RuntimeError(f'Failed to down local cluster {name}. '
530
+ f'Stdout: {stdout}'
531
+ f'\nError: {stderr}')
532
+ if cluster_removed:
533
+ # Run sky check
534
+ with rich_utils.safe_status(
535
+ ux_utils.spinner_message('Running sky check...')):
536
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
537
+ clouds=['kubernetes'],
538
+ quiet=True)
539
+ logger.info(
540
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
541
+ log_path=log_path,
542
+ is_local=True))
@@ -44,3 +44,8 @@ class KubernetesAutoscalerType(enum.Enum):
44
44
  KARPENTER = 'karpenter'
45
45
  COREWEAVE = 'coreweave'
46
46
  GENERIC = 'generic'
47
+
48
+ def emits_autoscale_event(self) -> bool:
49
+ """Returns whether specific autoscaler emits the event reason
50
+ TriggeredScaleUp."""
51
+ return self not in {self.KARPENTER}
sky/utils/ux_utils.py CHANGED
@@ -1,11 +1,12 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
3
  import enum
4
+ import fnmatch
4
5
  import os
5
6
  import sys
6
7
  import traceback
7
8
  import typing
8
- from typing import Callable, Optional, Union
9
+ from typing import Callable, Iterable, List, Optional, Union
9
10
 
10
11
  import colorama
11
12
 
@@ -288,3 +289,36 @@ def command_hint_messages(hint_type: CommandHintType,
288
289
  f'{BOLD}sky jobs queue{RESET_BOLD}')
289
290
  else:
290
291
  raise ValueError(f'Invalid hint type: {hint_type}')
292
+
293
+
294
+ def is_glob_pattern(pattern: str) -> bool:
295
+ """Checks if a string contains common glob pattern wildcards."""
296
+ glob_chars = {'*', '?', '[', ']'}
297
+ # Also check for '**' as a specific globstar pattern
298
+ if '**' in pattern:
299
+ return True
300
+ for char in pattern:
301
+ if char in glob_chars:
302
+ return True
303
+ return False
304
+
305
+
306
+ def get_non_matched_query(query_clusters: Iterable[str],
307
+ cluster_names: Iterable[str]) -> List[str]:
308
+ """Gets the non-matched query clusters."""
309
+ glob_query_clusters = []
310
+ non_glob_query_clusters = []
311
+ for cluster_name in query_clusters:
312
+ if is_glob_pattern(cluster_name):
313
+ glob_query_clusters.append(cluster_name)
314
+ else:
315
+ non_glob_query_clusters.append(cluster_name)
316
+ not_found_clusters = [
317
+ query_cluster for query_cluster in non_glob_query_clusters
318
+ if query_cluster not in cluster_names
319
+ ]
320
+ not_found_clusters.extend([
321
+ query_cluster for query_cluster in glob_query_clusters
322
+ if not fnmatch.filter(cluster_names, query_cluster)
323
+ ])
324
+ return not_found_clusters
sky/utils/yaml_utils.py CHANGED
@@ -44,6 +44,15 @@ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
44
44
  return config
45
45
 
46
46
 
47
+ def read_yaml_str(yaml_str: str) -> Dict[str, Any]:
48
+ stream = io.StringIO(yaml_str)
49
+ parsed_yaml = safe_load(stream)
50
+ if not parsed_yaml:
51
+ # Empty dict
52
+ return {}
53
+ return parsed_yaml
54
+
55
+
47
56
  def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
48
57
  stream = io.StringIO(yaml_str)
49
58
  config = safe_load_all(stream)
sky/volumes/client/sdk.py CHANGED
@@ -3,13 +3,16 @@ import json
3
3
  import typing
4
4
  from typing import Any, Dict, List
5
5
 
6
+ from sky import exceptions
6
7
  from sky import sky_logging
7
8
  from sky.adaptors import common as adaptors_common
8
9
  from sky.server import common as server_common
10
+ from sky.server import versions
9
11
  from sky.server.requests import payloads
10
12
  from sky.usage import usage_lib
11
13
  from sky.utils import annotations
12
14
  from sky.utils import context
15
+ from sky.utils import ux_utils
13
16
  from sky.volumes import volume as volume_lib
14
17
 
15
18
  if typing.TYPE_CHECKING:
@@ -71,12 +74,44 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
71
74
  config=volume.config,
72
75
  labels=volume.labels,
73
76
  )
74
- response = requests.post(f'{server_common.get_server_url()}/volumes/apply',
75
- json=json.loads(body.model_dump_json()),
76
- cookies=server_common.get_api_cookie_jar())
77
+ response = server_common.make_authenticated_request(
78
+ 'POST', '/volumes/apply', json=json.loads(body.model_dump_json()))
77
79
  return server_common.get_request_id(response)
78
80
 
79
81
 
82
+ @context.contextual
83
+ @usage_lib.entrypoint
84
+ @server_common.check_server_healthy_or_start
85
+ @annotations.client_api
86
+ @versions.minimal_api_version(20)
87
+ def validate(volume: volume_lib.Volume) -> None:
88
+ """Validates the volume.
89
+
90
+ All validation is done on the server side.
91
+
92
+ Args:
93
+ volume: The volume to validate.
94
+
95
+ Raises:
96
+ ValueError: If the volume is invalid.
97
+ """
98
+ body = payloads.VolumeValidateBody(
99
+ name=volume.name,
100
+ volume_type=volume.type,
101
+ infra=volume.infra,
102
+ resource_name=volume.resource_name,
103
+ size=volume.size,
104
+ config=volume.config,
105
+ labels=volume.labels,
106
+ )
107
+ response = server_common.make_authenticated_request(
108
+ 'POST', '/volumes/validate', json=json.loads(body.model_dump_json()))
109
+ if response.status_code == 400:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise exceptions.deserialize_exception(
112
+ response.json().get('detail'))
113
+
114
+
80
115
  @context.contextual
81
116
  @usage_lib.entrypoint
82
117
  @server_common.check_server_healthy_or_start
@@ -87,8 +122,10 @@ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
87
122
  Returns:
88
123
  The request ID of the list request.
89
124
  """
90
- response = requests.get(f'{server_common.get_server_url()}/volumes',
91
- cookies=server_common.get_api_cookie_jar())
125
+ response = server_common.make_authenticated_request(
126
+ 'GET',
127
+ '/volumes',
128
+ )
92
129
  return server_common.get_request_id(response)
93
130
 
94
131
 
@@ -106,7 +143,6 @@ def delete(names: List[str]) -> server_common.RequestId[None]:
106
143
  The request ID of the delete request.
107
144
  """
108
145
  body = payloads.VolumeDeleteBody(names=names)
109
- response = requests.post(f'{server_common.get_server_url()}/volumes/delete',
110
- json=json.loads(body.model_dump_json()),
111
- cookies=server_common.get_api_cookie_jar())
146
+ response = server_common.make_authenticated_request(
147
+ 'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
112
148
  return server_common.get_request_id(response)
@@ -213,6 +213,7 @@ def volume_apply(
213
213
  # generate the storage name on cloud.
214
214
  cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
215
215
  assert cloud_obj is not None
216
+ region, zone = cloud_obj.validate_region_zone(region, zone)
216
217
  name_uuid = str(uuid.uuid4())[:6]
217
218
  name_on_cloud = common_utils.make_cluster_name_on_cloud(
218
219
  name, max_length=cloud_obj.max_cluster_name_length())