skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
1
- """Utility functions for deploying Kubernetes clusters."""
1
+ """Utility functions for deploying local Kubernetes kind clusters."""
2
2
  import os
3
3
  import random
4
4
  import shlex
5
5
  import subprocess
6
6
  import tempfile
7
7
  import textwrap
8
- from typing import List, Optional, Tuple
9
-
10
- import colorama
8
+ from typing import Optional, Tuple
11
9
 
12
10
  from sky import check as sky_check
13
11
  from sky import sky_logging
@@ -20,7 +18,6 @@ from sky.utils import log_utils
20
18
  from sky.utils import rich_utils
21
19
  from sky.utils import subprocess_utils
22
20
  from sky.utils import ux_utils
23
- from sky.utils.kubernetes import deploy_ssh_node_pools
24
21
 
25
22
  logger = sky_logging.init_logger(__name__)
26
23
 
@@ -32,95 +29,6 @@ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
29
  LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
33
30
 
34
31
 
35
- def check_ssh_cluster_dependencies(
36
- raise_error: bool = True) -> Optional[List[str]]:
37
- """Checks if the dependencies for ssh cluster are installed.
38
-
39
- Args:
40
- raise_error: set to true when the dependency needs to be present.
41
- set to false for `sky check`, where reason strings are compiled
42
- at the end.
43
-
44
- Returns: the reasons list if there are missing dependencies.
45
- """
46
- # error message
47
- jq_message = ('`jq` is required to setup ssh cluster.')
48
-
49
- # save
50
- reasons = []
51
- required_binaries = []
52
-
53
- # Ensure jq is installed
54
- try:
55
- subprocess.run(['jq', '--version'],
56
- stdout=subprocess.DEVNULL,
57
- stderr=subprocess.DEVNULL,
58
- check=True)
59
- except (FileNotFoundError, subprocess.CalledProcessError):
60
- required_binaries.append('jq')
61
- reasons.append(jq_message)
62
-
63
- if required_binaries:
64
- reasons.extend([
65
- 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
66
- f' $ sudo apt install {" ".join(required_binaries)}',
67
- 'On MacOS, install with: ',
68
- f' $ brew install {" ".join(required_binaries)}',
69
- ])
70
- if raise_error:
71
- with ux_utils.print_exception_no_traceback():
72
- raise RuntimeError('\n'.join(reasons))
73
- return reasons
74
- return None
75
-
76
-
77
- def deploy_ssh_cluster(cleanup: bool = False,
78
- infra: Optional[str] = None,
79
- kubeconfig_path: Optional[str] = None):
80
- """Deploy a Kubernetes cluster on SSH targets.
81
-
82
- This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
83
- Kubernetes cluster on the specified machines.
84
-
85
- Args:
86
- cleanup: Whether to clean up the cluster instead of deploying.
87
- infra: Name of the cluster in ssh_node_pools.yaml to use.
88
- If None, the first cluster in the file will be used.
89
- kubeconfig_path: Path to save the Kubernetes configuration file.
90
- If None, the default ~/.kube/config will be used.
91
- """
92
- check_ssh_cluster_dependencies()
93
-
94
- action = 'Cleanup' if cleanup else 'Deployment'
95
- msg_str = f'Initializing SSH Node Pools {action}...'
96
-
97
- with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
98
- try:
99
- deploy_ssh_node_pools.deploy_clusters(
100
- infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
101
- except Exception as e: # pylint: disable=broad-except
102
- logger.error(str(e))
103
- with ux_utils.print_exception_no_traceback():
104
- raise RuntimeError(
105
- 'Failed to deploy SkyPilot on some Node Pools.') from e
106
-
107
- logger.info('')
108
- if cleanup:
109
- logger.info(
110
- ux_utils.finishing_message(
111
- '🎉 SSH Node Pools cleaned up successfully.'))
112
- else:
113
- logger.info(
114
- ux_utils.finishing_message(
115
- '🎉 SSH Node Pools set up successfully. ',
116
- follow_up_message=(
117
- f'Run `{colorama.Style.BRIGHT}'
118
- f'sky check ssh'
119
- f'{colorama.Style.RESET_ALL}` to verify access, '
120
- f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
121
- f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
122
-
123
-
124
32
  def generate_kind_config(port_start: int,
125
33
  num_nodes: int = 1,
126
34
  gpus: bool = False) -> str:
@@ -1,379 +1,10 @@
1
1
  #!/bin/bash
2
- # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
- # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
- # Returns a valid credential format for kubectl with expiration. The expiration
5
- # is calculated based on the TTL argument and is required to force kubectl to
6
- # check the tunnel status frequently.
7
2
 
8
- # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
3
+ # This redirect stub is needed because we use this script in the
4
+ # exec auth section when creating our kubeconfig. Therefore, node pools
5
+ # launched in older versions of SkyPilot will have kubeconfigs pointing
6
+ # to this path.
9
7
 
10
- # Default time-to-live for credential in seconds
11
- # This forces kubectl to check the tunnel status frequently
12
- TTL_SECONDS=30
13
-
14
- # Parse arguments
15
- USE_SSH_CONFIG=0
16
- SSH_KEY=""
17
- CONTEXT=""
18
- HOST=""
19
- USER=""
20
- PORT=6443 # Default port if not specified
21
-
22
- # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
- debug_log() {
24
- local message="$(date): $1"
25
- echo "$message" >> "$LOG_FILE"
26
- }
27
-
28
- # Generate expiration timestamp for credential
29
- generate_expiration_timestamp() {
30
- # Try macOS date format first, fallback to Linux format
31
- date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
- }
33
-
34
- # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
- acquire_lock() {
36
- # Check for flock command
37
- if ! command -v flock >/dev/null 2>&1; then
38
- debug_log "flock command not available, using alternative lock mechanism"
39
- # Simple file-based locking
40
- if [ -f "$LOCK_FILE" ]; then
41
- lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
- if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
- debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
- return 1
45
- else
46
- # Stale lock file
47
- debug_log "Removing stale lock file"
48
- rm -f "$LOCK_FILE"
49
- fi
50
- fi
51
- # Create our lock
52
- echo $$ > "$LOCK_FILE"
53
- return 0
54
- else
55
- # Use flock for better locking
56
- exec 9>"$LOCK_FILE"
57
- if ! flock -n 9; then
58
- debug_log "Another process is starting the tunnel, waiting briefly"
59
- return 1
60
- fi
61
- return 0
62
- fi
63
- }
64
-
65
- # Release the lock
66
- release_lock() {
67
- if command -v flock >/dev/null 2>&1; then
68
- # Using flock
69
- exec 9>&- # Close file descriptor to release lock
70
- else
71
- # Using simple lock
72
- rm -f "$LOCK_FILE"
73
- fi
74
- debug_log "Lock released"
75
- }
76
-
77
- # Generate SSH command based on available tools and parameters
78
- generate_ssh_command() {
79
- # Check for autossh
80
- if ! command -v autossh >/dev/null 2>&1; then
81
- debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
- debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
-
84
- # Fall back to regular ssh
85
- if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
- SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
- else
88
- SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
-
90
- # Add SSH key if provided
91
- if [[ -n "$SSH_KEY" ]]; then
92
- SSH_CMD+=("-i" "$SSH_KEY")
93
- fi
94
-
95
- # Add user@host
96
- SSH_CMD+=("$USER@$HOST")
97
- fi
98
- else
99
- # Configure autossh
100
- if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
- SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
- else
103
- SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
-
105
- # Add SSH key if provided
106
- if [[ -n "$SSH_KEY" ]]; then
107
- SSH_CMD+=("-i" "$SSH_KEY")
108
- fi
109
-
110
- # Add user@host
111
- SSH_CMD+=("$USER@$HOST")
112
- fi
113
- fi
114
- }
115
-
116
- # Function to read certificate files if they exist
117
- read_certificate_data() {
118
- local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
- local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
- local cert_data=""
121
- local key_data=""
122
-
123
- if [[ -f "$client_cert_file" ]]; then
124
- # Read the certificate file as is - it's already in PEM format
125
- cert_data=$(cat "$client_cert_file")
126
- debug_log "Found client certificate data for context $CONTEXT"
127
-
128
- # Log the first and last few characters to verify PEM format
129
- local cert_start=$(head -1 "$client_cert_file")
130
- local cert_end=$(tail -1 "$client_cert_file")
131
- debug_log "Certificate starts with: $cert_start"
132
- debug_log "Certificate ends with: $cert_end"
133
-
134
- # Check if it has proper PEM format
135
- if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
- debug_log "WARNING: Certificate file may not be in proper PEM format"
137
- # Try to fix it if needed
138
- if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
- echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
- cat "$client_cert_file" >> "$client_cert_file.fixed"
141
- echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
- mv "$client_cert_file.fixed" "$client_cert_file"
143
- cert_data=$(cat "$client_cert_file")
144
- debug_log "Fixed certificate format by adding BEGIN/END markers"
145
- fi
146
- fi
147
- fi
148
-
149
- if [[ -f "$client_key_file" ]]; then
150
- # Read the key file as is - it's already in PEM format
151
- key_data=$(cat "$client_key_file")
152
- debug_log "Found client key data for context $CONTEXT"
153
-
154
- # Log the first and last few characters to verify PEM format
155
- local key_start=$(head -1 "$client_key_file")
156
- local key_end=$(tail -1 "$client_key_file")
157
- debug_log "Key starts with: $key_start"
158
- debug_log "Key ends with: $key_end"
159
-
160
- # Check if it has proper PEM format
161
- if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
- debug_log "WARNING: Key file may not be in proper PEM format"
163
- # Try to fix it if needed
164
- if ! grep -q "BEGIN" "$client_key_file"; then
165
- echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
- cat "$client_key_file" >> "$client_key_file.fixed"
167
- echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
- mv "$client_key_file.fixed" "$client_key_file"
169
- key_data=$(cat "$client_key_file")
170
- debug_log "Fixed key format by adding BEGIN/END markers"
171
- fi
172
- fi
173
- fi
174
-
175
- echo "$cert_data:$key_data"
176
- }
177
-
178
- # Function to generate credentials JSON
179
- generate_credentials_json() {
180
- local expiration_time=$(generate_expiration_timestamp)
181
- local cert_bundle=$(read_certificate_data)
182
- local client_cert_data=${cert_bundle%:*}
183
- local client_key_data=${cert_bundle#*:}
184
-
185
- if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
- # Debug the certificate data
187
- debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
- debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
-
190
- # Check if we can create proper JSON with `jq`
191
- if ! command -v jq &>/dev/null; then
192
- echo "jq is not installed. Please install jq to use this script." >&2
193
- exit 1
194
- fi
195
- debug_log "Using jq for JSON formatting"
196
-
197
- # Create a temporary file for the JSON output to avoid shell escaping issues
198
- local TEMP_JSON_FILE=$(mktemp)
199
-
200
- # Write the JSON to the temporary file using jq for proper JSON formatting
201
- cat > "$TEMP_JSON_FILE" << EOL
202
- {
203
- "apiVersion": "client.authentication.k8s.io/v1beta1",
204
- "kind": "ExecCredential",
205
- "status": {
206
- "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
207
- "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
208
- "expirationTimestamp": "$expiration_time"
209
- }
210
- }
211
- EOL
212
-
213
- # Read the JSON from the file
214
- local json_response=$(cat "$TEMP_JSON_FILE")
215
-
216
- # Clean up
217
- rm -f "$TEMP_JSON_FILE"
218
-
219
- # Output the JSON
220
- echo "$json_response"
221
- else
222
- # Fallback to token-based credential for tunnel-only authentication
223
- echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
224
- fi
225
- }
226
-
227
- while [[ $# -gt 0 ]]; do
228
- case $1 in
229
- --use-ssh-config)
230
- USE_SSH_CONFIG=1
231
- shift
232
- ;;
233
- --ssh-key)
234
- SSH_KEY="$2"
235
- shift 2
236
- ;;
237
- --context)
238
- CONTEXT="$2"
239
- shift 2
240
- ;;
241
- --port)
242
- PORT="$2"
243
- shift 2
244
- ;;
245
- --host)
246
- HOST="$2"
247
- shift 2
248
- ;;
249
- --user)
250
- USER="$2"
251
- shift 2
252
- ;;
253
- --ttl)
254
- TTL_SECONDS="$2"
255
- shift 2
256
- ;;
257
- *)
258
- echo "Unknown parameter: $1" >&2
259
- exit 1
260
- ;;
261
- esac
262
- done
263
-
264
- # Validate required parameters
265
- if [[ -z "$HOST" ]]; then
266
- echo "Error: --host parameter is required" >&2
267
- exit 1
268
- fi
269
-
270
- # Setup directories
271
- TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
272
- mkdir -p "$TUNNEL_DIR"
273
-
274
- # Get context name for PID file
275
- if [[ -z "$CONTEXT" ]]; then
276
- CONTEXT="default"
277
- fi
278
-
279
- PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
280
- LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
281
- LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
282
-
283
- debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
284
- debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
285
-
286
- # Check if specified port is already in use (tunnel may be running)
287
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
288
- debug_log "Port $PORT already in use, checking if it's our tunnel"
289
-
290
- # Check if there's a PID file and if that process is running
291
- if [[ -f "$PID_FILE" ]]; then
292
- OLD_PID=$(cat "$PID_FILE")
293
- if kill -0 "$OLD_PID" 2>/dev/null; then
294
- debug_log "Tunnel appears to be running with PID $OLD_PID"
295
- else
296
- debug_log "PID file exists but process $OLD_PID is not running"
297
- fi
298
- else
299
- debug_log "Port $PORT is in use but no PID file exists"
300
- fi
301
-
302
- # Return valid credential format for kubectl with expiration
303
- generate_credentials_json
304
- exit 0
305
- fi
306
-
307
- # Try to acquire the lock
308
- if ! acquire_lock; then
309
- # Wait briefly for the tunnel to be established
310
- for i in {1..10}; do
311
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
312
- debug_log "Tunnel is now active"
313
-
314
- # Return valid credential format for kubectl with expiration
315
- generate_credentials_json
316
- exit 0
317
- fi
318
- sleep 0.2
319
- done
320
- debug_log "Waited for tunnel but port $PORT still not available"
321
- fi
322
-
323
- # Check if we have a PID file with running process
324
- if [[ -f "$PID_FILE" ]]; then
325
- OLD_PID=$(cat "$PID_FILE")
326
- if kill -0 "$OLD_PID" 2>/dev/null; then
327
- # Process exists but port isn't open - something's wrong, kill it
328
- kill "$OLD_PID" 2>/dev/null
329
- debug_log "Killed stale tunnel process $OLD_PID"
330
- else
331
- debug_log "PID file exists but process $OLD_PID is not running anymore"
332
- fi
333
- # Remove the stale PID file
334
- rm -f "$PID_FILE"
335
- fi
336
-
337
- # Generate the SSH command
338
- generate_ssh_command
339
-
340
- debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
341
-
342
- # Start the tunnel in foreground and wait for it to establish
343
- "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
344
- TUNNEL_PID=$!
345
-
346
- # Save PID
347
- echo $TUNNEL_PID > "$PID_FILE"
348
- debug_log "Tunnel started with PID $TUNNEL_PID"
349
-
350
- # Wait for tunnel to establish
351
- tunnel_up=0
352
- for i in {1..20}; do
353
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
354
- debug_log "Tunnel established successfully on port $PORT"
355
- tunnel_up=1
356
- break
357
- fi
358
- sleep 0.2
359
- done
360
-
361
- # Clean up lock file
362
- release_lock
363
-
364
- # Check if the tunnel process is still running
365
- if ! kill -0 $TUNNEL_PID 2>/dev/null; then
366
- debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
367
- if [[ -f "$PID_FILE" ]]; then
368
- rm -f "$PID_FILE"
369
- fi
370
- # Return error in case of tunnel failure
371
- echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
372
- exit 1
373
- elif [[ $tunnel_up -eq 0 ]]; then
374
- debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
375
- fi
376
-
377
- # Return valid credential format with certificates if available
378
- generate_credentials_json
379
- exit 0
8
+ # TODO (kyuds): remove this script after v0.13.0. Kept here for backwards compat.
9
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10
+ exec "$SCRIPT_DIR/../../ssh_node_pools/deploy/tunnel/ssh-tunnel.sh" "$@"
sky/utils/schemas.py CHANGED
@@ -1401,6 +1401,27 @@ def get_config_schema():
1401
1401
  **_CONTEXT_CONFIG_SCHEMA_MINIMAL,
1402
1402
  }
1403
1403
  },
1404
+ 'slurm': {
1405
+ 'type': 'object',
1406
+ 'required': [],
1407
+ 'additionalProperties': False,
1408
+ 'properties': {
1409
+ 'allowed_clusters': {
1410
+ 'oneOf': [{
1411
+ 'type': 'array',
1412
+ 'items': {
1413
+ 'type': 'string',
1414
+ },
1415
+ }, {
1416
+ 'type': 'string',
1417
+ 'pattern': '^all$'
1418
+ }]
1419
+ },
1420
+ 'provision_timeout': {
1421
+ 'type': 'integer',
1422
+ },
1423
+ }
1424
+ },
1404
1425
  'oci': {
1405
1426
  'type': 'object',
1406
1427
  'required': [],
@@ -1435,6 +1456,16 @@ def get_config_schema():
1435
1456
  }
1436
1457
  },
1437
1458
  },
1459
+ 'vast': {
1460
+ 'type': 'object',
1461
+ 'required': [],
1462
+ 'additionalProperties': False,
1463
+ 'properties': {
1464
+ 'secure_only': {
1465
+ 'type': 'boolean',
1466
+ },
1467
+ }
1468
+ },
1438
1469
  'nebius': {
1439
1470
  'type': 'object',
1440
1471
  'required': [],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20251203
3
+ Version: 1.0.0.dev20251210
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -64,6 +64,7 @@ Requires-Dist: passlib
64
64
  Requires-Dist: bcrypt==4.0.1
65
65
  Requires-Dist: pyjwt
66
66
  Requires-Dist: gitpython
67
+ Requires-Dist: paramiko
67
68
  Requires-Dist: types-paramiko
68
69
  Requires-Dist: alembic
69
70
  Requires-Dist: aiohttp
@@ -389,52 +390,63 @@ Requires-Dist: grpcio>=1.63.0; extra == "shadeform"
389
390
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "shadeform"
390
391
  Requires-Dist: aiosqlite; extra == "shadeform"
391
392
  Requires-Dist: greenlet; extra == "shadeform"
393
+ Provides-Extra: slurm
394
+ Requires-Dist: casbin; extra == "slurm"
395
+ Requires-Dist: sqlalchemy_adapter; extra == "slurm"
396
+ Requires-Dist: passlib; extra == "slurm"
397
+ Requires-Dist: pyjwt; extra == "slurm"
398
+ Requires-Dist: aiohttp; extra == "slurm"
399
+ Requires-Dist: anyio; extra == "slurm"
400
+ Requires-Dist: grpcio>=1.63.0; extra == "slurm"
401
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "slurm"
402
+ Requires-Dist: aiosqlite; extra == "slurm"
403
+ Requires-Dist: greenlet; extra == "slurm"
392
404
  Provides-Extra: all
393
- Requires-Dist: greenlet; extra == "all"
394
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
395
- Requires-Dist: msrestazure; extra == "all"
396
- Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
397
- Requires-Dist: aiosqlite; extra == "all"
398
- Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
399
- Requires-Dist: anyio; extra == "all"
400
- Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
401
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
402
405
  Requires-Dist: ibm-cloud-sdk-core; extra == "all"
406
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
407
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
408
+ Requires-Dist: colorama<0.4.5; extra == "all"
403
409
  Requires-Dist: sqlalchemy_adapter; extra == "all"
404
- Requires-Dist: botocore>=1.29.10; extra == "all"
405
410
  Requires-Dist: msgraph-sdk; extra == "all"
406
- Requires-Dist: aiohttp; extra == "all"
407
- Requires-Dist: nebius>=0.3.12; extra == "all"
408
- Requires-Dist: passlib; extra == "all"
409
- Requires-Dist: grpcio>=1.63.0; extra == "all"
410
- Requires-Dist: websockets; extra == "all"
411
- Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
412
- Requires-Dist: google-cloud-storage; extra == "all"
413
- Requires-Dist: azure-cli>=2.65.0; extra == "all"
411
+ Requires-Dist: greenlet; extra == "all"
414
412
  Requires-Dist: oci; extra == "all"
415
- Requires-Dist: ecsapi==0.4.0; extra == "all"
416
- Requires-Dist: cudo-compute>=0.1.10; extra == "all"
417
- Requires-Dist: azure-core>=1.31.0; extra == "all"
418
- Requires-Dist: colorama<0.4.5; extra == "all"
419
- Requires-Dist: ibm-cos-sdk; extra == "all"
420
- Requires-Dist: python-dateutil; extra == "all"
421
- Requires-Dist: docker; extra == "all"
422
- Requires-Dist: awscli>=1.27.10; extra == "all"
423
413
  Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
424
- Requires-Dist: tomli; extra == "all"
425
- Requires-Dist: azure-core>=1.24.0; extra == "all"
426
- Requires-Dist: casbin; extra == "all"
427
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
428
- Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
414
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
429
415
  Requires-Dist: pyjwt; extra == "all"
416
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
417
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
430
418
  Requires-Dist: runpod>=1.6.1; extra == "all"
419
+ Requires-Dist: docker; extra == "all"
420
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
421
+ Requires-Dist: python-dateutil; extra == "all"
422
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
423
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
424
+ Requires-Dist: passlib; extra == "all"
425
+ Requires-Dist: awscli>=1.27.10; extra == "all"
426
+ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
431
427
  Requires-Dist: boto3>=1.26.1; extra == "all"
432
- Requires-Dist: ray[default]>=2.6.1; extra == "all"
433
- Requires-Dist: pydo>=0.3.0; extra == "all"
434
- Requires-Dist: azure-common; extra == "all"
428
+ Requires-Dist: botocore>=1.29.10; extra == "all"
429
+ Requires-Dist: websockets; extra == "all"
430
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
431
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
435
432
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
436
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
433
+ Requires-Dist: msrestazure; extra == "all"
434
+ Requires-Dist: ibm-cos-sdk; extra == "all"
435
+ Requires-Dist: grpcio>=1.63.0; extra == "all"
436
+ Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
437
+ Requires-Dist: azure-common; extra == "all"
438
+ Requires-Dist: aiohttp; extra == "all"
439
+ Requires-Dist: nebius>=0.3.12; extra == "all"
437
440
  Requires-Dist: ibm-vpc; extra == "all"
441
+ Requires-Dist: casbin; extra == "all"
442
+ Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
443
+ Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
444
+ Requires-Dist: tomli; extra == "all"
445
+ Requires-Dist: ecsapi==0.4.0; extra == "all"
446
+ Requires-Dist: pydo>=0.3.0; extra == "all"
447
+ Requires-Dist: google-cloud-storage; extra == "all"
448
+ Requires-Dist: anyio; extra == "all"
449
+ Requires-Dist: aiosqlite; extra == "all"
438
450
  Provides-Extra: remote
439
451
  Requires-Dist: grpcio>=1.63.0; extra == "remote"
440
452
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "remote"