skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,199 @@
1
+ """Utilities to setup SSH Tunnel"""
2
+ import os
3
+ import random
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ from typing import Set
8
+
9
+ import colorama
10
+
11
+ from sky import sky_logging
12
+ from sky.ssh_node_pools import constants
13
+ from sky.ssh_node_pools.deploy import utils as deploy_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # Get the directory of this script
18
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
19
+
20
+
21
+ def _get_used_localhost_ports() -> Set[int]:
22
+ """Get SSH port forwardings already in use on localhost"""
23
+ used_ports = set()
24
+
25
+ # Get ports from netstat (works on macOS and Linux)
26
+ try:
27
+ if sys.platform == 'darwin':
28
+ # macOS
29
+ result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
30
+ capture_output=True,
31
+ text=True,
32
+ check=False)
33
+ else:
34
+ # Linux and other Unix-like systems
35
+ result = subprocess.run(['netstat', '-tln'],
36
+ capture_output=True,
37
+ text=True,
38
+ check=False)
39
+
40
+ if result.returncode == 0:
41
+ # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
42
+ for line in result.stdout.splitlines():
43
+ if '127.0.0.1:' in line or 'localhost:' in line:
44
+ match = re.search(r':(64\d\d)\s', line)
45
+ if match:
46
+ port = int(match.group(1))
47
+ if 6400 <= port <= 6500: # Only consider our range
48
+ used_ports.add(port)
49
+ except (subprocess.SubprocessError, FileNotFoundError):
50
+ # If netstat fails, try another approach
51
+ pass
52
+
53
+ # Also check ports from existing kubeconfig entries
54
+ try:
55
+ result = subprocess.run([
56
+ 'kubectl', 'config', 'view', '-o',
57
+ 'jsonpath=\'{.clusters[*].cluster.server}\''
58
+ ],
59
+ capture_output=True,
60
+ text=True,
61
+ check=False)
62
+
63
+ if result.returncode == 0:
64
+ # Look for localhost URLs with ports
65
+ for url in result.stdout.split():
66
+ if 'localhost:' in url or '127.0.0.1:' in url:
67
+ match = re.search(r':(\d+)', url)
68
+ if match:
69
+ port = int(match.group(1))
70
+ if 6400 <= port <= 6500: # Only consider our range
71
+ used_ports.add(port)
72
+ except subprocess.SubprocessError:
73
+ pass
74
+
75
+ return used_ports
76
+
77
+
78
+ def get_available_port(start: int = 6443, end: int = 6499) -> int:
79
+ """Get an available port in the given range not used by other tunnels"""
80
+ used_ports = _get_used_localhost_ports()
81
+
82
+ # Try to use port 6443 first if available for the first cluster
83
+ if start == 6443 and start not in used_ports:
84
+ return start
85
+
86
+ # Otherwise find any available port in the range
87
+ available_ports = list(set(range(start, end + 1)) - used_ports)
88
+
89
+ if not available_ports:
90
+ # If all ports are used, pick a random one from our range
91
+ # (we'll terminate any existing connection in the setup)
92
+ return random.randint(start, end)
93
+
94
+ # Sort to get deterministic allocation
95
+ available_ports.sort()
96
+ return available_ports[0]
97
+
98
+
99
+ def setup_kubectl_ssh_tunnel(head_node,
100
+ ssh_user,
101
+ ssh_key,
102
+ context_name,
103
+ use_ssh_config=False):
104
+ """Set up kubeconfig exec credential plugin for SSH tunnel"""
105
+ logger.info(f'{colorama.Fore.YELLOW}➜ Setting up SSH tunnel for '
106
+ f'Kubernetes API access...{colorama.Style.RESET_ALL}')
107
+
108
+ # Get an available port for this cluster
109
+ port = get_available_port()
110
+
111
+ # Paths to scripts
112
+ tunnel_script = os.path.join(SCRIPT_DIR, 'tunnel', 'ssh-tunnel.sh')
113
+
114
+ # Make sure scripts are executable
115
+ os.chmod(tunnel_script, 0o755)
116
+
117
+ # Certificate files
118
+ client_cert_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
119
+ f'{context_name}-cert.pem')
120
+ client_key_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
121
+ f'{context_name}-key.pem')
122
+
123
+ # Update kubeconfig to use localhost with the selected port
124
+ deploy_utils.run_command([
125
+ 'kubectl', 'config', 'set-cluster', context_name,
126
+ f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
127
+ ])
128
+
129
+ # Build the exec args list based on auth method
130
+ exec_args = [
131
+ '--exec-command', tunnel_script, '--exec-api-version',
132
+ 'client.authentication.k8s.io/v1beta1'
133
+ ]
134
+
135
+ # Set credential TTL to force frequent tunnel checks
136
+ ttl_seconds = 30
137
+
138
+ # Verify if we have extracted certificate data files
139
+ has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
140
+ client_key_file)
141
+ if has_cert_files:
142
+ logger.info(f'{colorama.Fore.GREEN}Client certificate data extracted '
143
+ 'and will be used for authentication'
144
+ f'{colorama.Style.RESET_ALL}')
145
+
146
+ if use_ssh_config:
147
+ deploy_utils.run_command(
148
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
149
+ [
150
+ '--exec-arg=--context', f'--exec-arg={context_name}',
151
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
152
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
153
+ '--exec-arg=--host', f'--exec-arg={head_node}'
154
+ ])
155
+ else:
156
+ deploy_utils.run_command(
157
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
158
+ [
159
+ '--exec-arg=--context', f'--exec-arg={context_name}',
160
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
161
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--host',
162
+ f'--exec-arg={head_node}', '--exec-arg=--user',
163
+ f'--exec-arg={ssh_user}', '--exec-arg=--ssh-key',
164
+ f'--exec-arg={ssh_key}'
165
+ ])
166
+
167
+ logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel configured through '
168
+ 'kubectl credential plugin on port '
169
+ f'{port}{colorama.Style.RESET_ALL}')
170
+ logger.info('Your kubectl connection is now tunneled through SSH '
171
+ f'(port {port}).')
172
+ logger.info('This tunnel will be automatically established when needed.')
173
+ logger.info(f'Credential TTL set to {ttl_seconds}s to ensure tunnel '
174
+ 'health is checked frequently.')
175
+ return port
176
+
177
+
178
+ def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
179
+ """Clean up the SSH tunnel for a specific context"""
180
+ logger.info(f'{colorama.Fore.YELLOW}➜ Cleaning up SSH tunnel for '
181
+ f'`{cluster_name}`...{colorama.Style.RESET_ALL}')
182
+
183
+ # Path to cleanup script
184
+ cleanup_script = os.path.join(SCRIPT_DIR, 'tunnel', 'cleanup-tunnel.sh')
185
+
186
+ # Make sure script is executable
187
+ if os.path.exists(cleanup_script):
188
+ os.chmod(cleanup_script, 0o755)
189
+
190
+ # Run the cleanup script
191
+ subprocess.run([cleanup_script, context_name],
192
+ stdout=subprocess.DEVNULL,
193
+ stderr=subprocess.DEVNULL,
194
+ check=False)
195
+ logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel for `{cluster_name}` '
196
+ f'cleaned up.{colorama.Style.RESET_ALL}')
197
+ else:
198
+ logger.error(f'{colorama.Fore.YELLOW}Cleanup script not found: '
199
+ f'{cleanup_script}{colorama.Style.RESET_ALL}')
@@ -0,0 +1,173 @@
1
+ """Utilities for SSH Node Pools Deployment"""
2
+ import os
3
+ import subprocess
4
+ from typing import List, Optional
5
+
6
+ import colorama
7
+
8
+ from sky import sky_logging
9
+ from sky.utils import ux_utils
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def check_ssh_cluster_dependencies(
15
+ raise_error: bool = True) -> Optional[List[str]]:
16
+ """Checks if the dependencies for ssh cluster are installed.
17
+
18
+ Args:
19
+ raise_error: set to true when the dependency needs to be present.
20
+ set to false for `sky check`, where reason strings are compiled
21
+ at the end.
22
+
23
+ Returns: the reasons list if there are missing dependencies.
24
+ """
25
+ # error message
26
+ jq_message = ('`jq` is required to setup ssh cluster.')
27
+
28
+ # save
29
+ reasons = []
30
+ required_binaries = []
31
+
32
+ # Ensure jq is installed
33
+ try:
34
+ subprocess.run(['jq', '--version'],
35
+ stdout=subprocess.DEVNULL,
36
+ stderr=subprocess.DEVNULL,
37
+ check=True)
38
+ except (FileNotFoundError, subprocess.CalledProcessError):
39
+ required_binaries.append('jq')
40
+ reasons.append(jq_message)
41
+
42
+ if required_binaries:
43
+ reasons.extend([
44
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
45
+ f' $ sudo apt install {" ".join(required_binaries)}',
46
+ 'On MacOS, install with: ',
47
+ f' $ brew install {" ".join(required_binaries)}',
48
+ ])
49
+ if raise_error:
50
+ with ux_utils.print_exception_no_traceback():
51
+ raise RuntimeError('\n'.join(reasons))
52
+ return reasons
53
+ return None
54
+
55
+
56
+ def run_command(cmd, shell=False, silent=False):
57
+ """Run a local command and return the output."""
58
+ process = subprocess.run(cmd,
59
+ shell=shell,
60
+ capture_output=True,
61
+ text=True,
62
+ check=False)
63
+ if process.returncode != 0:
64
+ if not silent:
65
+ logger.error(f'{colorama.Fore.RED}Error executing command: {cmd}\n'
66
+ f'{colorama.Style.RESET_ALL}STDOUT: {process.stdout}\n'
67
+ f'STDERR: {process.stderr}')
68
+ return None
69
+ return process.stdout.strip()
70
+
71
+
72
+ def get_effective_host_ip(hostname: str) -> str:
73
+ """Get the effective IP for a hostname from SSH config."""
74
+ try:
75
+ result = subprocess.run(['ssh', '-G', hostname],
76
+ capture_output=True,
77
+ text=True,
78
+ check=False)
79
+ if result.returncode == 0:
80
+ for line in result.stdout.splitlines():
81
+ if line.startswith('hostname '):
82
+ return line.split(' ', 1)[1].strip()
83
+ except Exception: # pylint: disable=broad-except
84
+ pass
85
+ return hostname # Return the original hostname if lookup fails
86
+
87
+
88
+ def run_remote(node,
89
+ cmd,
90
+ user='',
91
+ ssh_key='',
92
+ connect_timeout=30,
93
+ use_ssh_config=False,
94
+ print_output=False,
95
+ use_shell=False,
96
+ silent=False):
97
+ """Run a command on a remote machine via SSH."""
98
+ ssh_cmd: List[str]
99
+ if use_ssh_config:
100
+ # Use SSH config for connection parameters
101
+ ssh_cmd = ['ssh', node, cmd]
102
+ else:
103
+ # Use explicit parameters
104
+ ssh_cmd = [
105
+ 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
106
+ '-o', f'ConnectTimeout={connect_timeout}', '-o',
107
+ 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
108
+ ]
109
+
110
+ if ssh_key:
111
+ if not os.path.isfile(ssh_key):
112
+ raise ValueError(f'SSH key not found: {ssh_key}')
113
+ ssh_cmd.extend(['-i', ssh_key])
114
+
115
+ ssh_cmd.append(f'{user}@{node}' if user else node)
116
+ ssh_cmd.append(cmd)
117
+
118
+ subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
119
+ process = subprocess.run(subprocess_cmd,
120
+ capture_output=True,
121
+ text=True,
122
+ check=False,
123
+ shell=use_shell)
124
+ if process.returncode != 0:
125
+ if not silent:
126
+ logger.error(f'{colorama.Fore.RED}Error executing command {cmd} on '
127
+ f'{node}:{colorama.Style.RESET_ALL} {process.stderr}')
128
+ return None
129
+ if print_output:
130
+ logger.info(process.stdout)
131
+ return process.stdout.strip()
132
+
133
+
134
+ def ensure_directory_exists(path):
135
+ """Ensure the directory for the specified file path exists."""
136
+ directory = os.path.dirname(path)
137
+ if directory and not os.path.exists(directory):
138
+ os.makedirs(directory, exist_ok=True)
139
+
140
+
141
+ def check_gpu(node, user, ssh_key, use_ssh_config=False, is_head=False):
142
+ """Check if a node has a GPU."""
143
+ cmd = ('command -v nvidia-smi &> /dev/null && '
144
+ 'nvidia-smi --query-gpu=gpu_name --format=csv,noheader')
145
+ result = run_remote(node,
146
+ cmd,
147
+ user,
148
+ ssh_key,
149
+ use_ssh_config=use_ssh_config,
150
+ silent=True)
151
+ if result is not None:
152
+ # Check that all GPUs have the same type.
153
+ # Currently, SkyPilot does not support heterogeneous GPU node
154
+ # (i.e. more than one GPU type on the same node).
155
+ gpu_names = {
156
+ line.strip() for line in result.splitlines() if line.strip()
157
+ }
158
+ if not gpu_names:
159
+ # This can happen if nvidia-smi returns only whitespace.
160
+ # Set result to None to ensure this function returns False.
161
+ result = None
162
+ elif len(gpu_names) > 1:
163
+ # Sort for a deterministic error message.
164
+ sorted_gpu_names = sorted(list(gpu_names))
165
+ raise RuntimeError(
166
+ f'Node {node} has more than one GPU types '
167
+ f'({", ".join(sorted_gpu_names)}). '
168
+ 'SkyPilot does not support a node with multiple GPU types.')
169
+ else:
170
+ logger.info(f'{colorama.Fore.YELLOW}➜ GPU {list(gpu_names)[0]} '
171
+ f'detected on {"head" if is_head else "worker"} '
172
+ f'node ({node}).{colorama.Style.RESET_ALL}')
173
+ return result is not None
@@ -4,12 +4,11 @@ from typing import Any, Dict, List
4
4
 
5
5
  import fastapi
6
6
 
7
- from sky import core as sky_core
8
7
  from sky.server.requests import executor
9
8
  from sky.server.requests import payloads
10
9
  from sky.server.requests import request_names
11
10
  from sky.server.requests import requests as requests_lib
12
- from sky.ssh_node_pools import core as ssh_node_pools_core
11
+ from sky.ssh_node_pools import core
13
12
  from sky.utils import common_utils
14
13
 
15
14
  router = fastapi.APIRouter()
@@ -19,7 +18,7 @@ router = fastapi.APIRouter()
19
18
  def get_ssh_node_pools() -> Dict[str, Any]:
20
19
  """Get all SSH Node Pool configurations."""
21
20
  try:
22
- return ssh_node_pools_core.get_all_pools()
21
+ return core.get_all_pools()
23
22
  except Exception as e:
24
23
  raise fastapi.HTTPException(
25
24
  status_code=500,
@@ -31,7 +30,7 @@ def get_ssh_node_pools() -> Dict[str, Any]:
31
30
  def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
32
31
  """Update SSH Node Pool configurations."""
33
32
  try:
34
- ssh_node_pools_core.update_pools(pools_config)
33
+ core.update_pools(pools_config)
35
34
  return {'status': 'success'}
36
35
  except Exception as e:
37
36
  raise fastapi.HTTPException(status_code=400,
@@ -43,7 +42,7 @@ def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
43
42
  def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
44
43
  """Delete a SSH Node Pool configuration."""
45
44
  try:
46
- if ssh_node_pools_core.delete_pool(pool_name):
45
+ if core.delete_pool(pool_name):
47
46
  return {'status': 'success'}
48
47
  else:
49
48
  raise fastapi.HTTPException(
@@ -70,8 +69,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
70
69
  detail='Missing key_name or key_file')
71
70
 
72
71
  key_content = await key_file.read()
73
- key_path = ssh_node_pools_core.upload_ssh_key(key_name,
74
- key_content.decode())
72
+ key_path = core.upload_ssh_key(key_name, key_content.decode())
75
73
 
76
74
  return {'status': 'success', 'key_path': key_path}
77
75
  except fastapi.HTTPException:
@@ -87,7 +85,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
87
85
  def list_ssh_keys() -> List[str]:
88
86
  """List available SSH keys."""
89
87
  try:
90
- return ssh_node_pools_core.list_ssh_keys()
88
+ return core.list_ssh_keys()
91
89
  except Exception as e:
92
90
  exception_msg = common_utils.format_exception(e)
93
91
  raise fastapi.HTTPException(
@@ -104,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
104
102
  request_id=request.state.request_id,
105
103
  request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
106
104
  request_body=ssh_up_body,
107
- func=sky_core.ssh_up,
105
+ func=core.ssh_up,
108
106
  schedule_type=requests_lib.ScheduleType.LONG,
109
107
  )
110
108
 
@@ -129,7 +127,7 @@ async def deploy_ssh_node_pool_general(
129
127
  request_id=request.state.request_id,
130
128
  request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
131
129
  request_body=ssh_up_body,
132
- func=sky_core.ssh_up,
130
+ func=core.ssh_up,
133
131
  schedule_type=requests_lib.ScheduleType.LONG,
134
132
  )
135
133
 
@@ -155,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
155
153
  request_id=request.state.request_id,
156
154
  request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
157
155
  request_body=ssh_up_body,
158
- func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
156
+ func=core.ssh_up, # Reuse ssh_up function with cleanup=True
159
157
  schedule_type=requests_lib.ScheduleType.LONG,
160
158
  )
161
159
 
@@ -183,7 +181,7 @@ async def down_ssh_node_pool_general(
183
181
  request_id=request.state.request_id,
184
182
  request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
185
183
  request_body=ssh_up_body,
186
- func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
184
+ func=core.ssh_up, # Reuse ssh_up function with cleanup=True
187
185
  schedule_type=requests_lib.ScheduleType.LONG,
188
186
  )
189
187
 
@@ -206,7 +204,7 @@ def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
206
204
  try:
207
205
  # Call ssh_status to check the context
208
206
  context_name = f'ssh-{pool_name}'
209
- is_ready, reason = sky_core.ssh_status(context_name)
207
+ is_ready, reason = core.ssh_status(context_name)
210
208
 
211
209
  # Strip ANSI escape codes from the reason text
212
210
  def strip_ansi_codes(text):
@@ -5,13 +5,14 @@ import subprocess
5
5
  from typing import Any, Callable, Dict, List, Optional
6
6
  import uuid
7
7
 
8
+ import colorama
8
9
  import yaml
9
10
 
11
+ from sky import sky_logging
12
+ from sky.ssh_node_pools import constants
10
13
  from sky.utils import ux_utils
11
14
 
12
- DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
13
- RED = '\033[0;31m'
14
- NC = '\033[0m' # No color
15
+ logger = sky_logging.init_logger(__name__)
15
16
 
16
17
 
17
18
  def check_host_in_ssh_config(hostname: str) -> bool:
@@ -92,7 +93,8 @@ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
92
93
  def get_cluster_config(
93
94
  targets: Dict[str, Any],
94
95
  cluster_name: Optional[str] = None,
95
- file_path: str = DEFAULT_SSH_NODE_POOLS_PATH) -> Dict[str, Any]:
96
+ file_path: str = constants.DEFAULT_SSH_NODE_POOLS_PATH
97
+ ) -> Dict[str, Any]:
96
98
  """Get configuration for specific clusters or all clusters."""
97
99
  if not targets:
98
100
  with ux_utils.print_exception_no_traceback():
@@ -186,8 +188,9 @@ def prepare_hosts_info(
186
188
  else:
187
189
  # It's a dict with potential overrides
188
190
  if 'ip' not in host:
189
- print(f'{RED}Warning: Host missing \'ip\' field, '
190
- f'skipping: {host}{NC}')
191
+ logger.warning(f'{colorama.Fore.RED}Warning: Host missing'
192
+ f'\'ip\' field, skipping: {host}'
193
+ f'{colorama.Style.RESET_ALL}')
191
194
  continue
192
195
 
193
196
  # Check if this is an SSH config hostname
@@ -523,6 +523,14 @@ available_node_types:
523
523
  resourceFieldRef:
524
524
  containerName: ray-node
525
525
  resource: requests.memory
526
+ # Disable Ray memory monitor to prevent Ray's memory manager
527
+ # from interfering with kubernetes resource manager.
528
+ # If ray memory monitor is enabled, the ray memory monitor kills
529
+ # the running job is the job uses more than 95% of allocated memory,
530
+ # even if the job is not misbehaving or using its full allocated memory.
531
+ # This behavior does not give a chance for k8s scheduler to evict the pod.
532
+ - name: RAY_memory_monitor_refresh_ms
533
+ value: "0"
526
534
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
527
535
  - name: {{ key }}
528
536
  value: {{ value }}
@@ -0,0 +1,85 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.slurm
11
+
12
+ cluster: {{slurm_cluster}}
13
+ partition: {{slurm_partition}}
14
+
15
+ ssh:
16
+ hostname: {{ssh_hostname}}
17
+ port: {{ssh_port}}
18
+ user: {{ssh_user}}
19
+ private_key: {{slurm_private_key}}
20
+ {% if slurm_proxy_command is not none %}
21
+ proxycommand: {{slurm_proxy_command | tojson }}
22
+ {% endif %}
23
+
24
+ auth:
25
+ ssh_user: {{ssh_user}}
26
+ # TODO(jwj): Modify this tmp workaround.
27
+ # ssh_private_key: {{ssh_private_key}}
28
+ ssh_private_key: {{slurm_private_key}}
29
+ ssh_proxy_command: {{slurm_proxy_command | tojson }}
30
+
31
+ available_node_types:
32
+ ray_head_default:
33
+ resources: {}
34
+ node_config:
35
+ # From clouds/slurm.py::Slurm.make_deploy_resources_variables.
36
+ instance_type: {{instance_type}}
37
+ disk_size: {{disk_size}}
38
+ cpus: {{cpus}}
39
+ memory: {{memory}}
40
+ accelerator_type: {{accelerator_type}}
41
+ accelerator_count: {{accelerator_count}}
42
+
43
+ # TODO: more configs that is required by the provisioner to create new
44
+ # instances on the FluffyCloud:
45
+ # sky/provision/fluffycloud/instance.py::run_instances
46
+
47
+ head_node_type: ray_head_default
48
+
49
+ # Format: `REMOTE_PATH : LOCAL_PATH`
50
+ file_mounts: {
51
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
52
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
53
+ {%- for remote_path, local_path in credentials.items() %}
54
+ "{{remote_path}}": "{{local_path}}",
55
+ {%- endfor %}
56
+ }
57
+
58
+ rsync_exclude: []
59
+
60
+ initialization_commands: []
61
+
62
+ # List of shell commands to run to set up nodes.
63
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
64
+ # connection, which is expensive. Try your best to co-locate commands into fewer
65
+ # items!
66
+ #
67
+ # Increment the following for catching performance bugs easier:
68
+ # current num items (num SSH connections): 1
69
+ setup_commands:
70
+ - {%- for initial_setup_command in initial_setup_commands %}
71
+ {{ initial_setup_command }}
72
+ {%- endfor %}
73
+ {{ setup_sky_dirs_commands }}
74
+ {{ conda_installation_commands }}
75
+ {{ skypilot_wheel_installation_commands }}
76
+ {{ copy_skypilot_templates_commands }}
77
+
78
+ head_node: {}
79
+ worker_nodes: {}
80
+
81
+ # These fields are required for external cloud providers.
82
+ head_setup_commands: []
83
+ worker_setup_commands: []
84
+ cluster_synced_files: []
85
+ file_mounts_sync_continuously: False
@@ -10,6 +10,7 @@ provider:
10
10
  module: sky.provision.vast
11
11
  region: "{{region}}"
12
12
  disable_launch_config_check: true
13
+ secure_only: {{secure_only}}
13
14
 
14
15
  auth:
15
16
  ssh_user: root
sky/users/model.conf CHANGED
@@ -12,4 +12,4 @@ g = _, _
12
12
  e = some(where (p.eft == allow))
13
13
 
14
14
  [matchers]
15
- m = (g(r.sub, p.sub)|| p.sub == '*') && r.obj == p.obj && r.act == p.act
15
+ m = (g(r.sub, p.sub)|| p.sub == '*') && keyMatch2(r.obj, p.obj) && r.act == p.act
sky/users/permission.py CHANGED
@@ -69,6 +69,26 @@ class PermissionService:
69
69
  'Enforcer should be initialized after _lazy_initialize()')
70
70
  return self.enforcer
71
71
 
72
+ def _get_plugin_rbac_rules(self):
73
+ """Get RBAC rules from loaded plugins.
74
+
75
+ Returns:
76
+ Dictionary of plugin RBAC rules, or empty dict if plugins module
77
+ is not available or no rules are defined.
78
+ """
79
+ try:
80
+ # pylint: disable=import-outside-toplevel
81
+ from sky.server import plugins as server_plugins
82
+ return server_plugins.get_plugin_rbac_rules()
83
+ except ImportError:
84
+ # Plugin module not available (e.g., not running as server)
85
+ logger.debug(
86
+ 'Plugin module not available, skipping plugin RBAC rules')
87
+ return {}
88
+ except Exception as e: # pylint: disable=broad-except
89
+ logger.warning(f'Failed to get plugin RBAC rules: {e}')
90
+ return {}
91
+
72
92
  def _maybe_initialize_basic_auth_user(self) -> None:
73
93
  """Initialize basic auth user if it is enabled."""
74
94
  basic_auth = os.environ.get(constants.SKYPILOT_INITIAL_BASIC_AUTH)
@@ -101,9 +121,12 @@ class PermissionService:
101
121
  enforcer = self._ensure_enforcer()
102
122
  existing_policies = enforcer.get_policy()
103
123
 
124
+ # Get plugin RBAC rules dynamically
125
+ plugin_rules = self._get_plugin_rbac_rules()
126
+
104
127
  # If we already have policies for the expected roles, skip
105
128
  # initialization
106
- role_permissions = rbac.get_role_permissions()
129
+ role_permissions = rbac.get_role_permissions(plugin_rules=plugin_rules)
107
130
  expected_policies = []
108
131
  for role, permissions in role_permissions.items():
109
132
  if permissions['permissions'] and 'blocklist' in permissions[