skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250430__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +2 -0
- sky/cli.py +90 -37
- sky/client/cli.py +90 -37
- sky/client/sdk.py +3 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/kubernetes.py +4 -4
- sky/clouds/nebius.py +16 -10
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/optimizer.py +35 -11
- sky/provision/docker_utils.py +22 -16
- sky/provision/kubernetes/utils.py +26 -24
- sky/resources.py +1 -1
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/setup_files/setup.py +1 -1
- sky/skylet/constants.py +18 -0
- sky/skypilot_config.py +32 -11
- sky/templates/aws-ray.yml.j2 +2 -1
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +1 -1
- sky/templates/ibm-ray.yml.j2 +3 -3
- sky/templates/kubernetes-ray.yml.j2 +26 -14
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +64 -0
- sky/templates/oci-ray.yml.j2 +1 -1
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/vast-ray.yml.j2 +1 -1
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/utils/aws/__init__.py +0 -0
- sky/utils/aws/get_default_security_group.py +11 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/METADATA +3 -3
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/RECORD +58 -55
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
@@ -35,6 +35,7 @@ from sky.jobs.server import server as jobs_rest
|
|
35
35
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
36
36
|
from sky.serve.server import server as serve_rest
|
37
37
|
from sky.server import common
|
38
|
+
from sky.server import config as server_config
|
38
39
|
from sky.server import constants as server_constants
|
39
40
|
from sky.server import stream_utils
|
40
41
|
from sky.server.requests import executor
|
@@ -1166,13 +1167,12 @@ if __name__ == '__main__':
|
|
1166
1167
|
# that it is shown only when the API server is started.
|
1167
1168
|
usage_lib.maybe_show_privacy_policy()
|
1168
1169
|
|
1169
|
-
|
1170
|
-
|
1171
|
-
num_workers = common_utils.get_cpu_count()
|
1170
|
+
config = server_config.compute_server_config(cmd_args.deploy)
|
1171
|
+
num_workers = config.num_server_workers
|
1172
1172
|
|
1173
1173
|
sub_procs = []
|
1174
1174
|
try:
|
1175
|
-
sub_procs = executor.start(
|
1175
|
+
sub_procs = executor.start(config)
|
1176
1176
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
1177
1177
|
# We don't support reload for now, since it may cause leakage of request
|
1178
1178
|
# workers or interrupt running requests.
|
sky/setup_files/dependencies.py
CHANGED
sky/setup_files/setup.py
CHANGED
@@ -161,7 +161,7 @@ setuptools.setup(
|
|
161
161
|
author='SkyPilot Team',
|
162
162
|
license='Apache 2.0',
|
163
163
|
readme='README.md',
|
164
|
-
description='SkyPilot:
|
164
|
+
description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
|
165
165
|
long_description=long_description,
|
166
166
|
long_description_content_type='text/markdown',
|
167
167
|
setup_requires=['wheel'],
|
sky/skylet/constants.py
CHANGED
@@ -280,6 +280,24 @@ USER_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER_ID'
|
|
280
280
|
# runs on a VM launched by SkyPilot will be recognized as the same user.
|
281
281
|
USER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER'
|
282
282
|
|
283
|
+
# SSH configuration to allow more concurrent sessions and connections.
|
284
|
+
# Default MaxSessions is 10.
|
285
|
+
# Default MaxStartups is 10:30:60, meaning:
|
286
|
+
# - Up to 10 unauthenticated connections are allowed without restriction.
|
287
|
+
# - From 11 to 60 connections, 30% are randomly dropped.
|
288
|
+
# - Above 60 connections, all are dropped.
|
289
|
+
# These defaults are too low for submitting many parallel jobs (e.g., 150),
|
290
|
+
# which can easily exceed the limits and cause connection failures.
|
291
|
+
# The new values (MaxSessions 200, MaxStartups 150:30:200) increase these
|
292
|
+
# limits significantly.
|
293
|
+
# TODO(zeping): Bake this configuration in SkyPilot default images.
|
294
|
+
SET_SSH_MAX_SESSIONS_CONFIG_CMD = (
|
295
|
+
'sudo bash -c \''
|
296
|
+
'echo "MaxSessions 200" >> /etc/ssh/sshd_config; '
|
297
|
+
'echo "MaxStartups 150:30:200" >> /etc/ssh/sshd_config; '
|
298
|
+
'(systemctl reload sshd || service ssh reload); '
|
299
|
+
'\'')
|
300
|
+
|
283
301
|
# Internal: Env var indicating the system is running with a remote API server.
|
284
302
|
# It is used for internal purposes, including the jobs controller to mark
|
285
303
|
# clusters as launched with a remote API server.
|
sky/skypilot_config.py
CHANGED
@@ -56,8 +56,6 @@ import threading
|
|
56
56
|
import typing
|
57
57
|
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
58
58
|
|
59
|
-
from omegaconf import OmegaConf
|
60
|
-
|
61
59
|
from sky import exceptions
|
62
60
|
from sky import sky_logging
|
63
61
|
from sky.adaptors import common as adaptors_common
|
@@ -141,7 +139,7 @@ def get_user_config() -> config_utils.Config:
|
|
141
139
|
|
142
140
|
# load the user config file
|
143
141
|
if os.path.exists(user_config_path):
|
144
|
-
user_config =
|
142
|
+
user_config = parse_config_file(user_config_path)
|
145
143
|
_validate_config(user_config, user_config_path)
|
146
144
|
else:
|
147
145
|
user_config = config_utils.Config()
|
@@ -170,7 +168,7 @@ def _get_project_config() -> config_utils.Config:
|
|
170
168
|
|
171
169
|
# load the project config file
|
172
170
|
if os.path.exists(project_config_path):
|
173
|
-
project_config =
|
171
|
+
project_config = parse_config_file(project_config_path)
|
174
172
|
_validate_config(project_config, project_config_path)
|
175
173
|
else:
|
176
174
|
project_config = config_utils.Config()
|
@@ -199,7 +197,7 @@ def get_server_config() -> config_utils.Config:
|
|
199
197
|
|
200
198
|
# load the server config file
|
201
199
|
if os.path.exists(server_config_path):
|
202
|
-
server_config =
|
200
|
+
server_config = parse_config_file(server_config_path)
|
203
201
|
_validate_config(server_config, server_config_path)
|
204
202
|
else:
|
205
203
|
server_config = config_utils.Config()
|
@@ -304,7 +302,7 @@ def _reload_config() -> None:
|
|
304
302
|
_reload_config_as_client()
|
305
303
|
|
306
304
|
|
307
|
-
def
|
305
|
+
def parse_config_file(config_path: str) -> config_utils.Config:
|
308
306
|
config = config_utils.Config()
|
309
307
|
try:
|
310
308
|
config_dict = common_utils.read_yaml(config_path)
|
@@ -321,6 +319,31 @@ def _parse_config_file(config_path: str) -> config_utils.Config:
|
|
321
319
|
return config
|
322
320
|
|
323
321
|
|
322
|
+
def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
|
323
|
+
"""Parse a comma-separated list of key-value pairs into a dictionary.
|
324
|
+
|
325
|
+
Args:
|
326
|
+
dotlist: A comma-separated list of key-value pairs.
|
327
|
+
|
328
|
+
Returns:
|
329
|
+
A config_utils.Config object with the parsed key-value pairs.
|
330
|
+
"""
|
331
|
+
config: config_utils.Config = config_utils.Config()
|
332
|
+
for arg in dotlist:
|
333
|
+
try:
|
334
|
+
key, value = arg.split('=', 1)
|
335
|
+
except ValueError as e:
|
336
|
+
raise ValueError(f'Invalid config override: {arg}. '
|
337
|
+
'Please use the format: key=value') from e
|
338
|
+
if len(key) == 0 or len(value) == 0:
|
339
|
+
raise ValueError(f'Invalid config override: {arg}. '
|
340
|
+
'Please use the format: key=value')
|
341
|
+
value = yaml.safe_load(value)
|
342
|
+
nested_keys = tuple(key.split('.'))
|
343
|
+
config.set_nested(nested_keys, value)
|
344
|
+
return config
|
345
|
+
|
346
|
+
|
324
347
|
def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
325
348
|
global _dict, _loaded_config_path
|
326
349
|
# Reset the global variables, to avoid using stale values.
|
@@ -336,7 +359,7 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
336
359
|
'exist. Please double check the path or unset the env var: '
|
337
360
|
f'unset {ENV_VAR_SKYPILOT_CONFIG}')
|
338
361
|
logger.debug(f'Using config path: {config_path}')
|
339
|
-
_dict =
|
362
|
+
_dict = parse_config_file(config_path)
|
340
363
|
_loaded_config_path = config_path
|
341
364
|
|
342
365
|
|
@@ -483,11 +506,9 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
483
506
|
'Cannot use multiple --config flags with a config file.')
|
484
507
|
config_source = maybe_config_path
|
485
508
|
# cli_config is a path to a config file
|
486
|
-
parsed_config =
|
487
|
-
OmegaConf.load(maybe_config_path))
|
509
|
+
parsed_config = parse_config_file(maybe_config_path)
|
488
510
|
else: # cli_config is a comma-separated list of key-value pairs
|
489
|
-
parsed_config =
|
490
|
-
OmegaConf.from_dotlist(cli_config))
|
511
|
+
parsed_config = _parse_dotlist(cli_config)
|
491
512
|
_validate_config(parsed_config, config_source)
|
492
513
|
except ValueError as e:
|
493
514
|
raise ValueError(f'Invalid config override: {cli_config}. '
|
sky/templates/aws-ray.yml.j2
CHANGED
@@ -142,7 +142,7 @@ available_node_types:
|
|
142
142
|
{%- endfor %}
|
143
143
|
# Use IDMSv2
|
144
144
|
MetadataOptions:
|
145
|
-
HttpTokens: required
|
145
|
+
HttpTokens: required
|
146
146
|
|
147
147
|
head_node_type: ray.head.default
|
148
148
|
|
@@ -192,6 +192,7 @@ setup_commands:
|
|
192
192
|
{%- if remote_identity != 'LOCAL_CREDENTIALS' %}
|
193
193
|
rm ~/.aws/credentials || true;
|
194
194
|
{%- endif %}
|
195
|
+
{{ ssh_max_sessions_config }}
|
195
196
|
|
196
197
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
197
198
|
# We do not need to list it here anymore.
|
sky/templates/azure-ray.yml.j2
CHANGED
@@ -68,7 +68,7 @@ available_node_types:
|
|
68
68
|
imageOffer: {{image_offer}}
|
69
69
|
imageSku: "{{image_sku}}"
|
70
70
|
imageVersion: {{image_version}}
|
71
|
-
# Community Gallery Image ID
|
71
|
+
# Community Gallery Image ID
|
72
72
|
communityGalleryImageId: {{community_gallery_image_id}}
|
73
73
|
osDiskSizeGB: {{disk_size}}
|
74
74
|
osDiskTier: {{disk_tier}}
|
@@ -130,3 +130,4 @@ setup_commands:
|
|
130
130
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
131
131
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
132
132
|
sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true;
|
133
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/cudo-ray.yml.j2
CHANGED
@@ -73,3 +73,4 @@ setup_commands:
|
|
73
73
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
74
74
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
75
75
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
76
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/do-ray.yml.j2
CHANGED
@@ -93,6 +93,7 @@ setup_commands:
|
|
93
93
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
94
94
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
95
95
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
96
|
-
|
97
|
-
|
96
|
+
{{ ssh_max_sessions_config }}
|
97
|
+
|
98
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
98
99
|
# We do not need to list it here anymore.
|
@@ -74,4 +74,4 @@ setup_commands:
|
|
74
74
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
75
75
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
76
76
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
77
|
-
|
77
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/gcp-ray.yml.j2
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
cluster_name: {{cluster_name_on_cloud}}
|
3
2
|
|
4
3
|
# The maximum number of workers nodes to launch in addition to the head node.
|
@@ -225,6 +224,7 @@ setup_commands:
|
|
225
224
|
{%- endif %}
|
226
225
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
227
226
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
227
|
+
{{ ssh_max_sessions_config }}
|
228
228
|
|
229
229
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
230
230
|
# We do not need to list it here anymore.
|
sky/templates/ibm-ray.yml.j2
CHANGED
@@ -32,7 +32,7 @@ available_node_types:
|
|
32
32
|
ray_head_default:
|
33
33
|
resources: {{instance_resources}}
|
34
34
|
node_config:
|
35
|
-
image_id: {{image_id}}
|
35
|
+
image_id: {{image_id}}
|
36
36
|
boot_volume_capacity: {{disk_capacity}}
|
37
37
|
volume_tier_name: general-purpose
|
38
38
|
instance_profile_name: {{instance_type}}
|
@@ -48,7 +48,7 @@ available_node_types:
|
|
48
48
|
max_workers: {{num_nodes - 1}}
|
49
49
|
resources: {{worker_instance_resources}}
|
50
50
|
node_config:
|
51
|
-
image_id: {{image_id}}
|
51
|
+
image_id: {{image_id}}
|
52
52
|
boot_volume_capacity: {{disk_capacity}}
|
53
53
|
volume_tier_name: general-purpose
|
54
54
|
instance_profile_name: {{worker_instance_type}}
|
@@ -106,7 +106,7 @@ setup_commands:
|
|
106
106
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
107
107
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
108
108
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
109
|
-
|
109
|
+
{{ ssh_max_sessions_config }}
|
110
110
|
|
111
111
|
# Command to start ray on the head node. You don't need to change this.
|
112
112
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
@@ -258,7 +258,7 @@ available_node_types:
|
|
258
258
|
# service is required.
|
259
259
|
labels:
|
260
260
|
parent: skypilot
|
261
|
-
# component will be set for the head node pod to be the same as the head node service selector above if a
|
261
|
+
# component will be set for the head node pod to be the same as the head node service selector above if a
|
262
262
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
263
263
|
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
264
264
|
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
@@ -277,11 +277,8 @@ available_node_types:
|
|
277
277
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
278
278
|
|
279
279
|
# Add node selector if GPU/TPUs are requested:
|
280
|
-
{% if (
|
280
|
+
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
281
281
|
nodeSelector:
|
282
|
-
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
|
283
|
-
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
|
284
|
-
{% endif %}
|
285
282
|
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
286
283
|
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
287
284
|
{% endif %}
|
@@ -289,6 +286,19 @@ available_node_types:
|
|
289
286
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
290
287
|
{% endif %}
|
291
288
|
{% endif %}
|
289
|
+
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
|
290
|
+
affinity:
|
291
|
+
nodeAffinity:
|
292
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
293
|
+
nodeSelectorTerms:
|
294
|
+
- matchExpressions:
|
295
|
+
- key: {{k8s_acc_label_key}}
|
296
|
+
operator: In
|
297
|
+
values:
|
298
|
+
{% for label_value in k8s_acc_label_values %}
|
299
|
+
- {{label_value}}
|
300
|
+
{% endfor %}
|
301
|
+
{% endif %}
|
292
302
|
|
293
303
|
{% if k8s_spot_label_key is not none %}
|
294
304
|
tolerations:
|
@@ -339,15 +349,15 @@ available_node_types:
|
|
339
349
|
# Do not change this command - it keeps the pod alive until it is
|
340
350
|
# explicitly killed.
|
341
351
|
command: ["/bin/bash", "-c", "--"]
|
342
|
-
args:
|
352
|
+
args:
|
343
353
|
- |
|
344
354
|
# For backwards compatibility, we put a marker file in the pod
|
345
|
-
# to indicate that the pod is running with the changes introduced
|
355
|
+
# to indicate that the pod is running with the changes introduced
|
346
356
|
# in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
|
347
357
|
# TODO: Remove this marker file and it's usage in setup_commands
|
348
358
|
# after v0.10.0 release.
|
349
359
|
touch /tmp/skypilot_is_nimbus
|
350
|
-
|
360
|
+
|
351
361
|
# Helper function to conditionally use sudo
|
352
362
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
353
363
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
@@ -390,7 +400,7 @@ available_node_types:
|
|
390
400
|
fi;
|
391
401
|
# SSH and other packages are not necessary, so we disable set -e
|
392
402
|
set +e
|
393
|
-
|
403
|
+
|
394
404
|
if [ ! -z "$MISSING_PACKAGES" ]; then
|
395
405
|
# Install missing packages individually to avoid failure installation breaks the whole install process,
|
396
406
|
# e.g. fuse3 is not available on some distributions.
|
@@ -443,7 +453,7 @@ available_node_types:
|
|
443
453
|
$(prefix_cmd) rm -f /bin/fusermount-wrapper
|
444
454
|
$(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
|
445
455
|
fi
|
446
|
-
{% endif %}
|
456
|
+
{% endif %}
|
447
457
|
|
448
458
|
$(prefix_cmd) mkdir -p /var/run/sshd;
|
449
459
|
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
@@ -574,7 +584,7 @@ available_node_types:
|
|
574
584
|
# File is already being monitored
|
575
585
|
continue
|
576
586
|
fi
|
577
|
-
|
587
|
+
|
578
588
|
# Monitor the new file
|
579
589
|
monitor_file $file &
|
580
590
|
already_monitored="${already_monitored} ${file}"
|
@@ -651,7 +661,7 @@ available_node_types:
|
|
651
661
|
{{k8s_resource_key}}: {{accelerator_count}}
|
652
662
|
{% endif %}
|
653
663
|
{% endif %}
|
654
|
-
|
664
|
+
|
655
665
|
{% if high_availability %}
|
656
666
|
pvc_spec:
|
657
667
|
apiVersion: v1
|
@@ -737,6 +747,7 @@ available_node_types:
|
|
737
747
|
mountPath: /mnt/home # Temporary mount point for initialization
|
738
748
|
# should be replaced by pod spec
|
739
749
|
{% endif %}
|
750
|
+
|
740
751
|
setup_commands:
|
741
752
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
742
753
|
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
@@ -756,7 +767,7 @@ setup_commands:
|
|
756
767
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
757
768
|
if [ -f /tmp/skypilot_is_nimbus ]; then
|
758
769
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
759
|
-
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
770
|
+
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
760
771
|
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
761
772
|
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
762
773
|
fi
|
@@ -786,9 +797,10 @@ setup_commands:
|
|
786
797
|
# properly written.
|
787
798
|
# TODO(Doyoung): Investigate to see why TPU workload fails to run without
|
788
799
|
# execution permission, such as granting 766 to log file. Check if it's a
|
789
|
-
# must and see if there's a workaround to grant minimum permission.
|
800
|
+
# must and see if there's a workaround to grant minimum permission.
|
790
801
|
sudo chmod 777 /tmp/tpu_logs;
|
791
802
|
{% endif %}
|
803
|
+
{{ ssh_max_sessions_config }}
|
792
804
|
|
793
805
|
# Format: `REMOTE_PATH : LOCAL_PATH`
|
794
806
|
file_mounts: {
|
sky/templates/lambda-ray.yml.j2
CHANGED
@@ -96,6 +96,7 @@ setup_commands:
|
|
96
96
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
97
97
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
98
98
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
99
|
+
{{ ssh_max_sessions_config }}
|
99
100
|
|
100
101
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
101
102
|
# We do not need to list it here anymore.
|
sky/templates/nebius-ray.yml.j2
CHANGED
@@ -10,6 +10,27 @@ provider:
|
|
10
10
|
module: sky.provision.nebius
|
11
11
|
region: "{{region}}"
|
12
12
|
|
13
|
+
{%- if docker_image is not none %}
|
14
|
+
docker:
|
15
|
+
image: {{docker_image}}
|
16
|
+
container_name: {{docker_container_name}}
|
17
|
+
run_options:
|
18
|
+
- --ulimit nofile=1048576:1048576
|
19
|
+
{%- for run_option in docker_run_options %}
|
20
|
+
- {{run_option}}
|
21
|
+
{%- endfor %}
|
22
|
+
{%- if docker_login_config is not none %}
|
23
|
+
docker_login_config:
|
24
|
+
username: |-
|
25
|
+
{{docker_login_config.username}}
|
26
|
+
password: |-
|
27
|
+
{{docker_login_config.password}}
|
28
|
+
server: |-
|
29
|
+
{{docker_login_config.server}}
|
30
|
+
{%- endif %}
|
31
|
+
{%- endif %}
|
32
|
+
|
33
|
+
|
13
34
|
auth:
|
14
35
|
ssh_user: ubuntu
|
15
36
|
ssh_private_key: {{ssh_private_key}}
|
@@ -22,6 +43,48 @@ available_node_types:
|
|
22
43
|
ImageId: {{image_id}}
|
23
44
|
DiskSize: {{disk_size}}
|
24
45
|
UserData: |
|
46
|
+
{%- if docker_image is not none %}
|
47
|
+
runcmd:
|
48
|
+
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
|
49
|
+
- systemctl restart sshd
|
50
|
+
{%- endif %}
|
51
|
+
|
52
|
+
{# Two available OS images:
|
53
|
+
1. ubuntu22.04-driverless - requires Docker installation
|
54
|
+
2. ubuntu22.04-cuda12 - comes with Docker pre-installed
|
55
|
+
To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
|
56
|
+
{%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
|
57
|
+
apt:
|
58
|
+
sources:
|
59
|
+
docker.list:
|
60
|
+
source: deb [arch=amd64] https://download.docker.com/linux/ubuntu $RELEASE stable
|
61
|
+
keyid: 9DC858229FC7DD38854AE2D88D81803C0EBFCD88
|
62
|
+
|
63
|
+
packages:
|
64
|
+
- apt-transport-https
|
65
|
+
- ca-certificates
|
66
|
+
- curl
|
67
|
+
- gnupg-agent
|
68
|
+
- software-properties-common
|
69
|
+
- docker-ce
|
70
|
+
- docker-ce-cli
|
71
|
+
- containerd.io
|
72
|
+
|
73
|
+
# Enable ipv4 forwarding, required on CIS hardened machines
|
74
|
+
write_files:
|
75
|
+
- path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
|
76
|
+
content: |
|
77
|
+
net.ipv4.conf.all.forwarding=1
|
78
|
+
|
79
|
+
# create the docker group
|
80
|
+
groups:
|
81
|
+
- docker
|
82
|
+
|
83
|
+
# Add default auto created user to docker group
|
84
|
+
system_info:
|
85
|
+
default_user:
|
86
|
+
groups: [docker]
|
87
|
+
{%- endif %}
|
25
88
|
users:
|
26
89
|
- name: skypilot:ssh_user
|
27
90
|
shell: /bin/bash
|
@@ -77,3 +140,4 @@ setup_commands:
|
|
77
140
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
78
141
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
79
142
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
143
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/oci-ray.yml.j2
CHANGED
@@ -91,7 +91,7 @@ setup_commands:
|
|
91
91
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
92
92
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
93
93
|
sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
|
94
|
+
{{ ssh_max_sessions_config }}
|
94
95
|
|
95
96
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
96
97
|
# We do not need to list it here anymore.
|
97
|
-
|
@@ -91,3 +91,4 @@ setup_commands:
|
|
91
91
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
92
92
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
93
93
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
94
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/runpod-ray.yml.j2
CHANGED
@@ -90,6 +90,7 @@ setup_commands:
|
|
90
90
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
91
91
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
92
92
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
93
|
+
{{ ssh_max_sessions_config }}
|
93
94
|
|
94
95
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
95
96
|
# We do not need to list it here anymore.
|
sky/templates/scp-ray.yml.j2
CHANGED
@@ -77,6 +77,7 @@ setup_commands:
|
|
77
77
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
78
78
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
79
79
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
80
|
+
{{ ssh_max_sessions_config }}
|
80
81
|
|
81
82
|
# Command to start ray on the head node. You don't need to change this.
|
82
83
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
sky/templates/vast-ray.yml.j2
CHANGED
@@ -64,7 +64,7 @@ setup_commands:
|
|
64
64
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
65
65
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
66
66
|
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
67
|
-
|
67
|
+
{{ ssh_max_sessions_config }}
|
68
68
|
|
69
69
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
70
70
|
# We do not need to list it here anymore.
|
sky/templates/vsphere-ray.yml.j2
CHANGED
@@ -71,3 +71,4 @@ setup_commands:
|
|
71
71
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
72
72
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
73
73
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
74
|
+
{{ ssh_max_sessions_config }}
|
File without changes
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
4
|
-
Summary: SkyPilot:
|
3
|
+
Version: 1.0.0.dev20250430
|
4
|
+
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
7
7
|
Project-URL: Homepage, https://github.com/skypilot-org/skypilot
|
@@ -47,7 +47,6 @@ Requires-Dist: python-multipart
|
|
47
47
|
Requires-Dist: aiofiles
|
48
48
|
Requires-Dist: httpx
|
49
49
|
Requires-Dist: setproctitle
|
50
|
-
Requires-Dist: omegaconf<2.5,>=2.4.0dev3
|
51
50
|
Provides-Extra: aws
|
52
51
|
Requires-Dist: urllib3<2; extra == "aws"
|
53
52
|
Requires-Dist: awscli>=1.27.10; extra == "aws"
|
@@ -204,6 +203,7 @@ Dynamic: summary
|
|
204
203
|
|
205
204
|
----
|
206
205
|
:fire: *News* :fire:
|
206
|
+
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
|
207
207
|
- [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
|
208
208
|
- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
|
209
209
|
- [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
|