skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250430__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +2 -0
  4. sky/cli.py +90 -37
  5. sky/client/cli.py +90 -37
  6. sky/client/sdk.py +3 -2
  7. sky/clouds/cloud.py +5 -2
  8. sky/clouds/kubernetes.py +4 -4
  9. sky/clouds/nebius.py +16 -10
  10. sky/clouds/service_catalog/constants.py +1 -1
  11. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  12. sky/core.py +58 -29
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/favicon.ico +0 -0
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/optimizer.py +35 -11
  22. sky/provision/docker_utils.py +22 -16
  23. sky/provision/kubernetes/utils.py +26 -24
  24. sky/resources.py +1 -1
  25. sky/server/common.py +6 -3
  26. sky/server/config.py +184 -0
  27. sky/server/requests/executor.py +17 -156
  28. sky/server/server.py +4 -4
  29. sky/setup_files/dependencies.py +0 -1
  30. sky/setup_files/setup.py +1 -1
  31. sky/skylet/constants.py +18 -0
  32. sky/skypilot_config.py +32 -11
  33. sky/templates/aws-ray.yml.j2 +2 -1
  34. sky/templates/azure-ray.yml.j2 +2 -1
  35. sky/templates/cudo-ray.yml.j2 +1 -0
  36. sky/templates/do-ray.yml.j2 +3 -2
  37. sky/templates/fluidstack-ray.yml.j2 +1 -1
  38. sky/templates/gcp-ray.yml.j2 +1 -1
  39. sky/templates/ibm-ray.yml.j2 +3 -3
  40. sky/templates/kubernetes-ray.yml.j2 +26 -14
  41. sky/templates/lambda-ray.yml.j2 +1 -0
  42. sky/templates/nebius-ray.yml.j2 +64 -0
  43. sky/templates/oci-ray.yml.j2 +1 -1
  44. sky/templates/paperspace-ray.yml.j2 +1 -0
  45. sky/templates/runpod-ray.yml.j2 +1 -0
  46. sky/templates/scp-ray.yml.j2 +1 -0
  47. sky/templates/vast-ray.yml.j2 +1 -1
  48. sky/templates/vsphere-ray.yml.j2 +1 -0
  49. sky/utils/aws/__init__.py +0 -0
  50. sky/utils/aws/get_default_security_group.py +11 -0
  51. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/METADATA +3 -3
  52. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/RECORD +58 -55
  53. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/WHEEL +1 -1
  54. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -35,6 +35,7 @@ from sky.jobs.server import server as jobs_rest
35
35
  from sky.provision.kubernetes import utils as kubernetes_utils
36
36
  from sky.serve.server import server as serve_rest
37
37
  from sky.server import common
38
+ from sky.server import config as server_config
38
39
  from sky.server import constants as server_constants
39
40
  from sky.server import stream_utils
40
41
  from sky.server.requests import executor
@@ -1166,13 +1167,12 @@ if __name__ == '__main__':
1166
1167
  # that it is shown only when the API server is started.
1167
1168
  usage_lib.maybe_show_privacy_policy()
1168
1169
 
1169
- num_workers = 1
1170
- if cmd_args.deploy:
1171
- num_workers = common_utils.get_cpu_count()
1170
+ config = server_config.compute_server_config(cmd_args.deploy)
1171
+ num_workers = config.num_server_workers
1172
1172
 
1173
1173
  sub_procs = []
1174
1174
  try:
1175
- sub_procs = executor.start(deploy=cmd_args.deploy)
1175
+ sub_procs = executor.start(config)
1176
1176
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1177
1177
  # We don't support reload for now, since it may cause leakage of request
1178
1178
  # workers or interrupt running requests.
@@ -53,7 +53,6 @@ install_requires = [
53
53
  'aiofiles',
54
54
  'httpx',
55
55
  'setproctitle',
56
- 'omegaconf>=2.4.0dev3,<2.5',
57
56
  ]
58
57
 
59
58
  local_ray = [
sky/setup_files/setup.py CHANGED
@@ -161,7 +161,7 @@ setuptools.setup(
161
161
  author='SkyPilot Team',
162
162
  license='Apache 2.0',
163
163
  readme='README.md',
164
- description='SkyPilot: An intercloud broker for the clouds',
164
+ description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
165
  long_description=long_description,
166
166
  long_description_content_type='text/markdown',
167
167
  setup_requires=['wheel'],
sky/skylet/constants.py CHANGED
@@ -280,6 +280,24 @@ USER_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER_ID'
280
280
  # runs on a VM launched by SkyPilot will be recognized as the same user.
281
281
  USER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER'
282
282
 
283
+ # SSH configuration to allow more concurrent sessions and connections.
284
+ # Default MaxSessions is 10.
285
+ # Default MaxStartups is 10:30:60, meaning:
286
+ # - Up to 10 unauthenticated connections are allowed without restriction.
287
+ # - From 11 to 60 connections, 30% are randomly dropped.
288
+ # - Above 60 connections, all are dropped.
289
+ # These defaults are too low for submitting many parallel jobs (e.g., 150),
290
+ # which can easily exceed the limits and cause connection failures.
291
+ # The new values (MaxSessions 200, MaxStartups 150:30:200) increase these
292
+ # limits significantly.
293
+ # TODO(zeping): Bake this configuration in SkyPilot default images.
294
+ SET_SSH_MAX_SESSIONS_CONFIG_CMD = (
295
+ 'sudo bash -c \''
296
+ 'echo "MaxSessions 200" >> /etc/ssh/sshd_config; '
297
+ 'echo "MaxStartups 150:30:200" >> /etc/ssh/sshd_config; '
298
+ '(systemctl reload sshd || service ssh reload); '
299
+ '\'')
300
+
283
301
  # Internal: Env var indicating the system is running with a remote API server.
284
302
  # It is used for internal purposes, including the jobs controller to mark
285
303
  # clusters as launched with a remote API server.
sky/skypilot_config.py CHANGED
@@ -56,8 +56,6 @@ import threading
56
56
  import typing
57
57
  from typing import Any, Dict, Iterator, List, Optional, Tuple
58
58
 
59
- from omegaconf import OmegaConf
60
-
61
59
  from sky import exceptions
62
60
  from sky import sky_logging
63
61
  from sky.adaptors import common as adaptors_common
@@ -141,7 +139,7 @@ def get_user_config() -> config_utils.Config:
141
139
 
142
140
  # load the user config file
143
141
  if os.path.exists(user_config_path):
144
- user_config = _parse_config_file(user_config_path)
142
+ user_config = parse_config_file(user_config_path)
145
143
  _validate_config(user_config, user_config_path)
146
144
  else:
147
145
  user_config = config_utils.Config()
@@ -170,7 +168,7 @@ def _get_project_config() -> config_utils.Config:
170
168
 
171
169
  # load the project config file
172
170
  if os.path.exists(project_config_path):
173
- project_config = _parse_config_file(project_config_path)
171
+ project_config = parse_config_file(project_config_path)
174
172
  _validate_config(project_config, project_config_path)
175
173
  else:
176
174
  project_config = config_utils.Config()
@@ -199,7 +197,7 @@ def get_server_config() -> config_utils.Config:
199
197
 
200
198
  # load the server config file
201
199
  if os.path.exists(server_config_path):
202
- server_config = _parse_config_file(server_config_path)
200
+ server_config = parse_config_file(server_config_path)
203
201
  _validate_config(server_config, server_config_path)
204
202
  else:
205
203
  server_config = config_utils.Config()
@@ -304,7 +302,7 @@ def _reload_config() -> None:
304
302
  _reload_config_as_client()
305
303
 
306
304
 
307
- def _parse_config_file(config_path: str) -> config_utils.Config:
305
+ def parse_config_file(config_path: str) -> config_utils.Config:
308
306
  config = config_utils.Config()
309
307
  try:
310
308
  config_dict = common_utils.read_yaml(config_path)
@@ -321,6 +319,31 @@ def _parse_config_file(config_path: str) -> config_utils.Config:
321
319
  return config
322
320
 
323
321
 
322
+ def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
323
+ """Parse a comma-separated list of key-value pairs into a dictionary.
324
+
325
+ Args:
326
+ dotlist: A comma-separated list of key-value pairs.
327
+
328
+ Returns:
329
+ A config_utils.Config object with the parsed key-value pairs.
330
+ """
331
+ config: config_utils.Config = config_utils.Config()
332
+ for arg in dotlist:
333
+ try:
334
+ key, value = arg.split('=', 1)
335
+ except ValueError as e:
336
+ raise ValueError(f'Invalid config override: {arg}. '
337
+ 'Please use the format: key=value') from e
338
+ if len(key) == 0 or len(value) == 0:
339
+ raise ValueError(f'Invalid config override: {arg}. '
340
+ 'Please use the format: key=value')
341
+ value = yaml.safe_load(value)
342
+ nested_keys = tuple(key.split('.'))
343
+ config.set_nested(nested_keys, value)
344
+ return config
345
+
346
+
324
347
  def _reload_config_from_internal_file(internal_config_path: str) -> None:
325
348
  global _dict, _loaded_config_path
326
349
  # Reset the global variables, to avoid using stale values.
@@ -336,7 +359,7 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
336
359
  'exist. Please double check the path or unset the env var: '
337
360
  f'unset {ENV_VAR_SKYPILOT_CONFIG}')
338
361
  logger.debug(f'Using config path: {config_path}')
339
- _dict = _parse_config_file(config_path)
362
+ _dict = parse_config_file(config_path)
340
363
  _loaded_config_path = config_path
341
364
 
342
365
 
@@ -483,11 +506,9 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
483
506
  'Cannot use multiple --config flags with a config file.')
484
507
  config_source = maybe_config_path
485
508
  # cli_config is a path to a config file
486
- parsed_config = OmegaConf.to_object(
487
- OmegaConf.load(maybe_config_path))
509
+ parsed_config = parse_config_file(maybe_config_path)
488
510
  else: # cli_config is a comma-separated list of key-value pairs
489
- parsed_config = OmegaConf.to_object(
490
- OmegaConf.from_dotlist(cli_config))
511
+ parsed_config = _parse_dotlist(cli_config)
491
512
  _validate_config(parsed_config, config_source)
492
513
  except ValueError as e:
493
514
  raise ValueError(f'Invalid config override: {cli_config}. '
@@ -142,7 +142,7 @@ available_node_types:
142
142
  {%- endfor %}
143
143
  # Use IDMSv2
144
144
  MetadataOptions:
145
- HttpTokens: required
145
+ HttpTokens: required
146
146
 
147
147
  head_node_type: ray.head.default
148
148
 
@@ -192,6 +192,7 @@ setup_commands:
192
192
  {%- if remote_identity != 'LOCAL_CREDENTIALS' %}
193
193
  rm ~/.aws/credentials || true;
194
194
  {%- endif %}
195
+ {{ ssh_max_sessions_config }}
195
196
 
196
197
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
197
198
  # We do not need to list it here anymore.
@@ -68,7 +68,7 @@ available_node_types:
68
68
  imageOffer: {{image_offer}}
69
69
  imageSku: "{{image_sku}}"
70
70
  imageVersion: {{image_version}}
71
- # Community Gallery Image ID
71
+ # Community Gallery Image ID
72
72
  communityGalleryImageId: {{community_gallery_image_id}}
73
73
  osDiskSizeGB: {{disk_size}}
74
74
  osDiskTier: {{disk_tier}}
@@ -130,3 +130,4 @@ setup_commands:
130
130
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
131
131
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
132
132
  sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true;
133
+ {{ ssh_max_sessions_config }}
@@ -73,3 +73,4 @@ setup_commands:
73
73
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
74
74
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
75
75
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
76
+ {{ ssh_max_sessions_config }}
@@ -93,6 +93,7 @@ setup_commands:
93
93
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
94
94
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
95
95
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
96
-
97
- # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
96
+ {{ ssh_max_sessions_config }}
97
+
98
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
98
99
  # We do not need to list it here anymore.
@@ -74,4 +74,4 @@ setup_commands:
74
74
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
75
75
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
76
76
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
77
-
77
+ {{ ssh_max_sessions_config }}
@@ -1,4 +1,3 @@
1
-
2
1
  cluster_name: {{cluster_name_on_cloud}}
3
2
 
4
3
  # The maximum number of workers nodes to launch in addition to the head node.
@@ -225,6 +224,7 @@ setup_commands:
225
224
  {%- endif %}
226
225
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
227
226
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
227
+ {{ ssh_max_sessions_config }}
228
228
 
229
229
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
230
230
  # We do not need to list it here anymore.
@@ -32,7 +32,7 @@ available_node_types:
32
32
  ray_head_default:
33
33
  resources: {{instance_resources}}
34
34
  node_config:
35
- image_id: {{image_id}}
35
+ image_id: {{image_id}}
36
36
  boot_volume_capacity: {{disk_capacity}}
37
37
  volume_tier_name: general-purpose
38
38
  instance_profile_name: {{instance_type}}
@@ -48,7 +48,7 @@ available_node_types:
48
48
  max_workers: {{num_nodes - 1}}
49
49
  resources: {{worker_instance_resources}}
50
50
  node_config:
51
- image_id: {{image_id}}
51
+ image_id: {{image_id}}
52
52
  boot_volume_capacity: {{disk_capacity}}
53
53
  volume_tier_name: general-purpose
54
54
  instance_profile_name: {{worker_instance_type}}
@@ -106,7 +106,7 @@ setup_commands:
106
106
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
107
107
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
108
108
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
109
-
109
+ {{ ssh_max_sessions_config }}
110
110
 
111
111
  # Command to start ray on the head node. You don't need to change this.
112
112
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
@@ -258,7 +258,7 @@ available_node_types:
258
258
  # service is required.
259
259
  labels:
260
260
  parent: skypilot
261
- # component will be set for the head node pod to be the same as the head node service selector above if a
261
+ # component will be set for the head node pod to be the same as the head node service selector above if a
262
262
  skypilot-cluster: {{cluster_name_on_cloud}}
263
263
  # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
264
264
  skypilot-ssh-jump: {{k8s_ssh_jump_name}}
@@ -277,11 +277,8 @@ available_node_types:
277
277
  restartPolicy: {{ "Always" if high_availability else "Never" }}
278
278
 
279
279
  # Add node selector if GPU/TPUs are requested:
280
- {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
280
+ {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
281
281
  nodeSelector:
282
- {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
283
- {{k8s_acc_label_key}}: {{k8s_acc_label_value}}
284
- {% endif %}
285
282
  {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
286
283
  {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
287
284
  {% endif %}
@@ -289,6 +286,19 @@ available_node_types:
289
286
  {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
290
287
  {% endif %}
291
288
  {% endif %}
289
+ {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
290
+ affinity:
291
+ nodeAffinity:
292
+ requiredDuringSchedulingIgnoredDuringExecution:
293
+ nodeSelectorTerms:
294
+ - matchExpressions:
295
+ - key: {{k8s_acc_label_key}}
296
+ operator: In
297
+ values:
298
+ {% for label_value in k8s_acc_label_values %}
299
+ - {{label_value}}
300
+ {% endfor %}
301
+ {% endif %}
292
302
 
293
303
  {% if k8s_spot_label_key is not none %}
294
304
  tolerations:
@@ -339,15 +349,15 @@ available_node_types:
339
349
  # Do not change this command - it keeps the pod alive until it is
340
350
  # explicitly killed.
341
351
  command: ["/bin/bash", "-c", "--"]
342
- args:
352
+ args:
343
353
  - |
344
354
  # For backwards compatibility, we put a marker file in the pod
345
- # to indicate that the pod is running with the changes introduced
355
+ # to indicate that the pod is running with the changes introduced
346
356
  # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
347
357
  # TODO: Remove this marker file and it's usage in setup_commands
348
358
  # after v0.10.0 release.
349
359
  touch /tmp/skypilot_is_nimbus
350
-
360
+
351
361
  # Helper function to conditionally use sudo
352
362
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
353
363
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -390,7 +400,7 @@ available_node_types:
390
400
  fi;
391
401
  # SSH and other packages are not necessary, so we disable set -e
392
402
  set +e
393
-
403
+
394
404
  if [ ! -z "$MISSING_PACKAGES" ]; then
395
405
  # Install missing packages individually to avoid failure installation breaks the whole install process,
396
406
  # e.g. fuse3 is not available on some distributions.
@@ -443,7 +453,7 @@ available_node_types:
443
453
  $(prefix_cmd) rm -f /bin/fusermount-wrapper
444
454
  $(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
445
455
  fi
446
- {% endif %}
456
+ {% endif %}
447
457
 
448
458
  $(prefix_cmd) mkdir -p /var/run/sshd;
449
459
  $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
@@ -574,7 +584,7 @@ available_node_types:
574
584
  # File is already being monitored
575
585
  continue
576
586
  fi
577
-
587
+
578
588
  # Monitor the new file
579
589
  monitor_file $file &
580
590
  already_monitored="${already_monitored} ${file}"
@@ -651,7 +661,7 @@ available_node_types:
651
661
  {{k8s_resource_key}}: {{accelerator_count}}
652
662
  {% endif %}
653
663
  {% endif %}
654
-
664
+
655
665
  {% if high_availability %}
656
666
  pvc_spec:
657
667
  apiVersion: v1
@@ -737,6 +747,7 @@ available_node_types:
737
747
  mountPath: /mnt/home # Temporary mount point for initialization
738
748
  # should be replaced by pod spec
739
749
  {% endif %}
750
+
740
751
  setup_commands:
741
752
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
742
753
  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
@@ -756,7 +767,7 @@ setup_commands:
756
767
  echo "=== Logs for asynchronous ray and skypilot installation ===";
757
768
  if [ -f /tmp/skypilot_is_nimbus ]; then
758
769
  echo "=== Logs for asynchronous ray and skypilot installation ===";
759
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
770
+ [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
760
771
  { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
761
772
  [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
762
773
  fi
@@ -786,9 +797,10 @@ setup_commands:
786
797
  # properly written.
787
798
  # TODO(Doyoung): Investigate to see why TPU workload fails to run without
788
799
  # execution permission, such as granting 766 to log file. Check if it's a
789
- # must and see if there's a workaround to grant minimum permission.
800
+ # must and see if there's a workaround to grant minimum permission.
790
801
  sudo chmod 777 /tmp/tpu_logs;
791
802
  {% endif %}
803
+ {{ ssh_max_sessions_config }}
792
804
 
793
805
  # Format: `REMOTE_PATH : LOCAL_PATH`
794
806
  file_mounts: {
@@ -96,6 +96,7 @@ setup_commands:
96
96
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
97
97
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
98
98
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
99
+ {{ ssh_max_sessions_config }}
99
100
 
100
101
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
101
102
  # We do not need to list it here anymore.
@@ -10,6 +10,27 @@ provider:
10
10
  module: sky.provision.nebius
11
11
  region: "{{region}}"
12
12
 
13
+ {%- if docker_image is not none %}
14
+ docker:
15
+ image: {{docker_image}}
16
+ container_name: {{docker_container_name}}
17
+ run_options:
18
+ - --ulimit nofile=1048576:1048576
19
+ {%- for run_option in docker_run_options %}
20
+ - {{run_option}}
21
+ {%- endfor %}
22
+ {%- if docker_login_config is not none %}
23
+ docker_login_config:
24
+ username: |-
25
+ {{docker_login_config.username}}
26
+ password: |-
27
+ {{docker_login_config.password}}
28
+ server: |-
29
+ {{docker_login_config.server}}
30
+ {%- endif %}
31
+ {%- endif %}
32
+
33
+
13
34
  auth:
14
35
  ssh_user: ubuntu
15
36
  ssh_private_key: {{ssh_private_key}}
@@ -22,6 +43,48 @@ available_node_types:
22
43
  ImageId: {{image_id}}
23
44
  DiskSize: {{disk_size}}
24
45
  UserData: |
46
+ {%- if docker_image is not none %}
47
+ runcmd:
48
+ - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
49
+ - systemctl restart sshd
50
+ {%- endif %}
51
+
52
+ {# Two available OS images:
53
+ 1. ubuntu22.04-driverless - requires Docker installation
54
+ 2. ubuntu22.04-cuda12 - comes with Docker pre-installed
55
+ To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
56
+ {%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
57
+ apt:
58
+ sources:
59
+ docker.list:
60
+ source: deb [arch=amd64] https://download.docker.com/linux/ubuntu $RELEASE stable
61
+ keyid: 9DC858229FC7DD38854AE2D88D81803C0EBFCD88
62
+
63
+ packages:
64
+ - apt-transport-https
65
+ - ca-certificates
66
+ - curl
67
+ - gnupg-agent
68
+ - software-properties-common
69
+ - docker-ce
70
+ - docker-ce-cli
71
+ - containerd.io
72
+
73
+ # Enable ipv4 forwarding, required on CIS hardened machines
74
+ write_files:
75
+ - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
76
+ content: |
77
+ net.ipv4.conf.all.forwarding=1
78
+
79
+ # create the docker group
80
+ groups:
81
+ - docker
82
+
83
+ # Add default auto created user to docker group
84
+ system_info:
85
+ default_user:
86
+ groups: [docker]
87
+ {%- endif %}
25
88
  users:
26
89
  - name: skypilot:ssh_user
27
90
  shell: /bin/bash
@@ -77,3 +140,4 @@ setup_commands:
77
140
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
78
141
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
79
142
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
143
+ {{ ssh_max_sessions_config }}
@@ -91,7 +91,7 @@ setup_commands:
91
91
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
92
92
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
93
93
  sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
94
+ {{ ssh_max_sessions_config }}
94
95
 
95
96
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
96
97
  # We do not need to list it here anymore.
97
-
@@ -91,3 +91,4 @@ setup_commands:
91
91
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
92
92
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
93
93
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
94
+ {{ ssh_max_sessions_config }}
@@ -90,6 +90,7 @@ setup_commands:
90
90
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
91
91
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
92
92
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
93
+ {{ ssh_max_sessions_config }}
93
94
 
94
95
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
95
96
  # We do not need to list it here anymore.
@@ -77,6 +77,7 @@ setup_commands:
77
77
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
78
78
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
79
79
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
80
+ {{ ssh_max_sessions_config }}
80
81
 
81
82
  # Command to start ray on the head node. You don't need to change this.
82
83
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
@@ -64,7 +64,7 @@ setup_commands:
64
64
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
65
65
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
66
66
  (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
67
-
67
+ {{ ssh_max_sessions_config }}
68
68
 
69
69
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
70
70
  # We do not need to list it here anymore.
@@ -71,3 +71,4 @@ setup_commands:
71
71
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
72
72
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
73
73
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
74
+ {{ ssh_max_sessions_config }}
File without changes
@@ -0,0 +1,11 @@
1
+ """Script to get the default security group"""
2
+ from sky.clouds import aws
3
+
4
+
5
+ def main():
6
+ default_security_group = aws.DEFAULT_SECURITY_GROUP_NAME
7
+ print(f'{default_security_group}')
8
+
9
+
10
+ if __name__ == '__main__':
11
+ main()
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250428
4
- Summary: SkyPilot: An intercloud broker for the clouds
3
+ Version: 1.0.0.dev20250430
4
+ Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
7
7
  Project-URL: Homepage, https://github.com/skypilot-org/skypilot
@@ -47,7 +47,6 @@ Requires-Dist: python-multipart
47
47
  Requires-Dist: aiofiles
48
48
  Requires-Dist: httpx
49
49
  Requires-Dist: setproctitle
50
- Requires-Dist: omegaconf<2.5,>=2.4.0dev3
51
50
  Provides-Extra: aws
52
51
  Requires-Dist: urllib3<2; extra == "aws"
53
52
  Requires-Dist: awscli>=1.27.10; extra == "aws"
@@ -204,6 +203,7 @@ Dynamic: summary
204
203
 
205
204
  ----
206
205
  :fire: *News* :fire:
206
+ - [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
207
207
  - [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
208
208
  - [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
209
209
  - [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)