skypilot-nightly 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -15
  3. sky/backends/cloud_vm_ray_backend.py +21 -3
  4. sky/clouds/aws.py +1 -0
  5. sky/clouds/azure.py +1 -0
  6. sky/clouds/cloud.py +1 -0
  7. sky/clouds/cudo.py +1 -0
  8. sky/clouds/fluidstack.py +1 -0
  9. sky/clouds/gcp.py +1 -0
  10. sky/clouds/ibm.py +1 -0
  11. sky/clouds/kubernetes.py +45 -3
  12. sky/clouds/lambda_cloud.py +1 -0
  13. sky/clouds/oci.py +1 -0
  14. sky/clouds/paperspace.py +1 -0
  15. sky/clouds/runpod.py +1 -0
  16. sky/clouds/scp.py +1 -0
  17. sky/clouds/vsphere.py +1 -0
  18. sky/provision/instance_setup.py +80 -83
  19. sky/provision/kubernetes/instance.py +108 -76
  20. sky/provision/kubernetes/utils.py +2 -0
  21. sky/provision/oci/instance.py +4 -2
  22. sky/provision/provisioner.py +95 -19
  23. sky/resources.py +2 -1
  24. sky/skylet/constants.py +31 -21
  25. sky/templates/kubernetes-ray.yml.j2 +169 -39
  26. sky/utils/subprocess_utils.py +49 -4
  27. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/METADATA +65 -55
  28. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/RECORD +32 -32
  29. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/WHEEL +1 -1
  30. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/LICENSE +0 -0
  31. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/entry_points.txt +0 -0
  32. {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '789a9ea6fc54104219ca20aa785ccf69e1d30294'
8
+ _SKYPILOT_COMMIT_SHA = '204d979fedece9b7b789dcd2610d1ebdbc8d1fc5'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241120'
38
+ __version__ = '1.0.0.dev20241122'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -683,7 +683,7 @@ def write_cluster_config(
683
683
  resources_utils.ClusterName(
684
684
  cluster_name,
685
685
  cluster_name_on_cloud,
686
- ), region, zones, dryrun)
686
+ ), region, zones, num_nodes, dryrun)
687
687
  config_dict = {}
688
688
 
689
689
  specific_reservations = set(
@@ -844,7 +844,11 @@ def write_cluster_config(
844
844
  '{sky_wheel_hash}',
845
845
  wheel_hash).replace('{cloud}',
846
846
  str(cloud).lower())),
847
-
847
+ 'skypilot_wheel_installation_commands':
848
+ constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
849
+ '{sky_wheel_hash}',
850
+ wheel_hash).replace('{cloud}',
851
+ str(cloud).lower()),
848
852
  # Port of Ray (GCS server).
849
853
  # Ray's default port 6379 is conflicted with Redis.
850
854
  'ray_port': constants.SKY_REMOTE_RAY_PORT,
@@ -1191,18 +1195,18 @@ def ssh_credential_from_yaml(
1191
1195
 
1192
1196
 
1193
1197
  def parallel_data_transfer_to_nodes(
1194
- runners: List[command_runner.CommandRunner],
1195
- source: Optional[str],
1196
- target: str,
1197
- cmd: Optional[str],
1198
- run_rsync: bool,
1199
- *,
1200
- action_message: str,
1201
- # Advanced options.
1202
- log_path: str = os.devnull,
1203
- stream_logs: bool = False,
1204
- source_bashrc: bool = False,
1205
- ):
1198
+ runners: List[command_runner.CommandRunner],
1199
+ source: Optional[str],
1200
+ target: str,
1201
+ cmd: Optional[str],
1202
+ run_rsync: bool,
1203
+ *,
1204
+ action_message: str,
1205
+ # Advanced options.
1206
+ log_path: str = os.devnull,
1207
+ stream_logs: bool = False,
1208
+ source_bashrc: bool = False,
1209
+ num_threads: Optional[int] = None):
1206
1210
  """Runs a command on all nodes and optionally runs rsync from src->dst.
1207
1211
 
1208
1212
  Args:
@@ -1214,6 +1218,7 @@ def parallel_data_transfer_to_nodes(
1214
1218
  log_path: str; Path to the log file
1215
1219
  stream_logs: bool; Whether to stream logs to stdout
1216
1220
  source_bashrc: bool; Source bashrc before running the command.
1221
+ num_threads: Optional[int]; Number of threads to use.
1217
1222
  """
1218
1223
  style = colorama.Style
1219
1224
 
@@ -1254,7 +1259,7 @@ def parallel_data_transfer_to_nodes(
1254
1259
  message = (f' {style.DIM}{action_message} (to {num_nodes} node{plural})'
1255
1260
  f': {origin_source} -> {target}{style.RESET_ALL}')
1256
1261
  logger.info(message)
1257
- subprocess_utils.run_in_parallel(_sync_node, runners)
1262
+ subprocess_utils.run_in_parallel(_sync_node, runners, num_threads)
1258
1263
 
1259
1264
 
1260
1265
  def check_local_gpus() -> bool:
@@ -269,6 +269,13 @@ class RayCodeGen:
269
269
  import time
270
270
  from typing import Dict, List, Optional, Tuple, Union
271
271
 
272
+ # Set the environment variables to avoid deduplicating logs and
273
+ # scheduler events. This should be set in driver code, since we are
274
+ # not using `ray job submit` anymore, and the environment variables
275
+ # from the ray cluster is not inherited.
276
+ os.environ['RAY_DEDUP_LOGS'] = '0'
277
+ os.environ['RAY_SCHEDULER_EVENTS'] = '0'
278
+
272
279
  import ray
273
280
  import ray.util as ray_util
274
281
 
@@ -1528,7 +1535,7 @@ class RetryingVmProvisioner(object):
1528
1535
  to_provision,
1529
1536
  resources_utils.ClusterName(
1530
1537
  cluster_name, handle.cluster_name_on_cloud),
1531
- region, zones))
1538
+ region, zones, num_nodes))
1532
1539
  config_dict['provision_record'] = provision_record
1533
1540
  config_dict['resources_vars'] = resources_vars
1534
1541
  config_dict['handle'] = handle
@@ -3086,9 +3093,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3086
3093
  f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3087
3094
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3088
3095
  os.system(f'touch {log_path}')
3096
+ num_threads = subprocess_utils.get_parallel_threads(
3097
+ str(handle.launched_resources.cloud))
3089
3098
  with rich_utils.safe_status(
3090
3099
  ux_utils.spinner_message('Syncing workdir', log_path)):
3091
- subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
3100
+ subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
3101
+ num_threads)
3092
3102
  logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
3093
3103
 
3094
3104
  def _sync_file_mounts(
@@ -4416,6 +4426,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4416
4426
  start = time.time()
4417
4427
  runners = handle.get_command_runners()
4418
4428
  log_path = os.path.join(self.log_dir, 'file_mounts.log')
4429
+ num_threads = subprocess_utils.get_max_workers_for_file_mounts(
4430
+ file_mounts, str(handle.launched_resources.cloud))
4419
4431
 
4420
4432
  # Check the files and warn
4421
4433
  for dst, src in file_mounts.items():
@@ -4477,6 +4489,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4477
4489
  action_message='Syncing',
4478
4490
  log_path=log_path,
4479
4491
  stream_logs=False,
4492
+ num_threads=num_threads,
4480
4493
  )
4481
4494
  continue
4482
4495
 
@@ -4513,6 +4526,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4513
4526
  # Need to source bashrc, as the cloud specific CLI or SDK may
4514
4527
  # require PATH in bashrc.
4515
4528
  source_bashrc=True,
4529
+ num_threads=num_threads,
4516
4530
  )
4517
4531
  # (2) Run the commands to create symlinks on all the nodes.
4518
4532
  symlink_command = ' && '.join(symlink_commands)
@@ -4531,7 +4545,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4531
4545
  'Failed to create symlinks. The target destination '
4532
4546
  f'may already exist. Log: {log_path}')
4533
4547
 
4534
- subprocess_utils.run_in_parallel(_symlink_node, runners)
4548
+ subprocess_utils.run_in_parallel(_symlink_node, runners,
4549
+ num_threads)
4535
4550
  end = time.time()
4536
4551
  logger.debug(f'File mount sync took {end - start} seconds.')
4537
4552
  logger.info(ux_utils.finishing_message('Files synced.', log_path))
@@ -4560,6 +4575,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4560
4575
  return
4561
4576
  start = time.time()
4562
4577
  runners = handle.get_command_runners()
4578
+ num_threads = subprocess_utils.get_parallel_threads(
4579
+ str(handle.launched_resources.cloud))
4563
4580
  log_path = os.path.join(self.log_dir, 'storage_mounts.log')
4564
4581
 
4565
4582
  plural = 's' if len(storage_mounts) > 1 else ''
@@ -4598,6 +4615,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4598
4615
  # Need to source bashrc, as the cloud specific CLI or SDK
4599
4616
  # may require PATH in bashrc.
4600
4617
  source_bashrc=True,
4618
+ num_threads=num_threads,
4601
4619
  )
4602
4620
  except exceptions.CommandError as e:
4603
4621
  if e.returncode == exceptions.MOUNT_PATH_NON_EMPTY_CODE:
sky/clouds/aws.py CHANGED
@@ -401,6 +401,7 @@ class AWS(clouds.Cloud):
401
401
  cluster_name: resources_utils.ClusterName,
402
402
  region: 'clouds.Region',
403
403
  zones: Optional[List['clouds.Zone']],
404
+ num_nodes: int,
404
405
  dryrun: bool = False) -> Dict[str, Any]:
405
406
  del dryrun # unused
406
407
  assert zones is not None, (region, zones)
sky/clouds/azure.py CHANGED
@@ -302,6 +302,7 @@ class Azure(clouds.Cloud):
302
302
  cluster_name: resources_utils.ClusterName,
303
303
  region: 'clouds.Region',
304
304
  zones: Optional[List['clouds.Zone']],
305
+ num_nodes: int,
305
306
  dryrun: bool = False) -> Dict[str, Any]:
306
307
  assert zones is None, ('Azure does not support zones', zones)
307
308
 
sky/clouds/cloud.py CHANGED
@@ -283,6 +283,7 @@ class Cloud:
283
283
  cluster_name: resources_utils.ClusterName,
284
284
  region: 'Region',
285
285
  zones: Optional[List['Zone']],
286
+ num_nodes: int,
286
287
  dryrun: bool = False,
287
288
  ) -> Dict[str, Optional[str]]:
288
289
  """Converts planned sky.Resources to cloud-specific resource variables.
sky/clouds/cudo.py CHANGED
@@ -196,6 +196,7 @@ class Cudo(clouds.Cloud):
196
196
  cluster_name: resources_utils.ClusterName,
197
197
  region: 'clouds.Region',
198
198
  zones: Optional[List['clouds.Zone']],
199
+ num_nodes: int,
199
200
  dryrun: bool = False,
200
201
  ) -> Dict[str, Optional[str]]:
201
202
  del zones, cluster_name # unused
sky/clouds/fluidstack.py CHANGED
@@ -176,6 +176,7 @@ class Fluidstack(clouds.Cloud):
176
176
  cluster_name: resources_utils.ClusterName,
177
177
  region: clouds.Region,
178
178
  zones: Optional[List[clouds.Zone]],
179
+ num_nodes: int,
179
180
  dryrun: bool = False,
180
181
  ) -> Dict[str, Optional[str]]:
181
182
 
sky/clouds/gcp.py CHANGED
@@ -417,6 +417,7 @@ class GCP(clouds.Cloud):
417
417
  cluster_name: resources_utils.ClusterName,
418
418
  region: 'clouds.Region',
419
419
  zones: Optional[List['clouds.Zone']],
420
+ num_nodes: int,
420
421
  dryrun: bool = False) -> Dict[str, Optional[str]]:
421
422
  assert zones is not None, (region, zones)
422
423
 
sky/clouds/ibm.py CHANGED
@@ -170,6 +170,7 @@ class IBM(clouds.Cloud):
170
170
  cluster_name: resources_utils.ClusterName,
171
171
  region: 'clouds.Region',
172
172
  zones: Optional[List['clouds.Zone']],
173
+ num_nodes: int,
173
174
  dryrun: bool = False,
174
175
  ) -> Dict[str, Optional[str]]:
175
176
  """Converts planned sky.Resources to cloud-specific resource variables.
sky/clouds/kubernetes.py CHANGED
@@ -10,8 +10,10 @@ from sky import sky_logging
10
10
  from sky import skypilot_config
11
11
  from sky.adaptors import kubernetes
12
12
  from sky.clouds import service_catalog
13
+ from sky.provision import instance_setup
13
14
  from sky.provision.kubernetes import network_utils
14
15
  from sky.provision.kubernetes import utils as kubernetes_utils
16
+ from sky.skylet import constants
15
17
  from sky.utils import common_utils
16
18
  from sky.utils import resources_utils
17
19
  from sky.utils import schemas
@@ -311,12 +313,34 @@ class Kubernetes(clouds.Cloud):
311
313
  # we don't have a notion of disk size in Kubernetes.
312
314
  return 0
313
315
 
316
+ @staticmethod
317
+ def _calculate_provision_timeout(num_nodes: int) -> int:
318
+ """Calculate provision timeout based on number of nodes.
319
+
320
+ The timeout scales linearly with the number of nodes to account for
321
+ scheduling overhead, but is capped to avoid excessive waiting.
322
+
323
+ Args:
324
+ num_nodes: Number of nodes being provisioned
325
+
326
+ Returns:
327
+ Timeout in seconds
328
+ """
329
+ base_timeout = 10 # Base timeout for single node
330
+ per_node_timeout = 0.2 # Additional seconds per node
331
+ max_timeout = 60 # Cap at 1 minute
332
+
333
+ return int(
334
+ min(base_timeout + (per_node_timeout * (num_nodes - 1)),
335
+ max_timeout))
336
+
314
337
  def make_deploy_resources_variables(
315
338
  self,
316
339
  resources: 'resources_lib.Resources',
317
340
  cluster_name: resources_utils.ClusterName,
318
341
  region: Optional['clouds.Region'],
319
342
  zones: Optional[List['clouds.Zone']],
343
+ num_nodes: int,
320
344
  dryrun: bool = False) -> Dict[str, Optional[str]]:
321
345
  del cluster_name, zones, dryrun # Unused.
322
346
  if region is None:
@@ -413,12 +437,24 @@ class Kubernetes(clouds.Cloud):
413
437
  # Larger timeout may be required for autoscaling clusters, since
414
438
  # autoscaler may take some time to provision new nodes.
415
439
  # Note that this timeout includes time taken by the Kubernetes scheduler
416
- # itself, which can be upto 2-3 seconds.
417
- # For non-autoscaling clusters, we conservatively set this to 10s.
440
+ # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
441
+ # scheduling 100s of pods.
442
+ # We use a linear scaling formula to determine the timeout based on the
443
+ # number of nodes.
444
+
445
+ timeout = self._calculate_provision_timeout(num_nodes)
418
446
  timeout = skypilot_config.get_nested(
419
447
  ('kubernetes', 'provision_timeout'),
420
- 10,
448
+ timeout,
421
449
  override_configs=resources.cluster_config_overrides)
450
+ # We specify object-store-memory to be 500MB to avoid taking up too
451
+ # much memory on the head node. 'num-cpus' should be set to limit
452
+ # the CPU usage on the head pod, otherwise the ray cluster will use the
453
+ # CPU resources on the node instead within the pod.
454
+ custom_ray_options = {
455
+ 'object-store-memory': 500000000,
456
+ 'num-cpus': str(int(cpus)),
457
+ }
422
458
  deploy_vars = {
423
459
  'instance_type': resources.instance_type,
424
460
  'custom_resources': custom_resources,
@@ -445,6 +481,12 @@ class Kubernetes(clouds.Cloud):
445
481
  'k8s_topology_label_value': k8s_topology_label_value,
446
482
  'k8s_resource_key': k8s_resource_key,
447
483
  'image_id': image_id,
484
+ 'ray_installation_commands': constants.RAY_INSTALLATION_COMMANDS,
485
+ 'ray_head_start_command': instance_setup.ray_head_start_command(
486
+ custom_resources, custom_ray_options),
487
+ 'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
488
+ 'ray_worker_start_command': instance_setup.ray_worker_start_command(
489
+ custom_resources, custom_ray_options, no_restart=False),
448
490
  }
449
491
 
450
492
  # Add kubecontext if it is set. It may be None if SkyPilot is running
@@ -157,6 +157,7 @@ class Lambda(clouds.Cloud):
157
157
  cluster_name: resources_utils.ClusterName,
158
158
  region: 'clouds.Region',
159
159
  zones: Optional[List['clouds.Zone']],
160
+ num_nodes: int,
160
161
  dryrun: bool = False) -> Dict[str, Optional[str]]:
161
162
  del cluster_name, dryrun # Unused.
162
163
  assert zones is None, 'Lambda does not support zones.'
sky/clouds/oci.py CHANGED
@@ -208,6 +208,7 @@ class OCI(clouds.Cloud):
208
208
  cluster_name: resources_utils.ClusterName,
209
209
  region: Optional['clouds.Region'],
210
210
  zones: Optional[List['clouds.Zone']],
211
+ num_nodes: int,
211
212
  dryrun: bool = False) -> Dict[str, Optional[str]]:
212
213
  del cluster_name, dryrun # Unused.
213
214
  assert region is not None, resources
sky/clouds/paperspace.py CHANGED
@@ -175,6 +175,7 @@ class Paperspace(clouds.Cloud):
175
175
  cluster_name: resources_utils.ClusterName,
176
176
  region: 'clouds.Region',
177
177
  zones: Optional[List['clouds.Zone']],
178
+ num_nodes: int,
178
179
  dryrun: bool = False) -> Dict[str, Optional[str]]:
179
180
  del zones, dryrun, cluster_name
180
181
 
sky/clouds/runpod.py CHANGED
@@ -160,6 +160,7 @@ class RunPod(clouds.Cloud):
160
160
  cluster_name: resources_utils.ClusterName,
161
161
  region: 'clouds.Region',
162
162
  zones: Optional[List['clouds.Zone']],
163
+ num_nodes: int,
163
164
  dryrun: bool = False) -> Dict[str, Optional[str]]:
164
165
  del zones, dryrun, cluster_name # unused
165
166
 
sky/clouds/scp.py CHANGED
@@ -181,6 +181,7 @@ class SCP(clouds.Cloud):
181
181
  cluster_name: resources_utils.ClusterName,
182
182
  region: 'clouds.Region',
183
183
  zones: Optional[List['clouds.Zone']],
184
+ num_nodes: int,
184
185
  dryrun: bool = False) -> Dict[str, Optional[str]]:
185
186
  del cluster_name, dryrun # Unused.
186
187
  assert zones is None, 'SCP does not support zones.'
sky/clouds/vsphere.py CHANGED
@@ -173,6 +173,7 @@ class Vsphere(clouds.Cloud):
173
173
  cluster_name: resources_utils.ClusterName,
174
174
  region: 'clouds.Region',
175
175
  zones: Optional[List['clouds.Zone']],
176
+ num_nodes: int,
176
177
  dryrun: bool = False,
177
178
  ) -> Dict[str, Optional[str]]:
178
179
  # TODO get image id here.
@@ -4,7 +4,6 @@ import functools
4
4
  import hashlib
5
5
  import json
6
6
  import os
7
- import resource
8
7
  import time
9
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
10
9
 
@@ -20,6 +19,7 @@ from sky.utils import accelerator_registry
20
19
  from sky.utils import command_runner
21
20
  from sky.utils import common_utils
22
21
  from sky.utils import subprocess_utils
22
+ from sky.utils import timeline
23
23
  from sky.utils import ux_utils
24
24
 
25
25
  logger = sky_logging.init_logger(__name__)
@@ -115,7 +115,8 @@ def _parallel_ssh_with_cache(func,
115
115
  if max_workers is None:
116
116
  # Not using the default value of `max_workers` in ThreadPoolExecutor,
117
117
  # as 32 is too large for some machines.
118
- max_workers = subprocess_utils.get_parallel_threads()
118
+ max_workers = subprocess_utils.get_parallel_threads(
119
+ cluster_info.provider_name)
119
120
  with futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
120
121
  results = []
121
122
  runners = provision.get_command_runners(cluster_info.provider_name,
@@ -170,6 +171,7 @@ def initialize_docker(cluster_name: str, docker_config: Dict[str, Any],
170
171
 
171
172
 
172
173
  @common.log_function_start_end
174
+ @timeline.event
173
175
  def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str],
174
176
  cluster_info: common.ClusterInfo,
175
177
  ssh_credentials: Dict[str, Any]) -> None:
@@ -245,20 +247,9 @@ def _ray_gpu_options(custom_resource: str) -> str:
245
247
  return f' --num-gpus={acc_count}'
246
248
 
247
249
 
248
- @common.log_function_start_end
249
- @_auto_retry()
250
- def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
251
- cluster_info: common.ClusterInfo,
252
- ssh_credentials: Dict[str, Any]) -> None:
253
- """Start Ray on the head node."""
254
- runners = provision.get_command_runners(cluster_info.provider_name,
255
- cluster_info, **ssh_credentials)
256
- head_runner = runners[0]
257
- assert cluster_info.head_instance_id is not None, (cluster_name,
258
- cluster_info)
259
-
260
- # Log the head node's output to the provision.log
261
- log_path_abs = str(provision_logging.get_log_path())
250
+ def ray_head_start_command(custom_resource: Optional[str],
251
+ custom_ray_options: Optional[Dict[str, Any]]) -> str:
252
+ """Returns the command to start Ray on the head node."""
262
253
  ray_options = (
263
254
  # --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
264
255
  f'--disable-usage-stats '
@@ -270,23 +261,14 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
270
261
  if custom_resource:
271
262
  ray_options += f' --resources=\'{custom_resource}\''
272
263
  ray_options += _ray_gpu_options(custom_resource)
273
-
274
- if cluster_info.custom_ray_options:
275
- if 'use_external_ip' in cluster_info.custom_ray_options:
276
- cluster_info.custom_ray_options.pop('use_external_ip')
277
- for key, value in cluster_info.custom_ray_options.items():
264
+ if custom_ray_options:
265
+ if 'use_external_ip' in custom_ray_options:
266
+ custom_ray_options.pop('use_external_ip')
267
+ for key, value in custom_ray_options.items():
278
268
  ray_options += f' --{key}={value}'
279
269
 
280
- # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY to avoid using credentials
281
- # from environment variables set by user. SkyPilot's ray cluster should use
282
- # the `~/.aws/` credentials, as that is the one used to create the cluster,
283
- # and the autoscaler module started by the `ray start` command should use
284
- # the same credentials. Otherwise, `ray status` will fail to fetch the
285
- # available nodes.
286
- # Reference: https://github.com/skypilot-org/skypilot/issues/2441
287
270
  cmd = (
288
271
  f'{constants.SKY_RAY_CMD} stop; '
289
- 'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
290
272
  'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
291
273
  # worker_maximum_startup_concurrency controls the maximum number of
292
274
  # workers that can be started concurrently. However, it also controls
@@ -305,6 +287,62 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
305
287
  'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
306
288
  f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
307
289
  _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
290
+ return cmd
291
+
292
+
293
+ def ray_worker_start_command(custom_resource: Optional[str],
294
+ custom_ray_options: Optional[Dict[str, Any]],
295
+ no_restart: bool) -> str:
296
+ """Returns the command to start Ray on the worker node."""
297
+ # We need to use the ray port in the env variable, because the head node
298
+ # determines the port to be used for the worker node.
299
+ ray_options = ('--address=${SKYPILOT_RAY_HEAD_IP}:${SKYPILOT_RAY_PORT} '
300
+ '--object-manager-port=8076')
301
+
302
+ if custom_resource:
303
+ ray_options += f' --resources=\'{custom_resource}\''
304
+ ray_options += _ray_gpu_options(custom_resource)
305
+
306
+ if custom_ray_options:
307
+ for key, value in custom_ray_options.items():
308
+ ray_options += f' --{key}={value}'
309
+
310
+ cmd = (
311
+ 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
312
+ f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || '
313
+ 'exit 1;' + _RAY_PRLIMIT)
314
+ if no_restart:
315
+ # We do not use ray status to check whether ray is running, because
316
+ # on worker node, if the user started their own ray cluster, ray status
317
+ # will return 0, i.e., we don't know skypilot's ray cluster is running.
318
+ # Instead, we check whether the raylet process is running on gcs address
319
+ # that is connected to the head with the correct port.
320
+ cmd = (
321
+ f'ps aux | grep "ray/raylet/raylet" | '
322
+ 'grep "gcs-address=${SKYPILOT_RAY_HEAD_IP}:${SKYPILOT_RAY_PORT}" '
323
+ f'|| {{ {cmd} }}')
324
+ else:
325
+ cmd = f'{constants.SKY_RAY_CMD} stop; ' + cmd
326
+ return cmd
327
+
328
+
329
+ @common.log_function_start_end
330
+ @_auto_retry()
331
+ @timeline.event
332
+ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
333
+ cluster_info: common.ClusterInfo,
334
+ ssh_credentials: Dict[str, Any]) -> None:
335
+ """Start Ray on the head node."""
336
+ runners = provision.get_command_runners(cluster_info.provider_name,
337
+ cluster_info, **ssh_credentials)
338
+ head_runner = runners[0]
339
+ assert cluster_info.head_instance_id is not None, (cluster_name,
340
+ cluster_info)
341
+
342
+ # Log the head node's output to the provision.log
343
+ log_path_abs = str(provision_logging.get_log_path())
344
+ cmd = ray_head_start_command(custom_resource,
345
+ cluster_info.custom_ray_options)
308
346
  logger.info(f'Running command on head node: {cmd}')
309
347
  # TODO(zhwu): add the output to log files.
310
348
  returncode, stdout, stderr = head_runner.run(
@@ -324,6 +362,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
324
362
 
325
363
  @common.log_function_start_end
326
364
  @_auto_retry()
365
+ @timeline.event
327
366
  def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
328
367
  custom_resource: Optional[str], ray_port: int,
329
368
  cluster_info: common.ClusterInfo,
@@ -358,43 +397,17 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
358
397
  head_ip = (head_instance.internal_ip
359
398
  if not use_external_ip else head_instance.external_ip)
360
399
 
361
- ray_options = (f'--address={head_ip}:{constants.SKY_REMOTE_RAY_PORT} '
362
- f'--object-manager-port=8076')
363
-
364
- if custom_resource:
365
- ray_options += f' --resources=\'{custom_resource}\''
366
- ray_options += _ray_gpu_options(custom_resource)
367
-
368
- if cluster_info.custom_ray_options:
369
- for key, value in cluster_info.custom_ray_options.items():
370
- ray_options += f' --{key}={value}'
400
+ ray_cmd = ray_worker_start_command(custom_resource,
401
+ cluster_info.custom_ray_options,
402
+ no_restart)
371
403
 
372
- # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY, see the comment in
373
- # `start_ray_on_head_node`.
374
- cmd = (
375
- f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
376
- 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
377
- f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || '
378
- 'exit 1;' + _RAY_PRLIMIT)
379
- if no_restart:
380
- # We do not use ray status to check whether ray is running, because
381
- # on worker node, if the user started their own ray cluster, ray status
382
- # will return 0, i.e., we don't know skypilot's ray cluster is running.
383
- # Instead, we check whether the raylet process is running on gcs address
384
- # that is connected to the head with the correct port.
385
- cmd = (f'RAY_PORT={ray_port}; ps aux | grep "ray/raylet/raylet" | '
386
- f'grep "gcs-address={head_ip}:${{RAY_PORT}}" || '
387
- f'{{ {cmd} }}')
388
- else:
389
- cmd = f'{constants.SKY_RAY_CMD} stop; ' + cmd
404
+ cmd = (f'export SKYPILOT_RAY_HEAD_IP="{head_ip}"; '
405
+ f'export SKYPILOT_RAY_PORT={ray_port}; ' + ray_cmd)
390
406
 
391
407
  logger.info(f'Running command on worker nodes: {cmd}')
392
408
 
393
409
  def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner,
394
410
  str]):
395
- # for cmd in config_from_yaml['worker_start_ray_commands']:
396
- # cmd = cmd.replace('$RAY_HEAD_IP', ip_list[0][0])
397
- # runner.run(cmd)
398
411
  runner, instance_id = runner_and_id
399
412
  log_dir = metadata_utils.get_instance_log_dir(cluster_name, instance_id)
400
413
  log_path_abs = str(log_dir / ('ray_cluster' + '.log'))
@@ -407,8 +420,10 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
407
420
  # by ray will have the correct PATH.
408
421
  source_bashrc=True)
409
422
 
423
+ num_threads = subprocess_utils.get_parallel_threads(
424
+ cluster_info.provider_name)
410
425
  results = subprocess_utils.run_in_parallel(
411
- _setup_ray_worker, list(zip(worker_runners, cache_ids)))
426
+ _setup_ray_worker, list(zip(worker_runners, cache_ids)), num_threads)
412
427
  for returncode, stdout, stderr in results:
413
428
  if returncode:
414
429
  with ux_utils.print_exception_no_traceback():
@@ -421,6 +436,7 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
421
436
 
422
437
  @common.log_function_start_end
423
438
  @_auto_retry()
439
+ @timeline.event
424
440
  def start_skylet_on_head_node(cluster_name: str,
425
441
  cluster_info: common.ClusterInfo,
426
442
  ssh_credentials: Dict[str, Any]) -> None:
@@ -482,28 +498,8 @@ def _internal_file_mounts(file_mounts: Dict,
482
498
  )
483
499
 
484
500
 
485
- def _max_workers_for_file_mounts(common_file_mounts: Dict[str, str]) -> int:
486
- fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
487
-
488
- fd_per_rsync = 5
489
- for src in common_file_mounts.values():
490
- if os.path.isdir(src):
491
- # Assume that each file/folder under src takes 5 file descriptors
492
- # on average.
493
- fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
494
-
495
- # Reserve some file descriptors for the system and other processes
496
- fd_reserve = 100
497
-
498
- max_workers = (fd_limit - fd_reserve) // fd_per_rsync
499
- # At least 1 worker, and avoid too many workers overloading the system.
500
- max_workers = min(max(max_workers, 1),
501
- subprocess_utils.get_parallel_threads())
502
- logger.debug(f'Using {max_workers} workers for file mounts.')
503
- return max_workers
504
-
505
-
506
501
  @common.log_function_start_end
502
+ @timeline.event
507
503
  def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
508
504
  cluster_info: common.ClusterInfo,
509
505
  ssh_credentials: Dict[str, str]) -> None:
@@ -524,4 +520,5 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
524
520
  digest=None,
525
521
  cluster_info=cluster_info,
526
522
  ssh_credentials=ssh_credentials,
527
- max_workers=_max_workers_for_file_mounts(common_file_mounts))
523
+ max_workers=subprocess_utils.get_max_workers_for_file_mounts(
524
+ common_file_mounts, cluster_info.provider_name))