PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241120py3-none-any.whl → 1.0.0.dev20241122py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +20 -15
sky/backends/cloud_vm_ray_backend.py +21 -3
sky/clouds/aws.py +1 -0
sky/clouds/azure.py +1 -0
sky/clouds/cloud.py +1 -0
sky/clouds/cudo.py +1 -0
sky/clouds/fluidstack.py +1 -0
sky/clouds/gcp.py +1 -0
sky/clouds/ibm.py +1 -0
sky/clouds/kubernetes.py +45 -3
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/oci.py +1 -0
sky/clouds/paperspace.py +1 -0
sky/clouds/runpod.py +1 -0
sky/clouds/scp.py +1 -0
sky/clouds/vsphere.py +1 -0
sky/provision/instance_setup.py +80 -83
sky/provision/kubernetes/instance.py +108 -76
sky/provision/kubernetes/utils.py +2 -0
sky/provision/oci/instance.py +4 -2
sky/provision/provisioner.py +95 -19
sky/resources.py +2 -1
sky/skylet/constants.py +31 -21
sky/templates/kubernetes-ray.yml.j2 +169 -39
sky/utils/subprocess_utils.py +49 -4
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/METADATA +65 -55
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/RECORD +32 -32
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '789a9ea6fc54104219ca20aa785ccf69e1d30294'
+_SKYPILOT_COMMIT_SHA = '204d979fedece9b7b789dcd2610d1ebdbc8d1fc5'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20241120'
+__version__ = '1.0.0.dev20241122'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/backends/backend_utils.py CHANGED Viewed

@@ -683,7 +683,7 @@ def write_cluster_config(
         resources_utils.ClusterName(
             cluster_name,
             cluster_name_on_cloud,
-        ), region, zones, dryrun)
+        ), region, zones, num_nodes, dryrun)
     config_dict = {}
     specific_reservations = set(
@@ -844,7 +844,11 @@ def write_cluster_config(
                         '{sky_wheel_hash}',
                         wheel_hash).replace('{cloud}',
                                             str(cloud).lower())),
+                'skypilot_wheel_installation_commands':
+                    constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
+                        '{sky_wheel_hash}',
+                        wheel_hash).replace('{cloud}',
+                                            str(cloud).lower()),
                 # Port of Ray (GCS server).
                 # Ray's default port 6379 is conflicted with Redis.
                 'ray_port': constants.SKY_REMOTE_RAY_PORT,
@@ -1191,18 +1195,18 @@ def ssh_credential_from_yaml(
 def parallel_data_transfer_to_nodes(
-    runners: List[command_runner.CommandRunner],
-    source: Optional[str],
-    target: str,
-    cmd: Optional[str],
-    run_rsync: bool,
-    *,
-    action_message: str,
-    # Advanced options.
-    log_path: str = os.devnull,
-    stream_logs: bool = False,
-    source_bashrc: bool = False,
-):
+        runners: List[command_runner.CommandRunner],
+        source: Optional[str],
+        target: str,
+        cmd: Optional[str],
+        run_rsync: bool,
+        *,
+        action_message: str,
+        # Advanced options.
+        log_path: str = os.devnull,
+        stream_logs: bool = False,
+        source_bashrc: bool = False,
+        num_threads: Optional[int] = None):
     """Runs a command on all nodes and optionally runs rsync from src->dst.
     Args:
@@ -1214,6 +1218,7 @@ def parallel_data_transfer_to_nodes(
         log_path: str; Path to the log file
         stream_logs: bool; Whether to stream logs to stdout
         source_bashrc: bool; Source bashrc before running the command.
+        num_threads: Optional[int]; Number of threads to use.
     """
     style = colorama.Style
@@ -1254,7 +1259,7 @@ def parallel_data_transfer_to_nodes(
     message = (f'  {style.DIM}{action_message} (to {num_nodes} node{plural})'
                f': {origin_source} -> {target}{style.RESET_ALL}')
     logger.info(message)
-    subprocess_utils.run_in_parallel(_sync_node, runners)
+    subprocess_utils.run_in_parallel(_sync_node, runners, num_threads)
 def check_local_gpus() -> bool:

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -269,6 +269,13 @@ class RayCodeGen:
             import time
             from typing import Dict, List, Optional, Tuple, Union
+            # Set the environment variables to avoid deduplicating logs and
+            # scheduler events. This should be set in driver code, since we are
+            # not using `ray job submit` anymore, and the environment variables
+            # from the ray cluster is not inherited.
+            os.environ['RAY_DEDUP_LOGS'] = '0'
+            os.environ['RAY_SCHEDULER_EVENTS'] = '0'
             import ray
             import ray.util as ray_util
@@ -1528,7 +1535,7 @@ class RetryingVmProvisioner(object):
                             to_provision,
                             resources_utils.ClusterName(
                                 cluster_name, handle.cluster_name_on_cloud),
-                            region, zones))
+                            region, zones, num_nodes))
                     config_dict['provision_record'] = provision_record
                     config_dict['resources_vars'] = resources_vars
                     config_dict['handle'] = handle
@@ -3086,9 +3093,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
         os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
         os.system(f'touch {log_path}')
+        num_threads = subprocess_utils.get_parallel_threads(
+            str(handle.launched_resources.cloud))
         with rich_utils.safe_status(
                 ux_utils.spinner_message('Syncing workdir', log_path)):
-            subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
+            subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
+                                             num_threads)
         logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
     def _sync_file_mounts(
@@ -4416,6 +4426,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         start = time.time()
         runners = handle.get_command_runners()
         log_path = os.path.join(self.log_dir, 'file_mounts.log')
+        num_threads = subprocess_utils.get_max_workers_for_file_mounts(
+            file_mounts, str(handle.launched_resources.cloud))
         # Check the files and warn
         for dst, src in file_mounts.items():
@@ -4477,6 +4489,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     action_message='Syncing',
                     log_path=log_path,
                     stream_logs=False,
+                    num_threads=num_threads,
                 )
                 continue
@@ -4513,6 +4526,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # Need to source bashrc, as the cloud specific CLI or SDK may
                 # require PATH in bashrc.
                 source_bashrc=True,
+                num_threads=num_threads,
             )
         # (2) Run the commands to create symlinks on all the nodes.
         symlink_command = ' && '.join(symlink_commands)
@@ -4531,7 +4545,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     'Failed to create symlinks. The target destination '
                     f'may already exist. Log: {log_path}')
-            subprocess_utils.run_in_parallel(_symlink_node, runners)
+            subprocess_utils.run_in_parallel(_symlink_node, runners,
+                                             num_threads)
         end = time.time()
         logger.debug(f'File mount sync took {end - start} seconds.')
         logger.info(ux_utils.finishing_message('Files synced.', log_path))
@@ -4560,6 +4575,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             return
         start = time.time()
         runners = handle.get_command_runners()
+        num_threads = subprocess_utils.get_parallel_threads(
+            str(handle.launched_resources.cloud))
         log_path = os.path.join(self.log_dir, 'storage_mounts.log')
         plural = 's' if len(storage_mounts) > 1 else ''
@@ -4598,6 +4615,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     # Need to source bashrc, as the cloud specific CLI or SDK
                     # may require PATH in bashrc.
                     source_bashrc=True,
+                    num_threads=num_threads,
                 )
             except exceptions.CommandError as e:
                 if e.returncode == exceptions.MOUNT_PATH_NON_EMPTY_CODE:

sky/clouds/aws.py CHANGED Viewed

@@ -401,6 +401,7 @@ class AWS(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Any]:
         del dryrun  # unused
         assert zones is not None, (region, zones)

sky/clouds/azure.py CHANGED Viewed

@@ -302,6 +302,7 @@ class Azure(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Any]:
         assert zones is None, ('Azure does not support zones', zones)

sky/clouds/cloud.py CHANGED Viewed

@@ -283,6 +283,7 @@ class Cloud:
         cluster_name: resources_utils.ClusterName,
         region: 'Region',
         zones: Optional[List['Zone']],
+        num_nodes: int,
         dryrun: bool = False,
     ) -> Dict[str, Optional[str]]:
         """Converts planned sky.Resources to cloud-specific resource variables.

sky/clouds/cudo.py CHANGED Viewed

@@ -196,6 +196,7 @@ class Cudo(clouds.Cloud):
         cluster_name: resources_utils.ClusterName,
         region: 'clouds.Region',
         zones: Optional[List['clouds.Zone']],
+        num_nodes: int,
         dryrun: bool = False,
     ) -> Dict[str, Optional[str]]:
         del zones, cluster_name  # unused

sky/clouds/fluidstack.py CHANGED Viewed

@@ -176,6 +176,7 @@ class Fluidstack(clouds.Cloud):
         cluster_name: resources_utils.ClusterName,
         region: clouds.Region,
         zones: Optional[List[clouds.Zone]],
+        num_nodes: int,
         dryrun: bool = False,
     ) -> Dict[str, Optional[str]]:

sky/clouds/gcp.py CHANGED Viewed

@@ -417,6 +417,7 @@ class GCP(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         assert zones is not None, (region, zones)

sky/clouds/ibm.py CHANGED Viewed

@@ -170,6 +170,7 @@ class IBM(clouds.Cloud):
         cluster_name: resources_utils.ClusterName,
         region: 'clouds.Region',
         zones: Optional[List['clouds.Zone']],
+        num_nodes: int,
         dryrun: bool = False,
     ) -> Dict[str, Optional[str]]:
         """Converts planned sky.Resources to cloud-specific resource variables.

sky/clouds/kubernetes.py CHANGED Viewed

@@ -10,8 +10,10 @@ from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import kubernetes
 from sky.clouds import service_catalog
+from sky.provision import instance_setup
 from sky.provision.kubernetes import network_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import resources_utils
 from sky.utils import schemas
@@ -311,12 +313,34 @@ class Kubernetes(clouds.Cloud):
         # we don't have a notion of disk size in Kubernetes.
         return 0
+    @staticmethod
+    def _calculate_provision_timeout(num_nodes: int) -> int:
+        """Calculate provision timeout based on number of nodes.
+        The timeout scales linearly with the number of nodes to account for
+        scheduling overhead, but is capped to avoid excessive waiting.
+        Args:
+            num_nodes: Number of nodes being provisioned
+        Returns:
+            Timeout in seconds
+        """
+        base_timeout = 10  # Base timeout for single node
+        per_node_timeout = 0.2  # Additional seconds per node
+        max_timeout = 60  # Cap at 1 minute
+        return int(
+            min(base_timeout + (per_node_timeout * (num_nodes - 1)),
+                max_timeout))
     def make_deploy_resources_variables(
             self,
             resources: 'resources_lib.Resources',
             cluster_name: resources_utils.ClusterName,
             region: Optional['clouds.Region'],
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del cluster_name, zones, dryrun  # Unused.
         if region is None:
@@ -413,12 +437,24 @@ class Kubernetes(clouds.Cloud):
         # Larger timeout may be required for autoscaling clusters, since
         # autoscaler may take some time to provision new nodes.
         # Note that this timeout includes time taken by the Kubernetes scheduler
-        # itself, which can be upto 2-3 seconds.
-        # For non-autoscaling clusters, we conservatively set this to 10s.
+        # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
+        # scheduling 100s of pods.
+        # We use a linear scaling formula to determine the timeout based on the
+        # number of nodes.
+        timeout = self._calculate_provision_timeout(num_nodes)
         timeout = skypilot_config.get_nested(
             ('kubernetes', 'provision_timeout'),
-            10,
+            timeout,
             override_configs=resources.cluster_config_overrides)
+        # We specify object-store-memory to be 500MB to avoid taking up too
+        # much memory on the head node. 'num-cpus' should be set to limit
+        # the CPU usage on the head pod, otherwise the ray cluster will use the
+        # CPU resources on the node instead within the pod.
+        custom_ray_options = {
+            'object-store-memory': 500000000,
+            'num-cpus': str(int(cpus)),
+        }
         deploy_vars = {
             'instance_type': resources.instance_type,
             'custom_resources': custom_resources,
@@ -445,6 +481,12 @@ class Kubernetes(clouds.Cloud):
             'k8s_topology_label_value': k8s_topology_label_value,
             'k8s_resource_key': k8s_resource_key,
             'image_id': image_id,
+            'ray_installation_commands': constants.RAY_INSTALLATION_COMMANDS,
+            'ray_head_start_command': instance_setup.ray_head_start_command(
+                custom_resources, custom_ray_options),
+            'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
+            'ray_worker_start_command': instance_setup.ray_worker_start_command(
+                custom_resources, custom_ray_options, no_restart=False),
         }
         # Add kubecontext if it is set. It may be None if SkyPilot is running

sky/clouds/lambda_cloud.py CHANGED Viewed

@@ -157,6 +157,7 @@ class Lambda(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del cluster_name, dryrun  # Unused.
         assert zones is None, 'Lambda does not support zones.'

sky/clouds/oci.py CHANGED Viewed

@@ -208,6 +208,7 @@ class OCI(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: Optional['clouds.Region'],
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del cluster_name, dryrun  # Unused.
         assert region is not None, resources

sky/clouds/paperspace.py CHANGED Viewed

@@ -175,6 +175,7 @@ class Paperspace(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del zones, dryrun, cluster_name

sky/clouds/runpod.py CHANGED Viewed

@@ -160,6 +160,7 @@ class RunPod(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del zones, dryrun, cluster_name  # unused

sky/clouds/scp.py CHANGED Viewed

@@ -181,6 +181,7 @@ class SCP(clouds.Cloud):
             cluster_name: resources_utils.ClusterName,
             region: 'clouds.Region',
             zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
             dryrun: bool = False) -> Dict[str, Optional[str]]:
         del cluster_name, dryrun  # Unused.
         assert zones is None, 'SCP does not support zones.'

sky/clouds/vsphere.py CHANGED Viewed

@@ -173,6 +173,7 @@ class Vsphere(clouds.Cloud):
         cluster_name: resources_utils.ClusterName,
         region: 'clouds.Region',
         zones: Optional[List['clouds.Zone']],
+        num_nodes: int,
         dryrun: bool = False,
     ) -> Dict[str, Optional[str]]:
         # TODO get image id here.

sky/provision/instance_setup.py CHANGED Viewed

@@ -4,7 +4,6 @@ import functools
 import hashlib
 import json
 import os
-import resource
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -20,6 +19,7 @@ from sky.utils import accelerator_registry
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
+from sky.utils import timeline
 from sky.utils import ux_utils
 logger = sky_logging.init_logger(__name__)
@@ -115,7 +115,8 @@ def _parallel_ssh_with_cache(func,
     if max_workers is None:
         # Not using the default value of `max_workers` in ThreadPoolExecutor,
         # as 32 is too large for some machines.
-        max_workers = subprocess_utils.get_parallel_threads()
+        max_workers = subprocess_utils.get_parallel_threads(
+            cluster_info.provider_name)
     with futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
         results = []
         runners = provision.get_command_runners(cluster_info.provider_name,
@@ -170,6 +171,7 @@ def initialize_docker(cluster_name: str, docker_config: Dict[str, Any],
 @common.log_function_start_end
+@timeline.event
 def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str],
                              cluster_info: common.ClusterInfo,
                              ssh_credentials: Dict[str, Any]) -> None:
@@ -245,20 +247,9 @@ def _ray_gpu_options(custom_resource: str) -> str:
     return f' --num-gpus={acc_count}'
-@common.log_function_start_end
-@_auto_retry()
-def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
-                           cluster_info: common.ClusterInfo,
-                           ssh_credentials: Dict[str, Any]) -> None:
-    """Start Ray on the head node."""
-    runners = provision.get_command_runners(cluster_info.provider_name,
-                                            cluster_info, **ssh_credentials)
-    head_runner = runners[0]
-    assert cluster_info.head_instance_id is not None, (cluster_name,
-                                                       cluster_info)
-    # Log the head node's output to the provision.log
-    log_path_abs = str(provision_logging.get_log_path())
+def ray_head_start_command(custom_resource: Optional[str],
+                           custom_ray_options: Optional[Dict[str, Any]]) -> str:
+    """Returns the command to start Ray on the head node."""
     ray_options = (
         # --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
         f'--disable-usage-stats '
@@ -270,23 +261,14 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
     if custom_resource:
         ray_options += f' --resources=\'{custom_resource}\''
         ray_options += _ray_gpu_options(custom_resource)
-    if cluster_info.custom_ray_options:
-        if 'use_external_ip' in cluster_info.custom_ray_options:
-            cluster_info.custom_ray_options.pop('use_external_ip')
-        for key, value in cluster_info.custom_ray_options.items():
+    if custom_ray_options:
+        if 'use_external_ip' in custom_ray_options:
+            custom_ray_options.pop('use_external_ip')
+        for key, value in custom_ray_options.items():
             ray_options += f' --{key}={value}'
-    # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY to avoid using credentials
-    # from environment variables set by user. SkyPilot's ray cluster should use
-    # the `~/.aws/` credentials, as that is the one used to create the cluster,
-    # and the autoscaler module started by the `ray start` command should use
-    # the same credentials. Otherwise, `ray status` will fail to fetch the
-    # available nodes.
-    # Reference: https://github.com/skypilot-org/skypilot/issues/2441
     cmd = (
         f'{constants.SKY_RAY_CMD} stop; '
-        'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
         'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
         # worker_maximum_startup_concurrency controls the maximum number of
         # workers that can be started concurrently. However, it also controls
@@ -305,6 +287,62 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
         'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
         f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
         _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
+    return cmd
+def ray_worker_start_command(custom_resource: Optional[str],
+                             custom_ray_options: Optional[Dict[str, Any]],
+                             no_restart: bool) -> str:
+    """Returns the command to start Ray on the worker node."""
+    # We need to use the ray port in the env variable, because the head node
+    # determines the port to be used for the worker node.
+    ray_options = ('--address=${SKYPILOT_RAY_HEAD_IP}:${SKYPILOT_RAY_PORT} '
+                   '--object-manager-port=8076')
+    if custom_resource:
+        ray_options += f' --resources=\'{custom_resource}\''
+        ray_options += _ray_gpu_options(custom_resource)
+    if custom_ray_options:
+        for key, value in custom_ray_options.items():
+            ray_options += f' --{key}={value}'
+    cmd = (
+        'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
+        f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || '
+        'exit 1;' + _RAY_PRLIMIT)
+    if no_restart:
+        # We do not use ray status to check whether ray is running, because
+        # on worker node, if the user started their own ray cluster, ray status
+        # will return 0, i.e., we don't know skypilot's ray cluster is running.
+        # Instead, we check whether the raylet process is running on gcs address
+        # that is connected to the head with the correct port.
+        cmd = (
+            f'ps aux | grep "ray/raylet/raylet" | '
+            'grep "gcs-address=${SKYPILOT_RAY_HEAD_IP}:${SKYPILOT_RAY_PORT}" '
+            f'|| {{ {cmd} }}')
+    else:
+        cmd = f'{constants.SKY_RAY_CMD} stop; ' + cmd
+    return cmd
+@common.log_function_start_end
+@_auto_retry()
+@timeline.event
+def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
+                           cluster_info: common.ClusterInfo,
+                           ssh_credentials: Dict[str, Any]) -> None:
+    """Start Ray on the head node."""
+    runners = provision.get_command_runners(cluster_info.provider_name,
+                                            cluster_info, **ssh_credentials)
+    head_runner = runners[0]
+    assert cluster_info.head_instance_id is not None, (cluster_name,
+                                                       cluster_info)
+    # Log the head node's output to the provision.log
+    log_path_abs = str(provision_logging.get_log_path())
+    cmd = ray_head_start_command(custom_resource,
+                                 cluster_info.custom_ray_options)
     logger.info(f'Running command on head node: {cmd}')
     # TODO(zhwu): add the output to log files.
     returncode, stdout, stderr = head_runner.run(
@@ -324,6 +362,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
 @common.log_function_start_end
 @_auto_retry()
+@timeline.event
 def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
                               custom_resource: Optional[str], ray_port: int,
                               cluster_info: common.ClusterInfo,
@@ -358,43 +397,17 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
     head_ip = (head_instance.internal_ip
                if not use_external_ip else head_instance.external_ip)
-    ray_options = (f'--address={head_ip}:{constants.SKY_REMOTE_RAY_PORT} '
-                   f'--object-manager-port=8076')
-    if custom_resource:
-        ray_options += f' --resources=\'{custom_resource}\''
-        ray_options += _ray_gpu_options(custom_resource)
-    if cluster_info.custom_ray_options:
-        for key, value in cluster_info.custom_ray_options.items():
-            ray_options += f' --{key}={value}'
+    ray_cmd = ray_worker_start_command(custom_resource,
+                                       cluster_info.custom_ray_options,
+                                       no_restart)
-    # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY, see the comment in
-    # `start_ray_on_head_node`.
-    cmd = (
-        f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
-        'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
-        f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || '
-        'exit 1;' + _RAY_PRLIMIT)
-    if no_restart:
-        # We do not use ray status to check whether ray is running, because
-        # on worker node, if the user started their own ray cluster, ray status
-        # will return 0, i.e., we don't know skypilot's ray cluster is running.
-        # Instead, we check whether the raylet process is running on gcs address
-        # that is connected to the head with the correct port.
-        cmd = (f'RAY_PORT={ray_port}; ps aux | grep "ray/raylet/raylet" | '
-               f'grep "gcs-address={head_ip}:${{RAY_PORT}}" || '
-               f'{{ {cmd} }}')
-    else:
-        cmd = f'{constants.SKY_RAY_CMD} stop; ' + cmd
+    cmd = (f'export SKYPILOT_RAY_HEAD_IP="{head_ip}"; '
+           f'export SKYPILOT_RAY_PORT={ray_port}; ' + ray_cmd)
     logger.info(f'Running command on worker nodes: {cmd}')
     def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner,
                                                str]):
-        # for cmd in config_from_yaml['worker_start_ray_commands']:
-        #     cmd = cmd.replace('$RAY_HEAD_IP', ip_list[0][0])
-        #     runner.run(cmd)
         runner, instance_id = runner_and_id
         log_dir = metadata_utils.get_instance_log_dir(cluster_name, instance_id)
         log_path_abs = str(log_dir / ('ray_cluster' + '.log'))
@@ -407,8 +420,10 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
             # by ray will have the correct PATH.
             source_bashrc=True)
+    num_threads = subprocess_utils.get_parallel_threads(
+        cluster_info.provider_name)
     results = subprocess_utils.run_in_parallel(
-        _setup_ray_worker, list(zip(worker_runners, cache_ids)))
+        _setup_ray_worker, list(zip(worker_runners, cache_ids)), num_threads)
     for returncode, stdout, stderr in results:
         if returncode:
             with ux_utils.print_exception_no_traceback():
@@ -421,6 +436,7 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
 @common.log_function_start_end
 @_auto_retry()
+@timeline.event
 def start_skylet_on_head_node(cluster_name: str,
                               cluster_info: common.ClusterInfo,
                               ssh_credentials: Dict[str, Any]) -> None:
@@ -482,28 +498,8 @@ def _internal_file_mounts(file_mounts: Dict,
         )
-def _max_workers_for_file_mounts(common_file_mounts: Dict[str, str]) -> int:
-    fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
-    fd_per_rsync = 5
-    for src in common_file_mounts.values():
-        if os.path.isdir(src):
-            # Assume that each file/folder under src takes 5 file descriptors
-            # on average.
-            fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
-    # Reserve some file descriptors for the system and other processes
-    fd_reserve = 100
-    max_workers = (fd_limit - fd_reserve) // fd_per_rsync
-    # At least 1 worker, and avoid too many workers overloading the system.
-    max_workers = min(max(max_workers, 1),
-                      subprocess_utils.get_parallel_threads())
-    logger.debug(f'Using {max_workers} workers for file mounts.')
-    return max_workers
 @common.log_function_start_end
+@timeline.event
 def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
                          cluster_info: common.ClusterInfo,
                          ssh_credentials: Dict[str, str]) -> None:
@@ -524,4 +520,5 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
         digest=None,
         cluster_info=cluster_info,
         ssh_credentials=ssh_credentials,
-        max_workers=_max_workers_for_file_mounts(common_file_mounts))
+        max_workers=subprocess_utils.get_max_workers_for_file_mounts(
+            common_file_mounts, cluster_info.provider_name))

skypilot-nightly 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241120py3-none-any.whl → 1.0.0.dev20241122py3-none-any.whl