PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250220__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250220py3-none-any.whl → 1.0.0.dev20250221py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sky/__init__.py +4 -2
sky/adaptors/nebius.py +85 -0
sky/backends/backend_utils.py +8 -0
sky/backends/cloud_vm_ray_backend.py +10 -2
sky/client/sdk.py +8 -3
sky/clouds/__init__.py +2 -0
sky/clouds/nebius.py +294 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/jobs/controller.py +17 -0
sky/jobs/server/core.py +31 -3
sky/provision/__init__.py +1 -0
sky/provision/kubernetes/instance.py +5 -1
sky/provision/kubernetes/utils.py +8 -7
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +310 -0
sky/setup_files/dependencies.py +9 -1
sky/skylet/constants.py +3 -6
sky/task.py +6 -0
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/nebius-ray.yml.j2 +79 -0
sky/utils/controller_utils.py +66 -2
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +30 -22
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0

sky/provision/nebius/utils.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Nebius library wrapper for SkyPilot."""
+import time
+from typing import Any, Dict
+import uuid
+from sky import sky_logging
+from sky.adaptors import nebius
+from sky.utils import common_utils
+logger = sky_logging.init_logger(__name__)
+POLL_INTERVAL = 5
+def retry(func):
+    """Decorator to retry a function."""
+    def wrapper(*args, **kwargs):
+        """Wrapper for retrying a function."""
+        cnt = 0
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except nebius.nebius.error.QueryError as e:
+                if cnt >= 3:
+                    raise
+                logger.warning('Retrying for exception: '
+                               f'{common_utils.format_exception(e)}.')
+                time.sleep(POLL_INTERVAL)
+    return wrapper
+def get_project_by_region(region: str) -> str:
+    service = nebius.iam().ProjectServiceClient(nebius.sdk())
+    projects = service.list(nebius.iam().ListProjectsRequest(
+        parent_id=nebius.get_tenant_id())).wait()
+    # To find a project in a specific region, we rely on the project ID to
+    # deduce the region, since there is currently no method to retrieve region
+    # information directly from the project. Additionally, there is only one
+    # project per region, and projects cannot be created at this time.
+    # The region is determined from the project ID using a region-specific
+    # identifier embedded in it.
+    # Project id looks like project-e00xxxxxxxxxxxxxx where
+    # e00 - id of region 'eu-north1'
+    # e01 - id of region 'eu-west1'
+    # TODO(SalikovAlex): fix when info about region will be in projects list
+    # Currently, Nebius cloud supports 2 regions. We manually enumerate
+    # them here. Reference: https://docs.nebius.com/overview/regions
+    for project in projects.items:
+        if region == 'eu-north1' and project.metadata.id[8:11] == 'e00':
+            return project.metadata.id
+        if region == 'eu-west1' and project.metadata.id[8:11] == 'e01':
+            return project.metadata.id
+    raise Exception(f'No project found for region "{region}".')
+def get_or_create_gpu_cluster(name: str, region: str) -> str:
+    """Creates a GPU cluster.
+    When creating a GPU cluster, select an InfiniBand fabric for it:
+    fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
+    fabric-5 for projects in the eu-west1 region.
+    https://docs.nebius.com/compute/clusters/gpu
+    """
+    project_id = get_project_by_region(region)
+    service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
+    try:
+        cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=name,
+        )).wait()
+        cluster_id = cluster.metadata.id
+    except nebius.request_error() as no_cluster_found_error:
+        if region == 'eu-north1':
+            fabric = 'fabric-4'
+        elif region == 'eu-west1':
+            fabric = 'fabric-5'
+        else:
+            raise RuntimeError(
+                f'Unsupported region {region}.') from no_cluster_found_error
+        cluster = service.create(nebius.compute().CreateGpuClusterRequest(
+            metadata=nebius.nebius_common().ResourceMetadata(
+                parent_id=project_id,
+                name=name,
+            ),
+            spec=nebius.compute().GpuClusterSpec(
+                infiniband_fabric=fabric))).wait()
+        cluster_id = cluster.resource_id
+    return cluster_id
+def delete_cluster(name: str, region: str) -> None:
+    """Delete a GPU cluster."""
+    project_id = get_project_by_region(region)
+    service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
+    try:
+        cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=name,
+        )).wait()
+        cluster_id = cluster.metadata.id
+        logger.debug(f'Found GPU Cluster : {cluster_id}.')
+        service.delete(
+            nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
+        logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
+    except nebius.request_error():
+        logger.debug('GPU Cluster does not exist.')
+def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
+    """Lists instances associated with API key."""
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    result = service.list(
+        nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
+    instances = result
+    instance_dict: Dict[str, Dict[str, Any]] = {}
+    for instance in instances.items:
+        info = {}
+        info['status'] = instance.status.state.name
+        info['name'] = instance.metadata.name
+        if instance.status.network_interfaces:
+            info['external_ip'] = instance.status.network_interfaces[
+                0].public_ip_address.address.split('/')[0]
+            info['internal_ip'] = instance.status.network_interfaces[
+                0].ip_address.address.split('/')[0]
+        instance_dict[instance.metadata.id] = info
+    return instance_dict
+def stop(instance_id: str) -> None:
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get(nebius.compute().GetInstanceRequest(
+            id=instance_id,)).wait()
+        if instance.status.state.name == 'STOPPED':
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_id} stopping.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_STOP:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_STOP * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_id}'
+            f' to be stopped.')
+def start(instance_id: str) -> None:
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get(nebius.compute().GetInstanceRequest(
+            id=instance_id,)).wait()
+        if instance.status.state.name == 'RUNNING':
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_id} starting.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_START:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_START * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_id}'
+            f' to be ready.')
+def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
+           preset: str, region: str, image_family: str, disk_size: int,
+           user_data: str) -> str:
+    # Each node must have a unique name to avoid conflicts between
+    # multiple worker VMs. To ensure uniqueness,a UUID is appended
+    # to the node name.
+    instance_name = (f'{cluster_name_on_cloud}-'
+                     f'{uuid.uuid4().hex[:4]}-{node_type}')
+    logger.debug(f'Launching instance: {instance_name}')
+    disk_name = 'disk-' + instance_name
+    cluster_id = None
+    # 8 GPU virtual machines can be grouped into a GPU cluster.
+    # The GPU clusters are built with InfiniBand secure high-speed networking.
+    # https://docs.nebius.com/compute/clusters/gpu
+    if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
+        if preset == '8gpu-128vcpu-1600gb':
+            cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
+                                                   region)
+    project_id = get_project_by_region(region)
+    service = nebius.compute().DiskServiceClient(nebius.sdk())
+    disk = service.create(nebius.compute().CreateDiskRequest(
+        metadata=nebius.nebius_common().ResourceMetadata(
+            parent_id=project_id,
+            name=disk_name,
+        ),
+        spec=nebius.compute().DiskSpec(
+            source_image_family=nebius.compute().SourceImageFamily(
+                image_family=image_family),
+            size_gibibytes=disk_size,
+            type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
+        ))).wait()
+    disk_id = disk.resource_id
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
+        disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=disk_name,
+        )).wait()
+        if disk.status.state.name == 'READY':
+            break
+        logger.debug(f'Waiting for disk {disk_name} to be ready.')
+        time.sleep(POLL_INTERVAL)
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_DISK_CREATE:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_DISK_CREATE * POLL_INTERVAL}'
+            f' seconds) while waiting for disk {disk_name}'
+            f' to be ready.')
+    service = nebius.vpc().SubnetServiceClient(nebius.sdk())
+    sub_net = service.list(nebius.vpc().ListSubnetsRequest(
+        parent_id=project_id,)).wait()
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.create(nebius.compute().CreateInstanceRequest(
+        metadata=nebius.nebius_common().ResourceMetadata(
+            parent_id=project_id,
+            name=instance_name,
+        ),
+        spec=nebius.compute().InstanceSpec(
+            gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
+            if cluster_id is not None else None,
+            boot_disk=nebius.compute().AttachedDiskSpec(
+                attach_mode=nebius.compute(
+                ).AttachedDiskSpec.AttachMode.READ_WRITE,
+                existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
+            cloud_init_user_data=user_data,
+            resources=nebius.compute().ResourcesSpec(platform=platform,
+                                                     preset=preset),
+            network_interfaces=[
+                nebius.compute().NetworkInterfaceSpec(
+                    subnet_id=sub_net.items[0].metadata.id,
+                    ip_address=nebius.compute().IPAddress(),
+                    name='network-interface-0',
+                    public_ip_address=nebius.compute().PublicIPAddress())
+            ]))).wait()
+    instance_id = ''
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=instance_name,
+        )).wait()
+        if instance.status.state.name == 'STARTING':
+            instance_id = instance.metadata.id
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_name} start running.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_name}'
+            f' to be ready.')
+    return instance_id
+def remove(instance_id: str) -> None:
+    """Terminates the given instance."""
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    result = service.get(
+        nebius.compute().GetInstanceRequest(id=instance_id)).wait()
+    disk_id = result.spec.boot_disk.existing_disk.id
+    service.delete(
+        nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    # The instance begins deleting and attempts to delete the disk.
+    # Must wait until the disk is unlocked and becomes deletable.
+    while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
+        try:
+            service = nebius.compute().DiskServiceClient(nebius.sdk())
+            service.delete(
+                nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
+            break
+        except nebius.request_error():
+            logger.debug('Waiting for disk deletion.')
+            time.sleep(POLL_INTERVAL)
+            retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_DISK_DELETE:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_DISK_DELETE * POLL_INTERVAL}'
+            f' seconds) while waiting for disk {disk_id}'
+            f' to be deleted.')

sky/setup_files/dependencies.py CHANGED Viewed

@@ -5,6 +5,7 @@ This file is imported by setup.py, so:
   correct.
 - It should not import any dependencies, as they may not be installed yet.
 """
+import sys
 from typing import Dict, List
 install_requires = [
@@ -146,6 +147,13 @@ extras_require: Dict[str, List[str]] = {
         # docs instead.
         # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
     ],
+    'nebius': ['nebius>=0.2.0',]
 }
-extras_require['all'] = sum(extras_require.values(), [])
+# Nebius needs python3.10. If python 3.9 [all] will not install nebius
+if sys.version_info < (3, 10):
+    filtered_keys = [k for k in extras_require if k != 'nebius']
+    extras_require['all'] = sum(
+        [v for k, v in extras_require.items() if k != 'nebius'], [])
+else:
+    extras_require['all'] = sum(extras_require.values(), [])

sky/skylet/constants.py CHANGED Viewed

@@ -281,12 +281,9 @@ FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
 # linking. E.g., in our API server deployment on k8s, ~/.sky/ is mounted from a
 # persistent volume, so any contents in ~/.sky/ cannot be hard linked elsewhere.
 FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
-# Used when an managed jobs are created and
-# files are synced up to the cloud.
-FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
-FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
-FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
+# Base path for two-hop file mounts translation. See
+# controller_utils.translate_local_file_mounts_to_two_hop().
+FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
 # Used when an managed jobs are created and
 # files are synced up to the cloud.

sky/task.py CHANGED Viewed

@@ -1132,6 +1132,12 @@ class Task:
                         raise ValueError(f'Storage Type {store_type} '
                                          'does not exist!')
+                # TODO: Delete from storage_mounts, now that the storage is
+                # translated into file_mounts. Note: as is, this will break
+                # controller_utils.
+                # _maybe_translate_local_file_mounts_and_sync_up(), which still
+                # needs the storage, but not the file_mounts.
     def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
         """Returns file mounts of the form (dst=VM path, src=local path).

sky/templates/jobs-controller.yaml.j2 CHANGED Viewed

@@ -10,6 +10,9 @@ file_mounts:
   {%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
   {{remote_catalog_path}}: {{local_catalog_path}}
   {%- endfor %}
+  {%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
+  {{controller_file_mount_path}}: {{local_file_mount_path}}
+  {%- endfor %}
 setup: |
   {{ sky_activate_python_env }}

sky/templates/nebius-ray.yml.j2 ADDED Viewed

@@ -0,0 +1,79 @@
+cluster_name: {{cluster_name_on_cloud}}
+# The maximum number of workers nodes to launch in addition to the head node.
+max_workers: {{num_nodes - 1}}
+upscaling_speed: {{num_nodes - 1}}
+idle_timeout_minutes: 60
+provider:
+  type: external
+  module: sky.provision.nebius
+  region: "{{region}}"
+auth:
+  ssh_user: ubuntu
+  ssh_private_key: {{ssh_private_key}}
+available_node_types:
+  ray_head_default:
+    resources: {}
+    node_config:
+      InstanceType: {{instance_type}}
+      ImageId: {{image_id}}
+      DiskSize: {{disk_size}}
+      UserData: |
+        users:
+          - name: skypilot:ssh_user
+            shell: /bin/bash
+            sudo: ALL=(ALL) NOPASSWD:ALL
+            ssh_authorized_keys:
+              - |-
+                skypilot:ssh_public_key_content
+head_node_type: ray_head_default
+# Format: `REMOTE_PATH : LOCAL_PATH`
+file_mounts: {
+  "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
+  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
+{%- for remote_path, local_path in credentials.items() %}
+  "{{remote_path}}": "{{local_path}}",
+{%- endfor %}
+}
+rsync_exclude: []
+initialization_commands: []
+# List of shell commands to run to set up nodes.
+# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
+# connection, which is expensive. Try your best to co-locate commands into fewer
+# items!
+#
+# Increment the following for catching performance bugs easier:
+#   current num items (num SSH connections): 1
+setup_commands:
+  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
+  # Create ~/.ssh/config file in case the file does not exist in the image.
+  # Line 'rm ..': there is another installation of pip.
+  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
+  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
+  # Line 'mkdir -p ..': disable host key check
+  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
+  - {%- for initial_setup_command in initial_setup_commands %}
+    {{ initial_setup_command }}
+    {%- endfor %}
+    sudo systemctl stop unattended-upgrades || true;
+    sudo systemctl disable unattended-upgrades || true;
+    sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
+    sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
+    sudo pkill -9 apt-get;
+    sudo pkill -9 dpkg;
+    sudo dpkg --configure -a;
+    mkdir -p ~/.ssh; touch ~/.ssh/config;
+    {{ conda_installation_commands }}
+    {{ ray_skypilot_installation_commands }}
+    sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
+    sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
+    mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
+    [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');

sky/utils/controller_utils.py CHANGED Viewed

@@ -662,6 +662,66 @@ def replace_skypilot_config_path_in_file_mounts(
                      f'with the real path in file mounts: {file_mounts}')
+def _generate_run_uuid() -> str:
+    """Generates a unique run id for the job."""
+    return common_utils.base36_encode(uuid.uuid4().hex)[:8]
+def translate_local_file_mounts_to_two_hop(
+        task: 'task_lib.Task') -> Dict[str, str]:
+    """Translates local->VM mounts into two-hop file mounts.
+    This strategy will upload the local files to the controller first, using a
+    normal rsync as part of sky.launch() for the controller. Then, when the
+    controller launches the task, it will also use local file_mounts from the
+    destination path of the first hop.
+    Local machine/API server        Controller              Job cluster
+    ------------------------  -----------------------  --------------------
+    |      local path  ----|--|-> controller path --|--|-> job dst path   |
+    ------------------------  -----------------------  --------------------
+    Returns:
+        A dict mapping from controller file mount path to local file mount path
+          for the first hop. The task is updated in-place to do the second hop.
+    """
+    first_hop_file_mounts = {}
+    second_hop_file_mounts = {}
+    run_id = _generate_run_uuid()
+    base_tmp_dir = os.path.join(constants.FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH,
+                                run_id)
+    # Use a simple counter to create unique paths within the base_tmp_dir for
+    # each mount.
+    file_mount_id = 0
+    file_mounts_to_translate = task.file_mounts or {}
+    if task.workdir is not None:
+        file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
+        task.workdir = None
+    for job_cluster_path, local_path in file_mounts_to_translate.items():
+        if data_utils.is_cloud_store_url(
+                local_path) or data_utils.is_cloud_store_url(job_cluster_path):
+            raise exceptions.NotSupportedError(
+                'Cloud-based file_mounts are specified, but no cloud storage '
+                'is available. Please specify local file_mounts only.')
+        controller_path = os.path.join(base_tmp_dir, f'{file_mount_id}')
+        file_mount_id += 1
+        first_hop_file_mounts[controller_path] = local_path
+        second_hop_file_mounts[job_cluster_path] = controller_path
+    # Use set_file_mounts to override existing file mounts, if they exist.
+    task.set_file_mounts(second_hop_file_mounts)
+    # Return the first hop info so that it can be added to the jobs-controller
+    # YAML.
+    return first_hop_file_mounts
+# (maybe translate local file mounts) and (sync up)
 def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
                                                   task_type: str) -> None:
     """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
@@ -695,7 +755,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
     # We should not use common_utils.get_usage_run_id() here, because when
     # Python API is used, the run id will be the same across multiple
     # jobs.launch/serve.up calls after the sky is imported.
-    run_id = common_utils.base36_encode(uuid.uuid4().hex)[:8]
+    run_id = _generate_run_uuid()
     user_hash = common_utils.get_user_hash()
     original_file_mounts = task.file_mounts if task.file_mounts else {}
     original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
@@ -854,7 +914,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
         # Step 4: Upload storage from sources
         # Upload the local source to a bucket. The task will not be executed
         # locally, so we need to upload the files/folders to the bucket manually
-        # here before sending the task to the remote jobs controller.
+        # here before sending the task to the remote jobs controller.  This will
+        # also upload any storage mounts that are not translated.  After
+        # sync_storage_mounts, we will also have file_mounts in the task, but
+        # these aren't used since the storage_mounts for the same paths take
+        # precedence.
         if task.storage_mounts:
             # There may be existing (non-translated) storage mounts, so log this
             # whenever task.storage_mounts is non-empty.

{skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: skypilot-nightly
-Version: 1.0.0.dev20250220
+Version: 1.0.0.dev20250221
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0
@@ -107,6 +107,8 @@ Provides-Extra: vast
 Requires-Dist: vastai-sdk>=0.1.12; extra == "vast"
 Provides-Extra: vsphere
 Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
+Provides-Extra: nebius
+Requires-Dist: nebius>=0.2.0; extra == "nebius"
 Provides-Extra: all
 Requires-Dist: urllib3<2; extra == "all"
 Requires-Dist: awscli>=1.27.10; extra == "all"
@@ -150,6 +152,7 @@ Requires-Dist: azure-core>=1.24.0; extra == "all"
 Requires-Dist: azure-common; extra == "all"
 Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
 Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
+Requires-Dist: nebius>=0.2.0; extra == "all"
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
@@ -224,15 +227,16 @@ SkyPilot supports your existing GPU, TPU, and CPU workloads, with no code change
 Install with pip:
 ```bash
 # Choose your clouds:
-pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]"
+pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,nebius]"
 ```
 To get the latest features and fixes, use the nightly build or [install from source](https://docs.skypilot.co/en/latest/getting-started/installation.html):
 ```bash
 # Choose your clouds:
-pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]"
+pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,nebius]"
 ```
-[Current supported infra](https://docs.skypilot.co/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Digital Ocean, Paperspace, Cloudflare, Samsung, IBM, Vast.ai, VMware vSphere):
+[Current supported infra](https://docs.skypilot.co/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Digital Ocean, Paperspace, Cloudflare, Samsung, IBM, Vast.ai, VMware vSphere, Nebius):
 <p align="center">
   <img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/cloud-logos-light.png" width=85%>
 </p>

skypilot-nightly 1.0.0.dev20250220__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250220py3-none-any.whl → 1.0.0.dev20250221py3-none-any.whl