PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241028py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sky/__init__.py +2 -2
sky/adaptors/azure.py +3 -0
sky/backends/backend_utils.py +10 -133
sky/backends/cloud_vm_ray_backend.py +17 -105
sky/clouds/azure.py +10 -1
sky/execution.py +5 -4
sky/jobs/controller.py +38 -22
sky/jobs/recovery_strategy.py +30 -5
sky/jobs/state.py +33 -5
sky/jobs/utils.py +28 -4
sky/optimizer.py +11 -7
sky/provision/azure/azure-config-template.json +7 -1
sky/provision/azure/config.py +65 -45
sky/provision/azure/instance.py +275 -70
sky/provision/constants.py +7 -0
sky/provision/gcp/instance.py +0 -7
sky/resources.py +25 -8
sky/serve/core.py +0 -2
sky/serve/serve_state.py +3 -7
sky/serve/serve_utils.py +2 -14
sky/serve/service_spec.py +0 -28
sky/setup_files/setup.py +4 -3
sky/skylet/job_lib.py +37 -53
sky/skylet/log_lib.py +5 -14
sky/templates/azure-ray.yml.j2 +1 -0
sky/utils/dag_utils.py +14 -4
sky/utils/schemas.py +25 -15
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # TODO(zhwu): maybe use file based status instead of database, so
 # that we can easily switch to a s3-based storage.
 import enum
+import json
 import pathlib
 import sqlite3
 import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
     failure_reason TEXT,
     spot_job_id INTEGER,
     task_id INTEGER DEFAULT 0,
-    task_name TEXT)""")
+    task_name TEXT,
+    specs TEXT)""")
 _CONN.commit()
 db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
                              'TEXT',
                              copy_from='job_name')
+# Specs is some useful information about the task, e.g., the
+# max_restarts_on_errors value. It is stored in JSON format.
+db_utils.add_column_to_table(_CURSOR,
+                             _CONN,
+                             'spot',
+                             'specs',
+                             'TEXT',
+                             value_to_replace_existing_entries=json.dumps({
+                                 'max_restarts_on_errors': 0,
+                             }))
 # `job_info` contains the mapping from job_id to the job_name.
 # In the future, it may contain more information about each job.
 _CURSOR.execute("""\
@@ -130,7 +143,8 @@ columns = [
     'task_name',
     # columns from the job_info table
     '_job_info_job_id',  # This should be the same as job_id
-    'job_name'
+    'job_name',
+    'specs',
 ]
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
 def set_submitted(job_id: int, task_id: int, run_timestamp: str,
                   submit_time: float, resources_str: str,
-                  callback_func: CallbackType):
+                  specs: Dict[str, Union[str,
+                                         int]], callback_func: CallbackType):
     """Set the task to submitted.
     Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             determine the log directory of the managed task.
         submit_time: The time when the managed task is submitted.
         resources_str: The resources string of the managed task.
+        specs: The specs of the managed task.
+        callback_func: The callback function.
     """
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             resources=(?),
             submitted_at=(?),
             status=(?),
-            run_timestamp=(?)
+            run_timestamp=(?),
+            specs=(?)
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
             (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
-             run_timestamp, job_id, task_id))
+             run_timestamp, json.dumps(specs), job_id, task_id))
     callback_func('SUBMITTED')
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
         for (job_id,) in rows:
             return job_id
         return None
+def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        task_specs = cursor.execute(
+            """\
+            SELECT specs FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return json.loads(task_specs[0])

sky/jobs/utils.py CHANGED Viewed

@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
 # state, after the job finished. This is a safeguard to avoid the case where
 # the managed job status fails to be updated and keep the `sky jobs logs`
 # blocking for a long time.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
 class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             f'INFO: Log for the current task ({task_id}) '
                             'is finished. Waiting for the next task\'s log '
                             'to be started.')
-                        status_display.update('Waiting for the next task: '
-                                              f'{task_id + 1}.')
+                        # Add a newline to avoid the status display below
+                        # removing the last line of the task output.
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                f'Waiting for the next task: {task_id + 1}'))
                         status_display.start()
                         original_task_id = task_id
                         while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
                         continue
                     else:
-                        break
+                        task_specs = managed_job_state.get_task_specs(
+                            job_id, task_id)
+                        if task_specs.get('max_restarts_on_errors', 0) == 0:
+                            # We don't need to wait for the managed job status
+                            # update, as the job is guaranteed to be in terminal
+                            # state afterwards.
+                            break
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                'Waiting for next restart for the failed task'))
+                        status_display.start()
+                        while True:
+                            _, managed_job_status = (
+                                managed_job_state.get_latest_task_id_status(
+                                    job_id))
+                            if (managed_job_status !=
+                                    managed_job_state.ManagedJobStatus.RUNNING):
+                                break
+                            time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
+                        continue
                 # The job can be cancelled by the user or the controller (when
                 # the cluster is partially preempted).
                 logger.debug(

sky/optimizer.py CHANGED Viewed

@@ -831,13 +831,17 @@ class Optimizer:
             return row
         def _get_resource_group_hash(resources: 'resources_lib.Resources'):
-            return json.dumps(
-                {
-                    'cloud': f'{resources.cloud}',
-                    'accelerators': f'{resources.accelerators}',
-                    'use_spot': resources.use_spot
-                },
-                sort_keys=True)
+            resource_key_dict = {
+                'cloud': f'{resources.cloud}',
+                'accelerators': f'{resources.accelerators}',
+                'use_spot': resources.use_spot
+            }
+            if isinstance(resources.cloud, clouds.Kubernetes):
+                # Region for Kubernetes is the context name, i.e. different
+                # Kubernetes clusters. We add region to the key to show all the
+                # Kubernetes clusters in the optimizer table for better UX.
+                resource_key_dict['region'] = resources.region
+            return json.dumps(resource_key_dict, sort_keys=True)
         # Print the list of resouces that the optimizer considered.
         resource_fields = [

sky/provision/azure/azure-config-template.json CHANGED Viewed

@@ -14,6 +14,12 @@
                 "description": "Subnet parameters."
             }
         },
+        "location": {
+            "type": "string",
+            "metadata": {
+                "description": "Location of where the resources are allocated."
+            }
+        },
         "nsgName": {
             "type": "string",
             "metadata": {
@@ -23,7 +29,7 @@
     },
     "variables": {
         "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
-        "location": "[resourceGroup().location]",
+        "location": "[parameters('location')]",
         "msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
         "roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
         "nsgName": "[parameters('nsgName')]",

sky/provision/azure/config.py CHANGED Viewed

@@ -14,13 +14,12 @@ from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import azure
 from sky.provision import common
+from sky.provision import constants
 from sky.utils import common_utils
 logger = sky_logging.init_logger(__name__)
 UNIQUE_ID_LEN = 4
-_DEPLOYMENT_NAME = 'skypilot-config'
-_LEGACY_DEPLOYMENT_NAME = 'ray-config'
 _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480  # 8 minutes
 _CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
@@ -82,46 +81,55 @@ def bootstrap_instances(
             in provider_config), 'Provider config must include location field'
     params = {'location': provider_config['location']}
+    assert ('use_external_resource_group'
+            in provider_config), ('Provider config must include '
+                                  'use_external_resource_group field')
+    use_external_resource_group = provider_config['use_external_resource_group']
     if 'tags' in provider_config:
         params['tags'] = provider_config['tags']
-    logger.info(f'Creating/Updating resource group: {resource_group}')
-    rg_create_or_update = get_azure_sdk_function(
-        client=resource_client.resource_groups,
-        function_name='create_or_update')
-    rg_creation_start = time.time()
-    retry = 0
-    while (time.time() - rg_creation_start <
-           _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
-        try:
-            rg_create_or_update(resource_group_name=resource_group,
-                                parameters=params)
-            break
-        except azure.exceptions().ResourceExistsError as e:
-            if 'ResourceGroupBeingDeleted' in str(e):
-                if retry % 5 == 0:
-                    logger.info(
-                        f'Azure resource group {resource_group} of a recent '
-                        f'terminated cluster {cluster_name_on_cloud} is being '
-                        'deleted. It can only be provisioned after it is fully '
-                        'deleted. Waiting...')
-                time.sleep(1)
-                retry += 1
-                continue
-            raise
-        except azure.exceptions().ClientAuthenticationError as e:
+    # When resource group is user specified, it already exists in certain
+    # region.
+    if not use_external_resource_group:
+        logger.info(f'Creating/Updating resource group: {resource_group}')
+        rg_create_or_update = get_azure_sdk_function(
+            client=resource_client.resource_groups,
+            function_name='create_or_update')
+        rg_creation_start = time.time()
+        retry = 0
+        while (time.time() - rg_creation_start <
+               _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
+            try:
+                rg_create_or_update(resource_group_name=resource_group,
+                                    parameters=params)
+                break
+            except azure.exceptions().ResourceExistsError as e:
+                if 'ResourceGroupBeingDeleted' in str(e):
+                    if retry % 5 == 0:
+                        logger.info(
+                            f'Azure resource group {resource_group} of a '
+                            'recent terminated cluster '
+                            f'{cluster_name_on_cloud} is being deleted. It can'
+                            ' only be provisioned after it is fully deleted. '
+                            'Waiting...')
+                    time.sleep(1)
+                    retry += 1
+                    continue
+                raise
+            except azure.exceptions().ClientAuthenticationError as e:
+                message = (
+                    'Failed to authenticate with Azure. Please check your '
+                    'Azure credentials. Error: '
+                    f'{common_utils.format_exception(e)}').replace('\n', ' ')
+                logger.error(message)
+                raise exceptions.NoClusterLaunchedError(message) from e
+        else:
             message = (
-                'Failed to authenticate with Azure. Please check your Azure '
-                f'credentials. Error: {common_utils.format_exception(e)}'
-            ).replace('\n', ' ')
+                f'Timed out waiting for resource group {resource_group} to be '
+                'deleted.')
             logger.error(message)
-            raise exceptions.NoClusterLaunchedError(message) from e
-    else:
-        message = (
-            f'Timed out waiting for resource group {resource_group} to be '
-            'deleted.')
-        logger.error(message)
-        raise TimeoutError(message)
+            raise TimeoutError(message)
     # load the template file
     current_path = Path(__file__).parent
@@ -155,6 +163,9 @@ def bootstrap_instances(
                 'nsgName': {
                     'value': nsg_name
                 },
+                'location': {
+                    'value': params['location']
+                }
             },
         }
     }
@@ -164,11 +175,22 @@ def bootstrap_instances(
     get_deployment = get_azure_sdk_function(client=resource_client.deployments,
                                             function_name='get')
     deployment_exists = False
-    for deployment_name in [_DEPLOYMENT_NAME, _LEGACY_DEPLOYMENT_NAME]:
+    if use_external_resource_group:
+        deployment_name = (
+            constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
+                cluster_name_on_cloud=cluster_name_on_cloud))
+        deployment_list = [deployment_name]
+    else:
+        deployment_name = constants.DEPLOYMENT_NAME
+        deployment_list = [
+            constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
+        ]
+    for deploy_name in deployment_list:
         try:
             deployment = get_deployment(resource_group_name=resource_group,
-                                        deployment_name=deployment_name)
-            logger.info(f'Deployment {deployment_name!r} already exists. '
+                                        deployment_name=deploy_name)
+            logger.info(f'Deployment {deploy_name!r} already exists. '
                         'Skipping deployment creation.')
             outputs = deployment.properties.outputs
@@ -179,22 +201,20 @@ def bootstrap_instances(
             deployment_exists = False
     if not deployment_exists:
-        logger.info(f'Creating/Updating deployment: {_DEPLOYMENT_NAME}')
+        logger.info(f'Creating/Updating deployment: {deployment_name}')
         create_or_update = get_azure_sdk_function(
             client=resource_client.deployments,
             function_name='create_or_update')
         # TODO (skypilot): this takes a long time (> 40 seconds) to run.
         outputs = create_or_update(
             resource_group_name=resource_group,
-            deployment_name=_DEPLOYMENT_NAME,
+            deployment_name=deployment_name,
             parameters=parameters,
         ).result().properties.outputs
-    nsg_id = outputs['nsg']['value']
     # append output resource ids to be used with vm creation
     provider_config['msi'] = outputs['msi']['value']
-    provider_config['nsg'] = nsg_id
+    provider_config['nsg'] = outputs['nsg']['value']
     provider_config['subnet'] = outputs['subnet']['value']
     return config

skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241028py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl