skypilot-nightly 1.0.0.dev20241030__py3-none-any.whl → 1.0.0.dev20241101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '9d50f192b262d5f6cc74b5b6644f3a9e3ea31f2f'
8
+ _SKYPILOT_COMMIT_SHA = 'bf17e87d2f7887ee7d741a298a7094d6d25656cb'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241030'
38
+ __version__ = '1.0.0.dev20241101'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -2498,10 +2498,12 @@ def get_task_resources_str(task: 'task_lib.Task',
2498
2498
  the accelerator demands (if any). Otherwise, the CPU demand is shown.
2499
2499
  """
2500
2500
  spot_str = ''
2501
+ is_controller_task = task.is_controller_task()
2501
2502
  task_cpu_demand = (str(constants.CONTROLLER_PROCESS_CPU_DEMAND)
2502
- if task.is_controller_task() else
2503
- str(DEFAULT_TASK_CPU_DEMAND))
2504
- if task.best_resources is not None:
2503
+ if is_controller_task else str(DEFAULT_TASK_CPU_DEMAND))
2504
+ if is_controller_task:
2505
+ resources_str = f'CPU:{task_cpu_demand}'
2506
+ elif task.best_resources is not None:
2505
2507
  accelerator_dict = task.best_resources.accelerators
2506
2508
  if is_managed_job:
2507
2509
  if task.best_resources.use_spot:
@@ -1950,17 +1950,8 @@ class RetryingVmProvisioner(object):
1950
1950
 
1951
1951
  failover_history: List[Exception] = list()
1952
1952
 
1953
- style = colorama.Style
1954
- fore = colorama.Fore
1955
1953
  # Retrying launchable resources.
1956
1954
  while True:
1957
- if (isinstance(to_provision.cloud, clouds.Azure) and
1958
- to_provision.accelerators is not None and
1959
- 'A10' in to_provision.accelerators and prev_handle is None):
1960
- logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
1961
- 'an A10 cluster on Azure. This may take ~20 '
1962
- 'minutes due to driver installation.'
1963
- f'{style.RESET_ALL}')
1964
1955
  try:
1965
1956
  # Recheck cluster name as the 'except:' block below may
1966
1957
  # change the cloud assignment.
@@ -2123,8 +2114,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2123
2114
  self._version = self._VERSION
2124
2115
  self.cluster_name = cluster_name
2125
2116
  self.cluster_name_on_cloud = cluster_name_on_cloud
2126
- self._cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~',
2127
- 1)
2117
+ # Replace the home directory with ~ for better robustness across systems
2118
+ # with different home directories.
2119
+ if cluster_yaml.startswith(os.path.expanduser('~')):
2120
+ cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
2121
+ self._cluster_yaml = cluster_yaml
2128
2122
  # List of (internal_ip, feasible_ip) tuples for all the nodes in the
2129
2123
  # cluster, sorted by the feasible ips. The feasible ips can be either
2130
2124
  # internal or external ips, depending on the use_internal_ips flag.
@@ -2476,7 +2470,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2476
2470
  """Returns number of IPs per node in the cluster, handling TPU Pod."""
2477
2471
  is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
2478
2472
  if is_tpu_vm_pod:
2479
- num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
2473
+ num_ips = len(self.internal_ips())
2480
2474
  else:
2481
2475
  num_ips = 1
2482
2476
  return num_ips
@@ -2714,7 +2708,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2714
2708
  (e.g., cluster name invalid) or a region/zone throwing
2715
2709
  resource unavailability.
2716
2710
  exceptions.CommandError: any ssh command error.
2717
- RuntimeErorr: raised when 'rsync' is not installed.
2711
+ RuntimeError: raised when 'rsync' is not installed.
2718
2712
  # TODO(zhwu): complete the list of exceptions.
2719
2713
  """
2720
2714
  # FIXME: ray up for Azure with different cluster_names will overwrite
sky/cli.py CHANGED
@@ -555,6 +555,7 @@ def _launch_with_confirm(
555
555
  retry_until_up: bool = False,
556
556
  no_setup: bool = False,
557
557
  clone_disk_from: Optional[str] = None,
558
+ fast: bool = False,
558
559
  ):
559
560
  """Launch a cluster with a Task."""
560
561
  if cluster is None:
@@ -619,6 +620,7 @@ def _launch_with_confirm(
619
620
  retry_until_up=retry_until_up,
620
621
  no_setup=no_setup,
621
622
  clone_disk_from=clone_disk_from,
623
+ fast=fast,
622
624
  )
623
625
 
624
626
 
@@ -1040,6 +1042,13 @@ def cli():
1040
1042
  help=('[Experimental] Clone disk from an existing cluster to launch '
1041
1043
  'a new one. This is useful when the new cluster needs to have '
1042
1044
  'the same data on the boot disk as an existing cluster.'))
1045
+ @click.option(
1046
+ '--fast',
1047
+ is_flag=True,
1048
+ default=False,
1049
+ required=False,
1050
+ help=('[Experimental] If the cluster is already up and available, skip '
1051
+ 'provisioning and setup steps.'))
1043
1052
  @usage_lib.entrypoint
1044
1053
  def launch(
1045
1054
  entrypoint: Tuple[str, ...],
@@ -1071,6 +1080,7 @@ def launch(
1071
1080
  yes: bool,
1072
1081
  no_setup: bool,
1073
1082
  clone_disk_from: Optional[str],
1083
+ fast: bool,
1074
1084
  ):
1075
1085
  """Launch a cluster or task.
1076
1086
 
@@ -1139,7 +1149,8 @@ def launch(
1139
1149
  down=down,
1140
1150
  retry_until_up=retry_until_up,
1141
1151
  no_setup=no_setup,
1142
- clone_disk_from=clone_disk_from)
1152
+ clone_disk_from=clone_disk_from,
1153
+ fast=fast)
1143
1154
 
1144
1155
 
1145
1156
  @cli.command(cls=_DocumentedCodeCommand)
@@ -3549,6 +3560,15 @@ def jobs():
3549
3560
  default=False,
3550
3561
  required=False,
3551
3562
  help='Skip confirmation prompt.')
3563
+ # TODO(cooperc): remove this flag once --fast can robustly detect cluster
3564
+ # yaml config changes
3565
+ @click.option('--fast',
3566
+ default=False,
3567
+ is_flag=True,
3568
+ help='[Experimental] Launch the job faster by skipping '
3569
+ 'controller initialization steps. If you update SkyPilot or '
3570
+ 'your local cloud credentials, they will not be reflected until '
3571
+ 'you run `sky jobs launch` at least once without this flag.')
3552
3572
  @timeline.event
3553
3573
  @usage_lib.entrypoint
3554
3574
  def jobs_launch(
@@ -3575,6 +3595,7 @@ def jobs_launch(
3575
3595
  detach_run: bool,
3576
3596
  retry_until_up: bool,
3577
3597
  yes: bool,
3598
+ fast: bool,
3578
3599
  ):
3579
3600
  """Launch a managed job from a YAML or a command.
3580
3601
 
@@ -3658,7 +3679,8 @@ def jobs_launch(
3658
3679
  managed_jobs.launch(dag,
3659
3680
  name,
3660
3681
  detach_run=detach_run,
3661
- retry_until_up=retry_until_up)
3682
+ retry_until_up=retry_until_up,
3683
+ fast=fast)
3662
3684
 
3663
3685
 
3664
3686
  @jobs.command('queue', cls=_DocumentedCodeCommand)
sky/clouds/azure.py CHANGED
@@ -44,6 +44,8 @@ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
44
44
  _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
45
45
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
46
46
  _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
47
+ # This is used by Azure GPU VMs that use grid drivers (e.g. A10).
48
+ _DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
47
49
 
48
50
  _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
49
51
 
@@ -220,6 +222,8 @@ class Azure(clouds.Cloud):
220
222
  acc_name = list(acc.keys())[0]
221
223
  if acc_name == 'K80':
222
224
  return _DEFAULT_GPU_K80_IMAGE_ID
225
+ if acc_name == 'A10':
226
+ return _DEFAULT_GPU_GRID_IMAGE_ID
223
227
  # About Gen V1 vs V2:
224
228
  # In Azure, all instances with K80 (Standard_NC series), some
225
229
  # instances with M60 (Standard_NV series) and some cpu instances
@@ -350,10 +354,6 @@ class Azure(clouds.Cloud):
350
354
  'image_version': version,
351
355
  }
352
356
 
353
- # Setup the A10 nvidia driver.
354
- need_nvidia_driver_extension = (acc_dict is not None and
355
- 'A10' in acc_dict)
356
-
357
357
  # Determine resource group for deploying the instance.
358
358
  resource_group_name = skypilot_config.get_nested(
359
359
  ('azure', 'resource_group_vm'), None)
@@ -413,7 +413,6 @@ class Azure(clouds.Cloud):
413
413
  # Azure does not support specific zones.
414
414
  'zones': None,
415
415
  **image_config,
416
- 'need_nvidia_driver_extension': need_nvidia_driver_extension,
417
416
  'disk_tier': Azure._get_disk_type(disk_tier),
418
417
  'cloud_init_setup_commands': cloud_init_setup_commands,
419
418
  'azure_subscription_id': self.get_project_id(dryrun),
@@ -47,6 +47,10 @@ TPU_RETRY_CNT = 3
47
47
  TPU_V4_ZONES = ['us-central2-b']
48
48
  # TPU v3 pods are available in us-east1-d, but hidden in the skus.
49
49
  # We assume the TPU prices are the same as us-central1.
50
+ # TPU v6e's pricing info is not available on the SKUs. However, in
51
+ # https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
52
+ # us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
53
+ # and filtered out the other regions (us-central{1,2}, us-south1).
50
54
  HIDDEN_TPU_DF = pd.read_csv(
51
55
  io.StringIO(
52
56
  textwrap.dedent("""\
@@ -58,8 +62,50 @@ HIDDEN_TPU_DF = pd.read_csv(
58
62
  ,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
59
63
  ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
60
64
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
65
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
66
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
67
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
68
+ ,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
69
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
70
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
71
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
72
+ ,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
73
+ ,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
74
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
75
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
76
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
77
+ ,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
78
+ ,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
79
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
80
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
81
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
82
+ ,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
83
+ ,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
84
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
85
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
86
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
87
+ ,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
88
+ ,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
89
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
90
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
91
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
92
+ ,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
93
+ ,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
94
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
95
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
96
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
97
+ ,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
98
+ ,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
99
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
100
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
101
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
102
+ ,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
103
+ ,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
104
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
61
105
  """)))
62
106
 
107
+ TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1']
108
+
63
109
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
64
110
  # NOTE(dev): Keep the zones and the df in sync.
65
111
  TPU_V5_MISSING_ZONES_DF = {
@@ -683,11 +729,13 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
683
729
  'not found in SKUs or hidden TPU price DF.')
684
730
  # TODO(tian): Hack. Should investigate how to retrieve the price
685
731
  # for TPU-v6e.
686
- if not tpu_name.startswith('tpu-v6e'):
732
+ if (tpu_name.startswith('tpu-v6e') and
733
+ tpu_region in TPU_V6E_MISSING_REGIONS):
734
+ if not spot:
735
+ tpu_price = 0.0
736
+ else:
687
737
  assert spot or tpu_price is not None, (row, hidden_tpu,
688
738
  HIDDEN_TPU_DF)
689
- else:
690
- tpu_price = 0.0
691
739
  return tpu_price
692
740
 
693
741
  df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
@@ -49,14 +49,6 @@ def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool:
49
49
  return not acc.endswith('-8')
50
50
 
51
51
 
52
- def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int:
53
- if resources is None or not is_tpu(resources):
54
- raise ValueError('resources must be a valid TPU resource.')
55
- acc, _ = list(resources.accelerators.items())[0]
56
- num_tpu_devices = int(int(acc.split('-')[2]) / 8)
57
- return num_tpu_devices
58
-
59
-
60
52
  @dataclasses.dataclass
61
53
  class SpecificReservation:
62
54
  count: int
sky/data/storage.py CHANGED
@@ -1082,16 +1082,31 @@ class S3Store(AbstractStore):
1082
1082
  for S3 buckets.
1083
1083
  """
1084
1084
 
1085
+ _DEFAULT_REGION = 'us-east-1'
1085
1086
  _ACCESS_DENIED_MESSAGE = 'Access Denied'
1087
+ _CUSTOM_ENDPOINT_REGIONS = [
1088
+ 'ap-east-1', 'me-south-1', 'af-south-1', 'eu-south-1', 'eu-south-2',
1089
+ 'ap-south-2', 'ap-southeast-3', 'ap-southeast-4', 'me-central-1',
1090
+ 'il-central-1'
1091
+ ]
1086
1092
 
1087
1093
  def __init__(self,
1088
1094
  name: str,
1089
1095
  source: str,
1090
- region: Optional[str] = 'us-east-2',
1096
+ region: Optional[str] = _DEFAULT_REGION,
1091
1097
  is_sky_managed: Optional[bool] = None,
1092
1098
  sync_on_reconstruction: bool = True):
1093
1099
  self.client: 'boto3.client.Client'
1094
1100
  self.bucket: 'StorageHandle'
1101
+ # TODO(romilb): This is purely a stopgap fix for
1102
+ # https://github.com/skypilot-org/skypilot/issues/3405
1103
+ # We should eventually make all opt-in regions also work for S3 by
1104
+ # passing the right endpoint flags.
1105
+ if region in self._CUSTOM_ENDPOINT_REGIONS:
1106
+ logger.warning('AWS opt-in regions are not supported for S3. '
1107
+ f'Falling back to default region '
1108
+ f'{self._DEFAULT_REGION} for bucket {name!r}.')
1109
+ region = self._DEFAULT_REGION
1095
1110
  super().__init__(name, source, region, is_sky_managed,
1096
1111
  sync_on_reconstruction)
1097
1112
 
@@ -1424,7 +1439,7 @@ class S3Store(AbstractStore):
1424
1439
 
1425
1440
  def _create_s3_bucket(self,
1426
1441
  bucket_name: str,
1427
- region='us-east-2') -> StorageHandle:
1442
+ region=_DEFAULT_REGION) -> StorageHandle:
1428
1443
  """Creates S3 bucket with specific name in specific region
1429
1444
 
1430
1445
  Args:
sky/execution.py CHANGED
@@ -11,6 +11,7 @@ import sky
11
11
  from sky import admin_policy
12
12
  from sky import backends
13
13
  from sky import clouds
14
+ from sky import exceptions
14
15
  from sky import global_user_state
15
16
  from sky import optimizer
16
17
  from sky import sky_logging
@@ -216,7 +217,8 @@ def _execute(
216
217
  '(after all jobs finish).'
217
218
  f'{colorama.Style.RESET_ALL}')
218
219
  idle_minutes_to_autostop = 1
219
- stages.remove(Stage.DOWN)
220
+ if Stage.DOWN in stages:
221
+ stages.remove(Stage.DOWN)
220
222
  if idle_minutes_to_autostop >= 0:
221
223
  requested_features.add(
222
224
  clouds.CloudImplementationFeatures.AUTO_TERMINATE)
@@ -355,6 +357,7 @@ def launch(
355
357
  detach_run: bool = False,
356
358
  no_setup: bool = False,
357
359
  clone_disk_from: Optional[str] = None,
360
+ fast: bool = False,
358
361
  # Internal only:
359
362
  # pylint: disable=invalid-name
360
363
  _is_launched_by_jobs_controller: bool = False,
@@ -409,6 +412,8 @@ def launch(
409
412
  clone_disk_from: [Experimental] if set, clone the disk from the
410
413
  specified cluster. This is useful to migrate the cluster to a
411
414
  different availability zone or region.
415
+ fast: [Experimental] If the cluster is already up and available,
416
+ skip provisioning and setup steps.
412
417
 
413
418
  Example:
414
419
  .. code-block:: python
@@ -452,15 +457,43 @@ def launch(
452
457
  controller_utils.check_cluster_name_not_controller(
453
458
  cluster_name, operation_str='sky.launch')
454
459
 
460
+ handle = None
461
+ stages = None
462
+ # Check if cluster exists and we are doing fast provisioning
463
+ if fast and cluster_name is not None:
464
+ maybe_handle = global_user_state.get_handle_from_cluster_name(
465
+ cluster_name)
466
+ if maybe_handle is not None:
467
+ try:
468
+ # This will throw if the cluster is not available
469
+ backend_utils.check_cluster_available(
470
+ cluster_name,
471
+ operation='executing tasks',
472
+ check_cloud_vm_ray_backend=False,
473
+ dryrun=dryrun)
474
+ handle = maybe_handle
475
+ # Get all stages
476
+ stages = [
477
+ Stage.SYNC_WORKDIR,
478
+ Stage.SYNC_FILE_MOUNTS,
479
+ Stage.PRE_EXEC,
480
+ Stage.EXEC,
481
+ Stage.DOWN,
482
+ ]
483
+ except exceptions.ClusterNotUpError:
484
+ # Proceed with normal provisioning
485
+ pass
486
+
455
487
  return _execute(
456
488
  entrypoint=entrypoint,
457
489
  dryrun=dryrun,
458
490
  down=down,
459
491
  stream_logs=stream_logs,
460
- handle=None,
492
+ handle=handle,
461
493
  backend=backend,
462
494
  retry_until_up=retry_until_up,
463
495
  optimize_target=optimize_target,
496
+ stages=stages,
464
497
  cluster_name=cluster_name,
465
498
  detach_setup=detach_setup,
466
499
  detach_run=detach_run,
sky/jobs/core.py CHANGED
@@ -36,6 +36,7 @@ def launch(
36
36
  stream_logs: bool = True,
37
37
  detach_run: bool = False,
38
38
  retry_until_up: bool = False,
39
+ fast: bool = False,
39
40
  ) -> None:
40
41
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
41
42
  """Launch a managed job.
@@ -47,6 +48,9 @@ def launch(
47
48
  managed job.
48
49
  name: Name of the managed job.
49
50
  detach_run: Whether to detach the run.
51
+ fast: Whether to use sky.launch(fast=True) for the jobs controller. If
52
+ True, the SkyPilot wheel and the cloud credentials may not be updated
53
+ on the jobs controller.
50
54
 
51
55
  Raises:
52
56
  ValueError: cluster does not exist. Or, the entrypoint is not a valid
@@ -138,6 +142,7 @@ def launch(
138
142
  idle_minutes_to_autostop=skylet_constants.
139
143
  CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
140
144
  retry_until_up=True,
145
+ fast=fast,
141
146
  _disable_controller_check=True)
142
147
 
143
148
 
sky/jobs/state.py CHANGED
@@ -141,10 +141,10 @@ columns = [
141
141
  'job_id',
142
142
  'task_id',
143
143
  'task_name',
144
+ 'specs',
144
145
  # columns from the job_info table
145
146
  '_job_info_job_id', # This should be the same as job_id
146
147
  'job_name',
147
- 'specs',
148
148
  ]
149
149
 
150
150
 
@@ -311,30 +311,10 @@ def _create_vm(
311
311
  vm_name=vm_name,
312
312
  parameters=vm_instance,
313
313
  )
314
- # poller.result() will block on async operation until it's done.
315
- logger.info(f'Created VM {vm_poller.result().name}.')
316
- # Configure driver extension for A10 GPUs. A10 GPUs requires a
317
- # special type of drivers which is available at Microsoft HPC
318
- # extension. Reference:
319
- # https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
320
- # This can take more than 20mins for setting up the A10 GPUs
321
- if node_config.get('need_nvidia_driver_extension', False):
322
- ext_poller = compute_client.virtual_machine_extensions.\
323
- begin_create_or_update(
324
- resource_group_name=provider_config['resource_group'],
325
- vm_name=vm_name,
326
- vm_extension_name='NvidiaGpuDriverLinux',
327
- extension_parameters=compute.VirtualMachineExtension(
328
- location=provider_config['location'],
329
- publisher='Microsoft.HpcCompute',
330
- type_properties_type='NvidiaGpuDriverLinux',
331
- type_handler_version='1.9',
332
- auto_upgrade_minor_version=True,
333
- settings='{}'))
334
- logger.info(
335
- f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
336
- )
337
- return vm_poller.result()
314
+ # This line will block until the VM is created or the operation times out.
315
+ vm = vm_poller.result()
316
+ logger.info(f'Created VM {vm.name}.')
317
+ return vm
338
318
 
339
319
 
340
320
  def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
sky/resources.py CHANGED
@@ -602,6 +602,9 @@ class Resources:
602
602
  # TPU V5 requires a newer runtime version.
603
603
  if acc.startswith('tpu-v5'):
604
604
  return 'v2-alpha-tpuv5'
605
+ # TPU V6e requires a newer runtime version.
606
+ if acc.startswith('tpu-v6e'):
607
+ return 'v2-alpha-tpuv6e'
605
608
  return 'tpu-vm-base'
606
609
 
607
610
  accelerator_args['runtime_version'] = (
@@ -83,7 +83,6 @@ available_node_types:
83
83
  {%- for cmd in cloud_init_setup_commands %}
84
84
  {{ cmd }}
85
85
  {%- endfor %}
86
- need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
87
86
  {%- if disk_performance_tier is not none %}
88
87
  disk_performance_tier: {{disk_performance_tier}}
89
88
  {%- endif %}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241030
3
+ Version: 1.0.0.dev20241101
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,16 +1,16 @@
1
- sky/__init__.py,sha256=WwnJbF2ubaAJEJkUGPJ7jK5mh3QD1r487evpncErtC8,5882
1
+ sky/__init__.py,sha256=luv3b5qtKG5RjX-jN3EqZKJ-0v975DsXfGfSEUeU4AU,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=VoPwWKGeNZZcFNLvw3VPR_F0WpKnM5EvfffNS8kcKc0,210360
5
+ sky/cli.py,sha256=6umPcFovU5sHIUdC0B9lfOstzWLA0DPS5x6dg1EOkeQ,211193
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
9
  sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
10
- sky/execution.py,sha256=tDK6JhF_405cjqxRpbdLbHZyxrKTD5oa0UkKDvPJ_9Q,24751
10
+ sky/execution.py,sha256=HF76sz-gCEZPGkuL48jJaLOTqjuHg0KysgKaPw-hn84,25997
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
13
- sky/resources.py,sha256=7kVpLRfy3DFFgmEji0_Xz6FbrvBDUSXC6K0bsRIK3hA,68290
13
+ sky/resources.py,sha256=Zt8mCCmdvZ5ZCqY-l3KXlx_lkUesAopRtaEcEsrRFZo,68465
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
15
15
  sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
16
16
  sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -30,8 +30,8 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
30
30
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
- sky/backends/backend_utils.py,sha256=LmLsaLiPuuUyGebOXykdvwZpUY-8sB7n4o2AnmwNmdQ,121714
34
- sky/backends/cloud_vm_ray_backend.py,sha256=ZWAzdmKzSf3qalDoKfmLGaO3PywjLtIA5Q3AeeHhvHA,233158
33
+ sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=jdG17FDAOUoHjXib2P73Hhdl9yXoDJxPTY5Dyqvp6j4,232757
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -41,7 +41,7 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
43
  sky/clouds/aws.py,sha256=dVZ8auaa2z2Ifl9iiRT06IeEFaNtZhANKtHVLT6Gcno,49474
44
- sky/clouds/azure.py,sha256=-K-VPV2sYJJAfJbDcPAiNhNVhMQYkRBuYHQRb3-MGIQ,30598
44
+ sky/clouds/azure.py,sha256=ixw5jCnnMxDLj0hpArljVzq88EKOrqRxk9xm5N9u-mc,30576
45
45
  sky/clouds/cloud.py,sha256=A5F4a71ciPyljWEs6vT-4RmdGT-AE9NkhS8gJ4Vgi_I,35165
46
46
  sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
47
47
  sky/clouds/cudo.py,sha256=UiY273Sln7VOYDYx93yWiWH_RLlOKZ2cm7mA31ld4A8,13094
@@ -78,27 +78,27 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFy
78
78
  sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzEPBBNE9XOZM0K0FIXbBUMj9h0MQ,12803
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=5CbgU90ldiKVgaagQTnYBJVsgVGE3cMwtF7KpBiTtvU,29873
81
+ sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
82
82
  sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
86
  sky/clouds/utils/azure_utils.py,sha256=NToRBnhEyuUvb-nBnsKTxjhOBRkMcrelL8LK4w6s4t8,3555
87
- sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
87
+ sky/clouds/utils/gcp_utils.py,sha256=QejfgXOIVRv5-fv3Soi96VeVNVyquwVwy3M58N3YfNs,6633
88
88
  sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
89
89
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
90
90
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
91
91
  sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
92
92
  sky/data/data_utils.py,sha256=-P5GsDH_m4slrCz4vHdgiFezIys8ufzvhEKePJwfjFc,28597
93
93
  sky/data/mounting_utils.py,sha256=44YkYIIgArEkyvxCtfmXXumybrU8bmn1TfLXWv_eldI,11480
94
- sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
94
+ sky/data/storage.py,sha256=OQ_kznF-P50Jq0feO5FBqm97QGhfbsZ2dX-Ar3sVWr4,163903
95
95
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
96
96
  sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
98
  sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
99
- sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
99
+ sky/jobs/core.py,sha256=w7PancHi8_-afLKZQ3HHMD1sEDoepm1vEMxyDlXdo64,17155
100
100
  sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
101
- sky/jobs/state.py,sha256=TV1G12vEMQJRgwWXsAjb3lmkJqkZmAOUUOja2QQPrg8,24307
101
+ sky/jobs/state.py,sha256=exN6BdJlLBzFTccJCSHN4dNjVeYFgTgqgxOaHwLw2IQ,24307
102
102
  sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
103
103
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
@@ -118,7 +118,7 @@ sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,32
118
118
  sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
119
119
  sky/provision/azure/azure-config-template.json,sha256=jrjAgOtpe0e6FSg3vsVqHKQqJe0w-HeWOFT1HuwzS2c,4712
120
120
  sky/provision/azure/config.py,sha256=V5-0Zelt4Xo0vcqnD6PpsnaCS7vc3xosDelILDAKSW4,8885
121
- sky/provision/azure/instance.py,sha256=dq67O6gwvNN0jrBklgJ8AnrNj784aqyLl7PHeB5xVQA,50088
121
+ sky/provision/azure/instance.py,sha256=Xd1paLWVc6eVHzphOjZB4_BeXZNX7GYgPV9kH3GWvsc,48983
122
122
  sky/provision/cudo/__init__.py,sha256=KAEl26MVPsk7IoP9Gg-MOJJRIV6-X9B0fbyHdyJWdLo,741
123
123
  sky/provision/cudo/config.py,sha256=RYOVkV0MoUqVBJRZiKhBZhjFygeyFs7eUdVMdPg1vds,327
124
124
  sky/provision/cudo/cudo_machine_type.py,sha256=_VNXWPELmlFXbtdcnPvkuLuyE9CZ923BUCdiac-ClDY,696
@@ -219,7 +219,7 @@ sky/skylet/ray_patches/resource_demand_scheduler.py.patch,sha256=AVV-Hw-Rxw16aFm
219
219
  sky/skylet/ray_patches/updater.py.patch,sha256=ZNMGVYICPBB44jLbEx2KvCgIY7BWYdDv3-2b2HJWmAQ,289
220
220
  sky/skylet/ray_patches/worker.py.patch,sha256=_OBhibdr3xOy5Qje6Tt8D1eQVm_msi50TJbCJmOTxVU,565
221
221
  sky/templates/aws-ray.yml.j2,sha256=K0rAuyf1XC_GPFp1BR9df42-Be12A6T2UF0BllVSpYg,8005
222
- sky/templates/azure-ray.yml.j2,sha256=l8zBUVfMPNRlKpn3l7_D3yXpdrUoSeykUuZRy0UoCLQ,6308
222
+ sky/templates/azure-ray.yml.j2,sha256=uUneIfT5vTLUCvrZXiv2dsd3gFqLH2FK632oBruOO_k,6237
223
223
  sky/templates/cudo-ray.yml.j2,sha256=SEHVY57iBauCOE2HYJtYVFEKlriAkdwQu_p86a1n_bA,3548
224
224
  sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
225
225
  sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241030.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241030.dist-info/METADATA,sha256=bwgfsg4Zzl63yZYrUfZIBNeMitC8bOcgqKucALPDnbk,19708
279
- skypilot_nightly-1.0.0.dev20241030.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241030.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241030.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241030.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241101.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241101.dist-info/METADATA,sha256=eZ2YLqsd-uqiw7BkwG8nZPtPw70Yzs5ZPTLrBVNH1-k,19708
279
+ skypilot_nightly-1.0.0.dev20241101.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241101.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241101.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241101.dist-info/RECORD,,