skypilot-nightly 1.0.0.dev20250319__py3-none-any.whl → 1.0.0.dev20250321__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/cloudflare.py +19 -3
  3. sky/adaptors/kubernetes.py +2 -1
  4. sky/adaptors/nebius.py +128 -6
  5. sky/backends/cloud_vm_ray_backend.py +3 -1
  6. sky/benchmark/benchmark_utils.py +3 -2
  7. sky/check.py +89 -55
  8. sky/cloud_stores.py +66 -0
  9. sky/clouds/aws.py +14 -2
  10. sky/clouds/azure.py +13 -1
  11. sky/clouds/cloud.py +37 -2
  12. sky/clouds/cudo.py +3 -2
  13. sky/clouds/do.py +3 -2
  14. sky/clouds/fluidstack.py +3 -2
  15. sky/clouds/gcp.py +55 -34
  16. sky/clouds/ibm.py +15 -1
  17. sky/clouds/kubernetes.py +3 -1
  18. sky/clouds/lambda_cloud.py +3 -1
  19. sky/clouds/nebius.py +7 -3
  20. sky/clouds/oci.py +15 -1
  21. sky/clouds/paperspace.py +3 -2
  22. sky/clouds/runpod.py +7 -1
  23. sky/clouds/scp.py +3 -1
  24. sky/clouds/service_catalog/kubernetes_catalog.py +3 -1
  25. sky/clouds/utils/gcp_utils.py +11 -1
  26. sky/clouds/vast.py +3 -2
  27. sky/clouds/vsphere.py +3 -2
  28. sky/core.py +6 -2
  29. sky/data/data_transfer.py +75 -0
  30. sky/data/data_utils.py +34 -0
  31. sky/data/mounting_utils.py +18 -0
  32. sky/data/storage.py +542 -16
  33. sky/data/storage_utils.py +102 -84
  34. sky/exceptions.py +2 -0
  35. sky/global_user_state.py +15 -6
  36. sky/jobs/server/core.py +1 -1
  37. sky/jobs/utils.py +5 -0
  38. sky/optimizer.py +8 -2
  39. sky/provision/gcp/config.py +3 -3
  40. sky/provision/gcp/constants.py +16 -2
  41. sky/provision/gcp/instance.py +4 -1
  42. sky/provision/kubernetes/utils.py +26 -21
  43. sky/resources.py +6 -1
  44. sky/serve/replica_managers.py +10 -1
  45. sky/setup_files/dependencies.py +3 -1
  46. sky/task.py +16 -5
  47. sky/utils/command_runner.py +2 -0
  48. sky/utils/controller_utils.py +13 -4
  49. sky/utils/kubernetes/kubernetes_deploy_utils.py +4 -1
  50. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/METADATA +13 -2
  51. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/RECORD +55 -55
  52. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/WHEEL +1 -1
  53. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/entry_points.txt +0 -0
  54. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info/licenses}/LICENSE +0 -0
  55. {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py CHANGED
@@ -24,6 +24,9 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
24
24
  'to the cloud storage for {path!r}'
25
25
  'due to the following error: {error_msg!r}')
26
26
 
27
+ _USE_SKYIGNORE_HINT = (
28
+ 'To avoid using .gitignore, you can create a .skyignore file instead.')
29
+
27
30
  _LAST_USE_TRUNC_LENGTH = 25
28
31
 
29
32
 
@@ -109,10 +112,9 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
109
112
  def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
110
113
  """ Lists files and patterns ignored by git in the source directory
111
114
 
112
- Runs `git status --ignored` which returns a list of excluded files and
115
+ Runs `git ls-files --ignored ...` which returns a list of excluded files and
113
116
  patterns read from .gitignore and .git/info/exclude using git.
114
- `git init` is run if SRC_DIR_PATH is not a git repository and removed
115
- after obtaining excluded list.
117
+ This will also be run for all submodules under the src_dir_path.
116
118
 
117
119
  Returns:
118
120
  List[str] containing files and patterns to be ignored. Some of the
@@ -120,91 +122,107 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
120
122
  """
121
123
  expand_src_dir_path = os.path.expanduser(src_dir_path)
122
124
 
123
- git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
124
- gitignore_path = os.path.join(expand_src_dir_path,
125
- constants.GIT_IGNORE_FILE)
125
+ # We will use `git ls-files` to list files that we should ignore, but
126
+ # `ls-files` will not recurse into subdirectories. So, we need to manually
127
+ # list the submodules and run `ls-files` within the root and each submodule.
128
+ # Print the submodule paths relative to expand_src_dir_path, separated by
129
+ # null chars.
130
+ submodules_cmd = (f'git -C {shlex.quote(expand_src_dir_path)} '
131
+ 'submodule foreach -q "printf \\$displaypath\\\\\\0"')
132
+
133
+ try:
134
+ submodules_output = subprocess.run(submodules_cmd,
135
+ shell=True,
136
+ stdout=subprocess.PIPE,
137
+ stderr=subprocess.PIPE,
138
+ check=True,
139
+ text=True)
140
+ except subprocess.CalledProcessError as e:
141
+ gitignore_path = os.path.join(expand_src_dir_path,
142
+ constants.GIT_IGNORE_FILE)
143
+
144
+ if (e.returncode == exceptions.GIT_FATAL_EXIT_CODE and
145
+ 'not a git repository' in e.stderr):
146
+ # If git failed because we aren't in a git repository, but there is
147
+ # a .gitignore, warn the user that it will be ignored.
148
+ if os.path.exists(gitignore_path):
149
+ logger.warning('Detected a .gitignore file, but '
150
+ f'{src_dir_path} is not a git repository. The '
151
+ '.gitignore file will be ignored. '
152
+ f'{_USE_SKYIGNORE_HINT}')
153
+ # Otherwise, this is fine and we can exit early.
154
+ return []
155
+
156
+ if e.returncode == exceptions.COMMAND_NOT_FOUND_EXIT_CODE:
157
+ # Git is not installed. This is fine, skip the check.
158
+ # If .gitignore is present, warn the user.
159
+ if os.path.exists(gitignore_path):
160
+ logger.warning(f'Detected a .gitignore file in {src_dir_path}, '
161
+ 'but git is not installed. The .gitignore file '
162
+ f'will be ignored. {_USE_SKYIGNORE_HINT}')
163
+ return []
164
+
165
+ # Pretty much any other error is unexpected, so re-raise.
166
+ raise
126
167
 
127
- git_exclude_exists = os.path.isfile(git_exclude_path)
128
- gitignore_exists = os.path.isfile(gitignore_path)
168
+ # submodules_output will contain each submodule path (relative to
169
+ # src_dir_path), each ending with a null character.
170
+ # .split will have an empty string at the end because of the final null
171
+ # char, so trim it.
172
+ submodules = submodules_output.stdout.split('\0')[:-1]
173
+
174
+ # The empty string is the relative reference to the src_dir_path.
175
+ all_git_repos = ['.'] + [
176
+ # We only care about submodules that are a subdirectory of src_dir_path.
177
+ submodule for submodule in submodules if not submodule.startswith('../')
178
+ ]
129
179
 
130
- # This command outputs a list to be excluded according to .gitignore
131
- # and .git/info/exclude
132
- filter_cmd = (f'git -C {shlex.quote(expand_src_dir_path)} '
133
- 'status --ignored --porcelain=v1')
134
180
  excluded_list: List[str] = []
181
+ for repo in all_git_repos:
182
+ # repo is the path relative to src_dir_path. Get the full path.
183
+ repo_path = os.path.join(expand_src_dir_path, repo)
184
+ # This command outputs a list to be excluded according to .gitignore,
185
+ # .git/info/exclude, and global exclude config.
186
+ # -z: filenames terminated by \0 instead of \n
187
+ # --others: show untracked files
188
+ # --ignore: out of untracked files, only show ignored files
189
+ # --exclude-standard: use standard exclude rules (required for --ignore)
190
+ # --directory: if an entire directory is ignored, collapse to a single
191
+ # entry rather than listing every single file
192
+ # Since we are using --others instead of --cached, this will not show
193
+ # files that are tracked but also present in .gitignore.
194
+ filter_cmd = (f'git -C {shlex.quote(repo_path)} ls-files -z '
195
+ '--others --ignore --exclude-standard --directory')
196
+ output = subprocess.run(filter_cmd,
197
+ shell=True,
198
+ stdout=subprocess.PIPE,
199
+ stderr=subprocess.PIPE,
200
+ check=True,
201
+ text=True)
202
+ # Don't catch any errors. We would only expect to see errors during the
203
+ # first git invocation - so if we see any here, crash.
204
+
205
+ output_list = output.stdout.split('\0')
206
+ # trim the empty string at the end
207
+ output_list = output_list[:-1]
208
+
209
+ for item in output_list:
210
+
211
+ if repo == '.' and item == './':
212
+ logger.warning(f'{src_dir_path} is within a git repo, but the '
213
+ 'entire directory is ignored by git. We will '
214
+ 'ignore all git exclusions. '
215
+ f'{_USE_SKYIGNORE_HINT}')
216
+ return []
217
+
218
+ to_be_excluded = os.path.join(repo, item)
219
+ if item.endswith('/'):
220
+ # aws s3 sync and gsutil rsync require * to exclude
221
+ # files/dirs under the specified directory.
222
+ to_be_excluded += '*'
223
+
224
+ excluded_list.append(to_be_excluded)
135
225
 
136
- if git_exclude_exists or gitignore_exists:
137
- try:
138
- output = subprocess.run(filter_cmd,
139
- shell=True,
140
- stdout=subprocess.PIPE,
141
- stderr=subprocess.PIPE,
142
- check=True,
143
- text=True)
144
- except subprocess.CalledProcessError as e:
145
- # when the SRC_DIR_PATH is not a git repo and .git
146
- # does not exist in it
147
- if e.returncode == exceptions.GIT_FATAL_EXIT_CODE:
148
- if 'not a git repository' in e.stderr:
149
- # Check if the user has 'write' permission to
150
- # SRC_DIR_PATH
151
- if not os.access(expand_src_dir_path, os.W_OK):
152
- error_msg = 'Write permission denial'
153
- logger.warning(
154
- _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
155
- path=src_dir_path, error_msg=error_msg))
156
- return excluded_list
157
- init_cmd = f'git -C {expand_src_dir_path} init'
158
- try:
159
- subprocess.run(init_cmd,
160
- shell=True,
161
- stdout=subprocess.PIPE,
162
- stderr=subprocess.PIPE,
163
- check=True)
164
- output = subprocess.run(filter_cmd,
165
- shell=True,
166
- stdout=subprocess.PIPE,
167
- stderr=subprocess.PIPE,
168
- check=True,
169
- text=True)
170
- except subprocess.CalledProcessError as init_e:
171
- logger.warning(
172
- _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
173
- path=src_dir_path, error_msg=init_e.stderr))
174
- return excluded_list
175
- if git_exclude_exists:
176
- # removes all the files/dirs created with 'git init'
177
- # under .git/ except .git/info/exclude
178
- remove_files_cmd = (f'find {expand_src_dir_path}' \
179
- f'/.git -path {git_exclude_path}' \
180
- ' -prune -o -type f -exec rm -f ' \
181
- '{} +')
182
- remove_dirs_cmd = (f'find {expand_src_dir_path}' \
183
- f'/.git -path {git_exclude_path}' \
184
- ' -o -type d -empty -delete')
185
- subprocess.run(remove_files_cmd,
186
- shell=True,
187
- stdout=subprocess.PIPE,
188
- stderr=subprocess.PIPE,
189
- check=True)
190
- subprocess.run(remove_dirs_cmd,
191
- shell=True,
192
- stdout=subprocess.PIPE,
193
- stderr=subprocess.PIPE,
194
- check=True)
195
-
196
- output_list = output.stdout.split('\n')
197
- for line in output_list:
198
- # FILTER_CMD outputs items preceded by '!!'
199
- # to specify excluded files/dirs
200
- # e.g., '!! mydir/' or '!! mydir/myfile.txt'
201
- if line.startswith('!!'):
202
- to_be_excluded = line[3:]
203
- if line.endswith('/'):
204
- # aws s3 sync and gsutil rsync require * to exclude
205
- # files/dirs under the specified directory.
206
- to_be_excluded += '*'
207
- excluded_list.append(to_be_excluded)
208
226
  return excluded_list
209
227
 
210
228
 
sky/exceptions.py CHANGED
@@ -24,6 +24,8 @@ MOUNT_PATH_NON_EMPTY_CODE = 42
24
24
  INSUFFICIENT_PRIVILEGES_CODE = 52
25
25
  # Return code when git command is ran in a dir that is not git repo
26
26
  GIT_FATAL_EXIT_CODE = 128
27
+ # Return code from bash when a command is not found
28
+ COMMAND_NOT_FOUND_EXIT_CODE = 127
27
29
  # Architecture, such as arm64, not supported by the dependency
28
30
  ARCH_NOT_SUPPORTED_EXIT_CODE = 133
29
31
 
sky/global_user_state.py CHANGED
@@ -26,11 +26,12 @@ from sky.utils import status_lib
26
26
  if typing.TYPE_CHECKING:
27
27
  from sky import backends
28
28
  from sky import clouds
29
+ from sky.clouds import cloud
29
30
  from sky.data import Storage
30
31
 
31
32
  logger = sky_logging.init_logger(__name__)
32
33
 
33
- _ENABLED_CLOUDS_KEY = 'enabled_clouds'
34
+ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
34
35
 
35
36
  _DB_PATH = os.path.expanduser('~/.sky/state.db')
36
37
  pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
@@ -795,9 +796,11 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
795
796
  return [row[0] for row in rows]
796
797
 
797
798
 
798
- def get_cached_enabled_clouds() -> List['clouds.Cloud']:
799
+ def get_cached_enabled_clouds(
800
+ cloud_capability: 'cloud.CloudCapability') -> List['clouds.Cloud']:
801
+
799
802
  rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
800
- (_ENABLED_CLOUDS_KEY,))
803
+ (_get_capability_key(cloud_capability),))
801
804
  ret = []
802
805
  for (value,) in rows:
803
806
  ret = json.loads(value)
@@ -817,12 +820,18 @@ def get_cached_enabled_clouds() -> List['clouds.Cloud']:
817
820
  return enabled_clouds
818
821
 
819
822
 
820
- def set_enabled_clouds(enabled_clouds: List[str]) -> None:
821
- _DB.cursor.execute('INSERT OR REPLACE INTO config VALUES (?, ?)',
822
- (_ENABLED_CLOUDS_KEY, json.dumps(enabled_clouds)))
823
+ def set_enabled_clouds(enabled_clouds: List[str],
824
+ cloud_capability: 'cloud.CloudCapability') -> None:
825
+ _DB.cursor.execute(
826
+ 'INSERT OR REPLACE INTO config VALUES (?, ?)',
827
+ (_get_capability_key(cloud_capability), json.dumps(enabled_clouds)))
823
828
  _DB.conn.commit()
824
829
 
825
830
 
831
+ def _get_capability_key(cloud_capability: 'cloud.CloudCapability') -> str:
832
+ return _ENABLED_CLOUDS_KEY_PREFIX + cloud_capability.value
833
+
834
+
826
835
  def add_or_update_storage(storage_name: str,
827
836
  storage_handle: 'Storage.StorageMetadata',
828
837
  storage_status: status_lib.StorageStatus):
sky/jobs/server/core.py CHANGED
@@ -105,7 +105,7 @@ def launch(
105
105
 
106
106
  local_to_controller_file_mounts = {}
107
107
 
108
- if storage_lib.get_cached_enabled_storage_clouds_or_refresh():
108
+ if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
109
109
  for task_ in dag.tasks:
110
110
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
111
111
  task_, task_type='jobs')
sky/jobs/utils.py CHANGED
@@ -125,6 +125,11 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
125
125
  FAILED_SETUP or CANCELLED.
126
126
  """
127
127
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
128
+ if handle is None:
129
+ # This can happen if the cluster was preempted and background status
130
+ # refresh already noticed and cleaned it up.
131
+ logger.info(f'Cluster {cluster_name} not found.')
132
+ return None
128
133
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
129
134
  status = None
130
135
  try:
sky/optimizer.py CHANGED
@@ -16,6 +16,7 @@ from sky import resources as resources_lib
16
16
  from sky import sky_logging
17
17
  from sky import task as task_lib
18
18
  from sky.adaptors import common as adaptors_common
19
+ from sky.clouds import cloud as sky_cloud
19
20
  from sky.usage import usage_lib
20
21
  from sky.utils import common
21
22
  from sky.utils import env_options
@@ -368,7 +369,8 @@ class Optimizer:
368
369
  # mention "kubernetes cluster" and/instead of "catalog"
369
370
  # in the error message.
370
371
  enabled_clouds = (
371
- sky_check.get_cached_enabled_clouds_or_refresh())
372
+ sky_check.get_cached_enabled_clouds_or_refresh(
373
+ sky_cloud.CloudCapability.COMPUTE))
372
374
  if clouds.cloud_in_iterable(clouds.Kubernetes(),
373
375
  enabled_clouds):
374
376
  if any(orig_resources.cloud is None
@@ -1206,6 +1208,7 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1206
1208
  dag: The DAG specified by a user.
1207
1209
  """
1208
1210
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1211
+ capability=sky_cloud.CloudCapability.COMPUTE,
1209
1212
  raise_if_no_cloud_access=True)
1210
1213
 
1211
1214
  global_disabled_clouds: Set[str] = set()
@@ -1225,8 +1228,10 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1225
1228
  # Explicitly check again to update the enabled cloud list.
1226
1229
  sky_check.check(quiet=True,
1227
1230
  clouds=list(clouds_need_recheck -
1228
- global_disabled_clouds))
1231
+ global_disabled_clouds),
1232
+ capability=sky_cloud.CloudCapability.COMPUTE)
1229
1233
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1234
+ capability=sky_cloud.CloudCapability.COMPUTE,
1230
1235
  raise_if_no_cloud_access=True)
1231
1236
  disabled_clouds = (clouds_need_recheck -
1232
1237
  {str(c) for c in enabled_clouds})
@@ -1268,6 +1273,7 @@ def _fill_in_launchable_resources(
1268
1273
  a cloud that is not enabled.
1269
1274
  """
1270
1275
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1276
+ capability=sky_cloud.CloudCapability.COMPUTE,
1271
1277
  raise_if_no_cloud_access=True)
1272
1278
 
1273
1279
  launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
@@ -297,8 +297,8 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
297
297
  def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
298
298
  """Setup a gcp service account with IAM roles.
299
299
 
300
- Creates a gcp service acconut and binds IAM roles which allow it to control
301
- control storage/compute services. Specifically, the head node needs to have
300
+ Creates a gcp service account and binds IAM roles which allow it to control
301
+ storage/compute services. Specifically, the head node needs to have
302
302
  an IAM role that allows it to create further gce instances and store items
303
303
  in google cloud storage.
304
304
 
@@ -311,7 +311,7 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
311
311
  )
312
312
  service_account = _get_service_account(email, project_id, iam)
313
313
 
314
- permissions = gcp_utils.get_minimal_permissions()
314
+ permissions = gcp_utils.get_minimal_compute_permissions()
315
315
  roles = constants.DEFAULT_SERVICE_ACCOUNT_ROLES
316
316
  if config.provider_config.get(constants.HAS_TPU_PROVIDER_FIELD, False):
317
317
  roles = (constants.DEFAULT_SERVICE_ACCOUNT_ROLES +
@@ -141,6 +141,11 @@ FIREWALL_RULES_TEMPLATE = [
141
141
  },
142
142
  ]
143
143
 
144
+ GCP_MINIMAL_PERMISSIONS = [
145
+ 'serviceusage.services.enable',
146
+ 'serviceusage.services.list',
147
+ ]
148
+
144
149
  # A list of permissions required to run SkyPilot on GCP.
145
150
  # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
146
151
  VM_MINIMAL_PERMISSIONS = [
@@ -170,13 +175,22 @@ VM_MINIMAL_PERMISSIONS = [
170
175
  # Check: sky.provision.gcp.config::_is_permission_satisfied
171
176
  # 'iam.serviceAccounts.actAs',
172
177
  'iam.serviceAccounts.get',
173
- 'serviceusage.services.enable',
174
- 'serviceusage.services.list',
175
178
  'serviceusage.services.use',
176
179
  'resourcemanager.projects.get',
177
180
  'resourcemanager.projects.getIamPolicy',
178
181
  ]
179
182
 
183
+ STORAGE_MINIMAL_PERMISSIONS = [
184
+ 'storage.buckets.create',
185
+ 'storage.buckets.get',
186
+ 'storage.buckets.delete',
187
+ 'storage.objects.create',
188
+ 'storage.objects.update',
189
+ 'storage.objects.delete',
190
+ 'storage.objects.get',
191
+ 'storage.objects.list',
192
+ ]
193
+
180
194
  # Permissions implied by GCP built-in roles. We hardcode these here, as we
181
195
  # cannot get the permissions of built-in role from the GCP Python API.
182
196
  # The lists are not exhaustive, but should cover the permissions listed in
@@ -586,8 +586,11 @@ def open_ports(
586
586
  }
587
587
  handlers: List[Type[instance_utils.GCPInstance]] = [
588
588
  instance_utils.GCPComputeInstance,
589
- instance_utils.GCPTPUVMInstance,
590
589
  ]
590
+ use_tpu_vms = provider_config.get('_has_tpus', False)
591
+ if use_tpu_vms:
592
+ handlers.append(instance_utils.GCPTPUVMInstance)
593
+
591
594
  handler_to_instances = _filter_instances(handlers, project_id, zone,
592
595
  label_filters, lambda _: None)
593
596
  operations = collections.defaultdict(list)
@@ -663,18 +663,25 @@ class GKEAutoscaler(Autoscaler):
663
663
 
664
664
  # Check if any node pool with autoscaling enabled can
665
665
  # fit the instance type.
666
- for node_pool in cluster['nodePools']:
667
- logger.debug(f'checking if node pool {node_pool["name"]} '
666
+ node_pools = cluster.get('nodePools', [])
667
+ for node_pool in node_pools:
668
+ name = node_pool.get('name', '')
669
+ logger.debug(f'checking if node pool {name} '
668
670
  'has autoscaling enabled.')
669
- if (node_pool['autoscaling'] is not None and
670
- 'enabled' in node_pool['autoscaling'] and
671
- node_pool['autoscaling']['enabled']):
672
- logger.debug(
673
- f'node pool {node_pool["name"]} has autoscaling enabled. '
674
- 'Checking if it can create a node '
675
- f'satisfying {instance_type}')
676
- if cls._check_instance_fits_gke_autoscaler_node_pool(
677
- instance_type, node_pool):
671
+ autoscaling_enabled = (node_pool.get('autoscaling',
672
+ {}).get('enabled', False))
673
+ if autoscaling_enabled:
674
+ logger.debug(f'node pool {name} has autoscaling enabled. '
675
+ 'Checking if it can create a node '
676
+ f'satisfying {instance_type}')
677
+ try:
678
+ if cls._check_instance_fits_gke_autoscaler_node_pool(
679
+ instance_type, node_pool):
680
+ return True
681
+ except KeyError:
682
+ logger.debug('encountered KeyError while checking if '
683
+ f'node pool {name} can create a node '
684
+ f'satisfying {instance_type}.')
678
685
  return True
679
686
  return False
680
687
 
@@ -776,9 +783,9 @@ class GKEAutoscaler(Autoscaler):
776
783
  to fit the instance type.
777
784
  """
778
785
  for accelerator in node_pool_accelerators:
779
- node_accelerator_type = GKELabelFormatter. \
780
- get_accelerator_from_label_value(
781
- accelerator['acceleratorType'])
786
+ node_accelerator_type = (
787
+ GKELabelFormatter.get_accelerator_from_label_value(
788
+ accelerator['acceleratorType']))
782
789
  node_accelerator_count = accelerator['acceleratorCount']
783
790
  if node_accelerator_type == requested_gpu_type and int(
784
791
  node_accelerator_count) >= requested_gpu_count:
@@ -812,24 +819,22 @@ class GKEAutoscaler(Autoscaler):
812
819
  @classmethod
813
820
  def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
814
821
  """Infer the number of TPU chips from the instance type."""
815
- machine_type_parts = machine_type.split('-')
816
822
  # according to
817
823
  # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
818
824
  # GKE TPU machine types have the format of
819
825
  # ct<version>-<type>-<node-chip-count>t
820
826
  logger.debug(
821
827
  f'inferring TPU chip count from machine type: {machine_type}')
822
- if (len(machine_type_parts) != 3 or
823
- not machine_type_parts[0].startswith('ct') or
824
- not machine_type_parts[2].endswith('t') or
825
- not machine_type_parts[2].strip('t').isdigit()):
828
+ pattern = r'ct[a-z0-9]+-[a-z]+-([0-9]+)t'
829
+ search = re.search(pattern, machine_type)
830
+ if search is None:
826
831
  logger.debug(f'machine type {machine_type} is not a '
827
832
  'valid TPU machine type format.')
828
833
  return 0
829
- num_tpu_chips = int(machine_type_parts[2].strip('t'))
834
+ num_tpu_chips = search.group(1)
830
835
  logger.debug(
831
836
  f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
832
- return num_tpu_chips
837
+ return int(num_tpu_chips)
833
838
 
834
839
  @classmethod
835
840
  def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
sky/resources.py CHANGED
@@ -10,6 +10,7 @@ from sky import clouds
10
10
  from sky import exceptions
11
11
  from sky import sky_logging
12
12
  from sky import skypilot_config
13
+ from sky.clouds import cloud as sky_cloud
13
14
  from sky.clouds import service_catalog
14
15
  from sky.provision import docker_utils
15
16
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -677,6 +678,7 @@ class Resources:
677
678
  # cloud corresponds to region/zone, errors out.
678
679
  valid_clouds = []
679
680
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
681
+ sky_cloud.CloudCapability.COMPUTE,
680
682
  raise_if_no_cloud_access=True)
681
683
  cloud_to_errors = {}
682
684
  for cloud in enabled_clouds:
@@ -796,6 +798,7 @@ class Resources:
796
798
  # If cloud not specified
797
799
  valid_clouds = []
798
800
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
801
+ sky_cloud.CloudCapability.COMPUTE,
799
802
  raise_if_no_cloud_access=True)
800
803
  for cloud in enabled_clouds:
801
804
  if cloud.instance_type_exists(self._instance_type):
@@ -991,6 +994,7 @@ class Resources:
991
994
  else:
992
995
  at_least_one_cloud_supports_ports = False
993
996
  for cloud in sky_check.get_cached_enabled_clouds_or_refresh(
997
+ sky_cloud.CloudCapability.COMPUTE,
994
998
  raise_if_no_cloud_access=True):
995
999
  try:
996
1000
  cloud.check_features_are_supported(
@@ -1020,7 +1024,8 @@ class Resources:
1020
1024
  else:
1021
1025
  # If no specific cloud is set, validate label against ALL clouds.
1022
1026
  # The label will be dropped if invalid for any one of the cloud
1023
- validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
1027
+ validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1028
+ sky_cloud.CloudCapability.COMPUTE)
1024
1029
  invalid_table = log_utils.create_table(['Label', 'Reason'])
1025
1030
  for key, value in self._labels.items():
1026
1031
  for cloud in validated_clouds:
@@ -1205,7 +1205,16 @@ class SkyPilotReplicaManager(ReplicaManager):
1205
1205
  for key in ['service']:
1206
1206
  old_config.pop(key)
1207
1207
  # Bump replica version if all fields except for service are
1208
- # the same. File mounts should both be empty, as update always
1208
+ # the same.
1209
+ # Here, we manually convert the any_of field to a set to avoid
1210
+ # only the difference in the random order of the any_of fields.
1211
+ old_config_any_of = old_config.get('resources',
1212
+ {}).pop('any_of', [])
1213
+ new_config_any_of = new_config.get('resources',
1214
+ {}).pop('any_of', [])
1215
+ if set(old_config_any_of) != set(new_config_any_of):
1216
+ continue
1217
+ # File mounts should both be empty, as update always
1209
1218
  # create new buckets if they are not empty.
1210
1219
  if (old_config == new_config and
1211
1220
  old_config.get('file_mounts', None) == {}):
@@ -150,7 +150,9 @@ extras_require: Dict[str, List[str]] = {
150
150
  # docs instead.
151
151
  # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
152
152
  ],
153
- 'nebius': ['nebius>=0.2.0',]
153
+ 'nebius': [
154
+ 'nebius>=0.2.0',
155
+ ] + aws_dependencies
154
156
  }
155
157
 
156
158
  # Nebius needs python3.10. If python 3.9 [all] will not install nebius
sky/task.py CHANGED
@@ -974,8 +974,8 @@ class Task:
974
974
  # assert len(self.resources) == 1, self.resources
975
975
  storage_cloud = None
976
976
 
977
- enabled_storage_clouds = (
978
- storage_lib.get_cached_enabled_storage_clouds_or_refresh(
977
+ enabled_storage_cloud_names = (
978
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh(
979
979
  raise_if_no_cloud_access=True))
980
980
 
981
981
  if self.best_resources is not None:
@@ -987,13 +987,13 @@ class Task:
987
987
  storage_region = resources.region
988
988
 
989
989
  if storage_cloud is not None:
990
- if str(storage_cloud) not in enabled_storage_clouds:
990
+ if str(storage_cloud) not in enabled_storage_cloud_names:
991
991
  storage_cloud = None
992
992
 
993
993
  storage_cloud_str = None
994
994
  if storage_cloud is None:
995
- storage_cloud_str = enabled_storage_clouds[0]
996
- assert storage_cloud_str is not None, enabled_storage_clouds[0]
995
+ storage_cloud_str = enabled_storage_cloud_names[0]
996
+ assert storage_cloud_str is not None, enabled_storage_cloud_names[0]
997
997
  storage_region = None # Use default region in the Store class
998
998
  else:
999
999
  storage_cloud_str = str(storage_cloud)
@@ -1103,6 +1103,17 @@ class Task:
1103
1103
  self.update_file_mounts({
1104
1104
  mnt_path: blob_path,
1105
1105
  })
1106
+ elif store_type is storage_lib.StoreType.NEBIUS:
1107
+ if storage.source is not None and not isinstance(
1108
+ storage.source,
1109
+ list) and storage.source.startswith('nebius://'):
1110
+ blob_path = storage.source
1111
+ else:
1112
+ blob_path = 'nebius://' + storage.name
1113
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
1114
+ self.update_file_mounts({
1115
+ mnt_path: blob_path,
1116
+ })
1106
1117
  elif store_type is storage_lib.StoreType.IBM:
1107
1118
  if isinstance(storage.source,
1108
1119
  str) and storage.source.startswith('cos://'):
@@ -29,6 +29,8 @@ RSYNC_DISPLAY_OPTION = '-Pavz'
29
29
  # Note that "-" is mandatory for rsync and means all patterns in the ignore
30
30
  # files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
31
31
  # do_not_exclude" doesn't work, even though git allows it.
32
+ # TODO(cooperc): Avoid using this, and prefer utils in storage_utils instead for
33
+ # consistency between bucket upload and rsync.
32
34
  RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
33
35
  RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
34
36
  # The git exclude file to support.