skypilot-nightly 1.0.0.dev20250319__py3-none-any.whl → 1.0.0.dev20250321__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/cloudflare.py +19 -3
- sky/adaptors/kubernetes.py +2 -1
- sky/adaptors/nebius.py +128 -6
- sky/backends/cloud_vm_ray_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +3 -2
- sky/check.py +89 -55
- sky/cloud_stores.py +66 -0
- sky/clouds/aws.py +14 -2
- sky/clouds/azure.py +13 -1
- sky/clouds/cloud.py +37 -2
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +55 -34
- sky/clouds/ibm.py +15 -1
- sky/clouds/kubernetes.py +3 -1
- sky/clouds/lambda_cloud.py +3 -1
- sky/clouds/nebius.py +7 -3
- sky/clouds/oci.py +15 -1
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +7 -1
- sky/clouds/scp.py +3 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +3 -1
- sky/clouds/utils/gcp_utils.py +11 -1
- sky/clouds/vast.py +3 -2
- sky/clouds/vsphere.py +3 -2
- sky/core.py +6 -2
- sky/data/data_transfer.py +75 -0
- sky/data/data_utils.py +34 -0
- sky/data/mounting_utils.py +18 -0
- sky/data/storage.py +542 -16
- sky/data/storage_utils.py +102 -84
- sky/exceptions.py +2 -0
- sky/global_user_state.py +15 -6
- sky/jobs/server/core.py +1 -1
- sky/jobs/utils.py +5 -0
- sky/optimizer.py +8 -2
- sky/provision/gcp/config.py +3 -3
- sky/provision/gcp/constants.py +16 -2
- sky/provision/gcp/instance.py +4 -1
- sky/provision/kubernetes/utils.py +26 -21
- sky/resources.py +6 -1
- sky/serve/replica_managers.py +10 -1
- sky/setup_files/dependencies.py +3 -1
- sky/task.py +16 -5
- sky/utils/command_runner.py +2 -0
- sky/utils/controller_utils.py +13 -4
- sky/utils/kubernetes/kubernetes_deploy_utils.py +4 -1
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/METADATA +13 -2
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/RECORD +55 -55
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info/licenses}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250319.dist-info → skypilot_nightly-1.0.0.dev20250321.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py
CHANGED
@@ -24,6 +24,9 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
|
|
24
24
|
'to the cloud storage for {path!r}'
|
25
25
|
'due to the following error: {error_msg!r}')
|
26
26
|
|
27
|
+
_USE_SKYIGNORE_HINT = (
|
28
|
+
'To avoid using .gitignore, you can create a .skyignore file instead.')
|
29
|
+
|
27
30
|
_LAST_USE_TRUNC_LENGTH = 25
|
28
31
|
|
29
32
|
|
@@ -109,10 +112,9 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
|
|
109
112
|
def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
110
113
|
""" Lists files and patterns ignored by git in the source directory
|
111
114
|
|
112
|
-
Runs `git
|
115
|
+
Runs `git ls-files --ignored ...` which returns a list of excluded files and
|
113
116
|
patterns read from .gitignore and .git/info/exclude using git.
|
114
|
-
|
115
|
-
after obtaining excluded list.
|
117
|
+
This will also be run for all submodules under the src_dir_path.
|
116
118
|
|
117
119
|
Returns:
|
118
120
|
List[str] containing files and patterns to be ignored. Some of the
|
@@ -120,91 +122,107 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
120
122
|
"""
|
121
123
|
expand_src_dir_path = os.path.expanduser(src_dir_path)
|
122
124
|
|
123
|
-
|
124
|
-
|
125
|
-
|
125
|
+
# We will use `git ls-files` to list files that we should ignore, but
|
126
|
+
# `ls-files` will not recurse into subdirectories. So, we need to manually
|
127
|
+
# list the submodules and run `ls-files` within the root and each submodule.
|
128
|
+
# Print the submodule paths relative to expand_src_dir_path, separated by
|
129
|
+
# null chars.
|
130
|
+
submodules_cmd = (f'git -C {shlex.quote(expand_src_dir_path)} '
|
131
|
+
'submodule foreach -q "printf \\$displaypath\\\\\\0"')
|
132
|
+
|
133
|
+
try:
|
134
|
+
submodules_output = subprocess.run(submodules_cmd,
|
135
|
+
shell=True,
|
136
|
+
stdout=subprocess.PIPE,
|
137
|
+
stderr=subprocess.PIPE,
|
138
|
+
check=True,
|
139
|
+
text=True)
|
140
|
+
except subprocess.CalledProcessError as e:
|
141
|
+
gitignore_path = os.path.join(expand_src_dir_path,
|
142
|
+
constants.GIT_IGNORE_FILE)
|
143
|
+
|
144
|
+
if (e.returncode == exceptions.GIT_FATAL_EXIT_CODE and
|
145
|
+
'not a git repository' in e.stderr):
|
146
|
+
# If git failed because we aren't in a git repository, but there is
|
147
|
+
# a .gitignore, warn the user that it will be ignored.
|
148
|
+
if os.path.exists(gitignore_path):
|
149
|
+
logger.warning('Detected a .gitignore file, but '
|
150
|
+
f'{src_dir_path} is not a git repository. The '
|
151
|
+
'.gitignore file will be ignored. '
|
152
|
+
f'{_USE_SKYIGNORE_HINT}')
|
153
|
+
# Otherwise, this is fine and we can exit early.
|
154
|
+
return []
|
155
|
+
|
156
|
+
if e.returncode == exceptions.COMMAND_NOT_FOUND_EXIT_CODE:
|
157
|
+
# Git is not installed. This is fine, skip the check.
|
158
|
+
# If .gitignore is present, warn the user.
|
159
|
+
if os.path.exists(gitignore_path):
|
160
|
+
logger.warning(f'Detected a .gitignore file in {src_dir_path}, '
|
161
|
+
'but git is not installed. The .gitignore file '
|
162
|
+
f'will be ignored. {_USE_SKYIGNORE_HINT}')
|
163
|
+
return []
|
164
|
+
|
165
|
+
# Pretty much any other error is unexpected, so re-raise.
|
166
|
+
raise
|
126
167
|
|
127
|
-
|
128
|
-
|
168
|
+
# submodules_output will contain each submodule path (relative to
|
169
|
+
# src_dir_path), each ending with a null character.
|
170
|
+
# .split will have an empty string at the end because of the final null
|
171
|
+
# char, so trim it.
|
172
|
+
submodules = submodules_output.stdout.split('\0')[:-1]
|
173
|
+
|
174
|
+
# The empty string is the relative reference to the src_dir_path.
|
175
|
+
all_git_repos = ['.'] + [
|
176
|
+
# We only care about submodules that are a subdirectory of src_dir_path.
|
177
|
+
submodule for submodule in submodules if not submodule.startswith('../')
|
178
|
+
]
|
129
179
|
|
130
|
-
# This command outputs a list to be excluded according to .gitignore
|
131
|
-
# and .git/info/exclude
|
132
|
-
filter_cmd = (f'git -C {shlex.quote(expand_src_dir_path)} '
|
133
|
-
'status --ignored --porcelain=v1')
|
134
180
|
excluded_list: List[str] = []
|
181
|
+
for repo in all_git_repos:
|
182
|
+
# repo is the path relative to src_dir_path. Get the full path.
|
183
|
+
repo_path = os.path.join(expand_src_dir_path, repo)
|
184
|
+
# This command outputs a list to be excluded according to .gitignore,
|
185
|
+
# .git/info/exclude, and global exclude config.
|
186
|
+
# -z: filenames terminated by \0 instead of \n
|
187
|
+
# --others: show untracked files
|
188
|
+
# --ignore: out of untracked files, only show ignored files
|
189
|
+
# --exclude-standard: use standard exclude rules (required for --ignore)
|
190
|
+
# --directory: if an entire directory is ignored, collapse to a single
|
191
|
+
# entry rather than listing every single file
|
192
|
+
# Since we are using --others instead of --cached, this will not show
|
193
|
+
# files that are tracked but also present in .gitignore.
|
194
|
+
filter_cmd = (f'git -C {shlex.quote(repo_path)} ls-files -z '
|
195
|
+
'--others --ignore --exclude-standard --directory')
|
196
|
+
output = subprocess.run(filter_cmd,
|
197
|
+
shell=True,
|
198
|
+
stdout=subprocess.PIPE,
|
199
|
+
stderr=subprocess.PIPE,
|
200
|
+
check=True,
|
201
|
+
text=True)
|
202
|
+
# Don't catch any errors. We would only expect to see errors during the
|
203
|
+
# first git invocation - so if we see any here, crash.
|
204
|
+
|
205
|
+
output_list = output.stdout.split('\0')
|
206
|
+
# trim the empty string at the end
|
207
|
+
output_list = output_list[:-1]
|
208
|
+
|
209
|
+
for item in output_list:
|
210
|
+
|
211
|
+
if repo == '.' and item == './':
|
212
|
+
logger.warning(f'{src_dir_path} is within a git repo, but the '
|
213
|
+
'entire directory is ignored by git. We will '
|
214
|
+
'ignore all git exclusions. '
|
215
|
+
f'{_USE_SKYIGNORE_HINT}')
|
216
|
+
return []
|
217
|
+
|
218
|
+
to_be_excluded = os.path.join(repo, item)
|
219
|
+
if item.endswith('/'):
|
220
|
+
# aws s3 sync and gsutil rsync require * to exclude
|
221
|
+
# files/dirs under the specified directory.
|
222
|
+
to_be_excluded += '*'
|
223
|
+
|
224
|
+
excluded_list.append(to_be_excluded)
|
135
225
|
|
136
|
-
if git_exclude_exists or gitignore_exists:
|
137
|
-
try:
|
138
|
-
output = subprocess.run(filter_cmd,
|
139
|
-
shell=True,
|
140
|
-
stdout=subprocess.PIPE,
|
141
|
-
stderr=subprocess.PIPE,
|
142
|
-
check=True,
|
143
|
-
text=True)
|
144
|
-
except subprocess.CalledProcessError as e:
|
145
|
-
# when the SRC_DIR_PATH is not a git repo and .git
|
146
|
-
# does not exist in it
|
147
|
-
if e.returncode == exceptions.GIT_FATAL_EXIT_CODE:
|
148
|
-
if 'not a git repository' in e.stderr:
|
149
|
-
# Check if the user has 'write' permission to
|
150
|
-
# SRC_DIR_PATH
|
151
|
-
if not os.access(expand_src_dir_path, os.W_OK):
|
152
|
-
error_msg = 'Write permission denial'
|
153
|
-
logger.warning(
|
154
|
-
_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
|
155
|
-
path=src_dir_path, error_msg=error_msg))
|
156
|
-
return excluded_list
|
157
|
-
init_cmd = f'git -C {expand_src_dir_path} init'
|
158
|
-
try:
|
159
|
-
subprocess.run(init_cmd,
|
160
|
-
shell=True,
|
161
|
-
stdout=subprocess.PIPE,
|
162
|
-
stderr=subprocess.PIPE,
|
163
|
-
check=True)
|
164
|
-
output = subprocess.run(filter_cmd,
|
165
|
-
shell=True,
|
166
|
-
stdout=subprocess.PIPE,
|
167
|
-
stderr=subprocess.PIPE,
|
168
|
-
check=True,
|
169
|
-
text=True)
|
170
|
-
except subprocess.CalledProcessError as init_e:
|
171
|
-
logger.warning(
|
172
|
-
_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
|
173
|
-
path=src_dir_path, error_msg=init_e.stderr))
|
174
|
-
return excluded_list
|
175
|
-
if git_exclude_exists:
|
176
|
-
# removes all the files/dirs created with 'git init'
|
177
|
-
# under .git/ except .git/info/exclude
|
178
|
-
remove_files_cmd = (f'find {expand_src_dir_path}' \
|
179
|
-
f'/.git -path {git_exclude_path}' \
|
180
|
-
' -prune -o -type f -exec rm -f ' \
|
181
|
-
'{} +')
|
182
|
-
remove_dirs_cmd = (f'find {expand_src_dir_path}' \
|
183
|
-
f'/.git -path {git_exclude_path}' \
|
184
|
-
' -o -type d -empty -delete')
|
185
|
-
subprocess.run(remove_files_cmd,
|
186
|
-
shell=True,
|
187
|
-
stdout=subprocess.PIPE,
|
188
|
-
stderr=subprocess.PIPE,
|
189
|
-
check=True)
|
190
|
-
subprocess.run(remove_dirs_cmd,
|
191
|
-
shell=True,
|
192
|
-
stdout=subprocess.PIPE,
|
193
|
-
stderr=subprocess.PIPE,
|
194
|
-
check=True)
|
195
|
-
|
196
|
-
output_list = output.stdout.split('\n')
|
197
|
-
for line in output_list:
|
198
|
-
# FILTER_CMD outputs items preceded by '!!'
|
199
|
-
# to specify excluded files/dirs
|
200
|
-
# e.g., '!! mydir/' or '!! mydir/myfile.txt'
|
201
|
-
if line.startswith('!!'):
|
202
|
-
to_be_excluded = line[3:]
|
203
|
-
if line.endswith('/'):
|
204
|
-
# aws s3 sync and gsutil rsync require * to exclude
|
205
|
-
# files/dirs under the specified directory.
|
206
|
-
to_be_excluded += '*'
|
207
|
-
excluded_list.append(to_be_excluded)
|
208
226
|
return excluded_list
|
209
227
|
|
210
228
|
|
sky/exceptions.py
CHANGED
@@ -24,6 +24,8 @@ MOUNT_PATH_NON_EMPTY_CODE = 42
|
|
24
24
|
INSUFFICIENT_PRIVILEGES_CODE = 52
|
25
25
|
# Return code when git command is ran in a dir that is not git repo
|
26
26
|
GIT_FATAL_EXIT_CODE = 128
|
27
|
+
# Return code from bash when a command is not found
|
28
|
+
COMMAND_NOT_FOUND_EXIT_CODE = 127
|
27
29
|
# Architecture, such as arm64, not supported by the dependency
|
28
30
|
ARCH_NOT_SUPPORTED_EXIT_CODE = 133
|
29
31
|
|
sky/global_user_state.py
CHANGED
@@ -26,11 +26,12 @@ from sky.utils import status_lib
|
|
26
26
|
if typing.TYPE_CHECKING:
|
27
27
|
from sky import backends
|
28
28
|
from sky import clouds
|
29
|
+
from sky.clouds import cloud
|
29
30
|
from sky.data import Storage
|
30
31
|
|
31
32
|
logger = sky_logging.init_logger(__name__)
|
32
33
|
|
33
|
-
|
34
|
+
_ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
34
35
|
|
35
36
|
_DB_PATH = os.path.expanduser('~/.sky/state.db')
|
36
37
|
pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
|
@@ -795,9 +796,11 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
795
796
|
return [row[0] for row in rows]
|
796
797
|
|
797
798
|
|
798
|
-
def get_cached_enabled_clouds(
|
799
|
+
def get_cached_enabled_clouds(
|
800
|
+
cloud_capability: 'cloud.CloudCapability') -> List['clouds.Cloud']:
|
801
|
+
|
799
802
|
rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
|
800
|
-
(
|
803
|
+
(_get_capability_key(cloud_capability),))
|
801
804
|
ret = []
|
802
805
|
for (value,) in rows:
|
803
806
|
ret = json.loads(value)
|
@@ -817,12 +820,18 @@ def get_cached_enabled_clouds() -> List['clouds.Cloud']:
|
|
817
820
|
return enabled_clouds
|
818
821
|
|
819
822
|
|
820
|
-
def set_enabled_clouds(enabled_clouds: List[str]
|
821
|
-
|
822
|
-
|
823
|
+
def set_enabled_clouds(enabled_clouds: List[str],
|
824
|
+
cloud_capability: 'cloud.CloudCapability') -> None:
|
825
|
+
_DB.cursor.execute(
|
826
|
+
'INSERT OR REPLACE INTO config VALUES (?, ?)',
|
827
|
+
(_get_capability_key(cloud_capability), json.dumps(enabled_clouds)))
|
823
828
|
_DB.conn.commit()
|
824
829
|
|
825
830
|
|
831
|
+
def _get_capability_key(cloud_capability: 'cloud.CloudCapability') -> str:
|
832
|
+
return _ENABLED_CLOUDS_KEY_PREFIX + cloud_capability.value
|
833
|
+
|
834
|
+
|
826
835
|
def add_or_update_storage(storage_name: str,
|
827
836
|
storage_handle: 'Storage.StorageMetadata',
|
828
837
|
storage_status: status_lib.StorageStatus):
|
sky/jobs/server/core.py
CHANGED
@@ -105,7 +105,7 @@ def launch(
|
|
105
105
|
|
106
106
|
local_to_controller_file_mounts = {}
|
107
107
|
|
108
|
-
if storage_lib.
|
108
|
+
if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
|
109
109
|
for task_ in dag.tasks:
|
110
110
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
111
111
|
task_, task_type='jobs')
|
sky/jobs/utils.py
CHANGED
@@ -125,6 +125,11 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
125
125
|
FAILED_SETUP or CANCELLED.
|
126
126
|
"""
|
127
127
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
128
|
+
if handle is None:
|
129
|
+
# This can happen if the cluster was preempted and background status
|
130
|
+
# refresh already noticed and cleaned it up.
|
131
|
+
logger.info(f'Cluster {cluster_name} not found.')
|
132
|
+
return None
|
128
133
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
129
134
|
status = None
|
130
135
|
try:
|
sky/optimizer.py
CHANGED
@@ -16,6 +16,7 @@ from sky import resources as resources_lib
|
|
16
16
|
from sky import sky_logging
|
17
17
|
from sky import task as task_lib
|
18
18
|
from sky.adaptors import common as adaptors_common
|
19
|
+
from sky.clouds import cloud as sky_cloud
|
19
20
|
from sky.usage import usage_lib
|
20
21
|
from sky.utils import common
|
21
22
|
from sky.utils import env_options
|
@@ -368,7 +369,8 @@ class Optimizer:
|
|
368
369
|
# mention "kubernetes cluster" and/instead of "catalog"
|
369
370
|
# in the error message.
|
370
371
|
enabled_clouds = (
|
371
|
-
sky_check.get_cached_enabled_clouds_or_refresh(
|
372
|
+
sky_check.get_cached_enabled_clouds_or_refresh(
|
373
|
+
sky_cloud.CloudCapability.COMPUTE))
|
372
374
|
if clouds.cloud_in_iterable(clouds.Kubernetes(),
|
373
375
|
enabled_clouds):
|
374
376
|
if any(orig_resources.cloud is None
|
@@ -1206,6 +1208,7 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1206
1208
|
dag: The DAG specified by a user.
|
1207
1209
|
"""
|
1208
1210
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1211
|
+
capability=sky_cloud.CloudCapability.COMPUTE,
|
1209
1212
|
raise_if_no_cloud_access=True)
|
1210
1213
|
|
1211
1214
|
global_disabled_clouds: Set[str] = set()
|
@@ -1225,8 +1228,10 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1225
1228
|
# Explicitly check again to update the enabled cloud list.
|
1226
1229
|
sky_check.check(quiet=True,
|
1227
1230
|
clouds=list(clouds_need_recheck -
|
1228
|
-
global_disabled_clouds)
|
1231
|
+
global_disabled_clouds),
|
1232
|
+
capability=sky_cloud.CloudCapability.COMPUTE)
|
1229
1233
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1234
|
+
capability=sky_cloud.CloudCapability.COMPUTE,
|
1230
1235
|
raise_if_no_cloud_access=True)
|
1231
1236
|
disabled_clouds = (clouds_need_recheck -
|
1232
1237
|
{str(c) for c in enabled_clouds})
|
@@ -1268,6 +1273,7 @@ def _fill_in_launchable_resources(
|
|
1268
1273
|
a cloud that is not enabled.
|
1269
1274
|
"""
|
1270
1275
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1276
|
+
capability=sky_cloud.CloudCapability.COMPUTE,
|
1271
1277
|
raise_if_no_cloud_access=True)
|
1272
1278
|
|
1273
1279
|
launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
|
sky/provision/gcp/config.py
CHANGED
@@ -297,8 +297,8 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
|
|
297
297
|
def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
298
298
|
"""Setup a gcp service account with IAM roles.
|
299
299
|
|
300
|
-
Creates a gcp service
|
301
|
-
|
300
|
+
Creates a gcp service account and binds IAM roles which allow it to control
|
301
|
+
storage/compute services. Specifically, the head node needs to have
|
302
302
|
an IAM role that allows it to create further gce instances and store items
|
303
303
|
in google cloud storage.
|
304
304
|
|
@@ -311,7 +311,7 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
311
311
|
)
|
312
312
|
service_account = _get_service_account(email, project_id, iam)
|
313
313
|
|
314
|
-
permissions = gcp_utils.
|
314
|
+
permissions = gcp_utils.get_minimal_compute_permissions()
|
315
315
|
roles = constants.DEFAULT_SERVICE_ACCOUNT_ROLES
|
316
316
|
if config.provider_config.get(constants.HAS_TPU_PROVIDER_FIELD, False):
|
317
317
|
roles = (constants.DEFAULT_SERVICE_ACCOUNT_ROLES +
|
sky/provision/gcp/constants.py
CHANGED
@@ -141,6 +141,11 @@ FIREWALL_RULES_TEMPLATE = [
|
|
141
141
|
},
|
142
142
|
]
|
143
143
|
|
144
|
+
GCP_MINIMAL_PERMISSIONS = [
|
145
|
+
'serviceusage.services.enable',
|
146
|
+
'serviceusage.services.list',
|
147
|
+
]
|
148
|
+
|
144
149
|
# A list of permissions required to run SkyPilot on GCP.
|
145
150
|
# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
|
146
151
|
VM_MINIMAL_PERMISSIONS = [
|
@@ -170,13 +175,22 @@ VM_MINIMAL_PERMISSIONS = [
|
|
170
175
|
# Check: sky.provision.gcp.config::_is_permission_satisfied
|
171
176
|
# 'iam.serviceAccounts.actAs',
|
172
177
|
'iam.serviceAccounts.get',
|
173
|
-
'serviceusage.services.enable',
|
174
|
-
'serviceusage.services.list',
|
175
178
|
'serviceusage.services.use',
|
176
179
|
'resourcemanager.projects.get',
|
177
180
|
'resourcemanager.projects.getIamPolicy',
|
178
181
|
]
|
179
182
|
|
183
|
+
STORAGE_MINIMAL_PERMISSIONS = [
|
184
|
+
'storage.buckets.create',
|
185
|
+
'storage.buckets.get',
|
186
|
+
'storage.buckets.delete',
|
187
|
+
'storage.objects.create',
|
188
|
+
'storage.objects.update',
|
189
|
+
'storage.objects.delete',
|
190
|
+
'storage.objects.get',
|
191
|
+
'storage.objects.list',
|
192
|
+
]
|
193
|
+
|
180
194
|
# Permissions implied by GCP built-in roles. We hardcode these here, as we
|
181
195
|
# cannot get the permissions of built-in role from the GCP Python API.
|
182
196
|
# The lists are not exhaustive, but should cover the permissions listed in
|
sky/provision/gcp/instance.py
CHANGED
@@ -586,8 +586,11 @@ def open_ports(
|
|
586
586
|
}
|
587
587
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
588
588
|
instance_utils.GCPComputeInstance,
|
589
|
-
instance_utils.GCPTPUVMInstance,
|
590
589
|
]
|
590
|
+
use_tpu_vms = provider_config.get('_has_tpus', False)
|
591
|
+
if use_tpu_vms:
|
592
|
+
handlers.append(instance_utils.GCPTPUVMInstance)
|
593
|
+
|
591
594
|
handler_to_instances = _filter_instances(handlers, project_id, zone,
|
592
595
|
label_filters, lambda _: None)
|
593
596
|
operations = collections.defaultdict(list)
|
@@ -663,18 +663,25 @@ class GKEAutoscaler(Autoscaler):
|
|
663
663
|
|
664
664
|
# Check if any node pool with autoscaling enabled can
|
665
665
|
# fit the instance type.
|
666
|
-
|
667
|
-
|
666
|
+
node_pools = cluster.get('nodePools', [])
|
667
|
+
for node_pool in node_pools:
|
668
|
+
name = node_pool.get('name', '')
|
669
|
+
logger.debug(f'checking if node pool {name} '
|
668
670
|
'has autoscaling enabled.')
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
logger.debug(
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
671
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
672
|
+
{}).get('enabled', False))
|
673
|
+
if autoscaling_enabled:
|
674
|
+
logger.debug(f'node pool {name} has autoscaling enabled. '
|
675
|
+
'Checking if it can create a node '
|
676
|
+
f'satisfying {instance_type}')
|
677
|
+
try:
|
678
|
+
if cls._check_instance_fits_gke_autoscaler_node_pool(
|
679
|
+
instance_type, node_pool):
|
680
|
+
return True
|
681
|
+
except KeyError:
|
682
|
+
logger.debug('encountered KeyError while checking if '
|
683
|
+
f'node pool {name} can create a node '
|
684
|
+
f'satisfying {instance_type}.')
|
678
685
|
return True
|
679
686
|
return False
|
680
687
|
|
@@ -776,9 +783,9 @@ class GKEAutoscaler(Autoscaler):
|
|
776
783
|
to fit the instance type.
|
777
784
|
"""
|
778
785
|
for accelerator in node_pool_accelerators:
|
779
|
-
node_accelerator_type =
|
780
|
-
get_accelerator_from_label_value(
|
781
|
-
accelerator['acceleratorType'])
|
786
|
+
node_accelerator_type = (
|
787
|
+
GKELabelFormatter.get_accelerator_from_label_value(
|
788
|
+
accelerator['acceleratorType']))
|
782
789
|
node_accelerator_count = accelerator['acceleratorCount']
|
783
790
|
if node_accelerator_type == requested_gpu_type and int(
|
784
791
|
node_accelerator_count) >= requested_gpu_count:
|
@@ -812,24 +819,22 @@ class GKEAutoscaler(Autoscaler):
|
|
812
819
|
@classmethod
|
813
820
|
def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
|
814
821
|
"""Infer the number of TPU chips from the instance type."""
|
815
|
-
machine_type_parts = machine_type.split('-')
|
816
822
|
# according to
|
817
823
|
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
|
818
824
|
# GKE TPU machine types have the format of
|
819
825
|
# ct<version>-<type>-<node-chip-count>t
|
820
826
|
logger.debug(
|
821
827
|
f'inferring TPU chip count from machine type: {machine_type}')
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
not machine_type_parts[2].strip('t').isdigit()):
|
828
|
+
pattern = r'ct[a-z0-9]+-[a-z]+-([0-9]+)t'
|
829
|
+
search = re.search(pattern, machine_type)
|
830
|
+
if search is None:
|
826
831
|
logger.debug(f'machine type {machine_type} is not a '
|
827
832
|
'valid TPU machine type format.')
|
828
833
|
return 0
|
829
|
-
num_tpu_chips =
|
834
|
+
num_tpu_chips = search.group(1)
|
830
835
|
logger.debug(
|
831
836
|
f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
|
832
|
-
return num_tpu_chips
|
837
|
+
return int(num_tpu_chips)
|
833
838
|
|
834
839
|
@classmethod
|
835
840
|
def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
|
sky/resources.py
CHANGED
@@ -10,6 +10,7 @@ from sky import clouds
|
|
10
10
|
from sky import exceptions
|
11
11
|
from sky import sky_logging
|
12
12
|
from sky import skypilot_config
|
13
|
+
from sky.clouds import cloud as sky_cloud
|
13
14
|
from sky.clouds import service_catalog
|
14
15
|
from sky.provision import docker_utils
|
15
16
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
@@ -677,6 +678,7 @@ class Resources:
|
|
677
678
|
# cloud corresponds to region/zone, errors out.
|
678
679
|
valid_clouds = []
|
679
680
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
681
|
+
sky_cloud.CloudCapability.COMPUTE,
|
680
682
|
raise_if_no_cloud_access=True)
|
681
683
|
cloud_to_errors = {}
|
682
684
|
for cloud in enabled_clouds:
|
@@ -796,6 +798,7 @@ class Resources:
|
|
796
798
|
# If cloud not specified
|
797
799
|
valid_clouds = []
|
798
800
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
801
|
+
sky_cloud.CloudCapability.COMPUTE,
|
799
802
|
raise_if_no_cloud_access=True)
|
800
803
|
for cloud in enabled_clouds:
|
801
804
|
if cloud.instance_type_exists(self._instance_type):
|
@@ -991,6 +994,7 @@ class Resources:
|
|
991
994
|
else:
|
992
995
|
at_least_one_cloud_supports_ports = False
|
993
996
|
for cloud in sky_check.get_cached_enabled_clouds_or_refresh(
|
997
|
+
sky_cloud.CloudCapability.COMPUTE,
|
994
998
|
raise_if_no_cloud_access=True):
|
995
999
|
try:
|
996
1000
|
cloud.check_features_are_supported(
|
@@ -1020,7 +1024,8 @@ class Resources:
|
|
1020
1024
|
else:
|
1021
1025
|
# If no specific cloud is set, validate label against ALL clouds.
|
1022
1026
|
# The label will be dropped if invalid for any one of the cloud
|
1023
|
-
validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1027
|
+
validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1028
|
+
sky_cloud.CloudCapability.COMPUTE)
|
1024
1029
|
invalid_table = log_utils.create_table(['Label', 'Reason'])
|
1025
1030
|
for key, value in self._labels.items():
|
1026
1031
|
for cloud in validated_clouds:
|
sky/serve/replica_managers.py
CHANGED
@@ -1205,7 +1205,16 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1205
1205
|
for key in ['service']:
|
1206
1206
|
old_config.pop(key)
|
1207
1207
|
# Bump replica version if all fields except for service are
|
1208
|
-
# the same.
|
1208
|
+
# the same.
|
1209
|
+
# Here, we manually convert the any_of field to a set to avoid
|
1210
|
+
# only the difference in the random order of the any_of fields.
|
1211
|
+
old_config_any_of = old_config.get('resources',
|
1212
|
+
{}).pop('any_of', [])
|
1213
|
+
new_config_any_of = new_config.get('resources',
|
1214
|
+
{}).pop('any_of', [])
|
1215
|
+
if set(old_config_any_of) != set(new_config_any_of):
|
1216
|
+
continue
|
1217
|
+
# File mounts should both be empty, as update always
|
1209
1218
|
# create new buckets if they are not empty.
|
1210
1219
|
if (old_config == new_config and
|
1211
1220
|
old_config.get('file_mounts', None) == {}):
|
sky/setup_files/dependencies.py
CHANGED
@@ -150,7 +150,9 @@ extras_require: Dict[str, List[str]] = {
|
|
150
150
|
# docs instead.
|
151
151
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
152
152
|
],
|
153
|
-
'nebius': [
|
153
|
+
'nebius': [
|
154
|
+
'nebius>=0.2.0',
|
155
|
+
] + aws_dependencies
|
154
156
|
}
|
155
157
|
|
156
158
|
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
sky/task.py
CHANGED
@@ -974,8 +974,8 @@ class Task:
|
|
974
974
|
# assert len(self.resources) == 1, self.resources
|
975
975
|
storage_cloud = None
|
976
976
|
|
977
|
-
|
978
|
-
storage_lib.
|
977
|
+
enabled_storage_cloud_names = (
|
978
|
+
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh(
|
979
979
|
raise_if_no_cloud_access=True))
|
980
980
|
|
981
981
|
if self.best_resources is not None:
|
@@ -987,13 +987,13 @@ class Task:
|
|
987
987
|
storage_region = resources.region
|
988
988
|
|
989
989
|
if storage_cloud is not None:
|
990
|
-
if str(storage_cloud) not in
|
990
|
+
if str(storage_cloud) not in enabled_storage_cloud_names:
|
991
991
|
storage_cloud = None
|
992
992
|
|
993
993
|
storage_cloud_str = None
|
994
994
|
if storage_cloud is None:
|
995
|
-
storage_cloud_str =
|
996
|
-
assert storage_cloud_str is not None,
|
995
|
+
storage_cloud_str = enabled_storage_cloud_names[0]
|
996
|
+
assert storage_cloud_str is not None, enabled_storage_cloud_names[0]
|
997
997
|
storage_region = None # Use default region in the Store class
|
998
998
|
else:
|
999
999
|
storage_cloud_str = str(storage_cloud)
|
@@ -1103,6 +1103,17 @@ class Task:
|
|
1103
1103
|
self.update_file_mounts({
|
1104
1104
|
mnt_path: blob_path,
|
1105
1105
|
})
|
1106
|
+
elif store_type is storage_lib.StoreType.NEBIUS:
|
1107
|
+
if storage.source is not None and not isinstance(
|
1108
|
+
storage.source,
|
1109
|
+
list) and storage.source.startswith('nebius://'):
|
1110
|
+
blob_path = storage.source
|
1111
|
+
else:
|
1112
|
+
blob_path = 'nebius://' + storage.name
|
1113
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
1114
|
+
self.update_file_mounts({
|
1115
|
+
mnt_path: blob_path,
|
1116
|
+
})
|
1106
1117
|
elif store_type is storage_lib.StoreType.IBM:
|
1107
1118
|
if isinstance(storage.source,
|
1108
1119
|
str) and storage.source.startswith('cos://'):
|
sky/utils/command_runner.py
CHANGED
@@ -29,6 +29,8 @@ RSYNC_DISPLAY_OPTION = '-Pavz'
|
|
29
29
|
# Note that "-" is mandatory for rsync and means all patterns in the ignore
|
30
30
|
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
|
31
31
|
# do_not_exclude" doesn't work, even though git allows it.
|
32
|
+
# TODO(cooperc): Avoid using this, and prefer utils in storage_utils instead for
|
33
|
+
# consistency between bucket upload and rsync.
|
32
34
|
RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
|
33
35
|
RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
|
34
36
|
# The git exclude file to support.
|