skypilot-nightly 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250410__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +1 -1
- sky/adaptors/nebius.py +5 -27
- sky/backends/backend.py +9 -7
- sky/backends/cloud_vm_ray_backend.py +8 -11
- sky/backends/local_docker_backend.py +3 -3
- sky/cloud_stores.py +0 -4
- sky/clouds/do.py +4 -5
- sky/clouds/gcp.py +5 -3
- sky/clouds/nebius.py +22 -12
- sky/clouds/service_catalog/data_fetchers/fetch_ibm.py +1 -2
- sky/clouds/service_catalog/gcp_catalog.py +37 -10
- sky/core.py +6 -6
- sky/data/data_utils.py +5 -9
- sky/data/mounting_utils.py +1 -1
- sky/data/storage.py +25 -31
- sky/data/storage_utils.py +36 -20
- sky/execution.py +11 -4
- sky/jobs/server/server.py +5 -1
- sky/provision/do/utils.py +19 -16
- sky/provision/gcp/config.py +30 -20
- sky/server/requests/executor.py +204 -126
- sky/server/requests/process.py +212 -0
- sky/server/requests/queues/local_queue.py +16 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/log_lib.py +4 -0
- sky/task.py +27 -7
- sky/utils/atomic.py +52 -0
- sky/utils/common_utils.py +2 -2
- sky/utils/schemas.py +25 -7
- sky/utils/validator.py +1 -8
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/RECORD +37 -34
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '022a5c3ffe258f365764b03cb20fac70934f5a60'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250410'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/azure.py
CHANGED
@@ -232,7 +232,7 @@ def get_client(name: str,
|
|
232
232
|
'Must provide resource_group_name keyword '
|
233
233
|
'arguments for container client.')
|
234
234
|
sky_logger.info(
|
235
|
-
'Failed to check the
|
235
|
+
'Failed to check the existence of the '
|
236
236
|
f'container {container_url!r} due to '
|
237
237
|
'insufficient IAM role for storage '
|
238
238
|
f'account {storage_account_name!r}.')
|
sky/adaptors/nebius.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Nebius cloud adaptor."""
|
2
2
|
import os
|
3
3
|
import threading
|
4
|
-
from typing import Optional
|
5
4
|
|
6
5
|
from sky.adaptors import common
|
7
6
|
from sky.utils import annotations
|
@@ -168,7 +167,7 @@ def session():
|
|
168
167
|
|
169
168
|
|
170
169
|
@annotations.lru_cache(scope='global')
|
171
|
-
def resource(resource_name: str,
|
170
|
+
def resource(resource_name: str, **kwargs):
|
172
171
|
"""Create a Nebius resource.
|
173
172
|
|
174
173
|
Args:
|
@@ -181,21 +180,13 @@ def resource(resource_name: str, region: str = DEFAULT_REGION, **kwargs):
|
|
181
180
|
# Reference: https://stackoverflow.com/a/59635814
|
182
181
|
|
183
182
|
session_ = session()
|
184
|
-
nebius_credentials = get_nebius_credentials(session_)
|
185
|
-
endpoint = create_endpoint(region)
|
186
183
|
|
187
|
-
return session_.resource(
|
188
|
-
resource_name,
|
189
|
-
endpoint_url=endpoint,
|
190
|
-
aws_access_key_id=nebius_credentials.access_key,
|
191
|
-
aws_secret_access_key=nebius_credentials.secret_key,
|
192
|
-
region_name=region,
|
193
|
-
**kwargs)
|
184
|
+
return session_.resource(resource_name, **kwargs)
|
194
185
|
|
195
186
|
|
196
187
|
@annotations.lru_cache(scope='global')
|
197
|
-
def client(service_name: str
|
198
|
-
"""Create
|
188
|
+
def client(service_name: str):
|
189
|
+
"""Create Nebius client of a certain service.
|
199
190
|
|
200
191
|
Args:
|
201
192
|
service_name: Nebius service name (e.g., 's3').
|
@@ -207,14 +198,8 @@ def client(service_name: str, region):
|
|
207
198
|
# Reference: https://stackoverflow.com/a/59635814
|
208
199
|
|
209
200
|
session_ = session()
|
210
|
-
nebius_credentials = get_nebius_credentials(session_)
|
211
|
-
endpoint = create_endpoint(region)
|
212
201
|
|
213
|
-
return session_.client(service_name
|
214
|
-
endpoint_url=endpoint,
|
215
|
-
aws_access_key_id=nebius_credentials.access_key,
|
216
|
-
aws_secret_access_key=nebius_credentials.secret_key,
|
217
|
-
region_name=region)
|
202
|
+
return session_.client(service_name)
|
218
203
|
|
219
204
|
|
220
205
|
@common.load_lazy_modules(_LAZY_MODULES)
|
@@ -223,10 +208,3 @@ def botocore_exceptions():
|
|
223
208
|
# pylint: disable=import-outside-toplevel
|
224
209
|
from botocore import exceptions
|
225
210
|
return exceptions
|
226
|
-
|
227
|
-
|
228
|
-
def create_endpoint(region: Optional[str] = DEFAULT_REGION) -> str:
|
229
|
-
"""Reads accountid necessary to interact with Nebius Object Storage"""
|
230
|
-
if region is None:
|
231
|
-
region = DEFAULT_REGION
|
232
|
-
return f'https://storage.{region}.nebius.cloud:443'
|
sky/backends/backend.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Sky backend interface."""
|
2
2
|
import typing
|
3
|
-
from typing import Dict, Generic, Optional
|
3
|
+
from typing import Dict, Generic, Optional, Tuple
|
4
4
|
|
5
5
|
from sky.usage import usage_lib
|
6
6
|
from sky.utils import cluster_utils
|
@@ -53,7 +53,7 @@ class Backend(Generic[_ResourceHandleType]):
|
|
53
53
|
cluster_name: Optional[str] = None,
|
54
54
|
retry_until_up: bool = False,
|
55
55
|
skip_unnecessary_provisioning: bool = False,
|
56
|
-
) -> Optional[_ResourceHandleType]:
|
56
|
+
) -> Tuple[Optional[_ResourceHandleType], bool]:
|
57
57
|
"""Provisions resources for the given task.
|
58
58
|
|
59
59
|
Args:
|
@@ -68,13 +68,15 @@ class Backend(Generic[_ResourceHandleType]):
|
|
68
68
|
the existing cluster will be reused and re-provisioned.
|
69
69
|
retry_until_up: If True, retry provisioning until resources are
|
70
70
|
successfully launched.
|
71
|
-
|
72
|
-
the existing cluster_name's config. Skip provisioning if no
|
71
|
+
skip_unnecessary_provisioning: If True, compare the cluster config
|
72
|
+
to the existing cluster_name's config. Skip provisioning if no
|
73
73
|
updates are needed for the existing cluster.
|
74
74
|
|
75
75
|
Returns:
|
76
|
-
A ResourceHandle object for the provisioned resources, or None if
|
77
|
-
|
76
|
+
- A ResourceHandle object for the provisioned resources, or None if
|
77
|
+
dryrun is True.
|
78
|
+
- A boolean that is True if the provisioning was skipped, and False
|
79
|
+
if provisioning actually happened. Dryrun always gives False.
|
78
80
|
"""
|
79
81
|
if cluster_name is None:
|
80
82
|
cluster_name = cluster_utils.generate_cluster_name()
|
@@ -159,7 +161,7 @@ class Backend(Generic[_ResourceHandleType]):
|
|
159
161
|
cluster_name: str,
|
160
162
|
retry_until_up: bool = False,
|
161
163
|
skip_unnecessary_provisioning: bool = False,
|
162
|
-
) -> Optional[_ResourceHandleType]:
|
164
|
+
) -> Tuple[Optional[_ResourceHandleType], bool]:
|
163
165
|
raise NotImplementedError
|
164
166
|
|
165
167
|
def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
|
@@ -655,12 +655,9 @@ class RayCodeGen:
|
|
655
655
|
rclone_flush_script = {rclone_flush_script!r}
|
656
656
|
if run_fn is not None:
|
657
657
|
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
658
|
-
if script is not None:
|
659
|
-
script += rclone_flush_script
|
660
|
-
else:
|
661
|
-
script = rclone_flush_script
|
662
658
|
|
663
659
|
if script is not None:
|
660
|
+
script += rclone_flush_script
|
664
661
|
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
665
662
|
# Backward compatibility: Environment starting with `SKY_` is
|
666
663
|
# deprecated. Remove it in v0.9.0.
|
@@ -2832,7 +2829,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2832
2829
|
cluster_name: str,
|
2833
2830
|
retry_until_up: bool = False,
|
2834
2831
|
skip_unnecessary_provisioning: bool = False,
|
2835
|
-
) -> Optional[CloudVmRayResourceHandle]:
|
2832
|
+
) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
|
2836
2833
|
"""Provisions the cluster, or re-provisions an existing cluster.
|
2837
2834
|
|
2838
2835
|
Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
|
@@ -2972,7 +2969,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2972
2969
|
failover_history=e.failover_history) from None
|
2973
2970
|
if dryrun:
|
2974
2971
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
2975
|
-
return record['handle'] if record is not None else None
|
2972
|
+
return record['handle'] if record is not None else None, False
|
2976
2973
|
|
2977
2974
|
if config_dict['provisioning_skipped']:
|
2978
2975
|
# Skip further provisioning.
|
@@ -2983,7 +2980,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2983
2980
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
2984
2981
|
assert record is not None and record['handle'] is not None, (
|
2985
2982
|
cluster_name, record)
|
2986
|
-
return record['handle']
|
2983
|
+
return record['handle'], True
|
2987
2984
|
|
2988
2985
|
if 'provision_record' in config_dict:
|
2989
2986
|
# New provisioner is used here.
|
@@ -3025,7 +3022,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3025
3022
|
self._update_after_cluster_provisioned(
|
3026
3023
|
handle, to_provision_config.prev_handle, task,
|
3027
3024
|
prev_cluster_status, lock_path, config_hash)
|
3028
|
-
return handle
|
3025
|
+
return handle, False
|
3029
3026
|
|
3030
3027
|
cluster_config_file = config_dict['ray']
|
3031
3028
|
handle = config_dict['handle']
|
@@ -3097,7 +3094,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3097
3094
|
self._update_after_cluster_provisioned(
|
3098
3095
|
handle, to_provision_config.prev_handle, task,
|
3099
3096
|
prev_cluster_status, lock_path, config_hash)
|
3100
|
-
return handle
|
3097
|
+
return handle, False
|
3101
3098
|
|
3102
3099
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
3103
3100
|
cloud = handle.launched_resources.cloud
|
@@ -3438,7 +3435,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3438
3435
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
3439
3436
|
f'touch {remote_log_path}')
|
3440
3437
|
encoded_script = shlex.quote(codegen)
|
3441
|
-
create_script_code =
|
3438
|
+
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
3442
3439
|
job_submit_cmd = (
|
3443
3440
|
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
3444
3441
|
# with pid is the same driver process.
|
@@ -4334,7 +4331,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4334
4331
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
4335
4332
|
cloud = handle.launched_resources.cloud
|
4336
4333
|
|
4337
|
-
if
|
4334
|
+
if terminate and handle.launched_resources.is_image_managed is True:
|
4338
4335
|
# Delete the image when terminating a "cloned" cluster, i.e.,
|
4339
4336
|
# whose image is created by SkyPilot (--clone-disk-from)
|
4340
4337
|
logger.debug(f'Deleting image {handle.launched_resources.image_id}')
|
@@ -139,7 +139,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
139
139
|
cluster_name: str,
|
140
140
|
retry_until_up: bool = False,
|
141
141
|
skip_unnecessary_provisioning: bool = False,
|
142
|
-
) -> Optional[LocalDockerResourceHandle]:
|
142
|
+
) -> Tuple[Optional[LocalDockerResourceHandle], bool]:
|
143
143
|
"""Builds docker image for the task and returns cluster name as handle.
|
144
144
|
|
145
145
|
Since resource demands are ignored, There's no provisioning in local
|
@@ -149,7 +149,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
149
149
|
assert task.name is not None, ('Task name cannot be None - have you '
|
150
150
|
'specified a task name?')
|
151
151
|
if dryrun:
|
152
|
-
return None
|
152
|
+
return None, False
|
153
153
|
if retry_until_up:
|
154
154
|
logger.warning(
|
155
155
|
f'Retrying until up is not supported in backend: {self.NAME}. '
|
@@ -175,7 +175,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
175
175
|
requested_resources=set(
|
176
176
|
task.resources),
|
177
177
|
ready=False)
|
178
|
-
return handle
|
178
|
+
return handle, False
|
179
179
|
|
180
180
|
def _sync_workdir(self, handle: LocalDockerResourceHandle,
|
181
181
|
workdir: Path) -> None:
|
sky/cloud_stores.py
CHANGED
@@ -578,13 +578,11 @@ class NebiusCloudStorage(CloudStorage):
|
|
578
578
|
# AWS Sync by default uses 10 threads to upload files to the bucket.
|
579
579
|
# To increase parallelism, modify max_concurrent_requests in your
|
580
580
|
# aws config file (Default path: ~/.aws/config).
|
581
|
-
endpoint_url = nebius.create_endpoint()
|
582
581
|
assert 'nebius://' in source, 'nebius:// is not in source'
|
583
582
|
source = source.replace('nebius://', 's3://')
|
584
583
|
download_via_awscli = (f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
|
585
584
|
'sync --no-follow-symlinks '
|
586
585
|
f'{source} {destination} '
|
587
|
-
f'--endpoint {endpoint_url} '
|
588
586
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
589
587
|
|
590
588
|
all_commands = list(self._GET_AWSCLI)
|
@@ -593,12 +591,10 @@ class NebiusCloudStorage(CloudStorage):
|
|
593
591
|
|
594
592
|
def make_sync_file_command(self, source: str, destination: str) -> str:
|
595
593
|
"""Downloads a file using AWS CLI."""
|
596
|
-
endpoint_url = nebius.create_endpoint()
|
597
594
|
assert 'nebius://' in source, 'nebius:// is not in source'
|
598
595
|
source = source.replace('nebius://', 's3://')
|
599
596
|
download_via_awscli = (f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
|
600
597
|
f'cp {source} {destination} '
|
601
|
-
f'--endpoint {endpoint_url} '
|
602
598
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
603
599
|
|
604
600
|
all_commands = list(self._GET_AWSCLI)
|
sky/clouds/do.py
CHANGED
@@ -280,13 +280,12 @@ class DO(clouds.Cloud):
|
|
280
280
|
return True, None
|
281
281
|
|
282
282
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
283
|
-
|
283
|
+
credential_path = do_utils.get_credentials_path()
|
284
|
+
if credential_path is None:
|
284
285
|
return {}
|
285
|
-
if not os.path.exists(os.path.expanduser(
|
286
|
+
if not os.path.exists(os.path.expanduser(credential_path)):
|
286
287
|
return {}
|
287
|
-
return {
|
288
|
-
f'~/.config/doctl/{_CREDENTIAL_FILE}': do_utils.CREDENTIALS_PATH
|
289
|
-
}
|
288
|
+
return {f'~/.config/doctl/{_CREDENTIAL_FILE}': credential_path}
|
290
289
|
|
291
290
|
@classmethod
|
292
291
|
def get_current_user_identity(cls) -> Optional[List[str]]:
|
sky/clouds/gcp.py
CHANGED
@@ -688,9 +688,11 @@ class GCP(clouds.Cloud):
|
|
688
688
|
cls,
|
689
689
|
instance_type: str,
|
690
690
|
) -> Optional[Dict[str, Union[int, float]]]:
|
691
|
-
# GCP handles accelerators separately from regular instance types
|
692
|
-
#
|
693
|
-
|
691
|
+
# GCP handles accelerators separately from regular instance types.
|
692
|
+
# This method supports automatically inferring the GPU type for
|
693
|
+
# the instance type that come with GPUs pre-attached.
|
694
|
+
return service_catalog.get_accelerators_from_instance_type(
|
695
|
+
instance_type, clouds='gcp')
|
694
696
|
|
695
697
|
@classmethod
|
696
698
|
def get_vcpus_mem_from_instance_type(
|
sky/clouds/nebius.py
CHANGED
@@ -24,18 +24,28 @@ _CREDENTIAL_FILES = [
|
|
24
24
|
_INDENT_PREFIX = ' '
|
25
25
|
|
26
26
|
|
27
|
-
def
|
28
|
-
"""Checks if Nebius Object Storage profile is set in aws credentials
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
def nebius_profile_in_aws_cred_and_config() -> bool:
|
28
|
+
"""Checks if Nebius Object Storage profile is set in aws credentials
|
29
|
+
and profile."""
|
30
|
+
|
31
|
+
credentials_path = os.path.expanduser('~/.aws/credentials')
|
32
|
+
nebius_profile_exists_in_credentials = False
|
33
|
+
if os.path.isfile(credentials_path):
|
34
|
+
with open(credentials_path, 'r', encoding='utf-8') as file:
|
34
35
|
for line in file:
|
35
36
|
if f'[{nebius.NEBIUS_PROFILE_NAME}]' in line:
|
36
|
-
|
37
|
+
nebius_profile_exists_in_credentials = True
|
38
|
+
|
39
|
+
config_path = os.path.expanduser('~/.aws/config')
|
40
|
+
nebius_profile_exists_in_config = False
|
41
|
+
if os.path.isfile(config_path):
|
42
|
+
with open(config_path, 'r', encoding='utf-8') as file:
|
43
|
+
for line in file:
|
44
|
+
if f'[profile {nebius.NEBIUS_PROFILE_NAME}]' in line:
|
45
|
+
nebius_profile_exists_in_config = True
|
37
46
|
|
38
|
-
return
|
47
|
+
return (nebius_profile_exists_in_credentials and
|
48
|
+
nebius_profile_exists_in_config)
|
39
49
|
|
40
50
|
|
41
51
|
@registry.CLOUD_REGISTRY.register
|
@@ -308,12 +318,12 @@ class Nebius(clouds.Cloud):
|
|
308
318
|
with a string on unset credential.
|
309
319
|
"""
|
310
320
|
hints = None
|
311
|
-
if not
|
321
|
+
if not nebius_profile_in_aws_cred_and_config():
|
312
322
|
hints = (f'[{nebius.NEBIUS_PROFILE_NAME}] profile '
|
313
323
|
'is not set in ~/.aws/credentials.')
|
314
324
|
if hints:
|
315
325
|
hints += ' Run the following commands:'
|
316
|
-
if not
|
326
|
+
if not nebius_profile_in_aws_cred_and_config():
|
317
327
|
hints += (
|
318
328
|
f'\n{_INDENT_PREFIX} $ pip install boto3'
|
319
329
|
f'\n{_INDENT_PREFIX} $ aws configure --profile nebius')
|
@@ -329,7 +339,7 @@ class Nebius(clouds.Cloud):
|
|
329
339
|
for filename in _CREDENTIAL_FILES
|
330
340
|
}
|
331
341
|
credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
|
332
|
-
|
342
|
+
credential_file_mounts['~/.aws/config'] = '~/.aws/config'
|
333
343
|
return credential_file_mounts
|
334
344
|
|
335
345
|
@classmethod
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""A script that generates the
|
1
|
+
"""A script that generates the IBM Cloud catalog.
|
2
2
|
|
3
3
|
Usage:
|
4
4
|
python fetch_ibm.py [-h] [--api-key API_KEY]
|
@@ -19,7 +19,6 @@ import yaml
|
|
19
19
|
|
20
20
|
TOKEN_ENDPOINT = 'https://iam.cloud.ibm.com/identity/token'
|
21
21
|
REGIONS_ENDPOINT = f'https://us-south.iaas.cloud.ibm.com/v1/regions?version={datetime.today().strftime("%Y-%m-%d")}&generation=2' # pylint: disable=line-too-long
|
22
|
-
ENDPOINT = 'https://cloud.lambdalabs.com/api/v1/instance-types'
|
23
22
|
DEFAULT_IBM_CREDENTIALS_PATH = os.path.expanduser('~/.ibm/credentials.yaml')
|
24
23
|
|
25
24
|
|
@@ -106,6 +106,16 @@ _ACC_INSTANCE_TYPE_DICTS = {
|
|
106
106
|
8: ['a3-megagpu-8g'],
|
107
107
|
}
|
108
108
|
}
|
109
|
+
# Enable GPU type inference from instance types
|
110
|
+
_INSTANCE_TYPE_TO_ACC = {
|
111
|
+
instance_type: {
|
112
|
+
acc_name: acc_count
|
113
|
+
} for acc_name, acc_count_to_instance_type in
|
114
|
+
_ACC_INSTANCE_TYPE_DICTS.items()
|
115
|
+
for acc_count, instance_types in acc_count_to_instance_type.items()
|
116
|
+
for instance_type in instance_types
|
117
|
+
}
|
118
|
+
GCP_ACC_INSTANCE_TYPES = list(_INSTANCE_TYPE_TO_ACC.keys())
|
109
119
|
|
110
120
|
# Number of CPU cores per GPU based on the AWS setting.
|
111
121
|
# GCP A100 has its own instance type mapping.
|
@@ -270,6 +280,26 @@ def get_default_instance_type(
|
|
270
280
|
memory_gb_or_ratio)
|
271
281
|
|
272
282
|
|
283
|
+
def get_accelerators_from_instance_type(
|
284
|
+
instance_type: str) -> Optional[Dict[str, int]]:
|
285
|
+
"""Infer the GPU type from the instance type.
|
286
|
+
|
287
|
+
This inference logic is GCP-specific. Unlike other clouds, we don't call
|
288
|
+
the internal implementation defined in common.py.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
instance_type: the instance type to use.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
A dictionary mapping from the accelerator name to the accelerator count.
|
295
|
+
"""
|
296
|
+
if instance_type in GCP_ACC_INSTANCE_TYPES:
|
297
|
+
return _INSTANCE_TYPE_TO_ACC[instance_type]
|
298
|
+
else:
|
299
|
+
# General CPU instance types don't come with pre-attached accelerators.
|
300
|
+
return None
|
301
|
+
|
302
|
+
|
273
303
|
def get_instance_type_for_accelerator(
|
274
304
|
acc_name: str,
|
275
305
|
acc_count: int,
|
@@ -528,16 +558,13 @@ def check_accelerator_attachable_to_host(instance_type: str,
|
|
528
558
|
attached to the host.
|
529
559
|
"""
|
530
560
|
if accelerators is None:
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
f'{acc_name} GPUs. Either use other instance types or '
|
539
|
-
f'specify the accelerators as {acc_name}.')
|
540
|
-
return
|
561
|
+
if instance_type in GCP_ACC_INSTANCE_TYPES:
|
562
|
+
# Infer the GPU type from the instance type
|
563
|
+
accelerators = _INSTANCE_TYPE_TO_ACC[instance_type]
|
564
|
+
else:
|
565
|
+
# Skip the following checks if instance_type is a general CPU
|
566
|
+
# instance without accelerators
|
567
|
+
return
|
541
568
|
|
542
569
|
acc = list(accelerators.items())
|
543
570
|
assert len(acc) == 1, acc
|
sky/core.py
CHANGED
@@ -372,12 +372,12 @@ def _start(
|
|
372
372
|
with dag_lib.Dag():
|
373
373
|
dummy_task = task_lib.Task().set_resources(handle.launched_resources)
|
374
374
|
dummy_task.num_nodes = handle.launched_nodes
|
375
|
-
handle = backend.provision(dummy_task,
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
375
|
+
(handle, _) = backend.provision(dummy_task,
|
376
|
+
to_provision=handle.launched_resources,
|
377
|
+
dryrun=False,
|
378
|
+
stream_logs=True,
|
379
|
+
cluster_name=cluster_name,
|
380
|
+
retry_until_up=retry_until_up)
|
381
381
|
storage_mounts = backend.get_storage_mounts_metadata(handle.cluster_name)
|
382
382
|
# Passing all_file_mounts as None ensures the local source set in Storage
|
383
383
|
# to not redundantly sync source to the bucket.
|
sky/data/data_utils.py
CHANGED
@@ -322,14 +322,9 @@ def create_r2_client(region: str = 'auto') -> Client:
|
|
322
322
|
return cloudflare.client('s3', region)
|
323
323
|
|
324
324
|
|
325
|
-
def create_nebius_client(
|
326
|
-
"""Helper method that connects to Boto3 client for Nebius Object Storage
|
327
|
-
|
328
|
-
Args:
|
329
|
-
region: str; Region for Nebius Object Storage
|
330
|
-
"""
|
331
|
-
region = region if region is not None else nebius.DEFAULT_REGION
|
332
|
-
return nebius.client('s3', region)
|
325
|
+
def create_nebius_client() -> Client:
|
326
|
+
"""Helper method that connects to Boto3 client for Nebius Object Storage"""
|
327
|
+
return nebius.client('s3')
|
333
328
|
|
334
329
|
|
335
330
|
def verify_r2_bucket(name: str) -> bool:
|
@@ -566,7 +561,8 @@ def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
|
|
566
561
|
require_outputs=True,
|
567
562
|
# We need to use bash as some of the cloud commands uses bash syntax,
|
568
563
|
# such as [[ ... ]]
|
569
|
-
executable='/bin/bash'
|
564
|
+
executable='/bin/bash',
|
565
|
+
log_cmd=True)
|
570
566
|
if access_denied_message in stderr:
|
571
567
|
with ux_utils.print_exception_no_traceback():
|
572
568
|
raise PermissionError('Failed to upload files to '
|
sky/data/mounting_utils.py
CHANGED
@@ -64,8 +64,8 @@ def get_s3_mount_cmd(bucket_name: str,
|
|
64
64
|
|
65
65
|
|
66
66
|
def get_nebius_mount_cmd(nebius_profile_name: str,
|
67
|
-
endpoint_url: str,
|
68
67
|
bucket_name: str,
|
68
|
+
endpoint_url: str,
|
69
69
|
mount_path: str,
|
70
70
|
_bucket_sub_path: Optional[str] = None) -> str:
|
71
71
|
"""Returns a command to install Nebius mount utility goofys."""
|