skypilot-nightly 1.0.0.dev20250408__py3-none-any.whl → 1.0.0.dev20250411__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +1 -1
- sky/adaptors/nebius.py +5 -27
- sky/backends/backend.py +9 -7
- sky/backends/cloud_vm_ray_backend.py +7 -7
- sky/backends/local_docker_backend.py +3 -3
- sky/client/common.py +4 -2
- sky/client/sdk.py +58 -26
- sky/cloud_stores.py +0 -4
- sky/clouds/do.py +4 -5
- sky/clouds/gcp.py +5 -3
- sky/clouds/nebius.py +22 -12
- sky/clouds/service_catalog/data_fetchers/fetch_ibm.py +1 -2
- sky/clouds/service_catalog/gcp_catalog.py +37 -10
- sky/core.py +6 -6
- sky/data/data_utils.py +5 -9
- sky/data/mounting_utils.py +1 -1
- sky/data/storage.py +25 -31
- sky/data/storage_utils.py +27 -18
- sky/execution.py +11 -4
- sky/jobs/client/sdk.py +5 -0
- sky/jobs/server/server.py +5 -1
- sky/optimizer.py +1 -2
- sky/provision/do/utils.py +19 -16
- sky/provision/gcp/config.py +30 -20
- sky/serve/client/sdk.py +6 -0
- sky/server/common.py +16 -1
- sky/server/constants.py +5 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/log_lib.py +4 -0
- sky/skypilot_config.py +19 -30
- sky/task.py +27 -7
- sky/utils/schemas.py +25 -7
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/RECORD +39 -39
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/top_level.txt +0 -0
sky/clouds/nebius.py
CHANGED
@@ -24,18 +24,28 @@ _CREDENTIAL_FILES = [
|
|
24
24
|
_INDENT_PREFIX = ' '
|
25
25
|
|
26
26
|
|
27
|
-
def
|
28
|
-
"""Checks if Nebius Object Storage profile is set in aws credentials
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
def nebius_profile_in_aws_cred_and_config() -> bool:
|
28
|
+
"""Checks if Nebius Object Storage profile is set in aws credentials
|
29
|
+
and profile."""
|
30
|
+
|
31
|
+
credentials_path = os.path.expanduser('~/.aws/credentials')
|
32
|
+
nebius_profile_exists_in_credentials = False
|
33
|
+
if os.path.isfile(credentials_path):
|
34
|
+
with open(credentials_path, 'r', encoding='utf-8') as file:
|
34
35
|
for line in file:
|
35
36
|
if f'[{nebius.NEBIUS_PROFILE_NAME}]' in line:
|
36
|
-
|
37
|
+
nebius_profile_exists_in_credentials = True
|
38
|
+
|
39
|
+
config_path = os.path.expanduser('~/.aws/config')
|
40
|
+
nebius_profile_exists_in_config = False
|
41
|
+
if os.path.isfile(config_path):
|
42
|
+
with open(config_path, 'r', encoding='utf-8') as file:
|
43
|
+
for line in file:
|
44
|
+
if f'[profile {nebius.NEBIUS_PROFILE_NAME}]' in line:
|
45
|
+
nebius_profile_exists_in_config = True
|
37
46
|
|
38
|
-
return
|
47
|
+
return (nebius_profile_exists_in_credentials and
|
48
|
+
nebius_profile_exists_in_config)
|
39
49
|
|
40
50
|
|
41
51
|
@registry.CLOUD_REGISTRY.register
|
@@ -308,12 +318,12 @@ class Nebius(clouds.Cloud):
|
|
308
318
|
with a string on unset credential.
|
309
319
|
"""
|
310
320
|
hints = None
|
311
|
-
if not
|
321
|
+
if not nebius_profile_in_aws_cred_and_config():
|
312
322
|
hints = (f'[{nebius.NEBIUS_PROFILE_NAME}] profile '
|
313
323
|
'is not set in ~/.aws/credentials.')
|
314
324
|
if hints:
|
315
325
|
hints += ' Run the following commands:'
|
316
|
-
if not
|
326
|
+
if not nebius_profile_in_aws_cred_and_config():
|
317
327
|
hints += (
|
318
328
|
f'\n{_INDENT_PREFIX} $ pip install boto3'
|
319
329
|
f'\n{_INDENT_PREFIX} $ aws configure --profile nebius')
|
@@ -329,7 +339,7 @@ class Nebius(clouds.Cloud):
|
|
329
339
|
for filename in _CREDENTIAL_FILES
|
330
340
|
}
|
331
341
|
credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
|
332
|
-
|
342
|
+
credential_file_mounts['~/.aws/config'] = '~/.aws/config'
|
333
343
|
return credential_file_mounts
|
334
344
|
|
335
345
|
@classmethod
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""A script that generates the
|
1
|
+
"""A script that generates the IBM Cloud catalog.
|
2
2
|
|
3
3
|
Usage:
|
4
4
|
python fetch_ibm.py [-h] [--api-key API_KEY]
|
@@ -19,7 +19,6 @@ import yaml
|
|
19
19
|
|
20
20
|
TOKEN_ENDPOINT = 'https://iam.cloud.ibm.com/identity/token'
|
21
21
|
REGIONS_ENDPOINT = f'https://us-south.iaas.cloud.ibm.com/v1/regions?version={datetime.today().strftime("%Y-%m-%d")}&generation=2' # pylint: disable=line-too-long
|
22
|
-
ENDPOINT = 'https://cloud.lambdalabs.com/api/v1/instance-types'
|
23
22
|
DEFAULT_IBM_CREDENTIALS_PATH = os.path.expanduser('~/.ibm/credentials.yaml')
|
24
23
|
|
25
24
|
|
@@ -106,6 +106,16 @@ _ACC_INSTANCE_TYPE_DICTS = {
|
|
106
106
|
8: ['a3-megagpu-8g'],
|
107
107
|
}
|
108
108
|
}
|
109
|
+
# Enable GPU type inference from instance types
|
110
|
+
_INSTANCE_TYPE_TO_ACC = {
|
111
|
+
instance_type: {
|
112
|
+
acc_name: acc_count
|
113
|
+
} for acc_name, acc_count_to_instance_type in
|
114
|
+
_ACC_INSTANCE_TYPE_DICTS.items()
|
115
|
+
for acc_count, instance_types in acc_count_to_instance_type.items()
|
116
|
+
for instance_type in instance_types
|
117
|
+
}
|
118
|
+
GCP_ACC_INSTANCE_TYPES = list(_INSTANCE_TYPE_TO_ACC.keys())
|
109
119
|
|
110
120
|
# Number of CPU cores per GPU based on the AWS setting.
|
111
121
|
# GCP A100 has its own instance type mapping.
|
@@ -270,6 +280,26 @@ def get_default_instance_type(
|
|
270
280
|
memory_gb_or_ratio)
|
271
281
|
|
272
282
|
|
283
|
+
def get_accelerators_from_instance_type(
|
284
|
+
instance_type: str) -> Optional[Dict[str, int]]:
|
285
|
+
"""Infer the GPU type from the instance type.
|
286
|
+
|
287
|
+
This inference logic is GCP-specific. Unlike other clouds, we don't call
|
288
|
+
the internal implementation defined in common.py.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
instance_type: the instance type to use.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
A dictionary mapping from the accelerator name to the accelerator count.
|
295
|
+
"""
|
296
|
+
if instance_type in GCP_ACC_INSTANCE_TYPES:
|
297
|
+
return _INSTANCE_TYPE_TO_ACC[instance_type]
|
298
|
+
else:
|
299
|
+
# General CPU instance types don't come with pre-attached accelerators.
|
300
|
+
return None
|
301
|
+
|
302
|
+
|
273
303
|
def get_instance_type_for_accelerator(
|
274
304
|
acc_name: str,
|
275
305
|
acc_count: int,
|
@@ -528,16 +558,13 @@ def check_accelerator_attachable_to_host(instance_type: str,
|
|
528
558
|
attached to the host.
|
529
559
|
"""
|
530
560
|
if accelerators is None:
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
f'{acc_name} GPUs. Either use other instance types or '
|
539
|
-
f'specify the accelerators as {acc_name}.')
|
540
|
-
return
|
561
|
+
if instance_type in GCP_ACC_INSTANCE_TYPES:
|
562
|
+
# Infer the GPU type from the instance type
|
563
|
+
accelerators = _INSTANCE_TYPE_TO_ACC[instance_type]
|
564
|
+
else:
|
565
|
+
# Skip the following checks if instance_type is a general CPU
|
566
|
+
# instance without accelerators
|
567
|
+
return
|
541
568
|
|
542
569
|
acc = list(accelerators.items())
|
543
570
|
assert len(acc) == 1, acc
|
sky/core.py
CHANGED
@@ -372,12 +372,12 @@ def _start(
|
|
372
372
|
with dag_lib.Dag():
|
373
373
|
dummy_task = task_lib.Task().set_resources(handle.launched_resources)
|
374
374
|
dummy_task.num_nodes = handle.launched_nodes
|
375
|
-
handle = backend.provision(dummy_task,
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
375
|
+
(handle, _) = backend.provision(dummy_task,
|
376
|
+
to_provision=handle.launched_resources,
|
377
|
+
dryrun=False,
|
378
|
+
stream_logs=True,
|
379
|
+
cluster_name=cluster_name,
|
380
|
+
retry_until_up=retry_until_up)
|
381
381
|
storage_mounts = backend.get_storage_mounts_metadata(handle.cluster_name)
|
382
382
|
# Passing all_file_mounts as None ensures the local source set in Storage
|
383
383
|
# to not redundantly sync source to the bucket.
|
sky/data/data_utils.py
CHANGED
@@ -322,14 +322,9 @@ def create_r2_client(region: str = 'auto') -> Client:
|
|
322
322
|
return cloudflare.client('s3', region)
|
323
323
|
|
324
324
|
|
325
|
-
def create_nebius_client(
|
326
|
-
"""Helper method that connects to Boto3 client for Nebius Object Storage
|
327
|
-
|
328
|
-
Args:
|
329
|
-
region: str; Region for Nebius Object Storage
|
330
|
-
"""
|
331
|
-
region = region if region is not None else nebius.DEFAULT_REGION
|
332
|
-
return nebius.client('s3', region)
|
325
|
+
def create_nebius_client() -> Client:
|
326
|
+
"""Helper method that connects to Boto3 client for Nebius Object Storage"""
|
327
|
+
return nebius.client('s3')
|
333
328
|
|
334
329
|
|
335
330
|
def verify_r2_bucket(name: str) -> bool:
|
@@ -566,7 +561,8 @@ def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
|
|
566
561
|
require_outputs=True,
|
567
562
|
# We need to use bash as some of the cloud commands uses bash syntax,
|
568
563
|
# such as [[ ... ]]
|
569
|
-
executable='/bin/bash'
|
564
|
+
executable='/bin/bash',
|
565
|
+
log_cmd=True)
|
570
566
|
if access_denied_message in stderr:
|
571
567
|
with ux_utils.print_exception_no_traceback():
|
572
568
|
raise PermissionError('Failed to upload files to '
|
sky/data/mounting_utils.py
CHANGED
@@ -64,8 +64,8 @@ def get_s3_mount_cmd(bucket_name: str,
|
|
64
64
|
|
65
65
|
|
66
66
|
def get_nebius_mount_cmd(nebius_profile_name: str,
|
67
|
-
endpoint_url: str,
|
68
67
|
bucket_name: str,
|
68
|
+
endpoint_url: str,
|
69
69
|
mount_path: str,
|
70
70
|
_bucket_sub_path: Optional[str] = None) -> str:
|
71
71
|
"""Returns a command to install Nebius mount utility goofys."""
|
sky/data/storage.py
CHANGED
@@ -1616,9 +1616,25 @@ class S3Store(AbstractStore):
|
|
1616
1616
|
# we exclude .git directory from the sync
|
1617
1617
|
excluded_list = storage_utils.get_excluded_files(src_dir_path)
|
1618
1618
|
excluded_list.append('.git/*')
|
1619
|
+
|
1620
|
+
# Process exclusion patterns to make them work correctly with aws
|
1621
|
+
# s3 sync
|
1622
|
+
processed_excludes = []
|
1623
|
+
for excluded_path in excluded_list:
|
1624
|
+
# Check if the path is a directory exclusion pattern
|
1625
|
+
# For AWS S3 sync, directory patterns need to end with "/**" to
|
1626
|
+
# exclude all contents
|
1627
|
+
if (excluded_path.endswith('/') or os.path.isdir(
|
1628
|
+
os.path.join(src_dir_path, excluded_path.rstrip('/')))):
|
1629
|
+
# Remove any trailing slash and add '/*' to exclude all
|
1630
|
+
# contents
|
1631
|
+
processed_excludes.append(f'{excluded_path.rstrip("/")}/*')
|
1632
|
+
else:
|
1633
|
+
processed_excludes.append(excluded_path)
|
1634
|
+
|
1619
1635
|
excludes = ' '.join([
|
1620
1636
|
f'--exclude {shlex.quote(file_name)}'
|
1621
|
-
for file_name in
|
1637
|
+
for file_name in processed_excludes
|
1622
1638
|
])
|
1623
1639
|
src_dir_path = shlex.quote(src_dir_path)
|
1624
1640
|
sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
|
@@ -4676,7 +4692,6 @@ class NebiusStore(AbstractStore):
|
|
4676
4692
|
_bucket_sub_path: Optional[str] = None):
|
4677
4693
|
self.client: 'boto3.client.Client'
|
4678
4694
|
self.bucket: 'StorageHandle'
|
4679
|
-
self.region = region if region is not None else nebius.DEFAULT_REGION
|
4680
4695
|
super().__init__(name, source, region, is_sky_managed,
|
4681
4696
|
sync_on_reconstruction, _bucket_sub_path)
|
4682
4697
|
|
@@ -4749,7 +4764,7 @@ class NebiusStore(AbstractStore):
|
|
4749
4764
|
StorageBucketGetError: If fetching existing bucket fails
|
4750
4765
|
StorageInitError: If general initialization fails.
|
4751
4766
|
"""
|
4752
|
-
self.client = data_utils.create_nebius_client(
|
4767
|
+
self.client = data_utils.create_nebius_client()
|
4753
4768
|
self.bucket, is_new_bucket = self._get_bucket()
|
4754
4769
|
if self.is_sky_managed is None:
|
4755
4770
|
# If is_sky_managed is not specified, then this is a new storage
|
@@ -4846,12 +4861,10 @@ class NebiusStore(AbstractStore):
|
|
4846
4861
|
f'--include {shlex.quote(file_name)}'
|
4847
4862
|
for file_name in file_names
|
4848
4863
|
])
|
4849
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
4850
4864
|
base_dir_path = shlex.quote(base_dir_path)
|
4851
4865
|
sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" '
|
4852
4866
|
f'{includes} {base_dir_path} '
|
4853
4867
|
f's3://{self.name}{sub_path} '
|
4854
|
-
f'--endpoint={endpoint_url} '
|
4855
4868
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
4856
4869
|
return sync_command
|
4857
4870
|
|
@@ -4863,12 +4876,10 @@ class NebiusStore(AbstractStore):
|
|
4863
4876
|
f'--exclude {shlex.quote(file_name)}'
|
4864
4877
|
for file_name in excluded_list
|
4865
4878
|
])
|
4866
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
4867
4879
|
src_dir_path = shlex.quote(src_dir_path)
|
4868
4880
|
sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
|
4869
4881
|
f'{src_dir_path} '
|
4870
4882
|
f's3://{self.name}{sub_path}/{dest_dir_name} '
|
4871
|
-
f'--endpoint={endpoint_url} '
|
4872
4883
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
4873
4884
|
return sync_command
|
4874
4885
|
|
@@ -4927,7 +4938,6 @@ class NebiusStore(AbstractStore):
|
|
4927
4938
|
"""
|
4928
4939
|
nebius_s = nebius.resource('s3')
|
4929
4940
|
bucket = nebius_s.Bucket(self.name)
|
4930
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
4931
4941
|
try:
|
4932
4942
|
# Try Public bucket case.
|
4933
4943
|
# This line does not error out if the bucket is an external public
|
@@ -4942,7 +4952,6 @@ class NebiusStore(AbstractStore):
|
|
4942
4952
|
# user.
|
4943
4953
|
if error_code == '403':
|
4944
4954
|
command = (f'aws s3 ls s3://{self.name} '
|
4945
|
-
f'--endpoint={endpoint_url} '
|
4946
4955
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
4947
4956
|
with ux_utils.print_exception_no_traceback():
|
4948
4957
|
raise exceptions.StorageBucketGetError(
|
@@ -4954,7 +4963,7 @@ class NebiusStore(AbstractStore):
|
|
4954
4963
|
raise exceptions.StorageBucketGetError(
|
4955
4964
|
'Attempted to use a non-existent bucket as a source: '
|
4956
4965
|
f'{self.source}. Consider using `aws s3 ls '
|
4957
|
-
f's3://{self.name}
|
4966
|
+
f's3://{self.name} '
|
4958
4967
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}` to debug.')
|
4959
4968
|
|
4960
4969
|
# If bucket cannot be found in both private and public settings,
|
@@ -4962,7 +4971,7 @@ class NebiusStore(AbstractStore):
|
|
4962
4971
|
# Store object is being reconstructed for deletion or re-mount with
|
4963
4972
|
# sky start, and error is raised instead.
|
4964
4973
|
if self.sync_on_reconstruction:
|
4965
|
-
bucket = self._create_nebius_bucket(self.name
|
4974
|
+
bucket = self._create_nebius_bucket(self.name)
|
4966
4975
|
return bucket, True
|
4967
4976
|
else:
|
4968
4977
|
# Raised when Storage object is reconstructed for sky storage
|
@@ -4991,38 +5000,27 @@ class NebiusStore(AbstractStore):
|
|
4991
5000
|
mount_path: str; Path to mount the bucket to.
|
4992
5001
|
"""
|
4993
5002
|
install_cmd = mounting_utils.get_s3_mount_install_cmd()
|
4994
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
4995
5003
|
nebius_profile_name = nebius.NEBIUS_PROFILE_NAME
|
5004
|
+
endpoint_url = self.client.meta.endpoint_url
|
4996
5005
|
mount_cmd = mounting_utils.get_nebius_mount_cmd(nebius_profile_name,
|
4997
|
-
endpoint_url,
|
4998
5006
|
self.bucket.name,
|
5007
|
+
endpoint_url,
|
4999
5008
|
mount_path,
|
5000
5009
|
self._bucket_sub_path)
|
5001
5010
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
5002
5011
|
mount_cmd)
|
5003
5012
|
|
5004
|
-
def _create_nebius_bucket(self,
|
5005
|
-
|
5006
|
-
region='auto') -> StorageHandle:
|
5007
|
-
"""Creates S3 bucket with specific name in specific region
|
5013
|
+
def _create_nebius_bucket(self, bucket_name: str) -> StorageHandle:
|
5014
|
+
"""Creates S3 bucket with specific name
|
5008
5015
|
|
5009
5016
|
Args:
|
5010
5017
|
bucket_name: str; Name of bucket
|
5011
|
-
region: str; Region name, e.g. us-west-1, us-east-2
|
5012
5018
|
Raises:
|
5013
5019
|
StorageBucketCreateError: If bucket creation fails.
|
5014
5020
|
"""
|
5015
5021
|
nebius_client = self.client
|
5016
5022
|
try:
|
5017
|
-
|
5018
|
-
nebius_client.create_bucket(Bucket=bucket_name)
|
5019
|
-
else:
|
5020
|
-
location = {'LocationConstraint': region}
|
5021
|
-
nebius_client.create_bucket(Bucket=bucket_name,
|
5022
|
-
CreateBucketConfiguration=location)
|
5023
|
-
logger.info(f' {colorama.Style.DIM}Created Nebius bucket '
|
5024
|
-
f'{bucket_name!r} in {region}'
|
5025
|
-
f'{colorama.Style.RESET_ALL}')
|
5023
|
+
nebius_client.create_bucket(Bucket=bucket_name)
|
5026
5024
|
except aws.botocore_exceptions().ClientError as e:
|
5027
5025
|
with ux_utils.print_exception_no_traceback():
|
5028
5026
|
raise exceptions.StorageBucketCreateError(
|
@@ -5070,9 +5068,7 @@ class NebiusStore(AbstractStore):
|
|
5070
5068
|
# https://stackoverflow.com/questions/49239351/why-is-it-so-much-slower-to-delete-objects-in-aws-s3-than-it-is-to-create-them
|
5071
5069
|
# The fastest way to delete is to run `aws s3 rb --force`,
|
5072
5070
|
# which removes the bucket by force.
|
5073
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
5074
5071
|
remove_command = (f'aws s3 rb s3://{bucket_name} --force '
|
5075
|
-
f'--endpoint {endpoint_url} '
|
5076
5072
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
5077
5073
|
|
5078
5074
|
success = self._execute_nebius_remove_command(
|
@@ -5094,10 +5090,8 @@ class NebiusStore(AbstractStore):
|
|
5094
5090
|
def _delete_nebius_bucket_sub_path(self, bucket_name: str,
|
5095
5091
|
sub_path: str) -> bool:
|
5096
5092
|
"""Deletes the sub path from the bucket."""
|
5097
|
-
endpoint_url = nebius.create_endpoint(self.region)
|
5098
5093
|
remove_command = (
|
5099
5094
|
f'aws s3 rm s3://{bucket_name}/{sub_path}/ --recursive '
|
5100
|
-
f'--endpoint {endpoint_url} '
|
5101
5095
|
f'--profile={nebius.NEBIUS_PROFILE_NAME}')
|
5102
5096
|
return self._execute_nebius_remove_command(
|
5103
5097
|
remove_command, bucket_name, f'Removing objects from '
|
sky/data/storage_utils.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4
4
|
import pathlib
|
5
5
|
import shlex
|
6
6
|
import subprocess
|
7
|
-
from typing import Any, Dict, List, Optional, TextIO, Union
|
7
|
+
from typing import Any, Dict, List, Optional, Set, TextIO, Union
|
8
8
|
import warnings
|
9
9
|
import zipfile
|
10
10
|
|
@@ -71,7 +71,7 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
|
|
71
71
|
"""List files and patterns ignored by the .skyignore file
|
72
72
|
in the given source directory.
|
73
73
|
"""
|
74
|
-
excluded_list:
|
74
|
+
excluded_list: Set[str] = set()
|
75
75
|
expand_src_dir_path = os.path.expanduser(src_dir_path)
|
76
76
|
skyignore_path = os.path.join(expand_src_dir_path,
|
77
77
|
constants.SKY_IGNORE_FILE)
|
@@ -95,12 +95,12 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
|
|
95
95
|
for i in range(len(matching_files)):
|
96
96
|
matching_files[i] = os.path.relpath(
|
97
97
|
matching_files[i], expand_src_dir_path)
|
98
|
-
excluded_list.
|
98
|
+
excluded_list.update(matching_files)
|
99
99
|
except IOError as e:
|
100
100
|
logger.warning(f'Error reading {skyignore_path}: '
|
101
101
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
102
102
|
|
103
|
-
return excluded_list
|
103
|
+
return list(excluded_list)
|
104
104
|
|
105
105
|
|
106
106
|
def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
@@ -111,8 +111,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
111
111
|
This will also be run for all submodules under the src_dir_path.
|
112
112
|
|
113
113
|
Returns:
|
114
|
-
List[str] containing files and
|
115
|
-
patterns
|
114
|
+
List[str] containing files and folders to be ignored. There won't be any
|
115
|
+
patterns.
|
116
116
|
"""
|
117
117
|
expand_src_dir_path = os.path.expanduser(src_dir_path)
|
118
118
|
|
@@ -210,10 +210,6 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
210
210
|
return []
|
211
211
|
|
212
212
|
to_be_excluded = os.path.join(repo, item)
|
213
|
-
if item.endswith('/'):
|
214
|
-
# aws s3 sync and gsutil rsync require * to exclude
|
215
|
-
# files/dirs under the specified directory.
|
216
|
-
to_be_excluded += '*'
|
217
213
|
|
218
214
|
excluded_list.append(to_be_excluded)
|
219
215
|
|
@@ -223,11 +219,21 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
223
219
|
def get_excluded_files(src_dir_path: str) -> List[str]:
|
224
220
|
# TODO: this could return a huge list of files,
|
225
221
|
# should think of ways to optimize.
|
226
|
-
"""List files and directories to be excluded.
|
222
|
+
"""List files and directories to be excluded.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
src_dir_path (str): The path to the source directory.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
A list of relative paths to files and directories to be excluded from
|
229
|
+
the source directory.
|
230
|
+
"""
|
227
231
|
expand_src_dir_path = os.path.expanduser(src_dir_path)
|
228
232
|
skyignore_path = os.path.join(expand_src_dir_path,
|
229
233
|
constants.SKY_IGNORE_FILE)
|
230
234
|
# Fail fast if the source is a file.
|
235
|
+
if not os.path.exists(expand_src_dir_path):
|
236
|
+
raise ValueError(f'{src_dir_path} does not exist.')
|
231
237
|
if os.path.isfile(expand_src_dir_path):
|
232
238
|
raise ValueError(f'{src_dir_path} is a file, not a directory.')
|
233
239
|
if os.path.exists(skyignore_path):
|
@@ -235,12 +241,14 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
|
|
235
241
|
f'Excluded files to sync to cluster based on '
|
236
242
|
f'{constants.SKY_IGNORE_FILE}.'
|
237
243
|
f'{colorama.Style.RESET_ALL}')
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
+
excluded_paths = get_excluded_files_from_skyignore(src_dir_path)
|
245
|
+
else:
|
246
|
+
logger.debug(f' {colorama.Style.DIM}'
|
247
|
+
f'Excluded files to sync to cluster based on '
|
248
|
+
f'{constants.GIT_IGNORE_FILE}.'
|
249
|
+
f'{colorama.Style.RESET_ALL}')
|
250
|
+
excluded_paths = get_excluded_files_from_gitignore(src_dir_path)
|
251
|
+
return excluded_paths
|
244
252
|
|
245
253
|
|
246
254
|
def zip_files_and_folders(items: List[str],
|
@@ -277,7 +285,8 @@ def zip_files_and_folders(items: List[str],
|
|
277
285
|
zipf.write(item)
|
278
286
|
elif os.path.isdir(item):
|
279
287
|
excluded_files = set([
|
280
|
-
os.path.join(item, f
|
288
|
+
os.path.join(item, f.rstrip('/'))
|
289
|
+
for f in get_excluded_files(item)
|
281
290
|
])
|
282
291
|
for root, dirs, files in os.walk(item, followlinks=False):
|
283
292
|
# Modify dirs in-place to control os.walk()'s traversal
|
sky/execution.py
CHANGED
@@ -159,9 +159,9 @@ def _execute(
|
|
159
159
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
160
160
|
clone_disk_from: Optional[str]; if set, clone the disk from the specified
|
161
161
|
cluster.
|
162
|
-
|
162
|
+
skip_unnecessary_provisioning: bool; if True, compare the calculated
|
163
163
|
cluster config to the current cluster's config. If they match, shortcut
|
164
|
-
provisioning even if we have Stage.PROVISION.
|
164
|
+
provisioning and setup, even if we have Stage.PROVISION and Stage.SETUP.
|
165
165
|
|
166
166
|
Returns:
|
167
167
|
job_id: Optional[int]; the job ID of the submitted job. None if the
|
@@ -303,12 +303,13 @@ def _execute(
|
|
303
303
|
task.sync_storage_mounts()
|
304
304
|
|
305
305
|
try:
|
306
|
+
provisioning_skipped = False
|
306
307
|
if Stage.PROVISION in stages:
|
307
308
|
assert handle is None or skip_unnecessary_provisioning, (
|
308
309
|
'Provisioning requested, but handle is already set. PROVISION '
|
309
310
|
'should be excluded from stages or '
|
310
311
|
'skip_unecessary_provisioning should be set. ')
|
311
|
-
handle = backend.provision(
|
312
|
+
(handle, provisioning_skipped) = backend.provision(
|
312
313
|
task,
|
313
314
|
task.best_resources,
|
314
315
|
dryrun=dryrun,
|
@@ -341,7 +342,11 @@ def _execute(
|
|
341
342
|
if no_setup:
|
342
343
|
logger.info('Setup commands skipped.')
|
343
344
|
elif Stage.SETUP in stages and not dryrun:
|
344
|
-
|
345
|
+
if skip_unnecessary_provisioning and provisioning_skipped:
|
346
|
+
logger.debug('Unnecessary provisioning was skipped, so '
|
347
|
+
'skipping setup as well.')
|
348
|
+
else:
|
349
|
+
backend.setup(handle, task, detach_setup=detach_setup)
|
345
350
|
|
346
351
|
if Stage.PRE_EXEC in stages and not dryrun:
|
347
352
|
if idle_minutes_to_autostop is not None:
|
@@ -523,6 +528,8 @@ def launch(
|
|
523
528
|
Stage.PROVISION,
|
524
529
|
Stage.SYNC_WORKDIR,
|
525
530
|
Stage.SYNC_FILE_MOUNTS,
|
531
|
+
# Setup will be skipped if provisioning was skipped.
|
532
|
+
Stage.SETUP,
|
526
533
|
Stage.PRE_EXEC,
|
527
534
|
Stage.EXEC,
|
528
535
|
Stage.DOWN,
|
sky/jobs/client/sdk.py
CHANGED
@@ -82,6 +82,7 @@ def launch(
|
|
82
82
|
f'{server_common.get_server_url()}/jobs/launch',
|
83
83
|
json=json.loads(body.model_dump_json()),
|
84
84
|
timeout=(5, None),
|
85
|
+
cookies=server_common.get_api_cookie_jar(),
|
85
86
|
)
|
86
87
|
return server_common.get_request_id(response)
|
87
88
|
|
@@ -138,6 +139,7 @@ def queue(refresh: bool,
|
|
138
139
|
f'{server_common.get_server_url()}/jobs/queue',
|
139
140
|
json=json.loads(body.model_dump_json()),
|
140
141
|
timeout=(5, None),
|
142
|
+
cookies=server_common.get_api_cookie_jar(),
|
141
143
|
)
|
142
144
|
return server_common.get_request_id(response=response)
|
143
145
|
|
@@ -177,6 +179,7 @@ def cancel(
|
|
177
179
|
f'{server_common.get_server_url()}/jobs/cancel',
|
178
180
|
json=json.loads(body.model_dump_json()),
|
179
181
|
timeout=(5, None),
|
182
|
+
cookies=server_common.get_api_cookie_jar(),
|
180
183
|
)
|
181
184
|
return server_common.get_request_id(response=response)
|
182
185
|
|
@@ -224,6 +227,7 @@ def tail_logs(name: Optional[str] = None,
|
|
224
227
|
json=json.loads(body.model_dump_json()),
|
225
228
|
stream=True,
|
226
229
|
timeout=(5, None),
|
230
|
+
cookies=server_common.get_api_cookie_jar(),
|
227
231
|
)
|
228
232
|
request_id = server_common.get_request_id(response)
|
229
233
|
return sdk.stream_response(request_id, response, output_stream)
|
@@ -267,6 +271,7 @@ def download_logs(
|
|
267
271
|
f'{server_common.get_server_url()}/jobs/download_logs',
|
268
272
|
json=json.loads(body.model_dump_json()),
|
269
273
|
timeout=(5, None),
|
274
|
+
cookies=server_common.get_api_cookie_jar(),
|
270
275
|
)
|
271
276
|
job_id_remote_path_dict = sdk.stream_and_get(
|
272
277
|
server_common.get_request_id(response))
|
sky/jobs/server/server.py
CHANGED
@@ -161,7 +161,11 @@ async def dashboard(request: fastapi.Request,
|
|
161
161
|
response = await client.request('GET',
|
162
162
|
dashboard_url,
|
163
163
|
timeout=5)
|
164
|
-
|
164
|
+
if response.is_success:
|
165
|
+
break # Connection successful, proceed with the request
|
166
|
+
# Raise an HTTPException here which will be caught by the
|
167
|
+
# following except block to retry with new connection
|
168
|
+
response.raise_for_status()
|
165
169
|
except Exception as e: # pylint: disable=broad-except
|
166
170
|
# We catch all exceptions to gracefully handle unknown
|
167
171
|
# errors and retry or raise an HTTPException to the client.
|
sky/optimizer.py
CHANGED
@@ -6,6 +6,7 @@ import typing
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
7
7
|
|
8
8
|
import colorama
|
9
|
+
import numpy as np
|
9
10
|
import prettytable
|
10
11
|
|
11
12
|
from sky import check as sky_check
|
@@ -28,12 +29,10 @@ from sky.utils import ux_utils
|
|
28
29
|
|
29
30
|
if typing.TYPE_CHECKING:
|
30
31
|
import networkx as nx
|
31
|
-
import numpy as np
|
32
32
|
|
33
33
|
from sky import dag as dag_lib
|
34
34
|
else:
|
35
35
|
nx = adaptors_common.LazyImport('networkx')
|
36
|
-
np = adaptors_common.LazyImport('numpy')
|
37
36
|
|
38
37
|
logger = sky_logging.init_logger(__name__)
|
39
38
|
|