skypilot-nightly 1.0.0.dev20250408__py3-none-any.whl → 1.0.0.dev20250411__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +1 -1
  3. sky/adaptors/nebius.py +5 -27
  4. sky/backends/backend.py +9 -7
  5. sky/backends/cloud_vm_ray_backend.py +7 -7
  6. sky/backends/local_docker_backend.py +3 -3
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +58 -26
  9. sky/cloud_stores.py +0 -4
  10. sky/clouds/do.py +4 -5
  11. sky/clouds/gcp.py +5 -3
  12. sky/clouds/nebius.py +22 -12
  13. sky/clouds/service_catalog/data_fetchers/fetch_ibm.py +1 -2
  14. sky/clouds/service_catalog/gcp_catalog.py +37 -10
  15. sky/core.py +6 -6
  16. sky/data/data_utils.py +5 -9
  17. sky/data/mounting_utils.py +1 -1
  18. sky/data/storage.py +25 -31
  19. sky/data/storage_utils.py +27 -18
  20. sky/execution.py +11 -4
  21. sky/jobs/client/sdk.py +5 -0
  22. sky/jobs/server/server.py +5 -1
  23. sky/optimizer.py +1 -2
  24. sky/provision/do/utils.py +19 -16
  25. sky/provision/gcp/config.py +30 -20
  26. sky/serve/client/sdk.py +6 -0
  27. sky/server/common.py +16 -1
  28. sky/server/constants.py +5 -0
  29. sky/setup_files/dependencies.py +1 -1
  30. sky/skylet/log_lib.py +4 -0
  31. sky/skypilot_config.py +19 -30
  32. sky/task.py +27 -7
  33. sky/utils/schemas.py +25 -7
  34. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/METADATA +2 -2
  35. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/RECORD +39 -39
  36. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/WHEEL +0 -0
  37. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/entry_points.txt +0 -0
  38. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/licenses/LICENSE +0 -0
  39. {skypilot_nightly-1.0.0.dev20250408.dist-info → skypilot_nightly-1.0.0.dev20250411.dist-info}/top_level.txt +0 -0
sky/clouds/nebius.py CHANGED
@@ -24,18 +24,28 @@ _CREDENTIAL_FILES = [
24
24
  _INDENT_PREFIX = ' '
25
25
 
26
26
 
27
- def nebius_profile_in_aws_cred() -> bool:
28
- """Checks if Nebius Object Storage profile is set in aws credentials."""
29
-
30
- profile_path = os.path.expanduser('~/.aws/credentials')
31
- nebius_profile_exists = False
32
- if os.path.isfile(profile_path):
33
- with open(profile_path, 'r', encoding='utf-8') as file:
27
+ def nebius_profile_in_aws_cred_and_config() -> bool:
28
+ """Checks if Nebius Object Storage profile is set in aws credentials
29
+ and profile."""
30
+
31
+ credentials_path = os.path.expanduser('~/.aws/credentials')
32
+ nebius_profile_exists_in_credentials = False
33
+ if os.path.isfile(credentials_path):
34
+ with open(credentials_path, 'r', encoding='utf-8') as file:
34
35
  for line in file:
35
36
  if f'[{nebius.NEBIUS_PROFILE_NAME}]' in line:
36
- nebius_profile_exists = True
37
+ nebius_profile_exists_in_credentials = True
38
+
39
+ config_path = os.path.expanduser('~/.aws/config')
40
+ nebius_profile_exists_in_config = False
41
+ if os.path.isfile(config_path):
42
+ with open(config_path, 'r', encoding='utf-8') as file:
43
+ for line in file:
44
+ if f'[profile {nebius.NEBIUS_PROFILE_NAME}]' in line:
45
+ nebius_profile_exists_in_config = True
37
46
 
38
- return nebius_profile_exists
47
+ return (nebius_profile_exists_in_credentials and
48
+ nebius_profile_exists_in_config)
39
49
 
40
50
 
41
51
  @registry.CLOUD_REGISTRY.register
@@ -308,12 +318,12 @@ class Nebius(clouds.Cloud):
308
318
  with a string on unset credential.
309
319
  """
310
320
  hints = None
311
- if not nebius_profile_in_aws_cred():
321
+ if not nebius_profile_in_aws_cred_and_config():
312
322
  hints = (f'[{nebius.NEBIUS_PROFILE_NAME}] profile '
313
323
  'is not set in ~/.aws/credentials.')
314
324
  if hints:
315
325
  hints += ' Run the following commands:'
316
- if not nebius_profile_in_aws_cred():
326
+ if not nebius_profile_in_aws_cred_and_config():
317
327
  hints += (
318
328
  f'\n{_INDENT_PREFIX} $ pip install boto3'
319
329
  f'\n{_INDENT_PREFIX} $ aws configure --profile nebius')
@@ -329,7 +339,7 @@ class Nebius(clouds.Cloud):
329
339
  for filename in _CREDENTIAL_FILES
330
340
  }
331
341
  credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
332
-
342
+ credential_file_mounts['~/.aws/config'] = '~/.aws/config'
333
343
  return credential_file_mounts
334
344
 
335
345
  @classmethod
@@ -1,4 +1,4 @@
1
- """A script that generates the Lambda Cloud catalog.
1
+ """A script that generates the IBM Cloud catalog.
2
2
 
3
3
  Usage:
4
4
  python fetch_ibm.py [-h] [--api-key API_KEY]
@@ -19,7 +19,6 @@ import yaml
19
19
 
20
20
  TOKEN_ENDPOINT = 'https://iam.cloud.ibm.com/identity/token'
21
21
  REGIONS_ENDPOINT = f'https://us-south.iaas.cloud.ibm.com/v1/regions?version={datetime.today().strftime("%Y-%m-%d")}&generation=2' # pylint: disable=line-too-long
22
- ENDPOINT = 'https://cloud.lambdalabs.com/api/v1/instance-types'
23
22
  DEFAULT_IBM_CREDENTIALS_PATH = os.path.expanduser('~/.ibm/credentials.yaml')
24
23
 
25
24
 
@@ -106,6 +106,16 @@ _ACC_INSTANCE_TYPE_DICTS = {
106
106
  8: ['a3-megagpu-8g'],
107
107
  }
108
108
  }
109
+ # Enable GPU type inference from instance types
110
+ _INSTANCE_TYPE_TO_ACC = {
111
+ instance_type: {
112
+ acc_name: acc_count
113
+ } for acc_name, acc_count_to_instance_type in
114
+ _ACC_INSTANCE_TYPE_DICTS.items()
115
+ for acc_count, instance_types in acc_count_to_instance_type.items()
116
+ for instance_type in instance_types
117
+ }
118
+ GCP_ACC_INSTANCE_TYPES = list(_INSTANCE_TYPE_TO_ACC.keys())
109
119
 
110
120
  # Number of CPU cores per GPU based on the AWS setting.
111
121
  # GCP A100 has its own instance type mapping.
@@ -270,6 +280,26 @@ def get_default_instance_type(
270
280
  memory_gb_or_ratio)
271
281
 
272
282
 
283
+ def get_accelerators_from_instance_type(
284
+ instance_type: str) -> Optional[Dict[str, int]]:
285
+ """Infer the GPU type from the instance type.
286
+
287
+ This inference logic is GCP-specific. Unlike other clouds, we don't call
288
+ the internal implementation defined in common.py.
289
+
290
+ Args:
291
+ instance_type: the instance type to use.
292
+
293
+ Returns:
294
+ A dictionary mapping from the accelerator name to the accelerator count.
295
+ """
296
+ if instance_type in GCP_ACC_INSTANCE_TYPES:
297
+ return _INSTANCE_TYPE_TO_ACC[instance_type]
298
+ else:
299
+ # General CPU instance types don't come with pre-attached accelerators.
300
+ return None
301
+
302
+
273
303
  def get_instance_type_for_accelerator(
274
304
  acc_name: str,
275
305
  acc_count: int,
@@ -528,16 +558,13 @@ def check_accelerator_attachable_to_host(instance_type: str,
528
558
  attached to the host.
529
559
  """
530
560
  if accelerators is None:
531
- for acc_name, val in _ACC_INSTANCE_TYPE_DICTS.items():
532
- if instance_type in sum(val.values(), []):
533
- # NOTE: While it is allowed to use A2/G2 VMs as CPU-only nodes,
534
- # we exclude this case as it is uncommon and undesirable.
535
- with ux_utils.print_exception_no_traceback():
536
- raise exceptions.ResourcesMismatchError(
537
- f'{instance_type} instance types should be used with '
538
- f'{acc_name} GPUs. Either use other instance types or '
539
- f'specify the accelerators as {acc_name}.')
540
- return
561
+ if instance_type in GCP_ACC_INSTANCE_TYPES:
562
+ # Infer the GPU type from the instance type
563
+ accelerators = _INSTANCE_TYPE_TO_ACC[instance_type]
564
+ else:
565
+ # Skip the following checks if instance_type is a general CPU
566
+ # instance without accelerators
567
+ return
541
568
 
542
569
  acc = list(accelerators.items())
543
570
  assert len(acc) == 1, acc
sky/core.py CHANGED
@@ -372,12 +372,12 @@ def _start(
372
372
  with dag_lib.Dag():
373
373
  dummy_task = task_lib.Task().set_resources(handle.launched_resources)
374
374
  dummy_task.num_nodes = handle.launched_nodes
375
- handle = backend.provision(dummy_task,
376
- to_provision=handle.launched_resources,
377
- dryrun=False,
378
- stream_logs=True,
379
- cluster_name=cluster_name,
380
- retry_until_up=retry_until_up)
375
+ (handle, _) = backend.provision(dummy_task,
376
+ to_provision=handle.launched_resources,
377
+ dryrun=False,
378
+ stream_logs=True,
379
+ cluster_name=cluster_name,
380
+ retry_until_up=retry_until_up)
381
381
  storage_mounts = backend.get_storage_mounts_metadata(handle.cluster_name)
382
382
  # Passing all_file_mounts as None ensures the local source set in Storage
383
383
  # to not redundantly sync source to the bucket.
sky/data/data_utils.py CHANGED
@@ -322,14 +322,9 @@ def create_r2_client(region: str = 'auto') -> Client:
322
322
  return cloudflare.client('s3', region)
323
323
 
324
324
 
325
- def create_nebius_client(region: Optional[str]) -> Client:
326
- """Helper method that connects to Boto3 client for Nebius Object Storage
327
-
328
- Args:
329
- region: str; Region for Nebius Object Storage
330
- """
331
- region = region if region is not None else nebius.DEFAULT_REGION
332
- return nebius.client('s3', region)
325
+ def create_nebius_client() -> Client:
326
+ """Helper method that connects to Boto3 client for Nebius Object Storage"""
327
+ return nebius.client('s3')
333
328
 
334
329
 
335
330
  def verify_r2_bucket(name: str) -> bool:
@@ -566,7 +561,8 @@ def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
566
561
  require_outputs=True,
567
562
  # We need to use bash as some of the cloud commands uses bash syntax,
568
563
  # such as [[ ... ]]
569
- executable='/bin/bash')
564
+ executable='/bin/bash',
565
+ log_cmd=True)
570
566
  if access_denied_message in stderr:
571
567
  with ux_utils.print_exception_no_traceback():
572
568
  raise PermissionError('Failed to upload files to '
@@ -64,8 +64,8 @@ def get_s3_mount_cmd(bucket_name: str,
64
64
 
65
65
 
66
66
  def get_nebius_mount_cmd(nebius_profile_name: str,
67
- endpoint_url: str,
68
67
  bucket_name: str,
68
+ endpoint_url: str,
69
69
  mount_path: str,
70
70
  _bucket_sub_path: Optional[str] = None) -> str:
71
71
  """Returns a command to install Nebius mount utility goofys."""
sky/data/storage.py CHANGED
@@ -1616,9 +1616,25 @@ class S3Store(AbstractStore):
1616
1616
  # we exclude .git directory from the sync
1617
1617
  excluded_list = storage_utils.get_excluded_files(src_dir_path)
1618
1618
  excluded_list.append('.git/*')
1619
+
1620
+ # Process exclusion patterns to make them work correctly with aws
1621
+ # s3 sync
1622
+ processed_excludes = []
1623
+ for excluded_path in excluded_list:
1624
+ # Check if the path is a directory exclusion pattern
1625
+ # For AWS S3 sync, directory patterns need to end with "/**" to
1626
+ # exclude all contents
1627
+ if (excluded_path.endswith('/') or os.path.isdir(
1628
+ os.path.join(src_dir_path, excluded_path.rstrip('/')))):
1629
+ # Remove any trailing slash and add '/*' to exclude all
1630
+ # contents
1631
+ processed_excludes.append(f'{excluded_path.rstrip("/")}/*')
1632
+ else:
1633
+ processed_excludes.append(excluded_path)
1634
+
1619
1635
  excludes = ' '.join([
1620
1636
  f'--exclude {shlex.quote(file_name)}'
1621
- for file_name in excluded_list
1637
+ for file_name in processed_excludes
1622
1638
  ])
1623
1639
  src_dir_path = shlex.quote(src_dir_path)
1624
1640
  sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
@@ -4676,7 +4692,6 @@ class NebiusStore(AbstractStore):
4676
4692
  _bucket_sub_path: Optional[str] = None):
4677
4693
  self.client: 'boto3.client.Client'
4678
4694
  self.bucket: 'StorageHandle'
4679
- self.region = region if region is not None else nebius.DEFAULT_REGION
4680
4695
  super().__init__(name, source, region, is_sky_managed,
4681
4696
  sync_on_reconstruction, _bucket_sub_path)
4682
4697
 
@@ -4749,7 +4764,7 @@ class NebiusStore(AbstractStore):
4749
4764
  StorageBucketGetError: If fetching existing bucket fails
4750
4765
  StorageInitError: If general initialization fails.
4751
4766
  """
4752
- self.client = data_utils.create_nebius_client(self.region)
4767
+ self.client = data_utils.create_nebius_client()
4753
4768
  self.bucket, is_new_bucket = self._get_bucket()
4754
4769
  if self.is_sky_managed is None:
4755
4770
  # If is_sky_managed is not specified, then this is a new storage
@@ -4846,12 +4861,10 @@ class NebiusStore(AbstractStore):
4846
4861
  f'--include {shlex.quote(file_name)}'
4847
4862
  for file_name in file_names
4848
4863
  ])
4849
- endpoint_url = nebius.create_endpoint(self.region)
4850
4864
  base_dir_path = shlex.quote(base_dir_path)
4851
4865
  sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" '
4852
4866
  f'{includes} {base_dir_path} '
4853
4867
  f's3://{self.name}{sub_path} '
4854
- f'--endpoint={endpoint_url} '
4855
4868
  f'--profile={nebius.NEBIUS_PROFILE_NAME}')
4856
4869
  return sync_command
4857
4870
 
@@ -4863,12 +4876,10 @@ class NebiusStore(AbstractStore):
4863
4876
  f'--exclude {shlex.quote(file_name)}'
4864
4877
  for file_name in excluded_list
4865
4878
  ])
4866
- endpoint_url = nebius.create_endpoint(self.region)
4867
4879
  src_dir_path = shlex.quote(src_dir_path)
4868
4880
  sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
4869
4881
  f'{src_dir_path} '
4870
4882
  f's3://{self.name}{sub_path}/{dest_dir_name} '
4871
- f'--endpoint={endpoint_url} '
4872
4883
  f'--profile={nebius.NEBIUS_PROFILE_NAME}')
4873
4884
  return sync_command
4874
4885
 
@@ -4927,7 +4938,6 @@ class NebiusStore(AbstractStore):
4927
4938
  """
4928
4939
  nebius_s = nebius.resource('s3')
4929
4940
  bucket = nebius_s.Bucket(self.name)
4930
- endpoint_url = nebius.create_endpoint(self.region)
4931
4941
  try:
4932
4942
  # Try Public bucket case.
4933
4943
  # This line does not error out if the bucket is an external public
@@ -4942,7 +4952,6 @@ class NebiusStore(AbstractStore):
4942
4952
  # user.
4943
4953
  if error_code == '403':
4944
4954
  command = (f'aws s3 ls s3://{self.name} '
4945
- f'--endpoint={endpoint_url} '
4946
4955
  f'--profile={nebius.NEBIUS_PROFILE_NAME}')
4947
4956
  with ux_utils.print_exception_no_traceback():
4948
4957
  raise exceptions.StorageBucketGetError(
@@ -4954,7 +4963,7 @@ class NebiusStore(AbstractStore):
4954
4963
  raise exceptions.StorageBucketGetError(
4955
4964
  'Attempted to use a non-existent bucket as a source: '
4956
4965
  f'{self.source}. Consider using `aws s3 ls '
4957
- f's3://{self.name} --endpoint={endpoint_url}'
4966
+ f's3://{self.name} '
4958
4967
  f'--profile={nebius.NEBIUS_PROFILE_NAME}` to debug.')
4959
4968
 
4960
4969
  # If bucket cannot be found in both private and public settings,
@@ -4962,7 +4971,7 @@ class NebiusStore(AbstractStore):
4962
4971
  # Store object is being reconstructed for deletion or re-mount with
4963
4972
  # sky start, and error is raised instead.
4964
4973
  if self.sync_on_reconstruction:
4965
- bucket = self._create_nebius_bucket(self.name, self.region)
4974
+ bucket = self._create_nebius_bucket(self.name)
4966
4975
  return bucket, True
4967
4976
  else:
4968
4977
  # Raised when Storage object is reconstructed for sky storage
@@ -4991,38 +5000,27 @@ class NebiusStore(AbstractStore):
4991
5000
  mount_path: str; Path to mount the bucket to.
4992
5001
  """
4993
5002
  install_cmd = mounting_utils.get_s3_mount_install_cmd()
4994
- endpoint_url = nebius.create_endpoint(self.region)
4995
5003
  nebius_profile_name = nebius.NEBIUS_PROFILE_NAME
5004
+ endpoint_url = self.client.meta.endpoint_url
4996
5005
  mount_cmd = mounting_utils.get_nebius_mount_cmd(nebius_profile_name,
4997
- endpoint_url,
4998
5006
  self.bucket.name,
5007
+ endpoint_url,
4999
5008
  mount_path,
5000
5009
  self._bucket_sub_path)
5001
5010
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
5002
5011
  mount_cmd)
5003
5012
 
5004
- def _create_nebius_bucket(self,
5005
- bucket_name: str,
5006
- region='auto') -> StorageHandle:
5007
- """Creates S3 bucket with specific name in specific region
5013
+ def _create_nebius_bucket(self, bucket_name: str) -> StorageHandle:
5014
+ """Creates S3 bucket with specific name
5008
5015
 
5009
5016
  Args:
5010
5017
  bucket_name: str; Name of bucket
5011
- region: str; Region name, e.g. us-west-1, us-east-2
5012
5018
  Raises:
5013
5019
  StorageBucketCreateError: If bucket creation fails.
5014
5020
  """
5015
5021
  nebius_client = self.client
5016
5022
  try:
5017
- if region is None:
5018
- nebius_client.create_bucket(Bucket=bucket_name)
5019
- else:
5020
- location = {'LocationConstraint': region}
5021
- nebius_client.create_bucket(Bucket=bucket_name,
5022
- CreateBucketConfiguration=location)
5023
- logger.info(f' {colorama.Style.DIM}Created Nebius bucket '
5024
- f'{bucket_name!r} in {region}'
5025
- f'{colorama.Style.RESET_ALL}')
5023
+ nebius_client.create_bucket(Bucket=bucket_name)
5026
5024
  except aws.botocore_exceptions().ClientError as e:
5027
5025
  with ux_utils.print_exception_no_traceback():
5028
5026
  raise exceptions.StorageBucketCreateError(
@@ -5070,9 +5068,7 @@ class NebiusStore(AbstractStore):
5070
5068
  # https://stackoverflow.com/questions/49239351/why-is-it-so-much-slower-to-delete-objects-in-aws-s3-than-it-is-to-create-them
5071
5069
  # The fastest way to delete is to run `aws s3 rb --force`,
5072
5070
  # which removes the bucket by force.
5073
- endpoint_url = nebius.create_endpoint(self.region)
5074
5071
  remove_command = (f'aws s3 rb s3://{bucket_name} --force '
5075
- f'--endpoint {endpoint_url} '
5076
5072
  f'--profile={nebius.NEBIUS_PROFILE_NAME}')
5077
5073
 
5078
5074
  success = self._execute_nebius_remove_command(
@@ -5094,10 +5090,8 @@ class NebiusStore(AbstractStore):
5094
5090
  def _delete_nebius_bucket_sub_path(self, bucket_name: str,
5095
5091
  sub_path: str) -> bool:
5096
5092
  """Deletes the sub path from the bucket."""
5097
- endpoint_url = nebius.create_endpoint(self.region)
5098
5093
  remove_command = (
5099
5094
  f'aws s3 rm s3://{bucket_name}/{sub_path}/ --recursive '
5100
- f'--endpoint {endpoint_url} '
5101
5095
  f'--profile={nebius.NEBIUS_PROFILE_NAME}')
5102
5096
  return self._execute_nebius_remove_command(
5103
5097
  remove_command, bucket_name, f'Removing objects from '
sky/data/storage_utils.py CHANGED
@@ -4,7 +4,7 @@ import os
4
4
  import pathlib
5
5
  import shlex
6
6
  import subprocess
7
- from typing import Any, Dict, List, Optional, TextIO, Union
7
+ from typing import Any, Dict, List, Optional, Set, TextIO, Union
8
8
  import warnings
9
9
  import zipfile
10
10
 
@@ -71,7 +71,7 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
71
71
  """List files and patterns ignored by the .skyignore file
72
72
  in the given source directory.
73
73
  """
74
- excluded_list: List[str] = []
74
+ excluded_list: Set[str] = set()
75
75
  expand_src_dir_path = os.path.expanduser(src_dir_path)
76
76
  skyignore_path = os.path.join(expand_src_dir_path,
77
77
  constants.SKY_IGNORE_FILE)
@@ -95,12 +95,12 @@ def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
95
95
  for i in range(len(matching_files)):
96
96
  matching_files[i] = os.path.relpath(
97
97
  matching_files[i], expand_src_dir_path)
98
- excluded_list.extend(matching_files)
98
+ excluded_list.update(matching_files)
99
99
  except IOError as e:
100
100
  logger.warning(f'Error reading {skyignore_path}: '
101
101
  f'{common_utils.format_exception(e, use_bracket=True)}')
102
102
 
103
- return excluded_list
103
+ return list(excluded_list)
104
104
 
105
105
 
106
106
  def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
@@ -111,8 +111,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
111
111
  This will also be run for all submodules under the src_dir_path.
112
112
 
113
113
  Returns:
114
- List[str] containing files and patterns to be ignored. Some of the
115
- patterns include, **/mydir/*.txt, !myfile.log, or file-*/.
114
+ List[str] containing files and folders to be ignored. There won't be any
115
+ patterns.
116
116
  """
117
117
  expand_src_dir_path = os.path.expanduser(src_dir_path)
118
118
 
@@ -210,10 +210,6 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
210
210
  return []
211
211
 
212
212
  to_be_excluded = os.path.join(repo, item)
213
- if item.endswith('/'):
214
- # aws s3 sync and gsutil rsync require * to exclude
215
- # files/dirs under the specified directory.
216
- to_be_excluded += '*'
217
213
 
218
214
  excluded_list.append(to_be_excluded)
219
215
 
@@ -223,11 +219,21 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
223
219
  def get_excluded_files(src_dir_path: str) -> List[str]:
224
220
  # TODO: this could return a huge list of files,
225
221
  # should think of ways to optimize.
226
- """List files and directories to be excluded."""
222
+ """List files and directories to be excluded.
223
+
224
+ Args:
225
+ src_dir_path (str): The path to the source directory.
226
+
227
+ Returns:
228
+ A list of relative paths to files and directories to be excluded from
229
+ the source directory.
230
+ """
227
231
  expand_src_dir_path = os.path.expanduser(src_dir_path)
228
232
  skyignore_path = os.path.join(expand_src_dir_path,
229
233
  constants.SKY_IGNORE_FILE)
230
234
  # Fail fast if the source is a file.
235
+ if not os.path.exists(expand_src_dir_path):
236
+ raise ValueError(f'{src_dir_path} does not exist.')
231
237
  if os.path.isfile(expand_src_dir_path):
232
238
  raise ValueError(f'{src_dir_path} is a file, not a directory.')
233
239
  if os.path.exists(skyignore_path):
@@ -235,12 +241,14 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
235
241
  f'Excluded files to sync to cluster based on '
236
242
  f'{constants.SKY_IGNORE_FILE}.'
237
243
  f'{colorama.Style.RESET_ALL}')
238
- return get_excluded_files_from_skyignore(src_dir_path)
239
- logger.debug(f' {colorama.Style.DIM}'
240
- f'Excluded files to sync to cluster based on '
241
- f'{constants.GIT_IGNORE_FILE}.'
242
- f'{colorama.Style.RESET_ALL}')
243
- return get_excluded_files_from_gitignore(src_dir_path)
244
+ excluded_paths = get_excluded_files_from_skyignore(src_dir_path)
245
+ else:
246
+ logger.debug(f' {colorama.Style.DIM}'
247
+ f'Excluded files to sync to cluster based on '
248
+ f'{constants.GIT_IGNORE_FILE}.'
249
+ f'{colorama.Style.RESET_ALL}')
250
+ excluded_paths = get_excluded_files_from_gitignore(src_dir_path)
251
+ return excluded_paths
244
252
 
245
253
 
246
254
  def zip_files_and_folders(items: List[str],
@@ -277,7 +285,8 @@ def zip_files_and_folders(items: List[str],
277
285
  zipf.write(item)
278
286
  elif os.path.isdir(item):
279
287
  excluded_files = set([
280
- os.path.join(item, f) for f in get_excluded_files(item)
288
+ os.path.join(item, f.rstrip('/'))
289
+ for f in get_excluded_files(item)
281
290
  ])
282
291
  for root, dirs, files in os.walk(item, followlinks=False):
283
292
  # Modify dirs in-place to control os.walk()'s traversal
sky/execution.py CHANGED
@@ -159,9 +159,9 @@ def _execute(
159
159
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
160
160
  clone_disk_from: Optional[str]; if set, clone the disk from the specified
161
161
  cluster.
162
- skip_unecessary_provisioning: bool; if True, compare the calculated
162
+ skip_unnecessary_provisioning: bool; if True, compare the calculated
163
163
  cluster config to the current cluster's config. If they match, shortcut
164
- provisioning even if we have Stage.PROVISION.
164
+ provisioning and setup, even if we have Stage.PROVISION and Stage.SETUP.
165
165
 
166
166
  Returns:
167
167
  job_id: Optional[int]; the job ID of the submitted job. None if the
@@ -303,12 +303,13 @@ def _execute(
303
303
  task.sync_storage_mounts()
304
304
 
305
305
  try:
306
+ provisioning_skipped = False
306
307
  if Stage.PROVISION in stages:
307
308
  assert handle is None or skip_unnecessary_provisioning, (
308
309
  'Provisioning requested, but handle is already set. PROVISION '
309
310
  'should be excluded from stages or '
310
311
  'skip_unecessary_provisioning should be set. ')
311
- handle = backend.provision(
312
+ (handle, provisioning_skipped) = backend.provision(
312
313
  task,
313
314
  task.best_resources,
314
315
  dryrun=dryrun,
@@ -341,7 +342,11 @@ def _execute(
341
342
  if no_setup:
342
343
  logger.info('Setup commands skipped.')
343
344
  elif Stage.SETUP in stages and not dryrun:
344
- backend.setup(handle, task, detach_setup=detach_setup)
345
+ if skip_unnecessary_provisioning and provisioning_skipped:
346
+ logger.debug('Unnecessary provisioning was skipped, so '
347
+ 'skipping setup as well.')
348
+ else:
349
+ backend.setup(handle, task, detach_setup=detach_setup)
345
350
 
346
351
  if Stage.PRE_EXEC in stages and not dryrun:
347
352
  if idle_minutes_to_autostop is not None:
@@ -523,6 +528,8 @@ def launch(
523
528
  Stage.PROVISION,
524
529
  Stage.SYNC_WORKDIR,
525
530
  Stage.SYNC_FILE_MOUNTS,
531
+ # Setup will be skipped if provisioning was skipped.
532
+ Stage.SETUP,
526
533
  Stage.PRE_EXEC,
527
534
  Stage.EXEC,
528
535
  Stage.DOWN,
sky/jobs/client/sdk.py CHANGED
@@ -82,6 +82,7 @@ def launch(
82
82
  f'{server_common.get_server_url()}/jobs/launch',
83
83
  json=json.loads(body.model_dump_json()),
84
84
  timeout=(5, None),
85
+ cookies=server_common.get_api_cookie_jar(),
85
86
  )
86
87
  return server_common.get_request_id(response)
87
88
 
@@ -138,6 +139,7 @@ def queue(refresh: bool,
138
139
  f'{server_common.get_server_url()}/jobs/queue',
139
140
  json=json.loads(body.model_dump_json()),
140
141
  timeout=(5, None),
142
+ cookies=server_common.get_api_cookie_jar(),
141
143
  )
142
144
  return server_common.get_request_id(response=response)
143
145
 
@@ -177,6 +179,7 @@ def cancel(
177
179
  f'{server_common.get_server_url()}/jobs/cancel',
178
180
  json=json.loads(body.model_dump_json()),
179
181
  timeout=(5, None),
182
+ cookies=server_common.get_api_cookie_jar(),
180
183
  )
181
184
  return server_common.get_request_id(response=response)
182
185
 
@@ -224,6 +227,7 @@ def tail_logs(name: Optional[str] = None,
224
227
  json=json.loads(body.model_dump_json()),
225
228
  stream=True,
226
229
  timeout=(5, None),
230
+ cookies=server_common.get_api_cookie_jar(),
227
231
  )
228
232
  request_id = server_common.get_request_id(response)
229
233
  return sdk.stream_response(request_id, response, output_stream)
@@ -267,6 +271,7 @@ def download_logs(
267
271
  f'{server_common.get_server_url()}/jobs/download_logs',
268
272
  json=json.loads(body.model_dump_json()),
269
273
  timeout=(5, None),
274
+ cookies=server_common.get_api_cookie_jar(),
270
275
  )
271
276
  job_id_remote_path_dict = sdk.stream_and_get(
272
277
  server_common.get_request_id(response))
sky/jobs/server/server.py CHANGED
@@ -161,7 +161,11 @@ async def dashboard(request: fastapi.Request,
161
161
  response = await client.request('GET',
162
162
  dashboard_url,
163
163
  timeout=5)
164
- break # Connection successful, proceed with the request
164
+ if response.is_success:
165
+ break # Connection successful, proceed with the request
166
+ # Raise an HTTPException here which will be caught by the
167
+ # following except block to retry with new connection
168
+ response.raise_for_status()
165
169
  except Exception as e: # pylint: disable=broad-except
166
170
  # We catch all exceptions to gracefully handle unknown
167
171
  # errors and retry or raise an HTTPException to the client.
sky/optimizer.py CHANGED
@@ -6,6 +6,7 @@ import typing
6
6
  from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
7
7
 
8
8
  import colorama
9
+ import numpy as np
9
10
  import prettytable
10
11
 
11
12
  from sky import check as sky_check
@@ -28,12 +29,10 @@ from sky.utils import ux_utils
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  import networkx as nx
31
- import numpy as np
32
32
 
33
33
  from sky import dag as dag_lib
34
34
  else:
35
35
  nx = adaptors_common.LazyImport('networkx')
36
- np = adaptors_common.LazyImport('numpy')
37
36
 
38
37
  logger = sky_logging.init_logger(__name__)
39
38