skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,12 @@ History:
6
6
  configuration.
7
7
  - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant
8
8
  SERVICE_PORT_RULE_TAG
9
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Set the default image
10
+ from ubuntu 20.04 to ubuntu 22.04, including:
11
+ - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204
12
+ - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204
13
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing
14
+ VCN for SkyServe.
9
15
  """
10
16
  import os
11
17
 
@@ -105,8 +111,15 @@ class OCIConfig:
105
111
  ('oci', region, 'compartment_ocid'), default_compartment_ocid)
106
112
  return compartment
107
113
 
114
+ @classmethod
115
+ def get_vcn_ocid(cls, region):
116
+ # Will reuse the regional VCN if specified.
117
+ vcn = skypilot_config.get_nested(('oci', region, 'vcn_ocid'), None)
118
+ return vcn
119
+
108
120
  @classmethod
109
121
  def get_vcn_subnet(cls, region):
122
+ # Will reuse the subnet if specified.
110
123
  vcn = skypilot_config.get_nested(('oci', region, 'vcn_subnet'), None)
111
124
  return vcn
112
125
 
@@ -117,7 +130,7 @@ class OCIConfig:
117
130
  # the sky's user-config file (if not specified, use the hardcode one at
118
131
  # last)
119
132
  return skypilot_config.get_nested(('oci', 'default', 'image_tag_gpu'),
120
- 'skypilot:gpu-ubuntu-2004')
133
+ 'skypilot:gpu-ubuntu-2204')
121
134
 
122
135
  @classmethod
123
136
  def get_default_image_tag(cls) -> str:
@@ -125,7 +138,7 @@ class OCIConfig:
125
138
  # set the default image tag in the sky's user-config file. (if not
126
139
  # specified, use the hardcode one at last)
127
140
  return skypilot_config.get_nested(
128
- ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2004')
141
+ ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2204')
129
142
 
130
143
  @classmethod
131
144
  def get_sky_user_config_file(cls) -> str:
sky/core.py CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
915
915
  handle = global_user_state.get_handle_from_storage_name(name)
916
916
  if handle is None:
917
917
  raise ValueError(f'Storage name {name!r} not found.')
918
- else:
919
- storage_object = data.Storage(name=handle.storage_name,
920
- source=handle.source,
921
- sync_on_reconstruction=False)
922
- storage_object.delete()
918
+
919
+ assert handle.storage_name == name, (
920
+ f'In global_user_state, storage name {name!r} does not match '
921
+ f'handle.storage_name {handle.storage_name!r}')
922
+ storage_object = data.Storage(name=handle.storage_name,
923
+ source=handle.source,
924
+ sync_on_reconstruction=False)
925
+ storage_object.delete()
sky/data/data_transfer.py CHANGED
@@ -200,3 +200,40 @@ def _add_bucket_iam_member(bucket_name: str, role: str, member: str) -> None:
200
200
  bucket.set_iam_policy(policy)
201
201
 
202
202
  logger.debug(f'Added {member} with role {role} to {bucket_name}.')
203
+
204
+
205
+ def s3_to_oci(s3_bucket_name: str, oci_bucket_name: str) -> None:
206
+ """Creates a one-time transfer from Amazon S3 to OCI Object Storage.
207
+ Args:
208
+ s3_bucket_name: str; Name of the Amazon S3 Bucket
209
+ oci_bucket_name: str; Name of the OCI Bucket
210
+ """
211
+ # TODO(HysunHe): Implement sync with other clouds (s3, gs)
212
+ raise NotImplementedError('Moving data directly from S3 to OCI bucket '
213
+ 'is currently not supported. Please specify '
214
+ 'a local source for the storage object.')
215
+
216
+
217
+ def gcs_to_oci(gs_bucket_name: str, oci_bucket_name: str) -> None:
218
+ """Creates a one-time transfer from Google Cloud Storage to
219
+ OCI Object Storage.
220
+ Args:
221
+ gs_bucket_name: str; Name of the Google Cloud Storage Bucket
222
+ oci_bucket_name: str; Name of the OCI Bucket
223
+ """
224
+ # TODO(HysunHe): Implement sync with other clouds (s3, gs)
225
+ raise NotImplementedError('Moving data directly from GCS to OCI bucket '
226
+ 'is currently not supported. Please specify '
227
+ 'a local source for the storage object.')
228
+
229
+
230
+ def r2_to_oci(r2_bucket_name: str, oci_bucket_name: str) -> None:
231
+ """Creates a one-time transfer from Cloudflare R2 to OCI Bucket.
232
+ Args:
233
+ r2_bucket_name: str; Name of the Cloudflare R2 Bucket
234
+ oci_bucket_name: str; Name of the OCI Bucket
235
+ """
236
+ raise NotImplementedError(
237
+ 'Moving data directly from Cloudflare R2 to OCI '
238
+ 'bucket is currently not supported. Please specify '
239
+ 'a local source for the storage object.')
sky/data/data_utils.py CHANGED
@@ -523,10 +523,14 @@ def get_gsutil_command() -> Tuple[str, str]:
523
523
 
524
524
  def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
525
525
  log_path: str):
526
- returncode, stdout, stderr = log_lib.run_with_log(command,
527
- log_path,
528
- shell=True,
529
- require_outputs=True)
526
+ returncode, stdout, stderr = log_lib.run_with_log(
527
+ command,
528
+ log_path,
529
+ shell=True,
530
+ require_outputs=True,
531
+ # We need to use bash as some of the cloud commands uses bash syntax,
532
+ # such as [[ ... ]]
533
+ executable='/bin/bash')
530
534
  if access_denied_message in stderr:
531
535
  with ux_utils.print_exception_no_traceback():
532
536
  raise PermissionError('Failed to upload files to '
@@ -730,3 +734,14 @@ class Rclone():
730
734
  lines_to_keep.append(line)
731
735
 
732
736
  return lines_to_keep
737
+
738
+
739
+ def split_oci_path(oci_path: str) -> Tuple[str, str]:
740
+ """Splits OCI Path into Bucket name and Relative Path to Bucket
741
+ Args:
742
+ oci_path: str; OCI Path, e.g. oci://imagenet/train/
743
+ """
744
+ path_parts = oci_path.replace('oci://', '').split('/')
745
+ bucket = path_parts.pop(0)
746
+ key = '/'.join(path_parts)
747
+ return bucket, key
@@ -19,6 +19,7 @@ BLOBFUSE2_VERSION = '2.2.0'
19
19
  _BLOBFUSE_CACHE_ROOT_DIR = '~/.sky/blobfuse2_cache'
20
20
  _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
21
21
  '{storage_account_name}_{container_name}')
22
+ RCLONE_VERSION = 'v1.68.2'
22
23
 
23
24
 
24
25
  def get_s3_mount_install_cmd() -> str:
@@ -30,12 +31,19 @@ def get_s3_mount_install_cmd() -> str:
30
31
  return install_cmd
31
32
 
32
33
 
33
- def get_s3_mount_cmd(bucket_name: str, mount_path: str) -> str:
34
+ # pylint: disable=invalid-name
35
+ def get_s3_mount_cmd(bucket_name: str,
36
+ mount_path: str,
37
+ _bucket_sub_path: Optional[str] = None) -> str:
34
38
  """Returns a command to mount an S3 bucket using goofys."""
39
+ if _bucket_sub_path is None:
40
+ _bucket_sub_path = ''
41
+ else:
42
+ _bucket_sub_path = f':{_bucket_sub_path}'
35
43
  mount_cmd = ('goofys -o allow_other '
36
44
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
37
45
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
38
- f'{bucket_name} {mount_path}')
46
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
39
47
  return mount_cmd
40
48
 
41
49
 
@@ -49,15 +57,20 @@ def get_gcs_mount_install_cmd() -> str:
49
57
  return install_cmd
50
58
 
51
59
 
52
- def get_gcs_mount_cmd(bucket_name: str, mount_path: str) -> str:
60
+ # pylint: disable=invalid-name
61
+ def get_gcs_mount_cmd(bucket_name: str,
62
+ mount_path: str,
63
+ _bucket_sub_path: Optional[str] = None) -> str:
53
64
  """Returns a command to mount a GCS bucket using gcsfuse."""
54
-
65
+ bucket_sub_path_arg = f'--only-dir {_bucket_sub_path} '\
66
+ if _bucket_sub_path else ''
55
67
  mount_cmd = ('gcsfuse -o allow_other '
56
68
  '--implicit-dirs '
57
69
  f'--stat-cache-capacity {_STAT_CACHE_CAPACITY} '
58
70
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
59
71
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
60
72
  f'--rename-dir-limit {_RENAME_DIR_LIMIT} '
73
+ f'{bucket_sub_path_arg}'
61
74
  f'{bucket_name} {mount_path}')
62
75
  return mount_cmd
63
76
 
@@ -78,10 +91,12 @@ def get_az_mount_install_cmd() -> str:
78
91
  return install_cmd
79
92
 
80
93
 
94
+ # pylint: disable=invalid-name
81
95
  def get_az_mount_cmd(container_name: str,
82
96
  storage_account_name: str,
83
97
  mount_path: str,
84
- storage_account_key: Optional[str] = None) -> str:
98
+ storage_account_key: Optional[str] = None,
99
+ _bucket_sub_path: Optional[str] = None) -> str:
85
100
  """Returns a command to mount an AZ Container using blobfuse2.
86
101
 
87
102
  Args:
@@ -90,6 +105,7 @@ def get_az_mount_cmd(container_name: str,
90
105
  belongs to.
91
106
  mount_path: Path where the container will be mounting.
92
107
  storage_account_key: Access key for the given storage account.
108
+ _bucket_sub_path: Sub path of the mounting container.
93
109
 
94
110
  Returns:
95
111
  str: Command used to mount AZ container with blobfuse2.
@@ -106,25 +122,44 @@ def get_az_mount_cmd(container_name: str,
106
122
  cache_path = _BLOBFUSE_CACHE_DIR.format(
107
123
  storage_account_name=storage_account_name,
108
124
  container_name=container_name)
125
+ # The line below ensures the cache directory is new before mounting to
126
+ # avoid "config error in file_cache [temp directory not empty]" error, which
127
+ # can occur after stopping and starting the same cluster on Azure.
128
+ # This helps ensure a clean state for blobfuse2 operations.
129
+ remote_boot_time_cmd = 'date +%s -d "$(uptime -s)"'
130
+ if _bucket_sub_path is None:
131
+ bucket_sub_path_arg = ''
132
+ else:
133
+ bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
134
+ # TODO(zpoint): clear old cache that has been created in the previous boot.
109
135
  mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
110
136
  f'{key_env_var} '
111
137
  f'blobfuse2 {mount_path} --allow-other --no-symlinks '
112
138
  '-o umask=022 -o default_permissions '
113
- f'--tmp-path {cache_path} '
139
+ f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
140
+ f'{bucket_sub_path_arg}'
114
141
  f'--container-name {container_name}')
115
142
  return mount_cmd
116
143
 
117
144
 
118
- def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str,
119
- endpoint_url: str, bucket_name: str,
120
- mount_path: str) -> str:
145
+ # pylint: disable=invalid-name
146
+ def get_r2_mount_cmd(r2_credentials_path: str,
147
+ r2_profile_name: str,
148
+ endpoint_url: str,
149
+ bucket_name: str,
150
+ mount_path: str,
151
+ _bucket_sub_path: Optional[str] = None) -> str:
121
152
  """Returns a command to install R2 mount utility goofys."""
153
+ if _bucket_sub_path is None:
154
+ _bucket_sub_path = ''
155
+ else:
156
+ _bucket_sub_path = f':{_bucket_sub_path}'
122
157
  mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
123
158
  f'AWS_PROFILE={r2_profile_name} goofys -o allow_other '
124
159
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
125
160
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
126
161
  f'--endpoint {endpoint_url} '
127
- f'{bucket_name} {mount_path}')
162
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
128
163
  return mount_cmd
129
164
 
130
165
 
@@ -136,9 +171,12 @@ def get_cos_mount_install_cmd() -> str:
136
171
  return install_cmd
137
172
 
138
173
 
139
- def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str,
140
- bucket_rclone_profile: str, bucket_name: str,
141
- mount_path: str) -> str:
174
+ def get_cos_mount_cmd(rclone_config_data: str,
175
+ rclone_config_path: str,
176
+ bucket_rclone_profile: str,
177
+ bucket_name: str,
178
+ mount_path: str,
179
+ _bucket_sub_path: Optional[str] = None) -> str:
142
180
  """Returns a command to mount an IBM COS bucket using rclone."""
143
181
  # creates a fusermount soft link on older (<22) Ubuntu systems for
144
182
  # rclone's mount utility.
@@ -150,14 +188,60 @@ def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str,
150
188
  'mkdir -p ~/.config/rclone/ && '
151
189
  f'echo "{rclone_config_data}" >> '
152
190
  f'{rclone_config_path}')
191
+ if _bucket_sub_path is None:
192
+ sub_path_arg = f'{bucket_name}/{_bucket_sub_path}'
193
+ else:
194
+ sub_path_arg = f'/{bucket_name}'
153
195
  # --daemon will keep the mounting process running in the background.
154
196
  mount_cmd = (f'{configure_rclone_profile} && '
155
197
  'rclone mount '
156
- f'{bucket_rclone_profile}:{bucket_name} {mount_path} '
198
+ f'{bucket_rclone_profile}:{sub_path_arg} {mount_path} '
157
199
  '--daemon')
158
200
  return mount_cmd
159
201
 
160
202
 
203
+ def get_rclone_install_cmd() -> str:
204
+ """ RClone installation for both apt-get and rpm.
205
+ This would be common command.
206
+ """
207
+ # pylint: disable=line-too-long
208
+ install_cmd = (
209
+ f'(which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || (cd ~ > /dev/null'
210
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-amd64.deb'
211
+ f' && sudo dpkg -i rclone-{RCLONE_VERSION}-linux-amd64.deb'
212
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-amd64.deb)))'
213
+ f' || (which rclone > /dev/null || (cd ~ > /dev/null'
214
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-amd64.rpm'
215
+ f' && sudo yum --nogpgcheck install rclone-{RCLONE_VERSION}-linux-amd64.rpm -y'
216
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-amd64.rpm))')
217
+ return install_cmd
218
+
219
+
220
+ def get_oci_mount_cmd(mount_path: str, store_name: str, region: str,
221
+ namespace: str, compartment: str, config_file: str,
222
+ config_profile: str) -> str:
223
+ """ OCI specific RClone mount command for oci object storage. """
224
+ # pylint: disable=line-too-long
225
+ mount_cmd = (
226
+ f'sudo chown -R `whoami` {mount_path}'
227
+ f' && rclone config create oos_{store_name} oracleobjectstorage'
228
+ f' provider user_principal_auth namespace {namespace}'
229
+ f' compartment {compartment} region {region}'
230
+ f' oci-config-file {config_file}'
231
+ f' oci-config-profile {config_profile}'
232
+ f' && sed -i "s/oci-config-file/config_file/g;'
233
+ f' s/oci-config-profile/config_profile/g" ~/.config/rclone/rclone.conf'
234
+ f' && ([ ! -f /bin/fusermount3 ] && sudo ln -s /bin/fusermount /bin/fusermount3 || true)'
235
+ f' && (grep -q {mount_path} /proc/mounts || rclone mount oos_{store_name}:{store_name} {mount_path} --daemon --allow-non-empty)'
236
+ )
237
+ return mount_cmd
238
+
239
+
240
+ def get_rclone_version_check_cmd() -> str:
241
+ """ RClone version check. This would be common command. """
242
+ return f'rclone --version | grep -q {RCLONE_VERSION}'
243
+
244
+
161
245
  def _get_mount_binary(mount_cmd: str) -> str:
162
246
  """Returns mounting binary in string given as the mount command.
163
247
 
@@ -209,7 +293,7 @@ def get_mounting_script(
209
293
  script = textwrap.dedent(f"""
210
294
  #!/usr/bin/env bash
211
295
  set -e
212
-
296
+
213
297
  {command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}
214
298
 
215
299
  MOUNT_PATH={mount_path}