skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,8 @@ History:
7
7
  find_compartment: allow search subtree when find a compartment.
8
8
  - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
9
9
  Add/remove security rules: create_nsg_rules & remove_nsg
10
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing
11
+ VCN for SkyServe.
10
12
  """
11
13
  from datetime import datetime
12
14
  import functools
@@ -17,7 +19,6 @@ import traceback
17
19
  import typing
18
20
  from typing import List, Optional, Tuple
19
21
 
20
- from sky import exceptions
21
22
  from sky import sky_logging
22
23
  from sky.adaptors import common as adaptors_common
23
24
  from sky.adaptors import oci as oci_adaptor
@@ -496,23 +497,25 @@ class QueryHelper:
496
497
 
497
498
  compartment = cls.find_compartment(region)
498
499
 
499
- list_vcns_resp = net_client.list_vcns(
500
- compartment_id=compartment,
501
- display_name=oci_utils.oci_config.VCN_NAME,
502
- lifecycle_state='AVAILABLE',
503
- )
500
+ vcn_id = oci_utils.oci_config.get_vcn_ocid(region)
501
+ if vcn_id is None:
502
+ list_vcns_resp = net_client.list_vcns(
503
+ compartment_id=compartment,
504
+ display_name=oci_utils.oci_config.VCN_NAME,
505
+ lifecycle_state='AVAILABLE',
506
+ )
504
507
 
505
- if not list_vcns_resp:
506
- raise exceptions.ResourcesUnavailableError(
507
- 'The VCN is not available')
508
+ # Get the primary vnic. The vnic might be an empty list for the
509
+ # corner case when the cluster was exited during provision.
510
+ if not list_vcns_resp.data:
511
+ return None
508
512
 
509
- # Get the primary vnic.
510
- assert len(list_vcns_resp.data) > 0
511
- vcn = list_vcns_resp.data[0]
513
+ vcn = list_vcns_resp.data[0]
514
+ vcn_id = vcn.id
512
515
 
513
516
  list_nsg_resp = net_client.list_network_security_groups(
514
517
  compartment_id=compartment,
515
- vcn_id=vcn.id,
518
+ vcn_id=vcn_id,
516
519
  limit=1,
517
520
  display_name=nsg_name,
518
521
  )
@@ -529,7 +532,7 @@ class QueryHelper:
529
532
  create_network_security_group_details=oci_adaptor.oci.core.models.
530
533
  CreateNetworkSecurityGroupDetails(
531
534
  compartment_id=compartment,
532
- vcn_id=vcn.id,
535
+ vcn_id=vcn_id,
533
536
  display_name=nsg_name,
534
537
  ))
535
538
  get_nsg_resp = net_client.get_network_security_group(
@@ -415,7 +415,6 @@ def _post_provision_setup(
415
415
  f'{json.dumps(dataclasses.asdict(provision_record), indent=2)}\n'
416
416
  'Cluster info:\n'
417
417
  f'{json.dumps(dataclasses.asdict(cluster_info), indent=2)}')
418
-
419
418
  head_instance = cluster_info.get_head_instance()
420
419
  if head_instance is None:
421
420
  e = RuntimeError(f'Provision failed for cluster {cluster_name!r}. '
@@ -83,7 +83,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
83
83
  node_type = 'head' if head_instance_id is None else 'worker'
84
84
  try:
85
85
  instance_id = utils.launch(
86
- name=f'{cluster_name_on_cloud}-{node_type}',
86
+ cluster_name=cluster_name_on_cloud,
87
+ node_type=node_type,
87
88
  instance_type=config.node_config['InstanceType'],
88
89
  region=region,
89
90
  disk_size=config.node_config['DiskSize'],
@@ -92,6 +93,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
92
93
  public_key=config.node_config['PublicKey'],
93
94
  preemptible=config.node_config['Preemptible'],
94
95
  bid_per_gpu=config.node_config['BidPerGPU'],
96
+ docker_login_config=config.provider_config.get(
97
+ 'docker_login_config'),
95
98
  )
96
99
  except Exception as e: # pylint: disable=broad-except
97
100
  logger.warning(f'run_instances error: {e}')
@@ -145,6 +148,8 @@ def terminate_instances(
145
148
  """See sky/provision/__init__.py"""
146
149
  del provider_config # unused
147
150
  instances = _filter_instances(cluster_name_on_cloud, None)
151
+ template_name, registry_auth_id = utils.get_registry_auth_resources(
152
+ cluster_name_on_cloud)
148
153
  for inst_id, inst in instances.items():
149
154
  logger.debug(f'Terminating instance {inst_id}: {inst}')
150
155
  if worker_only and inst['name'].endswith('-head'):
@@ -157,6 +162,10 @@ def terminate_instances(
157
162
  f'Failed to terminate instance {inst_id}: '
158
163
  f'{common_utils.format_exception(e, use_bracket=False)}'
159
164
  ) from e
165
+ if template_name is not None:
166
+ utils.delete_pod_template(template_name)
167
+ if registry_auth_id is not None:
168
+ utils.delete_register_auth(registry_auth_id)
160
169
 
161
170
 
162
171
  def get_cluster_info(
@@ -2,10 +2,11 @@
2
2
 
3
3
  import base64
4
4
  import time
5
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Dict, List, Optional, Tuple
6
6
 
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
+ from sky.provision import docker_utils
9
10
  import sky.provision.runpod.api.commands as runpod_commands
10
11
  from sky.skylet import constants
11
12
  from sky.utils import common_utils
@@ -47,6 +48,11 @@ GPU_NAME_MAP = {
47
48
  }
48
49
 
49
50
 
51
+ def _construct_docker_login_template_name(cluster_name: str) -> str:
52
+ """Constructs the registry auth template name."""
53
+ return f'{cluster_name}-docker-login-template'
54
+
55
+
50
56
  def retry(func):
51
57
  """Decorator to retry a function."""
52
58
 
@@ -66,9 +72,83 @@ def retry(func):
66
72
  return wrapper
67
73
 
68
74
 
75
+ # Adapted from runpod.api.queries.pods.py::QUERY_POD.
76
+ # Adding containerRegistryAuthId to the query.
77
+ _QUERY_POD = """
78
+ query myPods {
79
+ myself {
80
+ pods {
81
+ id
82
+ containerDiskInGb
83
+ containerRegistryAuthId
84
+ costPerHr
85
+ desiredStatus
86
+ dockerArgs
87
+ dockerId
88
+ env
89
+ gpuCount
90
+ imageName
91
+ lastStatusChange
92
+ machineId
93
+ memoryInGb
94
+ name
95
+ podType
96
+ port
97
+ ports
98
+ uptimeSeconds
99
+ vcpuCount
100
+ volumeInGb
101
+ volumeMountPath
102
+ runtime {
103
+ ports{
104
+ ip
105
+ isIpPublic
106
+ privatePort
107
+ publicPort
108
+ type
109
+ }
110
+ }
111
+ machine {
112
+ gpuDisplayName
113
+ }
114
+ }
115
+ }
116
+ }
117
+ """
118
+
119
+
120
+ def _sky_get_pods() -> dict:
121
+ """List all pods with extra registry auth information.
122
+
123
+ Adapted from runpod.get_pods() to include containerRegistryAuthId.
124
+ """
125
+ raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD)
126
+ cleaned_return = raw_return['data']['myself']['pods']
127
+ return cleaned_return
128
+
129
+
130
+ _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """
131
+ query myself {
132
+ myself {
133
+ podTemplates {
134
+ name
135
+ containerRegistryAuthId
136
+ }
137
+ }
138
+ }
139
+ """
140
+
141
+
142
+ def _list_pod_templates_with_container_registry() -> dict:
143
+ """List all pod templates."""
144
+ raw_return = runpod.runpod.api.graphql.run_graphql_query(
145
+ _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH)
146
+ return raw_return['data']['myself']['podTemplates']
147
+
148
+
69
149
  def list_instances() -> Dict[str, Dict[str, Any]]:
70
150
  """Lists instances associated with API key."""
71
- instances = runpod.runpod.get_pods()
151
+ instances = _sky_get_pods()
72
152
 
73
153
  instance_dict: Dict[str, Dict[str, Any]] = {}
74
154
  for instance in instances:
@@ -100,14 +180,75 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
100
180
  return instance_dict
101
181
 
102
182
 
103
- def launch(name: str, instance_type: str, region: str, disk_size: int,
104
- image_name: str, ports: Optional[List[int]], public_key: str,
105
- preemptible: Optional[bool], bid_per_gpu: float) -> str:
183
+ def delete_pod_template(template_name: str) -> None:
184
+ """Deletes a pod template."""
185
+ try:
186
+ runpod.runpod.api.graphql.run_graphql_query(
187
+ f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
188
+ except runpod.runpod.error.QueryError as e:
189
+ logger.warning(f'Failed to delete template {template_name}: {e}'
190
+ 'Please delete it manually.')
191
+
192
+
193
+ def delete_register_auth(registry_auth_id: str) -> None:
194
+ """Deletes a registry auth."""
195
+ try:
196
+ runpod.runpod.delete_container_registry_auth(registry_auth_id)
197
+ except runpod.runpod.error.QueryError as e:
198
+ logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}'
199
+ 'Please delete it manually.')
200
+
201
+
202
+ def _create_template_for_docker_login(
203
+ cluster_name: str,
204
+ image_name: str,
205
+ docker_login_config: Optional[Dict[str, str]],
206
+ ) -> Tuple[str, Optional[str]]:
207
+ """Creates a template for the given image with the docker login config.
208
+
209
+ Returns:
210
+ formatted_image_name: The formatted image name.
211
+ template_id: The template ID. None for no docker login config.
212
+ """
213
+ if docker_login_config is None:
214
+ return image_name, None
215
+ login_config = docker_utils.DockerLoginConfig(**docker_login_config)
216
+ container_registry_auth_name = f'{cluster_name}-registry-auth'
217
+ container_template_name = _construct_docker_login_template_name(
218
+ cluster_name)
219
+ # The `name` argument is only for display purpose and the registry server
220
+ # will be splitted from the docker image name (Tested with AWS ECR).
221
+ # Here we only need the username and password to create the registry auth.
222
+ # TODO(tian): Now we create a template and a registry auth for each cluster.
223
+ # Consider create one for each server and reuse them. Challenges including
224
+ # calculate the reference count and delete them when no longer needed.
225
+ create_auth_resp = runpod.runpod.create_container_registry_auth(
226
+ name=container_registry_auth_name,
227
+ username=login_config.username,
228
+ password=login_config.password,
229
+ )
230
+ registry_auth_id = create_auth_resp['id']
231
+ create_template_resp = runpod.runpod.create_template(
232
+ name=container_template_name,
233
+ image_name=None,
234
+ registry_auth_id=registry_auth_id,
235
+ )
236
+ return login_config.format_image(image_name), create_template_resp['id']
237
+
238
+
239
+ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
240
+ disk_size: int, image_name: str, ports: Optional[List[int]],
241
+ public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
242
+ docker_login_config: Optional[Dict[str, str]]) -> str:
106
243
  """Launches an instance with the given parameters.
107
244
 
108
245
  Converts the instance_type to the RunPod GPU name, finds the specs for the
109
246
  GPU, and launches the instance.
247
+
248
+ Returns:
249
+ instance_id: The instance ID.
110
250
  """
251
+ name = f'{cluster_name}-{node_type}'
111
252
  gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
112
253
  gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
113
254
  cloud_type = instance_type.split('_')[2]
@@ -139,21 +280,24 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
139
280
  # Use base64 to deal with the tricky quoting issues caused by runpod API.
140
281
  encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')
141
282
 
283
+ docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
284
+ f'bash init.sh\'')
285
+
142
286
  # Port 8081 is occupied for nginx in the base image.
143
287
  custom_ports_str = ''
144
288
  if ports is not None:
145
289
  custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
290
+ ports_str = (f'22/tcp,'
291
+ f'{custom_ports_str}'
292
+ f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
293
+ f'{constants.SKY_REMOTE_RAY_PORT}/http')
146
294
 
147
- docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
148
- f'bash init.sh\'')
149
- ports = (f'22/tcp,'
150
- f'{custom_ports_str}'
151
- f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
152
- f'{constants.SKY_REMOTE_RAY_PORT}/http')
295
+ image_name_formatted, template_id = _create_template_for_docker_login(
296
+ cluster_name, image_name, docker_login_config)
153
297
 
154
298
  params = {
155
299
  'name': name,
156
- 'image_name': image_name,
300
+ 'image_name': image_name_formatted,
157
301
  'gpu_type_id': gpu_type,
158
302
  'cloud_type': cloud_type,
159
303
  'container_disk_in_gb': disk_size,
@@ -161,9 +305,10 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
161
305
  'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
162
306
  'gpu_count': gpu_quantity,
163
307
  'country_code': region,
164
- 'ports': ports,
308
+ 'ports': ports_str,
165
309
  'support_public_ip': True,
166
310
  'docker_args': docker_args,
311
+ 'template_id': template_id,
167
312
  }
168
313
 
169
314
  if preemptible is None or not preemptible:
@@ -177,6 +322,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
177
322
  return new_instance['id']
178
323
 
179
324
 
325
+ def get_registry_auth_resources(
326
+ cluster_name: str) -> Tuple[Optional[str], Optional[str]]:
327
+ """Gets the registry auth resources."""
328
+ container_registry_auth_name = _construct_docker_login_template_name(
329
+ cluster_name)
330
+ for template in _list_pod_templates_with_container_registry():
331
+ if template['name'] == container_registry_auth_name:
332
+ return container_registry_auth_name, template[
333
+ 'containerRegistryAuthId']
334
+ return None, None
335
+
336
+
180
337
  def remove(instance_id: str) -> None:
181
338
  """Terminates the given instance."""
182
339
  runpod.runpod.terminate_pod(instance_id)
sky/resources.py CHANGED
@@ -540,7 +540,7 @@ class Resources:
540
540
  if memory_gb <= 0:
541
541
  with ux_utils.print_exception_no_traceback():
542
542
  raise ValueError(
543
- f'The "cpus" field should be positive. Found: {memory!r}')
543
+ f'The "memory" field should be positive. Found: {memory!r}')
544
544
 
545
545
  def _set_accelerators(
546
546
  self,