skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/provision/oci/query_utils.py
CHANGED
@@ -7,6 +7,8 @@ History:
|
|
7
7
|
find_compartment: allow search subtree when find a compartment.
|
8
8
|
- Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
|
9
9
|
Add/remove security rules: create_nsg_rules & remove_nsg
|
10
|
+
- Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing
|
11
|
+
VCN for SkyServe.
|
10
12
|
"""
|
11
13
|
from datetime import datetime
|
12
14
|
import functools
|
@@ -17,7 +19,6 @@ import traceback
|
|
17
19
|
import typing
|
18
20
|
from typing import List, Optional, Tuple
|
19
21
|
|
20
|
-
from sky import exceptions
|
21
22
|
from sky import sky_logging
|
22
23
|
from sky.adaptors import common as adaptors_common
|
23
24
|
from sky.adaptors import oci as oci_adaptor
|
@@ -496,23 +497,25 @@ class QueryHelper:
|
|
496
497
|
|
497
498
|
compartment = cls.find_compartment(region)
|
498
499
|
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
500
|
+
vcn_id = oci_utils.oci_config.get_vcn_ocid(region)
|
501
|
+
if vcn_id is None:
|
502
|
+
list_vcns_resp = net_client.list_vcns(
|
503
|
+
compartment_id=compartment,
|
504
|
+
display_name=oci_utils.oci_config.VCN_NAME,
|
505
|
+
lifecycle_state='AVAILABLE',
|
506
|
+
)
|
504
507
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
+
# Get the primary vnic. The vnic might be an empty list for the
|
509
|
+
# corner case when the cluster was exited during provision.
|
510
|
+
if not list_vcns_resp.data:
|
511
|
+
return None
|
508
512
|
|
509
|
-
|
510
|
-
|
511
|
-
vcn = list_vcns_resp.data[0]
|
513
|
+
vcn = list_vcns_resp.data[0]
|
514
|
+
vcn_id = vcn.id
|
512
515
|
|
513
516
|
list_nsg_resp = net_client.list_network_security_groups(
|
514
517
|
compartment_id=compartment,
|
515
|
-
vcn_id=
|
518
|
+
vcn_id=vcn_id,
|
516
519
|
limit=1,
|
517
520
|
display_name=nsg_name,
|
518
521
|
)
|
@@ -529,7 +532,7 @@ class QueryHelper:
|
|
529
532
|
create_network_security_group_details=oci_adaptor.oci.core.models.
|
530
533
|
CreateNetworkSecurityGroupDetails(
|
531
534
|
compartment_id=compartment,
|
532
|
-
vcn_id=
|
535
|
+
vcn_id=vcn_id,
|
533
536
|
display_name=nsg_name,
|
534
537
|
))
|
535
538
|
get_nsg_resp = net_client.get_network_security_group(
|
sky/provision/provisioner.py
CHANGED
@@ -415,7 +415,6 @@ def _post_provision_setup(
|
|
415
415
|
f'{json.dumps(dataclasses.asdict(provision_record), indent=2)}\n'
|
416
416
|
'Cluster info:\n'
|
417
417
|
f'{json.dumps(dataclasses.asdict(cluster_info), indent=2)}')
|
418
|
-
|
419
418
|
head_instance = cluster_info.get_head_instance()
|
420
419
|
if head_instance is None:
|
421
420
|
e = RuntimeError(f'Provision failed for cluster {cluster_name!r}. '
|
sky/provision/runpod/instance.py
CHANGED
@@ -83,7 +83,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
83
83
|
node_type = 'head' if head_instance_id is None else 'worker'
|
84
84
|
try:
|
85
85
|
instance_id = utils.launch(
|
86
|
-
|
86
|
+
cluster_name=cluster_name_on_cloud,
|
87
|
+
node_type=node_type,
|
87
88
|
instance_type=config.node_config['InstanceType'],
|
88
89
|
region=region,
|
89
90
|
disk_size=config.node_config['DiskSize'],
|
@@ -92,6 +93,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
92
93
|
public_key=config.node_config['PublicKey'],
|
93
94
|
preemptible=config.node_config['Preemptible'],
|
94
95
|
bid_per_gpu=config.node_config['BidPerGPU'],
|
96
|
+
docker_login_config=config.provider_config.get(
|
97
|
+
'docker_login_config'),
|
95
98
|
)
|
96
99
|
except Exception as e: # pylint: disable=broad-except
|
97
100
|
logger.warning(f'run_instances error: {e}')
|
@@ -145,6 +148,8 @@ def terminate_instances(
|
|
145
148
|
"""See sky/provision/__init__.py"""
|
146
149
|
del provider_config # unused
|
147
150
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
151
|
+
template_name, registry_auth_id = utils.get_registry_auth_resources(
|
152
|
+
cluster_name_on_cloud)
|
148
153
|
for inst_id, inst in instances.items():
|
149
154
|
logger.debug(f'Terminating instance {inst_id}: {inst}')
|
150
155
|
if worker_only and inst['name'].endswith('-head'):
|
@@ -157,6 +162,10 @@ def terminate_instances(
|
|
157
162
|
f'Failed to terminate instance {inst_id}: '
|
158
163
|
f'{common_utils.format_exception(e, use_bracket=False)}'
|
159
164
|
) from e
|
165
|
+
if template_name is not None:
|
166
|
+
utils.delete_pod_template(template_name)
|
167
|
+
if registry_auth_id is not None:
|
168
|
+
utils.delete_register_auth(registry_auth_id)
|
160
169
|
|
161
170
|
|
162
171
|
def get_cluster_info(
|
sky/provision/runpod/utils.py
CHANGED
@@ -2,10 +2,11 @@
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import time
|
5
|
-
from typing import Any, Dict, List, Optional
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
6
6
|
|
7
7
|
from sky import sky_logging
|
8
8
|
from sky.adaptors import runpod
|
9
|
+
from sky.provision import docker_utils
|
9
10
|
import sky.provision.runpod.api.commands as runpod_commands
|
10
11
|
from sky.skylet import constants
|
11
12
|
from sky.utils import common_utils
|
@@ -47,6 +48,11 @@ GPU_NAME_MAP = {
|
|
47
48
|
}
|
48
49
|
|
49
50
|
|
51
|
+
def _construct_docker_login_template_name(cluster_name: str) -> str:
|
52
|
+
"""Constructs the registry auth template name."""
|
53
|
+
return f'{cluster_name}-docker-login-template'
|
54
|
+
|
55
|
+
|
50
56
|
def retry(func):
|
51
57
|
"""Decorator to retry a function."""
|
52
58
|
|
@@ -66,9 +72,83 @@ def retry(func):
|
|
66
72
|
return wrapper
|
67
73
|
|
68
74
|
|
75
|
+
# Adapted from runpod.api.queries.pods.py::QUERY_POD.
|
76
|
+
# Adding containerRegistryAuthId to the query.
|
77
|
+
_QUERY_POD = """
|
78
|
+
query myPods {
|
79
|
+
myself {
|
80
|
+
pods {
|
81
|
+
id
|
82
|
+
containerDiskInGb
|
83
|
+
containerRegistryAuthId
|
84
|
+
costPerHr
|
85
|
+
desiredStatus
|
86
|
+
dockerArgs
|
87
|
+
dockerId
|
88
|
+
env
|
89
|
+
gpuCount
|
90
|
+
imageName
|
91
|
+
lastStatusChange
|
92
|
+
machineId
|
93
|
+
memoryInGb
|
94
|
+
name
|
95
|
+
podType
|
96
|
+
port
|
97
|
+
ports
|
98
|
+
uptimeSeconds
|
99
|
+
vcpuCount
|
100
|
+
volumeInGb
|
101
|
+
volumeMountPath
|
102
|
+
runtime {
|
103
|
+
ports{
|
104
|
+
ip
|
105
|
+
isIpPublic
|
106
|
+
privatePort
|
107
|
+
publicPort
|
108
|
+
type
|
109
|
+
}
|
110
|
+
}
|
111
|
+
machine {
|
112
|
+
gpuDisplayName
|
113
|
+
}
|
114
|
+
}
|
115
|
+
}
|
116
|
+
}
|
117
|
+
"""
|
118
|
+
|
119
|
+
|
120
|
+
def _sky_get_pods() -> dict:
|
121
|
+
"""List all pods with extra registry auth information.
|
122
|
+
|
123
|
+
Adapted from runpod.get_pods() to include containerRegistryAuthId.
|
124
|
+
"""
|
125
|
+
raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD)
|
126
|
+
cleaned_return = raw_return['data']['myself']['pods']
|
127
|
+
return cleaned_return
|
128
|
+
|
129
|
+
|
130
|
+
_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """
|
131
|
+
query myself {
|
132
|
+
myself {
|
133
|
+
podTemplates {
|
134
|
+
name
|
135
|
+
containerRegistryAuthId
|
136
|
+
}
|
137
|
+
}
|
138
|
+
}
|
139
|
+
"""
|
140
|
+
|
141
|
+
|
142
|
+
def _list_pod_templates_with_container_registry() -> dict:
|
143
|
+
"""List all pod templates."""
|
144
|
+
raw_return = runpod.runpod.api.graphql.run_graphql_query(
|
145
|
+
_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH)
|
146
|
+
return raw_return['data']['myself']['podTemplates']
|
147
|
+
|
148
|
+
|
69
149
|
def list_instances() -> Dict[str, Dict[str, Any]]:
|
70
150
|
"""Lists instances associated with API key."""
|
71
|
-
instances =
|
151
|
+
instances = _sky_get_pods()
|
72
152
|
|
73
153
|
instance_dict: Dict[str, Dict[str, Any]] = {}
|
74
154
|
for instance in instances:
|
@@ -100,14 +180,75 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
100
180
|
return instance_dict
|
101
181
|
|
102
182
|
|
103
|
-
def
|
104
|
-
|
105
|
-
|
183
|
+
def delete_pod_template(template_name: str) -> None:
|
184
|
+
"""Deletes a pod template."""
|
185
|
+
try:
|
186
|
+
runpod.runpod.api.graphql.run_graphql_query(
|
187
|
+
f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
|
188
|
+
except runpod.runpod.error.QueryError as e:
|
189
|
+
logger.warning(f'Failed to delete template {template_name}: {e}'
|
190
|
+
'Please delete it manually.')
|
191
|
+
|
192
|
+
|
193
|
+
def delete_register_auth(registry_auth_id: str) -> None:
|
194
|
+
"""Deletes a registry auth."""
|
195
|
+
try:
|
196
|
+
runpod.runpod.delete_container_registry_auth(registry_auth_id)
|
197
|
+
except runpod.runpod.error.QueryError as e:
|
198
|
+
logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}'
|
199
|
+
'Please delete it manually.')
|
200
|
+
|
201
|
+
|
202
|
+
def _create_template_for_docker_login(
|
203
|
+
cluster_name: str,
|
204
|
+
image_name: str,
|
205
|
+
docker_login_config: Optional[Dict[str, str]],
|
206
|
+
) -> Tuple[str, Optional[str]]:
|
207
|
+
"""Creates a template for the given image with the docker login config.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
formatted_image_name: The formatted image name.
|
211
|
+
template_id: The template ID. None for no docker login config.
|
212
|
+
"""
|
213
|
+
if docker_login_config is None:
|
214
|
+
return image_name, None
|
215
|
+
login_config = docker_utils.DockerLoginConfig(**docker_login_config)
|
216
|
+
container_registry_auth_name = f'{cluster_name}-registry-auth'
|
217
|
+
container_template_name = _construct_docker_login_template_name(
|
218
|
+
cluster_name)
|
219
|
+
# The `name` argument is only for display purpose and the registry server
|
220
|
+
# will be splitted from the docker image name (Tested with AWS ECR).
|
221
|
+
# Here we only need the username and password to create the registry auth.
|
222
|
+
# TODO(tian): Now we create a template and a registry auth for each cluster.
|
223
|
+
# Consider create one for each server and reuse them. Challenges including
|
224
|
+
# calculate the reference count and delete them when no longer needed.
|
225
|
+
create_auth_resp = runpod.runpod.create_container_registry_auth(
|
226
|
+
name=container_registry_auth_name,
|
227
|
+
username=login_config.username,
|
228
|
+
password=login_config.password,
|
229
|
+
)
|
230
|
+
registry_auth_id = create_auth_resp['id']
|
231
|
+
create_template_resp = runpod.runpod.create_template(
|
232
|
+
name=container_template_name,
|
233
|
+
image_name=None,
|
234
|
+
registry_auth_id=registry_auth_id,
|
235
|
+
)
|
236
|
+
return login_config.format_image(image_name), create_template_resp['id']
|
237
|
+
|
238
|
+
|
239
|
+
def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
240
|
+
disk_size: int, image_name: str, ports: Optional[List[int]],
|
241
|
+
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
|
242
|
+
docker_login_config: Optional[Dict[str, str]]) -> str:
|
106
243
|
"""Launches an instance with the given parameters.
|
107
244
|
|
108
245
|
Converts the instance_type to the RunPod GPU name, finds the specs for the
|
109
246
|
GPU, and launches the instance.
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
instance_id: The instance ID.
|
110
250
|
"""
|
251
|
+
name = f'{cluster_name}-{node_type}'
|
111
252
|
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
112
253
|
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
113
254
|
cloud_type = instance_type.split('_')[2]
|
@@ -139,21 +280,24 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
139
280
|
# Use base64 to deal with the tricky quoting issues caused by runpod API.
|
140
281
|
encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')
|
141
282
|
|
283
|
+
docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
|
284
|
+
f'bash init.sh\'')
|
285
|
+
|
142
286
|
# Port 8081 is occupied for nginx in the base image.
|
143
287
|
custom_ports_str = ''
|
144
288
|
if ports is not None:
|
145
289
|
custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
|
290
|
+
ports_str = (f'22/tcp,'
|
291
|
+
f'{custom_ports_str}'
|
292
|
+
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
|
293
|
+
f'{constants.SKY_REMOTE_RAY_PORT}/http')
|
146
294
|
|
147
|
-
|
148
|
-
|
149
|
-
ports = (f'22/tcp,'
|
150
|
-
f'{custom_ports_str}'
|
151
|
-
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
|
152
|
-
f'{constants.SKY_REMOTE_RAY_PORT}/http')
|
295
|
+
image_name_formatted, template_id = _create_template_for_docker_login(
|
296
|
+
cluster_name, image_name, docker_login_config)
|
153
297
|
|
154
298
|
params = {
|
155
299
|
'name': name,
|
156
|
-
'image_name':
|
300
|
+
'image_name': image_name_formatted,
|
157
301
|
'gpu_type_id': gpu_type,
|
158
302
|
'cloud_type': cloud_type,
|
159
303
|
'container_disk_in_gb': disk_size,
|
@@ -161,9 +305,10 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
161
305
|
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
162
306
|
'gpu_count': gpu_quantity,
|
163
307
|
'country_code': region,
|
164
|
-
'ports':
|
308
|
+
'ports': ports_str,
|
165
309
|
'support_public_ip': True,
|
166
310
|
'docker_args': docker_args,
|
311
|
+
'template_id': template_id,
|
167
312
|
}
|
168
313
|
|
169
314
|
if preemptible is None or not preemptible:
|
@@ -177,6 +322,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
177
322
|
return new_instance['id']
|
178
323
|
|
179
324
|
|
325
|
+
def get_registry_auth_resources(
|
326
|
+
cluster_name: str) -> Tuple[Optional[str], Optional[str]]:
|
327
|
+
"""Gets the registry auth resources."""
|
328
|
+
container_registry_auth_name = _construct_docker_login_template_name(
|
329
|
+
cluster_name)
|
330
|
+
for template in _list_pod_templates_with_container_registry():
|
331
|
+
if template['name'] == container_registry_auth_name:
|
332
|
+
return container_registry_auth_name, template[
|
333
|
+
'containerRegistryAuthId']
|
334
|
+
return None, None
|
335
|
+
|
336
|
+
|
180
337
|
def remove(instance_id: str) -> None:
|
181
338
|
"""Terminates the given instance."""
|
182
339
|
runpod.runpod.terminate_pod(instance_id)
|
sky/resources.py
CHANGED
@@ -540,7 +540,7 @@ class Resources:
|
|
540
540
|
if memory_gb <= 0:
|
541
541
|
with ux_utils.print_exception_no_traceback():
|
542
542
|
raise ValueError(
|
543
|
-
f'The "
|
543
|
+
f'The "memory" field should be positive. Found: {memory!r}')
|
544
544
|
|
545
545
|
def _set_accelerators(
|
546
546
|
self,
|