skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
2
2
|
import dataclasses
|
3
|
-
import functools
|
4
3
|
import textwrap
|
5
4
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
6
5
|
|
@@ -9,15 +8,17 @@ import colorama
|
|
9
8
|
from sky import check as sky_check
|
10
9
|
from sky import clouds
|
11
10
|
from sky import exceptions
|
12
|
-
from sky import jobs as managed_jobs
|
13
11
|
from sky import sky_logging
|
14
12
|
from sky import skypilot_config
|
15
13
|
from sky.clouds import service_catalog
|
16
14
|
from sky.provision import docker_utils
|
15
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
16
|
from sky.skylet import constants
|
18
17
|
from sky.utils import accelerator_registry
|
18
|
+
from sky.utils import annotations
|
19
19
|
from sky.utils import common_utils
|
20
20
|
from sky.utils import log_utils
|
21
|
+
from sky.utils import registry
|
21
22
|
from sky.utils import resources_utils
|
22
23
|
from sky.utils import schemas
|
23
24
|
from sky.utils import ux_utils
|
@@ -32,7 +33,7 @@ class Resources:
|
|
32
33
|
|
33
34
|
This class is immutable once created (to ensure some validations are done
|
34
35
|
whenever properties change). To update the property of an instance of
|
35
|
-
Resources, use
|
36
|
+
Resources, use ``resources.copy(**new_properties)``.
|
36
37
|
|
37
38
|
Used:
|
38
39
|
|
@@ -44,7 +45,7 @@ class Resources:
|
|
44
45
|
"""
|
45
46
|
# If any fields changed, increment the version. For backward compatibility,
|
46
47
|
# modify the __setstate__ method to handle the old version.
|
47
|
-
_VERSION =
|
48
|
+
_VERSION = 22
|
48
49
|
|
49
50
|
def __init__(
|
50
51
|
self,
|
@@ -55,7 +56,7 @@ class Resources:
|
|
55
56
|
accelerators: Union[None, str, Dict[str, int]] = None,
|
56
57
|
accelerator_args: Optional[Dict[str, str]] = None,
|
57
58
|
use_spot: Optional[bool] = None,
|
58
|
-
job_recovery: Optional[str] = None,
|
59
|
+
job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
|
59
60
|
region: Optional[str] = None,
|
60
61
|
zone: Optional[str] = None,
|
61
62
|
image_id: Union[Dict[str, str], str, None] = None,
|
@@ -66,8 +67,10 @@ class Resources:
|
|
66
67
|
# Internal use only.
|
67
68
|
# pylint: disable=invalid-name
|
68
69
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
70
|
+
_docker_username_for_runpod: Optional[str] = None,
|
69
71
|
_is_image_managed: Optional[bool] = None,
|
70
72
|
_requires_fuse: Optional[bool] = None,
|
73
|
+
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
71
74
|
):
|
72
75
|
"""Initialize a Resources object.
|
73
76
|
|
@@ -110,6 +113,12 @@ class Resources:
|
|
110
113
|
job to recover the cluster from preemption. Refer to
|
111
114
|
`recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
|
112
115
|
for more details.
|
116
|
+
When a dict is provided, it can have the following fields:
|
117
|
+
|
118
|
+
- strategy: the recovery strategy to use.
|
119
|
+
- max_restarts_on_errors: the max number of restarts on user code
|
120
|
+
errors.
|
121
|
+
|
113
122
|
region: the region to use.
|
114
123
|
zone: the zone to use.
|
115
124
|
image_id: the image ID to use. If a str, must be a string
|
@@ -140,6 +149,9 @@ class Resources:
|
|
140
149
|
_docker_login_config: the docker configuration to use. This includes
|
141
150
|
the docker username, password, and registry server. If None, skip
|
142
151
|
docker login.
|
152
|
+
_docker_username_for_runpod: the login username for the docker
|
153
|
+
containers. This is used by RunPod to set the ssh user for the
|
154
|
+
docker containers.
|
143
155
|
_requires_fuse: whether the task requires FUSE mounting support. This
|
144
156
|
is used internally by certain cloud implementations to do additional
|
145
157
|
setup for FUSE mounting. This flag also safeguards against using
|
@@ -152,18 +164,27 @@ class Resources:
|
|
152
164
|
"""
|
153
165
|
self._version = self._VERSION
|
154
166
|
self._cloud = cloud
|
155
|
-
self._region: Optional[str] =
|
156
|
-
self._zone: Optional[str] =
|
157
|
-
self._validate_and_set_region_zone(region, zone)
|
167
|
+
self._region: Optional[str] = region
|
168
|
+
self._zone: Optional[str] = zone
|
158
169
|
|
159
170
|
self._instance_type = instance_type
|
160
171
|
|
161
172
|
self._use_spot_specified = use_spot is not None
|
162
173
|
self._use_spot = use_spot if use_spot is not None else False
|
163
|
-
self._job_recovery = None
|
174
|
+
self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
|
164
175
|
if job_recovery is not None:
|
165
|
-
if job_recovery
|
166
|
-
|
176
|
+
if isinstance(job_recovery, str):
|
177
|
+
job_recovery = {'strategy': job_recovery}
|
178
|
+
if 'strategy' not in job_recovery:
|
179
|
+
job_recovery['strategy'] = None
|
180
|
+
|
181
|
+
strategy_name = job_recovery['strategy']
|
182
|
+
if strategy_name == 'none':
|
183
|
+
self._job_recovery = None
|
184
|
+
else:
|
185
|
+
if strategy_name is not None:
|
186
|
+
job_recovery['strategy'] = strategy_name.upper()
|
187
|
+
self._job_recovery = job_recovery
|
167
188
|
|
168
189
|
if disk_size is not None:
|
169
190
|
if round(disk_size) != disk_size:
|
@@ -174,8 +195,6 @@ class Resources:
|
|
174
195
|
else:
|
175
196
|
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
176
197
|
|
177
|
-
# self._image_id is a dict of {region: image_id}.
|
178
|
-
# The key is None if the same image_id applies for all regions.
|
179
198
|
self._image_id = image_id
|
180
199
|
if isinstance(image_id, str):
|
181
200
|
self._image_id = {self._region: image_id.strip()}
|
@@ -216,12 +235,25 @@ class Resources:
|
|
216
235
|
|
217
236
|
self._docker_login_config = _docker_login_config
|
218
237
|
|
238
|
+
# TODO(andyl): This ctor param seems to be unused.
|
239
|
+
# We always use `Task.set_resources` and `Resources.copy` to set the
|
240
|
+
# `docker_username_for_runpod`. But to keep the consistency with
|
241
|
+
# `_docker_login_config`, we keep it here.
|
242
|
+
self._docker_username_for_runpod = _docker_username_for_runpod
|
243
|
+
|
219
244
|
self._requires_fuse = _requires_fuse
|
220
245
|
|
246
|
+
self._cluster_config_overrides = _cluster_config_overrides
|
247
|
+
self._cached_repr = None
|
248
|
+
|
221
249
|
self._set_cpus(cpus)
|
222
250
|
self._set_memory(memory)
|
223
251
|
self._set_accelerators(accelerators, accelerator_args)
|
224
252
|
|
253
|
+
def validate(self):
|
254
|
+
"""Validate the resources and infer the missing fields if possible."""
|
255
|
+
self._try_canonicalize_accelerators()
|
256
|
+
self._try_validate_and_set_region_zone()
|
225
257
|
self._try_validate_instance_type()
|
226
258
|
self._try_validate_cpus_mem()
|
227
259
|
self._try_validate_managed_job_attributes()
|
@@ -260,6 +292,8 @@ class Resources:
|
|
260
292
|
>>> sky.Resources(disk_size=100)
|
261
293
|
<Cloud>(disk_size=100)
|
262
294
|
"""
|
295
|
+
if self._cached_repr is not None:
|
296
|
+
return self._cached_repr
|
263
297
|
accelerators = ''
|
264
298
|
accelerator_args = ''
|
265
299
|
if self.accelerators is not None:
|
@@ -319,7 +353,8 @@ class Resources:
|
|
319
353
|
if self.cloud is not None:
|
320
354
|
cloud_str = f'{self.cloud}'
|
321
355
|
|
322
|
-
|
356
|
+
self._cached_repr = f'{cloud_str}({hardware_str})'
|
357
|
+
return self._cached_repr
|
323
358
|
|
324
359
|
@property
|
325
360
|
def repr_with_region_zone(self) -> str:
|
@@ -353,7 +388,7 @@ class Resources:
|
|
353
388
|
return self._instance_type
|
354
389
|
|
355
390
|
@property
|
356
|
-
@
|
391
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
357
392
|
def cpus(self) -> Optional[str]:
|
358
393
|
"""Returns the number of vCPUs that each instance must have.
|
359
394
|
|
@@ -387,8 +422,8 @@ class Resources:
|
|
387
422
|
return self._memory
|
388
423
|
|
389
424
|
@property
|
390
|
-
@
|
391
|
-
def accelerators(self) -> Optional[Dict[str, int]]:
|
425
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
426
|
+
def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
|
392
427
|
"""Returns the accelerators field directly or by inferring.
|
393
428
|
|
394
429
|
For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
|
@@ -415,7 +450,7 @@ class Resources:
|
|
415
450
|
return self._use_spot_specified
|
416
451
|
|
417
452
|
@property
|
418
|
-
def job_recovery(self) -> Optional[str]:
|
453
|
+
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
419
454
|
return self._job_recovery
|
420
455
|
|
421
456
|
@property
|
@@ -448,10 +483,20 @@ class Resources:
|
|
448
483
|
return False
|
449
484
|
return self._requires_fuse
|
450
485
|
|
486
|
+
@property
|
487
|
+
def cluster_config_overrides(self) -> Dict[str, Any]:
|
488
|
+
if self._cluster_config_overrides is None:
|
489
|
+
return {}
|
490
|
+
return self._cluster_config_overrides
|
491
|
+
|
451
492
|
@requires_fuse.setter
|
452
493
|
def requires_fuse(self, value: Optional[bool]) -> None:
|
453
494
|
self._requires_fuse = value
|
454
495
|
|
496
|
+
@property
|
497
|
+
def docker_username_for_runpod(self) -> Optional[str]:
|
498
|
+
return self._docker_username_for_runpod
|
499
|
+
|
455
500
|
def _set_cpus(
|
456
501
|
self,
|
457
502
|
cpus: Union[None, int, float, str],
|
@@ -513,7 +558,7 @@ class Resources:
|
|
513
558
|
if memory_gb <= 0:
|
514
559
|
with ux_utils.print_exception_no_traceback():
|
515
560
|
raise ValueError(
|
516
|
-
f'The "
|
561
|
+
f'The "memory" field should be positive. Found: {memory!r}')
|
517
562
|
|
518
563
|
def _set_accelerators(
|
519
564
|
self,
|
@@ -546,36 +591,49 @@ class Resources:
|
|
546
591
|
with ux_utils.print_exception_no_traceback():
|
547
592
|
raise ValueError(parse_error) from None
|
548
593
|
|
549
|
-
# Canonicalize the accelerator names.
|
550
|
-
accelerators = {
|
551
|
-
accelerator_registry.canonicalize_accelerator_name(
|
552
|
-
acc, self._cloud): acc_count
|
553
|
-
for acc, acc_count in accelerators.items()
|
554
|
-
}
|
555
|
-
|
556
594
|
acc, _ = list(accelerators.items())[0]
|
557
595
|
if 'tpu' in acc.lower():
|
558
596
|
if self.cloud is None:
|
559
|
-
|
560
|
-
|
561
|
-
|
597
|
+
if kubernetes_utils.is_tpu_on_gke(acc):
|
598
|
+
self._cloud = clouds.Kubernetes()
|
599
|
+
else:
|
600
|
+
self._cloud = clouds.GCP()
|
601
|
+
assert (self.cloud.is_same_cloud(clouds.GCP()) or
|
602
|
+
self.cloud.is_same_cloud(clouds.Kubernetes())), (
|
603
|
+
'Cloud must be GCP or Kubernetes for TPU '
|
604
|
+
'accelerators.')
|
605
|
+
|
562
606
|
if accelerator_args is None:
|
563
607
|
accelerator_args = {}
|
608
|
+
|
564
609
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
565
|
-
if self.
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
610
|
+
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
611
|
+
not kubernetes_utils.is_tpu_on_gke(acc)):
|
612
|
+
if 'runtime_version' not in accelerator_args:
|
613
|
+
|
614
|
+
def _get_default_runtime_version() -> str:
|
615
|
+
if not use_tpu_vm:
|
616
|
+
return '2.12.0'
|
617
|
+
# TPU V5 requires a newer runtime version.
|
618
|
+
if acc.startswith('tpu-v5'):
|
619
|
+
return 'v2-alpha-tpuv5'
|
620
|
+
# TPU V6e requires a newer runtime version.
|
621
|
+
elif acc.startswith('tpu-v6e'):
|
622
|
+
return 'v2-alpha-tpuv6e'
|
623
|
+
return 'tpu-vm-base'
|
624
|
+
|
625
|
+
accelerator_args['runtime_version'] = (
|
626
|
+
_get_default_runtime_version())
|
627
|
+
logger.info(
|
628
|
+
'Missing runtime_version in accelerator_args, using'
|
629
|
+
f' default ({accelerator_args["runtime_version"]})')
|
630
|
+
|
631
|
+
if self.instance_type is not None and use_tpu_vm:
|
632
|
+
if self.instance_type != 'TPU-VM':
|
633
|
+
with ux_utils.print_exception_no_traceback():
|
634
|
+
raise ValueError(
|
635
|
+
'Cannot specify instance type (got '
|
636
|
+
f'{self.instance_type!r}) for TPU VM.')
|
579
637
|
|
580
638
|
self._accelerators = accelerators
|
581
639
|
self._accelerator_args = accelerator_args
|
@@ -588,15 +646,30 @@ class Resources:
|
|
588
646
|
assert self.is_launchable(), self
|
589
647
|
return self.cloud.need_cleanup_after_preemption_or_failure(self)
|
590
648
|
|
591
|
-
def
|
592
|
-
|
649
|
+
def _try_canonicalize_accelerators(self) -> None:
|
650
|
+
"""Try to canonicalize the accelerators attribute.
|
651
|
+
|
652
|
+
We don't canonicalize accelerators during creation of Resources object
|
653
|
+
because it may check Kubernetes accelerators online. It requires
|
654
|
+
Kubernetes credentias which may not be available locally when a remote
|
655
|
+
API server is used.
|
656
|
+
"""
|
657
|
+
if self._accelerators is None:
|
658
|
+
return
|
659
|
+
self._accelerators = {
|
660
|
+
accelerator_registry.canonicalize_accelerator_name(
|
661
|
+
acc, self._cloud): acc_count
|
662
|
+
for acc, acc_count in self._accelerators.items()
|
663
|
+
}
|
664
|
+
|
665
|
+
def _try_validate_and_set_region_zone(self) -> None:
|
593
666
|
"""Try to validate and set the region and zone attribute.
|
594
667
|
|
595
668
|
Raises:
|
596
669
|
ValueError: if the attributes are invalid.
|
597
670
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
598
671
|
"""
|
599
|
-
if
|
672
|
+
if self._region is None and self._zone is None:
|
600
673
|
return
|
601
674
|
|
602
675
|
if self._cloud is None:
|
@@ -608,13 +681,13 @@ class Resources:
|
|
608
681
|
cloud_to_errors = {}
|
609
682
|
for cloud in enabled_clouds:
|
610
683
|
try:
|
611
|
-
cloud.validate_region_zone(
|
684
|
+
cloud.validate_region_zone(self._region, self._zone)
|
612
685
|
except ValueError as e:
|
613
686
|
cloud_to_errors[repr(cloud)] = e
|
614
687
|
continue
|
615
688
|
valid_clouds.append(cloud)
|
616
689
|
|
617
|
-
if
|
690
|
+
if not valid_clouds:
|
618
691
|
if len(enabled_clouds) == 1:
|
619
692
|
cloud_str = f'for cloud {enabled_clouds[0]}'
|
620
693
|
else:
|
@@ -632,23 +705,24 @@ class Resources:
|
|
632
705
|
table.add_row([str(cloud), reason_str])
|
633
706
|
hint = table.get_string()
|
634
707
|
raise ValueError(
|
635
|
-
f'Invalid (region {
|
636
|
-
f'{cloud_str}. Details:\n{hint}')
|
708
|
+
f'Invalid (region {self._region!r}, zone '
|
709
|
+
f'{self._zone!r}) {cloud_str}. Details:\n{hint}')
|
637
710
|
elif len(valid_clouds) > 1:
|
638
711
|
with ux_utils.print_exception_no_traceback():
|
639
712
|
raise ValueError(
|
640
|
-
f'Cannot infer cloud from (region {
|
641
|
-
f'{
|
642
|
-
f'of the same names: {valid_clouds}. '
|
713
|
+
f'Cannot infer cloud from (region {self._region!r}, '
|
714
|
+
f'zone {self._zone!r}). Multiple enabled clouds '
|
715
|
+
f'have region/zone of the same names: {valid_clouds}. '
|
643
716
|
f'To fix: explicitly specify `cloud`.')
|
644
717
|
logger.debug(f'Cloud is not specified, using {valid_clouds[0]} '
|
645
|
-
f'inferred from region {
|
718
|
+
f'inferred from region {self._region!r} and zone '
|
719
|
+
f'{self._zone!r}')
|
646
720
|
self._cloud = valid_clouds[0]
|
647
721
|
|
648
722
|
# Validate if region and zone exist in the catalog, and set the region
|
649
723
|
# if zone is specified.
|
650
724
|
self._region, self._zone = self._cloud.validate_region_zone(
|
651
|
-
|
725
|
+
self._region, self._zone)
|
652
726
|
|
653
727
|
def get_valid_regions_for_launchable(self) -> List[clouds.Region]:
|
654
728
|
"""Returns a set of `Region`s that can provision this Resources.
|
@@ -726,7 +800,7 @@ class Resources:
|
|
726
800
|
for cloud in enabled_clouds:
|
727
801
|
if cloud.instance_type_exists(self._instance_type):
|
728
802
|
valid_clouds.append(cloud)
|
729
|
-
if
|
803
|
+
if not valid_clouds:
|
730
804
|
if len(enabled_clouds) == 1:
|
731
805
|
cloud_str = f'for cloud {enabled_clouds[0]}'
|
732
806
|
else:
|
@@ -797,14 +871,11 @@ class Resources:
|
|
797
871
|
Raises:
|
798
872
|
ValueError: if the attributes are invalid.
|
799
873
|
"""
|
800
|
-
if self._job_recovery is None:
|
874
|
+
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
801
875
|
return
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
f'Spot recovery strategy {self._job_recovery} '
|
806
|
-
'is not supported. The strategy should be among '
|
807
|
-
f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
|
876
|
+
# Validate the job recovery strategy
|
877
|
+
registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
|
878
|
+
self._job_recovery['strategy'])
|
808
879
|
|
809
880
|
def extract_docker_image(self) -> Optional[str]:
|
810
881
|
if self.image_id is None:
|
@@ -826,12 +897,6 @@ class Resources:
|
|
826
897
|
|
827
898
|
if self.extract_docker_image() is not None:
|
828
899
|
# TODO(tian): validate the docker image exists / of reasonable size
|
829
|
-
if self.accelerators is not None:
|
830
|
-
for acc in self.accelerators.keys():
|
831
|
-
if acc.lower().startswith('tpu'):
|
832
|
-
with ux_utils.print_exception_no_traceback():
|
833
|
-
raise ValueError(
|
834
|
-
'Docker image is not supported for TPU VM.')
|
835
900
|
if self.cloud is not None:
|
836
901
|
self.cloud.check_features_are_supported(
|
837
902
|
self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE})
|
@@ -920,12 +985,6 @@ class Resources:
|
|
920
985
|
"""
|
921
986
|
if self.ports is None:
|
922
987
|
return
|
923
|
-
if skypilot_config.get_nested(('aws', 'security_group_name'),
|
924
|
-
None) is not None:
|
925
|
-
with ux_utils.print_exception_no_traceback():
|
926
|
-
raise ValueError(
|
927
|
-
'Cannot specify ports when AWS security group name is '
|
928
|
-
'specified.')
|
929
988
|
if self.cloud is not None:
|
930
989
|
self.cloud.check_features_are_supported(
|
931
990
|
self, {clouds.CloudImplementationFeatures.OPEN_PORTS})
|
@@ -956,21 +1015,23 @@ class Resources:
|
|
956
1015
|
"""
|
957
1016
|
if not self._labels:
|
958
1017
|
return
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
#
|
963
|
-
|
964
|
-
|
965
|
-
'Cloud must be specified when labels are provided.')
|
966
|
-
|
967
|
-
# Check if the label key value pairs are valid.
|
1018
|
+
if self.cloud is not None:
|
1019
|
+
validated_clouds = [self.cloud]
|
1020
|
+
else:
|
1021
|
+
# If no specific cloud is set, validate label against ALL clouds.
|
1022
|
+
# The label will be dropped if invalid for any one of the cloud
|
1023
|
+
validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
|
968
1024
|
invalid_table = log_utils.create_table(['Label', 'Reason'])
|
969
1025
|
for key, value in self._labels.items():
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
1026
|
+
for cloud in validated_clouds:
|
1027
|
+
valid, err_msg = cloud.is_label_valid(key, value)
|
1028
|
+
if not valid:
|
1029
|
+
invalid_table.add_row([
|
1030
|
+
f'{key}: {value}',
|
1031
|
+
f'Label rejected due to {cloud}: {err_msg}'
|
1032
|
+
])
|
1033
|
+
break
|
1034
|
+
if invalid_table.rows:
|
974
1035
|
with ux_utils.print_exception_no_traceback():
|
975
1036
|
raise ValueError(
|
976
1037
|
'The following labels are invalid:'
|
@@ -1000,9 +1061,10 @@ class Resources:
|
|
1000
1061
|
def get_spot_str(self) -> str:
|
1001
1062
|
return '[Spot]' if self.use_spot else ''
|
1002
1063
|
|
1003
|
-
def make_deploy_variables(self,
|
1064
|
+
def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
|
1004
1065
|
region: clouds.Region,
|
1005
1066
|
zones: Optional[List[clouds.Zone]],
|
1067
|
+
num_nodes: int,
|
1006
1068
|
dryrun: bool) -> Dict[str, Optional[str]]:
|
1007
1069
|
"""Converts planned sky.Resources to resource variables.
|
1008
1070
|
|
@@ -1011,13 +1073,48 @@ class Resources:
|
|
1011
1073
|
cloud.make_deploy_resources_variables() method, and the cloud-agnostic
|
1012
1074
|
variables are generated by this method.
|
1013
1075
|
"""
|
1014
|
-
|
1015
|
-
|
1076
|
+
# Initial setup commands
|
1077
|
+
initial_setup_commands = []
|
1078
|
+
if (skypilot_config.get_nested(
|
1079
|
+
('nvidia_gpus', 'disable_ecc'),
|
1080
|
+
False,
|
1081
|
+
override_configs=self.cluster_config_overrides) and
|
1082
|
+
self.accelerators is not None):
|
1083
|
+
initial_setup_commands = [constants.DISABLE_GPU_ECC_COMMAND]
|
1084
|
+
|
1016
1085
|
docker_image = self.extract_docker_image()
|
1086
|
+
|
1087
|
+
# Cloud specific variables
|
1088
|
+
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1089
|
+
self, cluster_name, region, zones, num_nodes, dryrun)
|
1090
|
+
|
1091
|
+
# TODO(andyl): Should we print some warnings if users' envs share
|
1092
|
+
# same names with the cloud specific variables, but not enabled
|
1093
|
+
# since it's not on the particular cloud?
|
1094
|
+
|
1095
|
+
# Docker run options
|
1096
|
+
docker_run_options = skypilot_config.get_nested(
|
1097
|
+
('docker', 'run_options'),
|
1098
|
+
default_value=[],
|
1099
|
+
override_configs=self.cluster_config_overrides)
|
1100
|
+
if isinstance(docker_run_options, str):
|
1101
|
+
docker_run_options = [docker_run_options]
|
1102
|
+
# Special accelerator runtime might require additional docker run
|
1103
|
+
# options. e.g., for TPU, we need --privileged.
|
1104
|
+
if 'docker_run_options' in cloud_specific_variables:
|
1105
|
+
docker_run_options.extend(
|
1106
|
+
cloud_specific_variables['docker_run_options'])
|
1107
|
+
if docker_run_options and isinstance(self.cloud, clouds.Kubernetes):
|
1108
|
+
logger.warning(
|
1109
|
+
f'{colorama.Style.DIM}Docker run options are specified, '
|
1110
|
+
'but ignored for Kubernetes: '
|
1111
|
+
f'{" ".join(docker_run_options)}'
|
1112
|
+
f'{colorama.Style.RESET_ALL}')
|
1017
1113
|
return dict(
|
1018
1114
|
cloud_specific_variables,
|
1019
1115
|
**{
|
1020
1116
|
# Docker config
|
1117
|
+
'docker_run_options': docker_run_options,
|
1021
1118
|
# Docker image. The image name used to pull the image, e.g.
|
1022
1119
|
# ubuntu:latest.
|
1023
1120
|
'docker_image': docker_image,
|
@@ -1027,7 +1124,9 @@ class Resources:
|
|
1027
1124
|
constants.DEFAULT_DOCKER_CONTAINER_NAME,
|
1028
1125
|
# Docker login config (if any). This helps pull the image from
|
1029
1126
|
# private registries.
|
1030
|
-
'docker_login_config': self._docker_login_config
|
1127
|
+
'docker_login_config': self._docker_login_config,
|
1128
|
+
# Initial setup commands.
|
1129
|
+
'initial_setup_commands': initial_setup_commands,
|
1031
1130
|
})
|
1032
1131
|
|
1033
1132
|
def get_reservations_available_resources(self) -> Dict[str, int]:
|
@@ -1169,17 +1268,17 @@ class Resources:
|
|
1169
1268
|
def is_empty(self) -> bool:
|
1170
1269
|
"""Is this Resources an empty request (all fields None)?"""
|
1171
1270
|
return all([
|
1172
|
-
self.
|
1271
|
+
self._cloud is None,
|
1173
1272
|
self._instance_type is None,
|
1174
1273
|
self._cpus is None,
|
1175
|
-
self.
|
1176
|
-
self.
|
1177
|
-
self.
|
1274
|
+
self._memory is None,
|
1275
|
+
self._accelerators is None,
|
1276
|
+
self._accelerator_args is None,
|
1178
1277
|
not self._use_spot_specified,
|
1179
|
-
self.
|
1180
|
-
self.
|
1278
|
+
self._disk_size == _DEFAULT_DISK_SIZE_GB,
|
1279
|
+
self._disk_tier is None,
|
1181
1280
|
self._image_id is None,
|
1182
|
-
self.
|
1281
|
+
self._ports is None,
|
1183
1282
|
self._docker_login_config is None,
|
1184
1283
|
])
|
1185
1284
|
|
@@ -1205,11 +1304,16 @@ class Resources:
|
|
1205
1304
|
labels=override.pop('labels', self.labels),
|
1206
1305
|
_docker_login_config=override.pop('_docker_login_config',
|
1207
1306
|
self._docker_login_config),
|
1307
|
+
_docker_username_for_runpod=override.pop(
|
1308
|
+
'_docker_username_for_runpod',
|
1309
|
+
self._docker_username_for_runpod),
|
1208
1310
|
_is_image_managed=override.pop('_is_image_managed',
|
1209
1311
|
self._is_image_managed),
|
1210
1312
|
_requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
|
1313
|
+
_cluster_config_overrides=override.pop(
|
1314
|
+
'_cluster_config_overrides', self._cluster_config_overrides),
|
1211
1315
|
)
|
1212
|
-
assert
|
1316
|
+
assert not override
|
1213
1317
|
return resources
|
1214
1318
|
|
1215
1319
|
def valid_on_region_zones(self, region: str, zones: List[str]) -> bool:
|
@@ -1337,7 +1441,7 @@ class Resources:
|
|
1337
1441
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
1338
1442
|
|
1339
1443
|
resources_fields = {}
|
1340
|
-
resources_fields['cloud'] =
|
1444
|
+
resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
|
1341
1445
|
config.pop('cloud', None))
|
1342
1446
|
resources_fields['instance_type'] = config.pop('instance_type', None)
|
1343
1447
|
resources_fields['cpus'] = config.pop('cpus', None)
|
@@ -1364,9 +1468,13 @@ class Resources:
|
|
1364
1468
|
resources_fields['labels'] = config.pop('labels', None)
|
1365
1469
|
resources_fields['_docker_login_config'] = config.pop(
|
1366
1470
|
'_docker_login_config', None)
|
1471
|
+
resources_fields['_docker_username_for_runpod'] = config.pop(
|
1472
|
+
'_docker_username_for_runpod', None)
|
1367
1473
|
resources_fields['_is_image_managed'] = config.pop(
|
1368
1474
|
'_is_image_managed', None)
|
1369
1475
|
resources_fields['_requires_fuse'] = config.pop('_requires_fuse', None)
|
1476
|
+
resources_fields['_cluster_config_overrides'] = config.pop(
|
1477
|
+
'_cluster_config_overrides', None)
|
1370
1478
|
|
1371
1479
|
if resources_fields['cpus'] is not None:
|
1372
1480
|
resources_fields['cpus'] = str(resources_fields['cpus'])
|
@@ -1393,7 +1501,7 @@ class Resources:
|
|
1393
1501
|
add_if_not_none('instance_type', self.instance_type)
|
1394
1502
|
add_if_not_none('cpus', self._cpus)
|
1395
1503
|
add_if_not_none('memory', self.memory)
|
1396
|
-
add_if_not_none('accelerators', self.
|
1504
|
+
add_if_not_none('accelerators', self._accelerators)
|
1397
1505
|
add_if_not_none('accelerator_args', self.accelerator_args)
|
1398
1506
|
|
1399
1507
|
if self._use_spot_specified:
|
@@ -1410,6 +1518,11 @@ class Resources:
|
|
1410
1518
|
if self._docker_login_config is not None:
|
1411
1519
|
config['_docker_login_config'] = dataclasses.asdict(
|
1412
1520
|
self._docker_login_config)
|
1521
|
+
if self._docker_username_for_runpod is not None:
|
1522
|
+
config['_docker_username_for_runpod'] = (
|
1523
|
+
self._docker_username_for_runpod)
|
1524
|
+
add_if_not_none('_cluster_config_overrides',
|
1525
|
+
self._cluster_config_overrides)
|
1413
1526
|
if self._is_image_managed is not None:
|
1414
1527
|
config['_is_image_managed'] = self._is_image_managed
|
1415
1528
|
if self._requires_fuse is not None:
|
@@ -1525,4 +1638,36 @@ class Resources:
|
|
1525
1638
|
if version < 18:
|
1526
1639
|
self._job_recovery = state.pop('_spot_recovery', None)
|
1527
1640
|
|
1641
|
+
if version < 19:
|
1642
|
+
self._cluster_config_overrides = state.pop(
|
1643
|
+
'_cluster_config_overrides', None)
|
1644
|
+
|
1645
|
+
if version < 20:
|
1646
|
+
# Pre-0.7.0, we used 'kubernetes' as the default region for
|
1647
|
+
# Kubernetes clusters. With the introduction of support for
|
1648
|
+
# multiple contexts, we now set the region to the context name.
|
1649
|
+
# Since we do not have information on which context the cluster
|
1650
|
+
# was run in, we default it to the current active context.
|
1651
|
+
legacy_region = clouds.Kubernetes().LEGACY_SINGLETON_REGION
|
1652
|
+
original_cloud = state.get('_cloud', None)
|
1653
|
+
original_region = state.get('_region', None)
|
1654
|
+
if (isinstance(original_cloud, clouds.Kubernetes) and
|
1655
|
+
original_region == legacy_region):
|
1656
|
+
current_context = (
|
1657
|
+
kubernetes_utils.get_current_kube_config_context_name())
|
1658
|
+
state['_region'] = current_context
|
1659
|
+
# Also update the image_id dict if it contains the old region
|
1660
|
+
if isinstance(state['_image_id'], dict):
|
1661
|
+
if legacy_region in state['_image_id']:
|
1662
|
+
state['_image_id'][current_context] = (
|
1663
|
+
state['_image_id'][legacy_region])
|
1664
|
+
del state['_image_id'][legacy_region]
|
1665
|
+
|
1666
|
+
if version < 21:
|
1667
|
+
self._cached_repr = None
|
1668
|
+
|
1669
|
+
if version < 22:
|
1670
|
+
self._docker_username_for_runpod = state.pop(
|
1671
|
+
'_docker_username_for_runpod', None)
|
1672
|
+
|
1528
1673
|
self.__dict__.update(state)
|