skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/execution.py
CHANGED
@@ -3,12 +3,12 @@
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
4
4
|
"""
|
5
5
|
import enum
|
6
|
-
import
|
6
|
+
import typing
|
7
7
|
from typing import List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import colorama
|
10
10
|
|
11
|
-
import
|
11
|
+
from sky import admin_policy
|
12
12
|
from sky import backends
|
13
13
|
from sky import clouds
|
14
14
|
from sky import global_user_state
|
@@ -16,14 +16,19 @@ from sky import optimizer
|
|
16
16
|
from sky import sky_logging
|
17
17
|
from sky.backends import backend_utils
|
18
18
|
from sky.usage import usage_lib
|
19
|
+
from sky.utils import admin_policy_utils
|
20
|
+
from sky.utils import common
|
19
21
|
from sky.utils import controller_utils
|
20
22
|
from sky.utils import dag_utils
|
21
|
-
from sky.utils import
|
23
|
+
from sky.utils import resources_utils
|
22
24
|
from sky.utils import rich_utils
|
23
|
-
from sky.utils import
|
25
|
+
from sky.utils import status_lib
|
24
26
|
from sky.utils import timeline
|
25
27
|
from sky.utils import ux_utils
|
26
28
|
|
29
|
+
if typing.TYPE_CHECKING:
|
30
|
+
import sky
|
31
|
+
|
27
32
|
logger = sky_logging.init_logger(__name__)
|
28
33
|
|
29
34
|
|
@@ -55,8 +60,9 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str],
|
|
55
60
|
with rich_utils.safe_status('Creating image from source cluster '
|
56
61
|
f'{clone_disk_from!r}'):
|
57
62
|
image_id = original_cloud.create_image_from_cluster(
|
58
|
-
|
59
|
-
|
63
|
+
cluster_name=resources_utils.ClusterName(
|
64
|
+
display_name=clone_disk_from,
|
65
|
+
name_on_cloud=handle.cluster_name_on_cloud),
|
60
66
|
region=handle.launched_resources.region,
|
61
67
|
zone=handle.launched_resources.zone,
|
62
68
|
)
|
@@ -98,7 +104,7 @@ def _execute(
|
|
98
104
|
handle: Optional[backends.ResourceHandle] = None,
|
99
105
|
backend: Optional[backends.Backend] = None,
|
100
106
|
retry_until_up: bool = False,
|
101
|
-
optimize_target:
|
107
|
+
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
102
108
|
stages: Optional[List[Stage]] = None,
|
103
109
|
cluster_name: Optional[str] = None,
|
104
110
|
detach_setup: bool = False,
|
@@ -106,8 +112,10 @@ def _execute(
|
|
106
112
|
idle_minutes_to_autostop: Optional[int] = None,
|
107
113
|
no_setup: bool = False,
|
108
114
|
clone_disk_from: Optional[str] = None,
|
115
|
+
skip_unnecessary_provisioning: bool = False,
|
109
116
|
# Internal only:
|
110
117
|
# pylint: disable=invalid-name
|
118
|
+
_quiet_optimizer: bool = False,
|
111
119
|
_is_launched_by_jobs_controller: bool = False,
|
112
120
|
_is_launched_by_sky_serve_controller: bool = False,
|
113
121
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
@@ -126,8 +134,9 @@ def _execute(
|
|
126
134
|
Note that if errors occur during provisioning/data syncing/setting up,
|
127
135
|
the cluster will not be torn down for debugging purposes.
|
128
136
|
stream_logs: bool; whether to stream all tasks' outputs to the client.
|
129
|
-
handle: Optional[backends.ResourceHandle]; if provided, execution will
|
130
|
-
an existing backend cluster handle instead of
|
137
|
+
handle: Optional[backends.ResourceHandle]; if provided, execution will
|
138
|
+
attempt to use an existing backend cluster handle instead of
|
139
|
+
provisioning a new one.
|
131
140
|
backend: Backend; backend to use for executing the tasks. Defaults to
|
132
141
|
CloudVmRayBackend()
|
133
142
|
retry_until_up: bool; whether to retry the provisioning until the cluster
|
@@ -148,6 +157,11 @@ def _execute(
|
|
148
157
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
149
158
|
autostop after this many minutes of idleness.
|
150
159
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
160
|
+
clone_disk_from: Optional[str]; if set, clone the disk from the specified
|
161
|
+
cluster.
|
162
|
+
skip_unecessary_provisioning: bool; if True, compare the calculated
|
163
|
+
cluster config to the current cluster's config. If they match, shortcut
|
164
|
+
provisioning even if we have Stage.PROVISION.
|
151
165
|
|
152
166
|
Returns:
|
153
167
|
job_id: Optional[int]; the job ID of the submitted job. None if the
|
@@ -156,21 +170,35 @@ def _execute(
|
|
156
170
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
157
171
|
if dryrun.
|
158
172
|
"""
|
173
|
+
|
159
174
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
175
|
+
for task in dag.tasks:
|
176
|
+
if task.storage_mounts is not None:
|
177
|
+
for storage in task.storage_mounts.values():
|
178
|
+
# Ensure the storage is constructed.
|
179
|
+
storage.construct()
|
180
|
+
dag, _ = admin_policy_utils.apply(
|
181
|
+
dag,
|
182
|
+
request_options=admin_policy.RequestOptions(
|
183
|
+
cluster_name=cluster_name,
|
184
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
185
|
+
down=down,
|
186
|
+
dryrun=dryrun,
|
187
|
+
))
|
160
188
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
161
189
|
task = dag.tasks[0]
|
162
190
|
|
163
191
|
if any(r.job_recovery is not None for r in task.resources):
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
192
|
+
logger.warning(
|
193
|
+
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
194
|
+
'but is launched as an unmanaged job. It will be ignored.'
|
195
|
+
'To enable job recovery, use managed jobs: sky jobs launch.'
|
196
|
+
f'{colorama.Style.RESET_ALL}')
|
168
197
|
|
169
198
|
cluster_exists = False
|
170
199
|
if cluster_name is not None:
|
171
|
-
|
172
|
-
|
173
|
-
cluster_exists = existing_handle is not None
|
200
|
+
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
201
|
+
cluster_exists = cluster_record is not None
|
174
202
|
# TODO(woosuk): If the cluster exists, print a warning that
|
175
203
|
# `cpus` and `memory` are not used as a job scheduling constraint,
|
176
204
|
# unlike `gpus`.
|
@@ -206,7 +234,8 @@ def _execute(
|
|
206
234
|
'(after all jobs finish).'
|
207
235
|
f'{colorama.Style.RESET_ALL}')
|
208
236
|
idle_minutes_to_autostop = 1
|
209
|
-
|
237
|
+
if Stage.DOWN in stages:
|
238
|
+
stages.remove(Stage.DOWN)
|
210
239
|
if idle_minutes_to_autostop >= 0:
|
211
240
|
requested_features.add(
|
212
241
|
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
|
@@ -238,8 +267,8 @@ def _execute(
|
|
238
267
|
bold = colorama.Style.BRIGHT
|
239
268
|
reset = colorama.Style.RESET_ALL
|
240
269
|
logger.info(
|
241
|
-
f'{yellow}Launching
|
242
|
-
f'automatically recover from preemptions.
|
270
|
+
f'{yellow}Launching a spot job that does not '
|
271
|
+
f'automatically recover from preemptions. To '
|
243
272
|
'get automatic recovery, use managed job instead: '
|
244
273
|
f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
|
245
274
|
f'{bold}sky.jobs.launch(){reset}.')
|
@@ -253,7 +282,15 @@ def _execute(
|
|
253
282
|
# no-credential machine should not enter optimize(), which
|
254
283
|
# would directly error out ('No cloud is enabled...'). Fix
|
255
284
|
# by moving `sky check` checks out of optimize()?
|
256
|
-
|
285
|
+
controller = controller_utils.Controllers.from_name(
|
286
|
+
cluster_name)
|
287
|
+
if controller is not None:
|
288
|
+
logger.info(
|
289
|
+
f'Choosing resources for {controller.value.name}...'
|
290
|
+
)
|
291
|
+
dag = optimizer.Optimizer.optimize(dag,
|
292
|
+
minimize=optimize_target,
|
293
|
+
quiet=_quiet_optimizer)
|
257
294
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
258
295
|
assert task.best_resources is not None, task
|
259
296
|
|
@@ -267,13 +304,18 @@ def _execute(
|
|
267
304
|
|
268
305
|
try:
|
269
306
|
if Stage.PROVISION in stages:
|
270
|
-
|
271
|
-
handle
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
307
|
+
assert handle is None or skip_unnecessary_provisioning, (
|
308
|
+
'Provisioning requested, but handle is already set. PROVISION '
|
309
|
+
'should be excluded from stages or '
|
310
|
+
'skip_unecessary_provisioning should be set. ')
|
311
|
+
handle = backend.provision(
|
312
|
+
task,
|
313
|
+
task.best_resources,
|
314
|
+
dryrun=dryrun,
|
315
|
+
stream_logs=stream_logs,
|
316
|
+
cluster_name=cluster_name,
|
317
|
+
retry_until_up=retry_until_up,
|
318
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning)
|
277
319
|
|
278
320
|
if handle is None:
|
279
321
|
assert dryrun, ('If not dryrun, handle must be set or '
|
@@ -281,11 +323,18 @@ def _execute(
|
|
281
323
|
logger.info('Dryrun finished.')
|
282
324
|
return None, None
|
283
325
|
|
284
|
-
|
285
|
-
|
286
|
-
|
326
|
+
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
327
|
+
task.workdir is not None)
|
328
|
+
do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
|
329
|
+
(task.file_mounts is not None or
|
330
|
+
task.storage_mounts is not None))
|
331
|
+
if do_workdir or do_file_mounts:
|
332
|
+
logger.info(ux_utils.starting_message('Syncing files.'))
|
333
|
+
|
334
|
+
if do_workdir:
|
335
|
+
backend.sync_workdir(handle, task.workdir)
|
287
336
|
|
288
|
-
if
|
337
|
+
if do_file_mounts:
|
289
338
|
backend.sync_file_mounts(handle, task.file_mounts,
|
290
339
|
task.storage_mounts)
|
291
340
|
|
@@ -318,23 +367,6 @@ def _execute(
|
|
318
367
|
backend.teardown_ephemeral_storage(task)
|
319
368
|
backend.teardown(handle, terminate=True)
|
320
369
|
finally:
|
321
|
-
controller = controller_utils.Controllers.from_name(cluster_name)
|
322
|
-
if controller is None and not _is_launched_by_sky_serve_controller:
|
323
|
-
# UX: print live clusters to make users aware (to save costs).
|
324
|
-
#
|
325
|
-
# Don't print if this job is launched by the jobs controller,
|
326
|
-
# because managed jobs are serverless, there can be many of them,
|
327
|
-
# and users tend to continuously monitor managed jobs using `sky
|
328
|
-
# job queue`. Also don't print if this job is a skyserve controller
|
329
|
-
# job or launched by a skyserve controller job, because the
|
330
|
-
# redirect for this subprocess.run won't success and it will
|
331
|
-
# pollute the controller logs.
|
332
|
-
#
|
333
|
-
# Disable the usage collection for this status command.
|
334
|
-
env = dict(os.environ,
|
335
|
-
**{env_options.Options.DISABLE_LOGGING.value: '1'})
|
336
|
-
subprocess_utils.run(
|
337
|
-
'sky status --no-show-managed-jobs --no-show-services', env=env)
|
338
370
|
print()
|
339
371
|
print('\x1b[?25h', end='') # Show cursor.
|
340
372
|
return job_id, handle
|
@@ -351,19 +383,19 @@ def launch(
|
|
351
383
|
down: bool = False,
|
352
384
|
stream_logs: bool = True,
|
353
385
|
backend: Optional[backends.Backend] = None,
|
354
|
-
optimize_target:
|
355
|
-
detach_setup: bool = False,
|
356
|
-
detach_run: bool = False,
|
386
|
+
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
357
387
|
no_setup: bool = False,
|
358
388
|
clone_disk_from: Optional[str] = None,
|
389
|
+
fast: bool = False,
|
359
390
|
# Internal only:
|
360
391
|
# pylint: disable=invalid-name
|
392
|
+
_quiet_optimizer: bool = False,
|
361
393
|
_is_launched_by_jobs_controller: bool = False,
|
362
394
|
_is_launched_by_sky_serve_controller: bool = False,
|
363
395
|
_disable_controller_check: bool = False,
|
364
396
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
365
397
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
366
|
-
"""
|
398
|
+
"""Launches a cluster or task.
|
367
399
|
|
368
400
|
The task's setup and run commands are executed under the task's workdir
|
369
401
|
(when specified, it is synced to remote cluster). The task undergoes job
|
@@ -373,6 +405,16 @@ def launch(
|
|
373
405
|
usage) a sky.Dag. In the latter case, currently it must contain a single
|
374
406
|
task; support for pipelines/general DAGs are in experimental branches.
|
375
407
|
|
408
|
+
Example:
|
409
|
+
.. code-block:: python
|
410
|
+
|
411
|
+
import sky
|
412
|
+
task = sky.Task(run='echo hello SkyPilot')
|
413
|
+
task.set_resources(
|
414
|
+
sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
|
415
|
+
sky.launch(task, cluster_name='my-cluster')
|
416
|
+
|
417
|
+
|
376
418
|
Args:
|
377
419
|
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch.
|
378
420
|
cluster_name: name of the cluster to create/reuse. If None,
|
@@ -384,7 +426,7 @@ def launch(
|
|
384
426
|
cluster's job queue. Idleness gets reset whenever setting-up/
|
385
427
|
running/pending jobs are found in the job queue. Setting this
|
386
428
|
flag is equivalent to running
|
387
|
-
``sky.launch(
|
429
|
+
``sky.launch(...)`` and then
|
388
430
|
``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
|
389
431
|
will not be autostopped.
|
390
432
|
down: Tear down the cluster after all jobs finish (successfully or
|
@@ -398,27 +440,12 @@ def launch(
|
|
398
440
|
(CloudVMRayBackend).
|
399
441
|
optimize_target: target to optimize for. Choices: OptimizeTarget.COST,
|
400
442
|
OptimizeTarget.TIME.
|
401
|
-
detach_setup: If True, run setup in non-interactive mode as part of the
|
402
|
-
job itself. You can safely ctrl-c to detach from logging, and it
|
403
|
-
will not interrupt the setup process. To see the logs again after
|
404
|
-
detaching, use `sky logs`. To cancel setup, cancel the job via
|
405
|
-
`sky cancel`. Useful for long-running setup
|
406
|
-
commands.
|
407
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
408
|
-
function and do not stream execution logs.
|
409
443
|
no_setup: if True, do not re-run setup commands.
|
410
444
|
clone_disk_from: [Experimental] if set, clone the disk from the
|
411
445
|
specified cluster. This is useful to migrate the cluster to a
|
412
446
|
different availability zone or region.
|
413
|
-
|
414
|
-
|
415
|
-
.. code-block:: python
|
416
|
-
|
417
|
-
import sky
|
418
|
-
task = sky.Task(run='echo hello SkyPilot')
|
419
|
-
task.set_resources(
|
420
|
-
sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
|
421
|
-
sky.launch(task, cluster_name='my-cluster')
|
447
|
+
fast: [Experimental] If the cluster is already up and available,
|
448
|
+
skip provisioning and setup steps.
|
422
449
|
|
423
450
|
Raises:
|
424
451
|
exceptions.ClusterOwnerIdentityMismatchError: if the cluster is
|
@@ -448,26 +475,78 @@ def launch(
|
|
448
475
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
449
476
|
if dryrun.
|
450
477
|
"""
|
478
|
+
|
451
479
|
entrypoint = task
|
480
|
+
entrypoint.validate()
|
452
481
|
if not _disable_controller_check:
|
453
482
|
controller_utils.check_cluster_name_not_controller(
|
454
483
|
cluster_name, operation_str='sky.launch')
|
455
484
|
|
485
|
+
handle = None
|
486
|
+
stages = None
|
487
|
+
skip_unnecessary_provisioning = False
|
488
|
+
# Check if cluster exists and we are doing fast provisioning
|
489
|
+
if fast and cluster_name is not None:
|
490
|
+
cluster_status, maybe_handle = (
|
491
|
+
backend_utils.refresh_cluster_status_handle(cluster_name))
|
492
|
+
if cluster_status == status_lib.ClusterStatus.INIT:
|
493
|
+
# If the cluster is INIT, it may be provisioning. We want to prevent
|
494
|
+
# concurrent calls from queueing up many sequential reprovision
|
495
|
+
# attempts. Since provisioning will hold the cluster status lock, we
|
496
|
+
# wait to hold that lock by force refreshing the status. This will
|
497
|
+
# block until the cluster finishes provisioning, then correctly see
|
498
|
+
# that it is UP.
|
499
|
+
# TODO(cooperc): If multiple processes launched in parallel see that
|
500
|
+
# the cluster is STOPPED or does not exist, they will still all try
|
501
|
+
# to provision it, since we do not hold the lock continuously from
|
502
|
+
# the status check until the provision call. Fixing this requires a
|
503
|
+
# bigger refactor.
|
504
|
+
cluster_status, maybe_handle = (
|
505
|
+
backend_utils.refresh_cluster_status_handle(
|
506
|
+
cluster_name,
|
507
|
+
force_refresh_statuses=[
|
508
|
+
# If the cluster is INIT, we want to try to grab the
|
509
|
+
# status lock, which should block until provisioning is
|
510
|
+
# finished.
|
511
|
+
status_lib.ClusterStatus.INIT,
|
512
|
+
],
|
513
|
+
# Wait indefinitely to obtain the lock, so that we don't
|
514
|
+
# have multiple processes launching the same cluster at
|
515
|
+
# once.
|
516
|
+
cluster_status_lock_timeout=-1,
|
517
|
+
))
|
518
|
+
if cluster_status == status_lib.ClusterStatus.UP:
|
519
|
+
handle = maybe_handle
|
520
|
+
stages = [
|
521
|
+
# Provisioning will be short-circuited if the existing
|
522
|
+
# cluster config hash matches the calculated one.
|
523
|
+
Stage.PROVISION,
|
524
|
+
Stage.SYNC_WORKDIR,
|
525
|
+
Stage.SYNC_FILE_MOUNTS,
|
526
|
+
Stage.PRE_EXEC,
|
527
|
+
Stage.EXEC,
|
528
|
+
Stage.DOWN,
|
529
|
+
]
|
530
|
+
skip_unnecessary_provisioning = True
|
531
|
+
|
456
532
|
return _execute(
|
457
533
|
entrypoint=entrypoint,
|
458
534
|
dryrun=dryrun,
|
459
535
|
down=down,
|
460
536
|
stream_logs=stream_logs,
|
461
|
-
handle=
|
537
|
+
handle=handle,
|
462
538
|
backend=backend,
|
463
539
|
retry_until_up=retry_until_up,
|
464
540
|
optimize_target=optimize_target,
|
541
|
+
stages=stages,
|
465
542
|
cluster_name=cluster_name,
|
466
|
-
detach_setup=
|
467
|
-
detach_run=
|
543
|
+
detach_setup=True,
|
544
|
+
detach_run=True,
|
468
545
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
469
546
|
no_setup=no_setup,
|
470
547
|
clone_disk_from=clone_disk_from,
|
548
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
549
|
+
_quiet_optimizer=_quiet_optimizer,
|
471
550
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
472
551
|
_is_launched_by_sky_serve_controller=
|
473
552
|
_is_launched_by_sky_serve_controller,
|
@@ -482,10 +561,9 @@ def exec( # pylint: disable=redefined-builtin
|
|
482
561
|
down: bool = False,
|
483
562
|
stream_logs: bool = True,
|
484
563
|
backend: Optional[backends.Backend] = None,
|
485
|
-
detach_run: bool = False,
|
486
564
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
487
565
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
488
|
-
"""
|
566
|
+
"""Executes a task on an existing cluster.
|
489
567
|
|
490
568
|
This function performs two actions:
|
491
569
|
|
@@ -520,12 +598,11 @@ def exec( # pylint: disable=redefined-builtin
|
|
520
598
|
stream_logs: if True, show the logs in the terminal.
|
521
599
|
backend: backend to use. If None, use the default backend
|
522
600
|
(CloudVMRayBackend).
|
523
|
-
detach_run: if True, detach from logging once the task has been
|
524
|
-
submitted.
|
525
601
|
|
526
602
|
Raises:
|
527
|
-
ValueError: if the specified cluster
|
528
|
-
|
603
|
+
ValueError: if the specified cluster is not in UP status.
|
604
|
+
sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
|
605
|
+
exist.
|
529
606
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
530
607
|
controller that does not support this operation.
|
531
608
|
|
@@ -537,11 +614,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
537
614
|
if dryrun.
|
538
615
|
"""
|
539
616
|
entrypoint = task
|
540
|
-
|
541
|
-
logger.warning(
|
542
|
-
f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is '
|
543
|
-
'deprecated. Pass sky.Task instead.'
|
544
|
-
f'{colorama.Style.RESET_ALL}')
|
617
|
+
entrypoint.validate(workdir_only=True)
|
545
618
|
controller_utils.check_cluster_name_not_controller(cluster_name,
|
546
619
|
operation_str='sky.exec')
|
547
620
|
|
@@ -562,5 +635,5 @@ def exec( # pylint: disable=redefined-builtin
|
|
562
635
|
Stage.EXEC,
|
563
636
|
],
|
564
637
|
cluster_name=cluster_name,
|
565
|
-
detach_run=
|
638
|
+
detach_run=True,
|
566
639
|
)
|