skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py
ADDED
@@ -0,0 +1,302 @@
|
|
1
|
+
"""SDK functions for managed jobs."""
|
2
|
+
import json
|
3
|
+
import typing
|
4
|
+
from typing import Dict, List, Optional, Union
|
5
|
+
import webbrowser
|
6
|
+
|
7
|
+
import click
|
8
|
+
import requests
|
9
|
+
|
10
|
+
from sky import sky_logging
|
11
|
+
from sky.client import common as client_common
|
12
|
+
from sky.client import sdk
|
13
|
+
from sky.server import common as server_common
|
14
|
+
from sky.server.requests import payloads
|
15
|
+
from sky.skylet import constants
|
16
|
+
from sky.usage import usage_lib
|
17
|
+
from sky.utils import common_utils
|
18
|
+
from sky.utils import dag_utils
|
19
|
+
|
20
|
+
if typing.TYPE_CHECKING:
|
21
|
+
import io
|
22
|
+
|
23
|
+
import sky
|
24
|
+
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
@usage_lib.entrypoint
|
29
|
+
@server_common.check_server_healthy_or_start
|
30
|
+
def launch(
|
31
|
+
task: Union['sky.Task', 'sky.Dag'],
|
32
|
+
name: Optional[str] = None,
|
33
|
+
# Internal only:
|
34
|
+
# pylint: disable=invalid-name
|
35
|
+
_need_confirmation: bool = False,
|
36
|
+
) -> server_common.RequestId:
|
37
|
+
"""Launches a managed job.
|
38
|
+
|
39
|
+
Please refer to sky.cli.job_launch for documentation.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
|
43
|
+
managed job.
|
44
|
+
name: Name of the managed job.
|
45
|
+
_need_confirmation: (Internal only) Whether to show a confirmation
|
46
|
+
prompt before launching the job.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
The request ID of the launch request.
|
50
|
+
|
51
|
+
Request Returns:
|
52
|
+
job_id (Optional[int]): Job ID for the managed job
|
53
|
+
controller_handle (Optional[ResourceHandle]): ResourceHandle of the
|
54
|
+
controller
|
55
|
+
|
56
|
+
Request Raises:
|
57
|
+
ValueError: cluster does not exist. Or, the entrypoint is not a valid
|
58
|
+
chain dag.
|
59
|
+
sky.exceptions.NotSupportedError: the feature is not supported.
|
60
|
+
"""
|
61
|
+
|
62
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
63
|
+
sdk.validate(dag)
|
64
|
+
if _need_confirmation:
|
65
|
+
request_id = sdk.optimize(dag)
|
66
|
+
sdk.stream_and_get(request_id)
|
67
|
+
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
68
|
+
if prompt is not None:
|
69
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
70
|
+
|
71
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
72
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
73
|
+
body = payloads.JobsLaunchBody(
|
74
|
+
task=dag_str,
|
75
|
+
name=name,
|
76
|
+
)
|
77
|
+
response = requests.post(
|
78
|
+
f'{server_common.get_server_url()}/jobs/launch',
|
79
|
+
json=json.loads(body.model_dump_json()),
|
80
|
+
timeout=(5, None),
|
81
|
+
)
|
82
|
+
return server_common.get_request_id(response)
|
83
|
+
|
84
|
+
|
85
|
+
@usage_lib.entrypoint
|
86
|
+
@server_common.check_server_healthy_or_start
|
87
|
+
def queue(refresh: bool,
|
88
|
+
skip_finished: bool = False,
|
89
|
+
all_users: bool = False) -> server_common.RequestId:
|
90
|
+
"""Gets statuses of managed jobs.
|
91
|
+
|
92
|
+
Please refer to sky.cli.job_queue for documentation.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
96
|
+
skip_finished: Whether to skip finished jobs.
|
97
|
+
all_users: Whether to show all users' jobs.
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
The request ID of the queue request.
|
101
|
+
|
102
|
+
Request Returns:
|
103
|
+
job_records (List[Dict[str, Any]]): A list of dicts, with each dict
|
104
|
+
containing the information of a job.
|
105
|
+
|
106
|
+
.. code-block:: python
|
107
|
+
|
108
|
+
[
|
109
|
+
{
|
110
|
+
'job_id': (int) job id,
|
111
|
+
'job_name': (str) job name,
|
112
|
+
'resources': (str) resources of the job,
|
113
|
+
'submitted_at': (float) timestamp of submission,
|
114
|
+
'end_at': (float) timestamp of end,
|
115
|
+
'duration': (float) duration in seconds,
|
116
|
+
'recovery_count': (int) Number of retries,
|
117
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
118
|
+
'cluster_resources': (str) resources of the cluster,
|
119
|
+
'region': (str) region of the cluster,
|
120
|
+
}
|
121
|
+
]
|
122
|
+
|
123
|
+
Request Raises:
|
124
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
125
|
+
does not exist.
|
126
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
127
|
+
"""
|
128
|
+
body = payloads.JobsQueueBody(
|
129
|
+
refresh=refresh,
|
130
|
+
skip_finished=skip_finished,
|
131
|
+
all_users=all_users,
|
132
|
+
)
|
133
|
+
response = requests.post(
|
134
|
+
f'{server_common.get_server_url()}/jobs/queue',
|
135
|
+
json=json.loads(body.model_dump_json()),
|
136
|
+
timeout=(5, None),
|
137
|
+
)
|
138
|
+
return server_common.get_request_id(response=response)
|
139
|
+
|
140
|
+
|
141
|
+
@usage_lib.entrypoint
|
142
|
+
@server_common.check_server_healthy_or_start
|
143
|
+
def cancel(
|
144
|
+
name: Optional[str] = None,
|
145
|
+
job_ids: Optional[List[int]] = None,
|
146
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
147
|
+
all_users: bool = False,
|
148
|
+
) -> server_common.RequestId:
|
149
|
+
"""Cancels managed jobs.
|
150
|
+
|
151
|
+
Please refer to sky.cli.job_cancel for documentation.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
name: Name of the managed job to cancel.
|
155
|
+
job_ids: IDs of the managed jobs to cancel.
|
156
|
+
all: Whether to cancel all managed jobs.
|
157
|
+
all_users: Whether to cancel all managed jobs from all users.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
The request ID of the cancel request.
|
161
|
+
|
162
|
+
Request Raises:
|
163
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
164
|
+
RuntimeError: failed to cancel the job.
|
165
|
+
"""
|
166
|
+
body = payloads.JobsCancelBody(
|
167
|
+
name=name,
|
168
|
+
job_ids=job_ids,
|
169
|
+
all=all,
|
170
|
+
all_users=all_users,
|
171
|
+
)
|
172
|
+
response = requests.post(
|
173
|
+
f'{server_common.get_server_url()}/jobs/cancel',
|
174
|
+
json=json.loads(body.model_dump_json()),
|
175
|
+
timeout=(5, None),
|
176
|
+
)
|
177
|
+
return server_common.get_request_id(response=response)
|
178
|
+
|
179
|
+
|
180
|
+
@usage_lib.entrypoint
|
181
|
+
@server_common.check_server_healthy_or_start
|
182
|
+
def tail_logs(name: Optional[str] = None,
|
183
|
+
job_id: Optional[int] = None,
|
184
|
+
follow: bool = True,
|
185
|
+
controller: bool = False,
|
186
|
+
refresh: bool = False,
|
187
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
188
|
+
"""Tails logs of managed jobs.
|
189
|
+
|
190
|
+
You can provide either a job name or a job ID to tail logs. If both are not
|
191
|
+
provided, the logs of the latest job will be shown.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
name: Name of the managed job to tail logs.
|
195
|
+
job_id: ID of the managed job to tail logs.
|
196
|
+
follow: Whether to follow the logs.
|
197
|
+
controller: Whether to tail logs from the jobs controller.
|
198
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
199
|
+
output_stream: The stream to write the logs to. If None, print to the
|
200
|
+
console.
|
201
|
+
|
202
|
+
Request Raises:
|
203
|
+
ValueError: invalid arguments.
|
204
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
205
|
+
"""
|
206
|
+
body = payloads.JobsLogsBody(
|
207
|
+
name=name,
|
208
|
+
job_id=job_id,
|
209
|
+
follow=follow,
|
210
|
+
controller=controller,
|
211
|
+
refresh=refresh,
|
212
|
+
)
|
213
|
+
response = requests.post(
|
214
|
+
f'{server_common.get_server_url()}/jobs/logs',
|
215
|
+
json=json.loads(body.model_dump_json()),
|
216
|
+
stream=True,
|
217
|
+
timeout=(5, None),
|
218
|
+
)
|
219
|
+
request_id = server_common.get_request_id(response)
|
220
|
+
sdk.stream_response(request_id, response, output_stream)
|
221
|
+
|
222
|
+
|
223
|
+
@usage_lib.entrypoint
|
224
|
+
@server_common.check_server_healthy_or_start
|
225
|
+
def download_logs(
|
226
|
+
name: Optional[str],
|
227
|
+
job_id: Optional[int],
|
228
|
+
refresh: bool,
|
229
|
+
controller: bool,
|
230
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[int, str]:
|
231
|
+
"""Sync down logs of managed jobs.
|
232
|
+
|
233
|
+
Please refer to sky.cli.job_logs for documentation.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
name: Name of the managed job to sync down logs.
|
237
|
+
job_id: ID of the managed job to sync down logs.
|
238
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
239
|
+
controller: Whether to sync down logs from the jobs controller.
|
240
|
+
local_dir: Local directory to sync down logs.
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
A dictionary mapping job ID to the local path.
|
244
|
+
|
245
|
+
Request Raises:
|
246
|
+
ValueError: invalid arguments.
|
247
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
248
|
+
"""
|
249
|
+
|
250
|
+
body = payloads.JobsDownloadLogsBody(
|
251
|
+
name=name,
|
252
|
+
job_id=job_id,
|
253
|
+
refresh=refresh,
|
254
|
+
controller=controller,
|
255
|
+
local_dir=local_dir,
|
256
|
+
)
|
257
|
+
response = requests.post(
|
258
|
+
f'{server_common.get_server_url()}/jobs/download_logs',
|
259
|
+
json=json.loads(body.model_dump_json()),
|
260
|
+
timeout=(5, None),
|
261
|
+
)
|
262
|
+
job_id_remote_path_dict = sdk.stream_and_get(
|
263
|
+
server_common.get_request_id(response))
|
264
|
+
remote2local_path_dict = client_common.download_logs_from_api_server(
|
265
|
+
job_id_remote_path_dict.values())
|
266
|
+
return {
|
267
|
+
job_id: remote2local_path_dict[remote_path]
|
268
|
+
for job_id, remote_path in job_id_remote_path_dict.items()
|
269
|
+
}
|
270
|
+
|
271
|
+
|
272
|
+
spot_launch = common_utils.deprecated_function(
|
273
|
+
launch,
|
274
|
+
name='sky.jobs.launch',
|
275
|
+
deprecated_name='spot_launch',
|
276
|
+
removing_version='0.8.0',
|
277
|
+
override_argument={'use_spot': True})
|
278
|
+
spot_queue = common_utils.deprecated_function(queue,
|
279
|
+
name='sky.jobs.queue',
|
280
|
+
deprecated_name='spot_queue',
|
281
|
+
removing_version='0.8.0')
|
282
|
+
spot_cancel = common_utils.deprecated_function(cancel,
|
283
|
+
name='sky.jobs.cancel',
|
284
|
+
deprecated_name='spot_cancel',
|
285
|
+
removing_version='0.8.0')
|
286
|
+
spot_tail_logs = common_utils.deprecated_function(
|
287
|
+
tail_logs,
|
288
|
+
name='sky.jobs.tail_logs',
|
289
|
+
deprecated_name='spot_tail_logs',
|
290
|
+
removing_version='0.8.0')
|
291
|
+
|
292
|
+
|
293
|
+
@usage_lib.entrypoint
|
294
|
+
@server_common.check_server_healthy_or_start
|
295
|
+
def dashboard() -> None:
|
296
|
+
"""Starts a dashboard for managed jobs."""
|
297
|
+
user_hash = common_utils.get_user_hash()
|
298
|
+
api_server_url = server_common.get_server_url()
|
299
|
+
params = f'user_hash={user_hash}'
|
300
|
+
url = f'{api_server_url}/jobs/dashboard?{params}'
|
301
|
+
logger.info(f'Opening dashboard in browser: {url}')
|
302
|
+
webbrowser.open(url)
|
sky/jobs/constants.py
CHANGED
@@ -1,27 +1,65 @@
|
|
1
1
|
"""Constants used for Managed Jobs."""
|
2
|
+
from typing import Dict, Union
|
3
|
+
|
4
|
+
from sky.skylet import constants as skylet_constants
|
2
5
|
|
3
6
|
JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
|
4
7
|
JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
|
8
|
+
JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
5
9
|
|
6
10
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
7
11
|
|
8
12
|
# Resources as a dict for the jobs controller.
|
9
|
-
# Use
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
13
|
+
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
14
|
+
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
15
|
+
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
16
|
+
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
17
|
+
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
18
|
+
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
15
19
|
# We use 50 GB disk size to reduce the cost.
|
16
|
-
CONTROLLER_RESOURCES
|
20
|
+
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
21
|
+
'cpus': '4+',
|
22
|
+
'memory': '8x',
|
23
|
+
'disk_size': 50
|
24
|
+
}
|
17
25
|
|
26
|
+
# TODO(zhwu): This is no longer accurate, after #4592, which increases the
|
27
|
+
# length of user hash appended to the cluster name from 4 to 8 chars. This makes
|
28
|
+
# the cluster name on GCP being wrapped twice. However, we cannot directly
|
29
|
+
# update this constant, because the job cluster cleanup and many other logic
|
30
|
+
# in managed jobs depends on this constant, i.e., updating this constant will
|
31
|
+
# break backward compatibility and existing jobs.
|
32
|
+
#
|
18
33
|
# Max length of the cluster name for GCP is 35, the user hash to be attached is
|
19
|
-
# 4+1 chars, and we assume the maximum length of the job id is
|
20
|
-
# length of the cluster name prefix is 25
|
21
|
-
# long and truncated twice during the
|
34
|
+
# 4(now 8)+1 chars, and we assume the maximum length of the job id is
|
35
|
+
# 4(now 8)+1, so the max length of the cluster name prefix is 25(should be 21
|
36
|
+
# now) to avoid the cluster name being too long and truncated twice during the
|
37
|
+
# cluster creation.
|
22
38
|
JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
23
39
|
|
24
40
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
25
41
|
# change for the jobs/utils, we need to bump this version and update
|
26
42
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
27
|
-
MANAGED_JOBS_VERSION =
|
43
|
+
MANAGED_JOBS_VERSION = 2
|
44
|
+
|
45
|
+
# The command for setting up the jobs dashboard on the controller. It firstly
|
46
|
+
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
47
|
+
# containers may not have systemd), it starts the dashboard manually.
|
48
|
+
DASHBOARD_SETUP_CMD = (
|
49
|
+
'if command -v systemctl &>/dev/null && systemctl --user show &>/dev/null; '
|
50
|
+
'then '
|
51
|
+
' systemctl --user daemon-reload; '
|
52
|
+
' systemctl --user enable --now skypilot-dashboard; '
|
53
|
+
'else '
|
54
|
+
' echo "Systemd services not found. Starting SkyPilot dashboard '
|
55
|
+
'manually."; '
|
56
|
+
# Kill any old dashboard processes;
|
57
|
+
' ps aux | grep -v nohup | grep -v grep | '
|
58
|
+
' grep -- \'-m sky.jobs.dashboard.dashboard\' | awk \'{print $2}\' | '
|
59
|
+
' xargs kill > /dev/null 2>&1 || true;'
|
60
|
+
# Launch the dashboard in the background if not already running
|
61
|
+
' (ps aux | grep -v nohup | grep -v grep | '
|
62
|
+
' grep -q -- \'-m sky.jobs.dashboard.dashboard\') || '
|
63
|
+
f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
|
64
|
+
'>> ~/.sky/job-dashboard.log 2>&1 &); '
|
65
|
+
'fi')
|