skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/core.py
DELETED
@@ -1,330 +0,0 @@
|
|
1
|
-
"""SDK functions for managed jobs."""
|
2
|
-
import os
|
3
|
-
import tempfile
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
5
|
-
import uuid
|
6
|
-
|
7
|
-
import colorama
|
8
|
-
|
9
|
-
import sky
|
10
|
-
from sky import backends
|
11
|
-
from sky import exceptions
|
12
|
-
from sky import sky_logging
|
13
|
-
from sky import status_lib
|
14
|
-
from sky import task as task_lib
|
15
|
-
from sky.backends import backend_utils
|
16
|
-
from sky.clouds.service_catalog import common as service_catalog_common
|
17
|
-
from sky.jobs import constants as managed_job_constants
|
18
|
-
from sky.jobs import utils as managed_job_utils
|
19
|
-
from sky.skylet import constants as skylet_constants
|
20
|
-
from sky.usage import usage_lib
|
21
|
-
from sky.utils import common_utils
|
22
|
-
from sky.utils import controller_utils
|
23
|
-
from sky.utils import dag_utils
|
24
|
-
from sky.utils import rich_utils
|
25
|
-
from sky.utils import subprocess_utils
|
26
|
-
from sky.utils import ux_utils
|
27
|
-
|
28
|
-
|
29
|
-
@usage_lib.entrypoint
|
30
|
-
def launch(
|
31
|
-
task: Union['sky.Task', 'sky.Dag'],
|
32
|
-
name: Optional[str] = None,
|
33
|
-
stream_logs: bool = True,
|
34
|
-
detach_run: bool = False,
|
35
|
-
retry_until_up: bool = False,
|
36
|
-
) -> None:
|
37
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
38
|
-
"""Launch a managed job.
|
39
|
-
|
40
|
-
Please refer to sky.cli.job_launch for documentation.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
|
44
|
-
managed job.
|
45
|
-
name: Name of the managed job.
|
46
|
-
detach_run: Whether to detach the run.
|
47
|
-
|
48
|
-
Raises:
|
49
|
-
ValueError: cluster does not exist. Or, the entrypoint is not a valid
|
50
|
-
chain dag.
|
51
|
-
sky.exceptions.NotSupportedError: the feature is not supported.
|
52
|
-
"""
|
53
|
-
entrypoint = task
|
54
|
-
dag_uuid = str(uuid.uuid4().hex[:4])
|
55
|
-
|
56
|
-
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
57
|
-
if not dag.is_chain():
|
58
|
-
with ux_utils.print_exception_no_traceback():
|
59
|
-
raise ValueError('Only single-task or chain DAG is '
|
60
|
-
f'allowed for job_launch. Dag: {dag}')
|
61
|
-
|
62
|
-
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
63
|
-
|
64
|
-
task_names = set()
|
65
|
-
for task_ in dag.tasks:
|
66
|
-
if task_.name in task_names:
|
67
|
-
with ux_utils.print_exception_no_traceback():
|
68
|
-
raise ValueError(
|
69
|
-
f'Task name {task_.name!r} is duplicated in the DAG. '
|
70
|
-
'Either change task names to be unique, or specify the DAG '
|
71
|
-
'name only and comment out the task names (so that they '
|
72
|
-
'will be auto-generated) .')
|
73
|
-
task_names.add(task_.name)
|
74
|
-
|
75
|
-
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
76
|
-
|
77
|
-
for task_ in dag.tasks:
|
78
|
-
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
79
|
-
task_, path='jobs')
|
80
|
-
|
81
|
-
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
82
|
-
mode='w') as f:
|
83
|
-
dag_utils.dump_chain_dag_to_yaml(dag, f.name)
|
84
|
-
controller = controller_utils.Controllers.JOBS_CONTROLLER
|
85
|
-
controller_name = controller.value.cluster_name
|
86
|
-
prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
|
87
|
-
remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
|
88
|
-
remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
|
89
|
-
controller_resources = controller_utils.get_controller_resources(
|
90
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
91
|
-
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
92
|
-
|
93
|
-
vars_to_fill = {
|
94
|
-
'remote_user_yaml_path': remote_user_yaml_path,
|
95
|
-
'user_yaml_path': f.name,
|
96
|
-
'jobs_controller': controller_name,
|
97
|
-
# Note: actual cluster name will be <task.name>-<managed job ID>
|
98
|
-
'dag_name': dag.name,
|
99
|
-
'retry_until_up': retry_until_up,
|
100
|
-
'remote_user_config_path': remote_user_config_path,
|
101
|
-
'modified_catalogs':
|
102
|
-
service_catalog_common.get_modified_catalog_file_mounts(),
|
103
|
-
**controller_utils.shared_controller_vars_to_fill(
|
104
|
-
controller_utils.Controllers.JOBS_CONTROLLER,
|
105
|
-
remote_user_config_path=remote_user_config_path,
|
106
|
-
),
|
107
|
-
}
|
108
|
-
|
109
|
-
yaml_path = os.path.join(
|
110
|
-
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
111
|
-
f'{name}-{dag_uuid}.yaml')
|
112
|
-
common_utils.fill_template(
|
113
|
-
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
114
|
-
vars_to_fill,
|
115
|
-
output_path=yaml_path)
|
116
|
-
controller_task = task_lib.Task.from_yaml(yaml_path)
|
117
|
-
controller_task.set_resources(controller_resources)
|
118
|
-
|
119
|
-
controller_task.managed_job_dag = dag
|
120
|
-
assert len(controller_task.resources) == 1, controller_task
|
121
|
-
|
122
|
-
sky_logging.print(
|
123
|
-
f'{colorama.Fore.YELLOW}'
|
124
|
-
f'Launching managed job {dag.name!r} from jobs controller...'
|
125
|
-
f'{colorama.Style.RESET_ALL}')
|
126
|
-
sky_logging.print('Launching jobs controller...')
|
127
|
-
sky.launch(task=controller_task,
|
128
|
-
stream_logs=stream_logs,
|
129
|
-
cluster_name=controller_name,
|
130
|
-
detach_run=detach_run,
|
131
|
-
idle_minutes_to_autostop=skylet_constants.
|
132
|
-
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
|
133
|
-
retry_until_up=True,
|
134
|
-
_disable_controller_check=True)
|
135
|
-
|
136
|
-
|
137
|
-
@usage_lib.entrypoint
|
138
|
-
def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
139
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
140
|
-
"""Get statuses of managed jobs.
|
141
|
-
|
142
|
-
Please refer to sky.cli.job_queue for documentation.
|
143
|
-
|
144
|
-
Returns:
|
145
|
-
[
|
146
|
-
{
|
147
|
-
'job_id': int,
|
148
|
-
'job_name': str,
|
149
|
-
'resources': str,
|
150
|
-
'submitted_at': (float) timestamp of submission,
|
151
|
-
'end_at': (float) timestamp of end,
|
152
|
-
'duration': (float) duration in seconds,
|
153
|
-
'recovery_count': (int) Number of retries,
|
154
|
-
'status': (sky.jobs.ManagedJobStatus) of the job,
|
155
|
-
'cluster_resources': (str) resources of the cluster,
|
156
|
-
'region': (str) region of the cluster,
|
157
|
-
}
|
158
|
-
]
|
159
|
-
Raises:
|
160
|
-
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
161
|
-
does not exist.
|
162
|
-
RuntimeError: if failed to get the managed jobs with ssh.
|
163
|
-
"""
|
164
|
-
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
165
|
-
stopped_message = ''
|
166
|
-
if not refresh:
|
167
|
-
stopped_message = 'No in-progress managed jobs.'
|
168
|
-
try:
|
169
|
-
handle = backend_utils.is_controller_accessible(
|
170
|
-
controller=jobs_controller_type, stopped_message=stopped_message)
|
171
|
-
except exceptions.ClusterNotUpError as e:
|
172
|
-
if not refresh:
|
173
|
-
raise
|
174
|
-
handle = None
|
175
|
-
controller_status = e.cluster_status
|
176
|
-
|
177
|
-
if refresh and handle is None:
|
178
|
-
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
179
|
-
'Restarting controller for latest status...'
|
180
|
-
f'{colorama.Style.RESET_ALL}')
|
181
|
-
|
182
|
-
rich_utils.force_update_status(
|
183
|
-
'[cyan] Checking managed jobs - restarting '
|
184
|
-
'controller[/]')
|
185
|
-
handle = sky.start(jobs_controller_type.value.cluster_name)
|
186
|
-
controller_status = status_lib.ClusterStatus.UP
|
187
|
-
rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
|
188
|
-
|
189
|
-
assert handle is not None, (controller_status, refresh)
|
190
|
-
|
191
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
192
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
193
|
-
|
194
|
-
code = managed_job_utils.ManagedJobCodeGen.get_job_table()
|
195
|
-
returncode, job_table_payload, stderr = backend.run_on_head(
|
196
|
-
handle,
|
197
|
-
code,
|
198
|
-
require_outputs=True,
|
199
|
-
stream_logs=False,
|
200
|
-
separate_stderr=True)
|
201
|
-
|
202
|
-
try:
|
203
|
-
subprocess_utils.handle_returncode(returncode,
|
204
|
-
code,
|
205
|
-
'Failed to fetch managed jobs',
|
206
|
-
job_table_payload + stderr,
|
207
|
-
stream_logs=False)
|
208
|
-
except exceptions.CommandError as e:
|
209
|
-
raise RuntimeError(str(e)) from e
|
210
|
-
|
211
|
-
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
212
|
-
if skip_finished:
|
213
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
214
|
-
# finished, we will include all its tasks.
|
215
|
-
non_finished_tasks = list(
|
216
|
-
filter(lambda job: not job['status'].is_terminal(), jobs))
|
217
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
218
|
-
jobs = list(
|
219
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
220
|
-
return jobs
|
221
|
-
|
222
|
-
|
223
|
-
@usage_lib.entrypoint
|
224
|
-
# pylint: disable=redefined-builtin
|
225
|
-
def cancel(name: Optional[str] = None,
|
226
|
-
job_ids: Optional[List[int]] = None,
|
227
|
-
all: bool = False) -> None:
|
228
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
229
|
-
"""Cancel managed jobs.
|
230
|
-
|
231
|
-
Please refer to sky.cli.job_cancel for documentation.
|
232
|
-
|
233
|
-
Raises:
|
234
|
-
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
235
|
-
RuntimeError: failed to cancel the job.
|
236
|
-
"""
|
237
|
-
job_ids = [] if job_ids is None else job_ids
|
238
|
-
handle = backend_utils.is_controller_accessible(
|
239
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
240
|
-
stopped_message='All managed jobs should have finished.')
|
241
|
-
|
242
|
-
job_id_str = ','.join(map(str, job_ids))
|
243
|
-
if sum([len(job_ids) > 0, name is not None, all]) != 1:
|
244
|
-
argument_str = f'job_ids={job_id_str}' if len(job_ids) > 0 else ''
|
245
|
-
argument_str += f' name={name}' if name is not None else ''
|
246
|
-
argument_str += ' all' if all else ''
|
247
|
-
with ux_utils.print_exception_no_traceback():
|
248
|
-
raise ValueError('Can only specify one of JOB_IDS or name or all. '
|
249
|
-
f'Provided {argument_str!r}.')
|
250
|
-
|
251
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
252
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
253
|
-
if all:
|
254
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
|
255
|
-
elif job_ids:
|
256
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
|
257
|
-
else:
|
258
|
-
assert name is not None, (job_ids, name, all)
|
259
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
|
260
|
-
# The stderr is redirected to stdout
|
261
|
-
returncode, stdout, _ = backend.run_on_head(handle,
|
262
|
-
code,
|
263
|
-
require_outputs=True,
|
264
|
-
stream_logs=False)
|
265
|
-
try:
|
266
|
-
subprocess_utils.handle_returncode(returncode, code,
|
267
|
-
'Failed to cancel managed job',
|
268
|
-
stdout)
|
269
|
-
except exceptions.CommandError as e:
|
270
|
-
with ux_utils.print_exception_no_traceback():
|
271
|
-
raise RuntimeError(e.error_msg) from e
|
272
|
-
|
273
|
-
sky_logging.print(stdout)
|
274
|
-
if 'Multiple jobs found with name' in stdout:
|
275
|
-
with ux_utils.print_exception_no_traceback():
|
276
|
-
raise RuntimeError(
|
277
|
-
'Please specify the job ID instead of the job name.')
|
278
|
-
|
279
|
-
|
280
|
-
@usage_lib.entrypoint
|
281
|
-
def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
282
|
-
controller: bool) -> None:
|
283
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
284
|
-
"""Tail logs of managed jobs.
|
285
|
-
|
286
|
-
Please refer to sky.cli.job_logs for documentation.
|
287
|
-
|
288
|
-
Raises:
|
289
|
-
ValueError: invalid arguments.
|
290
|
-
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
291
|
-
"""
|
292
|
-
# TODO(zhwu): Automatically restart the jobs controller
|
293
|
-
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
294
|
-
handle = backend_utils.is_controller_accessible(
|
295
|
-
controller=jobs_controller_type,
|
296
|
-
stopped_message=(
|
297
|
-
'Please restart the jobs controller with '
|
298
|
-
f'`sky start {jobs_controller_type.value.cluster_name}`.'))
|
299
|
-
|
300
|
-
if name is not None and job_id is not None:
|
301
|
-
raise ValueError('Cannot specify both name and job_id.')
|
302
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
303
|
-
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
304
|
-
|
305
|
-
backend.tail_managed_job_logs(handle,
|
306
|
-
job_id=job_id,
|
307
|
-
job_name=name,
|
308
|
-
follow=follow,
|
309
|
-
controller=controller)
|
310
|
-
|
311
|
-
|
312
|
-
spot_launch = common_utils.deprecated_function(
|
313
|
-
launch,
|
314
|
-
name='sky.jobs.launch',
|
315
|
-
deprecated_name='spot_launch',
|
316
|
-
removing_version='0.8.0',
|
317
|
-
override_argument={'use_spot': True})
|
318
|
-
spot_queue = common_utils.deprecated_function(queue,
|
319
|
-
name='sky.jobs.queue',
|
320
|
-
deprecated_name='spot_queue',
|
321
|
-
removing_version='0.8.0')
|
322
|
-
spot_cancel = common_utils.deprecated_function(cancel,
|
323
|
-
name='sky.jobs.cancel',
|
324
|
-
deprecated_name='spot_cancel',
|
325
|
-
removing_version='0.8.0')
|
326
|
-
spot_tail_logs = common_utils.deprecated_function(
|
327
|
-
tail_logs,
|
328
|
-
name='sky.jobs.tail_logs',
|
329
|
-
deprecated_name='spot_tail_logs',
|
330
|
-
removing_version='0.8.0')
|
@@ -1,301 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
3
|
-
"contentVersion": "1.0.0.0",
|
4
|
-
"parameters": {
|
5
|
-
"vmName": {
|
6
|
-
"type": "string",
|
7
|
-
"metadata": {
|
8
|
-
"description": "The name of you Virtual Machine."
|
9
|
-
}
|
10
|
-
},
|
11
|
-
"adminUsername": {
|
12
|
-
"type": "string",
|
13
|
-
"metadata": {
|
14
|
-
"description": "Username for the Virtual Machine."
|
15
|
-
}
|
16
|
-
},
|
17
|
-
"publicKey": {
|
18
|
-
"type": "securestring",
|
19
|
-
"metadata": {
|
20
|
-
"description": "SSH Key for the Virtual Machine"
|
21
|
-
}
|
22
|
-
},
|
23
|
-
"imagePublisher": {
|
24
|
-
"type": "string",
|
25
|
-
"metadata": {
|
26
|
-
"description": "The publisher of the VM image"
|
27
|
-
}
|
28
|
-
},
|
29
|
-
"imageOffer": {
|
30
|
-
"type": "string",
|
31
|
-
"metadata": {
|
32
|
-
"description": "The offer of the VM image"
|
33
|
-
}
|
34
|
-
},
|
35
|
-
"imageSku": {
|
36
|
-
"type": "string",
|
37
|
-
"metadata": {
|
38
|
-
"description": "The sku of the VM image"
|
39
|
-
}
|
40
|
-
},
|
41
|
-
"imageVersion": {
|
42
|
-
"type": "string",
|
43
|
-
"metadata": {
|
44
|
-
"description": "The version of the VM image"
|
45
|
-
}
|
46
|
-
},
|
47
|
-
"vmSize": {
|
48
|
-
"type": "string",
|
49
|
-
"metadata": {
|
50
|
-
"description": "The size of the VM"
|
51
|
-
}
|
52
|
-
},
|
53
|
-
"vmTags": {
|
54
|
-
"type": "object",
|
55
|
-
"metadata": {
|
56
|
-
"description": "Tags for the VM"
|
57
|
-
}
|
58
|
-
},
|
59
|
-
"vmCount": {
|
60
|
-
"type": "int",
|
61
|
-
"metadata": {
|
62
|
-
"description": "Number of VMs to deploy"
|
63
|
-
}
|
64
|
-
},
|
65
|
-
"provisionPublicIp": {
|
66
|
-
"type": "bool",
|
67
|
-
"defaultValue": true,
|
68
|
-
"metadata": {
|
69
|
-
"description": "If true creates a public ip"
|
70
|
-
}
|
71
|
-
},
|
72
|
-
"priority": {
|
73
|
-
"type": "string",
|
74
|
-
"defaultValue": "Regular",
|
75
|
-
"metadata": {
|
76
|
-
"description": "Specifies the priority for the virtual machine."
|
77
|
-
}
|
78
|
-
},
|
79
|
-
"billingProfile": {
|
80
|
-
"type": "object",
|
81
|
-
"defaultValue": {},
|
82
|
-
"metadata": {
|
83
|
-
"description": "Specifies the maximum price to pay for Azure Spot VM."
|
84
|
-
}
|
85
|
-
},
|
86
|
-
"osDiskSizeGB": {
|
87
|
-
"type": "int",
|
88
|
-
"metadata": {
|
89
|
-
"description": "OS disk size in GBs."
|
90
|
-
}
|
91
|
-
},
|
92
|
-
"msi": {
|
93
|
-
"type": "string",
|
94
|
-
"metadata": {
|
95
|
-
"description": "Managed service identity resource id."
|
96
|
-
}
|
97
|
-
},
|
98
|
-
"nsg": {
|
99
|
-
"type": "string",
|
100
|
-
"metadata": {
|
101
|
-
"description": "Network security group resource id."
|
102
|
-
}
|
103
|
-
},
|
104
|
-
"subnet": {
|
105
|
-
"type": "string",
|
106
|
-
"metadata": {
|
107
|
-
"descriptions": "Subnet resource id."
|
108
|
-
}
|
109
|
-
},
|
110
|
-
"osDiskTier": {
|
111
|
-
"type": "string",
|
112
|
-
"allowedValues": [
|
113
|
-
"Premium_LRS",
|
114
|
-
"StandardSSD_LRS",
|
115
|
-
"Standard_LRS"
|
116
|
-
],
|
117
|
-
"metadata": {
|
118
|
-
"description": "OS disk tier."
|
119
|
-
}
|
120
|
-
},
|
121
|
-
"cloudInitSetupCommands": {
|
122
|
-
"type": "string",
|
123
|
-
"metadata": {
|
124
|
-
"description": "Base64 encoded cloud-init setup commands."
|
125
|
-
}
|
126
|
-
}
|
127
|
-
},
|
128
|
-
"variables": {
|
129
|
-
"location": "[resourceGroup().location]",
|
130
|
-
"networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
|
131
|
-
"networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
|
132
|
-
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
|
133
|
-
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
|
134
|
-
"publicIpAddressName": "[concat(parameters('vmName'), '-ip')]"
|
135
|
-
},
|
136
|
-
"resources": [
|
137
|
-
{
|
138
|
-
"type": "Microsoft.Network/networkInterfaces",
|
139
|
-
"apiVersion": "2020-06-01",
|
140
|
-
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
|
141
|
-
"location": "[variables('location')]",
|
142
|
-
"dependsOn": [
|
143
|
-
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
|
144
|
-
],
|
145
|
-
"copy": {
|
146
|
-
"name": "NICPublicCopy",
|
147
|
-
"count": "[parameters('vmCount')]"
|
148
|
-
},
|
149
|
-
"properties": {
|
150
|
-
"ipConfigurations": [
|
151
|
-
{
|
152
|
-
"name": "[variables('networkIpConfig')]",
|
153
|
-
"properties": {
|
154
|
-
"subnet": {
|
155
|
-
"id": "[parameters('subnet')]"
|
156
|
-
},
|
157
|
-
"privateIPAllocationMethod": "Dynamic",
|
158
|
-
"publicIpAddress": {
|
159
|
-
"id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
|
160
|
-
}
|
161
|
-
}
|
162
|
-
}
|
163
|
-
],
|
164
|
-
"networkSecurityGroup": {
|
165
|
-
"id": "[parameters('nsg')]"
|
166
|
-
}
|
167
|
-
},
|
168
|
-
"condition": "[parameters('provisionPublicIp')]"
|
169
|
-
},
|
170
|
-
{
|
171
|
-
"type": "Microsoft.Network/networkInterfaces",
|
172
|
-
"apiVersion": "2020-06-01",
|
173
|
-
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
|
174
|
-
"location": "[variables('location')]",
|
175
|
-
"copy": {
|
176
|
-
"name": "NICPrivateCopy",
|
177
|
-
"count": "[parameters('vmCount')]"
|
178
|
-
},
|
179
|
-
"properties": {
|
180
|
-
"ipConfigurations": [
|
181
|
-
{
|
182
|
-
"name": "[variables('networkIpConfig')]",
|
183
|
-
"properties": {
|
184
|
-
"subnet": {
|
185
|
-
"id": "[parameters('subnet')]"
|
186
|
-
},
|
187
|
-
"privateIPAllocationMethod": "Dynamic"
|
188
|
-
}
|
189
|
-
}
|
190
|
-
],
|
191
|
-
"networkSecurityGroup": {
|
192
|
-
"id": "[parameters('nsg')]"
|
193
|
-
}
|
194
|
-
},
|
195
|
-
"condition": "[not(parameters('provisionPublicIp'))]"
|
196
|
-
},
|
197
|
-
{
|
198
|
-
"type": "Microsoft.Network/publicIpAddresses",
|
199
|
-
"apiVersion": "2019-02-01",
|
200
|
-
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
|
201
|
-
"location": "[variables('location')]",
|
202
|
-
"properties": {
|
203
|
-
"publicIpAllocationMethod": "Static",
|
204
|
-
"publicIPAddressVersion": "IPv4"
|
205
|
-
},
|
206
|
-
"copy": {
|
207
|
-
"name": "PublicIpCopy",
|
208
|
-
"count": "[parameters('vmCount')]"
|
209
|
-
},
|
210
|
-
"sku": {
|
211
|
-
"name": "Basic",
|
212
|
-
"tier": "Regional"
|
213
|
-
},
|
214
|
-
"condition": "[parameters('provisionPublicIp')]"
|
215
|
-
},
|
216
|
-
{
|
217
|
-
"type": "Microsoft.Compute/virtualMachines",
|
218
|
-
"apiVersion": "2019-03-01",
|
219
|
-
"name": "[concat(parameters('vmName'), copyIndex())]",
|
220
|
-
"location": "[variables('location')]",
|
221
|
-
"dependsOn": [
|
222
|
-
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
|
223
|
-
],
|
224
|
-
"copy": {
|
225
|
-
"name": "VmCopy",
|
226
|
-
"count": "[parameters('vmCount')]"
|
227
|
-
},
|
228
|
-
"tags": "[parameters('vmTags')]",
|
229
|
-
"properties": {
|
230
|
-
"hardwareProfile": {
|
231
|
-
"vmSize": "[parameters('vmSize')]"
|
232
|
-
},
|
233
|
-
"storageProfile": {
|
234
|
-
"osDisk": {
|
235
|
-
"createOption": "fromImage",
|
236
|
-
"managedDisk": {
|
237
|
-
"storageAccountType": "[parameters('osDiskTier')]"
|
238
|
-
},
|
239
|
-
"diskSizeGB": "[parameters('osDiskSizeGB')]"
|
240
|
-
},
|
241
|
-
"imageReference": {
|
242
|
-
"publisher": "[parameters('imagePublisher')]",
|
243
|
-
"offer": "[parameters('imageOffer')]",
|
244
|
-
"sku": "[parameters('imageSku')]",
|
245
|
-
"version": "[parameters('imageVersion')]"
|
246
|
-
}
|
247
|
-
},
|
248
|
-
"networkProfile": {
|
249
|
-
"networkInterfaces": [
|
250
|
-
{
|
251
|
-
"id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
|
252
|
-
}
|
253
|
-
]
|
254
|
-
},
|
255
|
-
"osProfile": {
|
256
|
-
"computerName": "[concat(parameters('vmName'), copyIndex())]",
|
257
|
-
"adminUsername": "[parameters('adminUsername')]",
|
258
|
-
"adminPassword": "[parameters('publicKey')]",
|
259
|
-
"linuxConfiguration": {
|
260
|
-
"disablePasswordAuthentication": true,
|
261
|
-
"ssh": {
|
262
|
-
"publicKeys": [
|
263
|
-
{
|
264
|
-
"path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
|
265
|
-
"keyData": "[parameters('publicKey')]"
|
266
|
-
}
|
267
|
-
]
|
268
|
-
}
|
269
|
-
},
|
270
|
-
"customData": "[parameters('cloudInitSetupCommands')]"
|
271
|
-
},
|
272
|
-
"priority": "[parameters('priority')]",
|
273
|
-
"billingProfile": "[parameters('billingProfile')]"
|
274
|
-
},
|
275
|
-
"identity": {
|
276
|
-
"type": "UserAssigned",
|
277
|
-
"userAssignedIdentities": {
|
278
|
-
"[parameters('msi')]": {
|
279
|
-
}
|
280
|
-
}
|
281
|
-
}
|
282
|
-
}
|
283
|
-
],
|
284
|
-
"outputs": {
|
285
|
-
"publicIp": {
|
286
|
-
"type": "array",
|
287
|
-
"copy": {
|
288
|
-
"count": "[parameters('vmCount')]",
|
289
|
-
"input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
|
290
|
-
},
|
291
|
-
"condition": "[parameters('provisionPublicIp')]"
|
292
|
-
},
|
293
|
-
"privateIp": {
|
294
|
-
"type": "array",
|
295
|
-
"copy": {
|
296
|
-
"count": "[parameters('vmCount')]",
|
297
|
-
"input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
|
298
|
-
}
|
299
|
-
}
|
300
|
-
}
|
301
|
-
}
|