skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/subprocess_utils.py
CHANGED
@@ -2,21 +2,26 @@
|
|
2
2
|
from multiprocessing import pool
|
3
3
|
import os
|
4
4
|
import random
|
5
|
+
import resource
|
6
|
+
import shlex
|
5
7
|
import subprocess
|
6
8
|
import time
|
7
|
-
from typing import Any, Callable,
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
8
10
|
|
9
11
|
import colorama
|
10
12
|
import psutil
|
11
13
|
|
12
14
|
from sky import exceptions
|
13
15
|
from sky import sky_logging
|
16
|
+
from sky.skylet import constants
|
14
17
|
from sky.skylet import log_lib
|
15
18
|
from sky.utils import timeline
|
16
19
|
from sky.utils import ux_utils
|
17
20
|
|
18
21
|
logger = sky_logging.init_logger(__name__)
|
19
22
|
|
23
|
+
_fd_limit_warning_shown = False
|
24
|
+
|
20
25
|
|
21
26
|
@timeline.event
|
22
27
|
def run(cmd, **kwargs):
|
@@ -42,27 +47,86 @@ def run_no_outputs(cmd, **kwargs):
|
|
42
47
|
**kwargs)
|
43
48
|
|
44
49
|
|
45
|
-
def
|
46
|
-
|
50
|
+
def _get_thread_multiplier(cloud_str: Optional[str] = None) -> int:
|
51
|
+
# If using Kubernetes, we use 4x the number of cores.
|
52
|
+
if cloud_str and cloud_str.lower() == 'kubernetes':
|
53
|
+
return 4
|
54
|
+
return 1
|
55
|
+
|
56
|
+
|
57
|
+
def get_max_workers_for_file_mounts(common_file_mounts: Dict[str, str],
|
58
|
+
cloud_str: Optional[str] = None) -> int:
|
59
|
+
global _fd_limit_warning_shown
|
60
|
+
fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
61
|
+
|
62
|
+
# Raise warning for low fd_limit (only once)
|
63
|
+
if fd_limit < 1024 and not _fd_limit_warning_shown:
|
64
|
+
logger.warning(
|
65
|
+
f'Open file descriptor limit ({fd_limit}) is low. File sync to '
|
66
|
+
'remote clusters may be slow. Consider increasing the limit using '
|
67
|
+
'`ulimit -n <number>` or modifying system limits.')
|
68
|
+
_fd_limit_warning_shown = True
|
69
|
+
|
70
|
+
fd_per_rsync = 5
|
71
|
+
for src in common_file_mounts.values():
|
72
|
+
if os.path.isdir(src):
|
73
|
+
# Assume that each file/folder under src takes 5 file descriptors
|
74
|
+
# on average.
|
75
|
+
fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
|
76
|
+
|
77
|
+
# Reserve some file descriptors for the system and other processes
|
78
|
+
fd_reserve = 100
|
79
|
+
|
80
|
+
max_workers = (fd_limit - fd_reserve) // fd_per_rsync
|
81
|
+
# At least 1 worker, and avoid too many workers overloading the system.
|
82
|
+
num_threads = get_parallel_threads(cloud_str)
|
83
|
+
max_workers = min(max(max_workers, 1), num_threads)
|
84
|
+
logger.debug(f'Using {max_workers} workers for file mounts.')
|
85
|
+
return max_workers
|
86
|
+
|
87
|
+
|
88
|
+
def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
89
|
+
"""Returns the number of threads to use for parallel execution.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
cloud_str: The cloud
|
93
|
+
"""
|
47
94
|
cpu_count = os.cpu_count()
|
48
95
|
if cpu_count is None:
|
49
96
|
cpu_count = 1
|
50
|
-
return max(4, cpu_count - 1)
|
97
|
+
return max(4, cpu_count - 1) * _get_thread_multiplier(cloud_str)
|
51
98
|
|
52
99
|
|
53
|
-
def run_in_parallel(func: Callable,
|
100
|
+
def run_in_parallel(func: Callable,
|
101
|
+
args: List[Any],
|
102
|
+
num_threads: Optional[int] = None) -> List[Any]:
|
54
103
|
"""Run a function in parallel on a list of arguments.
|
55
104
|
|
56
|
-
|
105
|
+
Args:
|
106
|
+
func: The function to run in parallel
|
107
|
+
args: Iterable of arguments to pass to func
|
108
|
+
num_threads: Number of threads to use. If None, uses
|
109
|
+
get_parallel_threads()
|
57
110
|
|
58
111
|
Returns:
|
59
112
|
A list of the return values of the function func, in the same order as the
|
60
|
-
|
113
|
+
arguments.
|
114
|
+
|
115
|
+
Raises:
|
116
|
+
Exception: The first exception encountered.
|
61
117
|
"""
|
62
|
-
#
|
63
|
-
|
64
|
-
|
65
|
-
|
118
|
+
# Short-circuit for short lists
|
119
|
+
if len(args) == 0:
|
120
|
+
return []
|
121
|
+
if len(args) == 1:
|
122
|
+
return [func(args[0])]
|
123
|
+
|
124
|
+
processes = (num_threads
|
125
|
+
if num_threads is not None else get_parallel_threads())
|
126
|
+
|
127
|
+
with pool.ThreadPool(processes=processes) as p:
|
128
|
+
ordered_iterators = p.imap(func, args)
|
129
|
+
return list(ordered_iterators)
|
66
130
|
|
67
131
|
|
68
132
|
def handle_returncode(returncode: int,
|
@@ -77,8 +141,9 @@ def handle_returncode(returncode: int,
|
|
77
141
|
command: The command that was run.
|
78
142
|
error_msg: The error message to print.
|
79
143
|
stderr: The stderr of the command.
|
144
|
+
stream_logs: Whether to stream logs.
|
80
145
|
"""
|
81
|
-
echo = logger.error if stream_logs else
|
146
|
+
echo = logger.error if stream_logs else logger.debug
|
82
147
|
if returncode != 0:
|
83
148
|
if stderr is not None:
|
84
149
|
echo(stderr)
|
@@ -92,9 +157,9 @@ def handle_returncode(returncode: int,
|
|
92
157
|
stderr)
|
93
158
|
|
94
159
|
|
95
|
-
def kill_children_processes(
|
96
|
-
|
97
|
-
|
160
|
+
def kill_children_processes(parent_pids: Optional[Union[
|
161
|
+
int, List[Optional[int]]]] = None,
|
162
|
+
force: bool = False) -> None:
|
98
163
|
"""Kill children processes recursively.
|
99
164
|
|
100
165
|
We need to kill the children, so that
|
@@ -104,41 +169,57 @@ def kill_children_processes(
|
|
104
169
|
etc. while we are cleaning up the clusters.
|
105
170
|
|
106
171
|
Args:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
172
|
+
parent_pids: Optional PIDs of a series of processes. The processes and
|
173
|
+
their children will be killed. If a list of PID is specified, it is
|
174
|
+
killed by the order in the list. This is for guaranteeing the order
|
175
|
+
of cleaning up and suppress flaky errors.
|
176
|
+
force: bool, send SIGKILL if force, otherwise, use SIGTERM for
|
177
|
+
gracefully kill the process.
|
112
178
|
"""
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
179
|
+
if isinstance(parent_pids, int):
|
180
|
+
parent_pids = [parent_pids]
|
181
|
+
|
182
|
+
def kill(proc: psutil.Process):
|
183
|
+
if not proc.is_running():
|
184
|
+
# Skip if the process is not running.
|
185
|
+
return
|
186
|
+
logger.debug(f'Killing process {proc.pid}')
|
187
|
+
try:
|
188
|
+
if force:
|
189
|
+
proc.kill()
|
190
|
+
else:
|
191
|
+
proc.terminate()
|
192
|
+
proc.wait(timeout=10)
|
193
|
+
except psutil.NoSuchProcess:
|
194
|
+
# The child process may have already been terminated.
|
195
|
+
pass
|
196
|
+
except psutil.TimeoutExpired:
|
197
|
+
logger.debug(
|
198
|
+
f'Process {proc.pid} did not terminate after 10 seconds')
|
199
|
+
# Attempt to force kill if the normal termination fails
|
200
|
+
if not force:
|
201
|
+
logger.debug(f'Force killing process {proc.pid}')
|
202
|
+
proc.kill()
|
203
|
+
proc.wait(timeout=5) # Shorter timeout after force kill
|
204
|
+
|
205
|
+
parent_processes = []
|
206
|
+
if parent_pids is None:
|
207
|
+
parent_processes = [psutil.Process()]
|
208
|
+
else:
|
209
|
+
for pid in parent_pids:
|
122
210
|
try:
|
123
|
-
|
124
|
-
process.kill()
|
125
|
-
else:
|
126
|
-
process.terminate()
|
211
|
+
process = psutil.Process(pid)
|
127
212
|
except psutil.NoSuchProcess:
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
parent_process = psutil.Process()
|
132
|
-
for child in parent_process.children(recursive=True):
|
133
|
-
if child.pid in first_pid_to_kill:
|
134
|
-
pid_to_proc[child.pid] = child
|
135
|
-
else:
|
136
|
-
child_processes.append(child)
|
213
|
+
continue
|
214
|
+
parent_processes.append(process)
|
137
215
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
216
|
+
for parent_process in parent_processes:
|
217
|
+
child_processes = parent_process.children(recursive=True)
|
218
|
+
if parent_pids is not None:
|
219
|
+
kill(parent_process)
|
220
|
+
logger.debug(f'Killing child processes: {child_processes}')
|
221
|
+
for child in child_processes:
|
222
|
+
kill(child)
|
142
223
|
|
143
224
|
|
144
225
|
def run_with_retries(
|
@@ -187,3 +268,88 @@ def run_with_retries(
|
|
187
268
|
continue
|
188
269
|
break
|
189
270
|
return returncode, stdout, stderr
|
271
|
+
|
272
|
+
|
273
|
+
def kill_process_daemon(process_pid: int) -> None:
|
274
|
+
"""Start a daemon as a safety net to kill the process.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
process_pid: The PID of the process to kill.
|
278
|
+
"""
|
279
|
+
# Get initial children list
|
280
|
+
try:
|
281
|
+
process = psutil.Process(process_pid)
|
282
|
+
initial_children = [p.pid for p in process.children(recursive=True)]
|
283
|
+
except psutil.NoSuchProcess:
|
284
|
+
initial_children = []
|
285
|
+
|
286
|
+
parent_pid = os.getpid()
|
287
|
+
daemon_script = os.path.join(
|
288
|
+
os.path.dirname(os.path.abspath(log_lib.__file__)),
|
289
|
+
'subprocess_daemon.py')
|
290
|
+
python_path = subprocess.check_output(constants.SKY_GET_PYTHON_PATH_CMD,
|
291
|
+
shell=True,
|
292
|
+
stderr=subprocess.DEVNULL,
|
293
|
+
encoding='utf-8').strip()
|
294
|
+
daemon_cmd = [
|
295
|
+
python_path,
|
296
|
+
daemon_script,
|
297
|
+
'--parent-pid',
|
298
|
+
str(parent_pid),
|
299
|
+
'--proc-pid',
|
300
|
+
str(process_pid),
|
301
|
+
# We pass the initial children list to avoid the race condition where
|
302
|
+
# the process_pid is terminated before the daemon starts and gets the
|
303
|
+
# children list.
|
304
|
+
'--initial-children',
|
305
|
+
','.join(map(str, initial_children)),
|
306
|
+
]
|
307
|
+
|
308
|
+
# We do not need to set `start_new_session=True` here, as the
|
309
|
+
# daemon script will detach itself from the parent process with
|
310
|
+
# fork to avoid being killed by parent process. See the reason we
|
311
|
+
# daemonize the process in `sky/skylet/subprocess_daemon.py`.
|
312
|
+
subprocess.Popen(
|
313
|
+
daemon_cmd,
|
314
|
+
# Suppress output
|
315
|
+
stdout=subprocess.DEVNULL,
|
316
|
+
stderr=subprocess.DEVNULL,
|
317
|
+
# Disable input
|
318
|
+
stdin=subprocess.DEVNULL,
|
319
|
+
)
|
320
|
+
|
321
|
+
|
322
|
+
def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
|
323
|
+
"""Launch a new process that will not be a child of the current process.
|
324
|
+
|
325
|
+
This will launch bash in a new session, which will launch the given cmd.
|
326
|
+
This will ensure that cmd is in its own process tree, and once bash exits,
|
327
|
+
will not be an ancestor of the current process. This is useful for job
|
328
|
+
launching.
|
329
|
+
|
330
|
+
Returns the pid of the launched cmd.
|
331
|
+
"""
|
332
|
+
# Use nohup to ensure the job driver process is a separate process tree,
|
333
|
+
# instead of being a child of the current process. This is important to
|
334
|
+
# avoid a chain of driver processes (job driver can call schedule_step() to
|
335
|
+
# submit new jobs, and the new job can also call schedule_step()
|
336
|
+
# recursively).
|
337
|
+
#
|
338
|
+
# echo $! will output the PID of the last background process started in the
|
339
|
+
# current shell, so we can retrieve it and record in the DB.
|
340
|
+
#
|
341
|
+
# TODO(zhwu): A more elegant solution is to use another daemon process to be
|
342
|
+
# in charge of starting these driver processes, instead of starting them in
|
343
|
+
# the current process.
|
344
|
+
wrapped_cmd = (f'nohup bash -c {shlex.quote(cmd)} '
|
345
|
+
f'</dev/null >{log_output} 2>&1 & echo $!')
|
346
|
+
proc = subprocess.run(wrapped_cmd,
|
347
|
+
stdout=subprocess.PIPE,
|
348
|
+
stderr=subprocess.PIPE,
|
349
|
+
stdin=subprocess.DEVNULL,
|
350
|
+
start_new_session=True,
|
351
|
+
check=True,
|
352
|
+
shell=True,
|
353
|
+
text=True)
|
354
|
+
# Get the PID of the detached process
|
355
|
+
return int(proc.stdout.strip())
|
sky/utils/timeline.py
CHANGED
@@ -9,6 +9,7 @@ import json
|
|
9
9
|
import os
|
10
10
|
import threading
|
11
11
|
import time
|
12
|
+
import traceback
|
12
13
|
from typing import Callable, Optional, Union
|
13
14
|
|
14
15
|
import filelock
|
@@ -48,8 +49,9 @@ class Event:
|
|
48
49
|
'ph': 'B',
|
49
50
|
'ts': f'{time.time() * 10 ** 6: .3f}',
|
50
51
|
})
|
52
|
+
event_begin['args'] = {'stack': '\n'.join(traceback.format_stack())}
|
51
53
|
if self._message is not None:
|
52
|
-
event_begin['args']
|
54
|
+
event_begin['args']['message'] = self._message
|
53
55
|
_events.append(event_begin)
|
54
56
|
|
55
57
|
def end(self):
|
@@ -77,11 +79,11 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
|
|
77
79
|
class FileLockEvent:
|
78
80
|
"""Serve both as a file lock and event for the lock."""
|
79
81
|
|
80
|
-
def __init__(self, lockfile: Union[str, os.PathLike]):
|
82
|
+
def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
|
81
83
|
self._lockfile = lockfile
|
82
|
-
|
83
|
-
|
84
|
-
self._lock = filelock.FileLock(self._lockfile)
|
84
|
+
os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
|
85
|
+
exist_ok=True)
|
86
|
+
self._lock = filelock.FileLock(self._lockfile, timeout)
|
85
87
|
self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
|
86
88
|
|
87
89
|
def acquire(self):
|
@@ -116,7 +118,10 @@ class FileLockEvent:
|
|
116
118
|
return wrapper
|
117
119
|
|
118
120
|
|
119
|
-
def
|
121
|
+
def save_timeline():
|
122
|
+
file_path = os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
|
123
|
+
if not file_path:
|
124
|
+
return
|
120
125
|
json_output = {
|
121
126
|
'traceEvents': _events,
|
122
127
|
'displayTimeUnit': 'ms',
|
@@ -130,4 +135,4 @@ def _save_timeline(file_path: str):
|
|
130
135
|
|
131
136
|
|
132
137
|
if os.environ.get('SKYPILOT_TIMELINE_FILE_PATH'):
|
133
|
-
atexit.register(
|
138
|
+
atexit.register(save_timeline)
|
sky/utils/ux_utils.py
CHANGED
@@ -1,21 +1,42 @@
|
|
1
1
|
"""Utility functions for UX."""
|
2
2
|
import contextlib
|
3
|
+
import enum
|
4
|
+
import os
|
3
5
|
import sys
|
4
6
|
import traceback
|
5
|
-
|
7
|
+
import typing
|
8
|
+
from typing import Callable, Optional, Union
|
6
9
|
|
10
|
+
import colorama
|
7
11
|
import rich.console as rich_console
|
8
12
|
|
9
13
|
from sky import sky_logging
|
14
|
+
from sky.skylet import constants
|
10
15
|
from sky.utils import common_utils
|
11
|
-
|
12
|
-
|
16
|
+
|
17
|
+
if typing.TYPE_CHECKING:
|
18
|
+
import pathlib
|
13
19
|
|
14
20
|
console = rich_console.Console()
|
15
21
|
|
22
|
+
INDENT_SYMBOL = f'{colorama.Style.DIM}├── {colorama.Style.RESET_ALL}'
|
23
|
+
INDENT_LAST_SYMBOL = f'{colorama.Style.DIM}└── {colorama.Style.RESET_ALL}'
|
24
|
+
|
25
|
+
# Console formatting constants
|
26
|
+
BOLD = '\033[1m'
|
27
|
+
RESET_BOLD = '\033[0m'
|
28
|
+
|
29
|
+
# Log path hint in the spinner during launching
|
30
|
+
_LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
|
31
|
+
'{log_path}'
|
32
|
+
f'{colorama.Style.RESET_ALL}')
|
33
|
+
_LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
|
34
|
+
'{log_path}'
|
35
|
+
f'{colorama.Style.RESET_ALL}')
|
36
|
+
|
16
37
|
|
17
38
|
def console_newline():
|
18
|
-
"""
|
39
|
+
"""Prints a newline to the console using rich.
|
19
40
|
|
20
41
|
Useful when catching exceptions inside console.status()
|
21
42
|
"""
|
@@ -38,19 +59,15 @@ def print_exception_no_traceback():
|
|
38
59
|
if error():
|
39
60
|
raise ValueError('...')
|
40
61
|
"""
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
|
46
|
-
sys.tracebacklimit = 0
|
47
|
-
yield
|
48
|
-
sys.tracebacklimit = original_tracelimit
|
62
|
+
original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
|
63
|
+
sys.tracebacklimit = 0
|
64
|
+
yield
|
65
|
+
sys.tracebacklimit = original_tracelimit
|
49
66
|
|
50
67
|
|
51
68
|
@contextlib.contextmanager
|
52
69
|
def enable_traceback():
|
53
|
-
"""
|
70
|
+
"""Reverts the effect of print_exception_no_traceback().
|
54
71
|
|
55
72
|
This is used for usage_lib to collect the full traceback.
|
56
73
|
"""
|
@@ -61,7 +78,7 @@ def enable_traceback():
|
|
61
78
|
|
62
79
|
|
63
80
|
class RedirectOutputForProcess:
|
64
|
-
"""
|
81
|
+
"""Redirects stdout and stderr to a file.
|
65
82
|
|
66
83
|
This class enabled output redirect for multiprocessing.Process.
|
67
84
|
Example usage:
|
@@ -99,6 +116,142 @@ class RedirectOutputForProcess:
|
|
99
116
|
except Exception as e: # pylint: disable=broad-except
|
100
117
|
logger.error(f'Failed to run {self.func.__name__}. '
|
101
118
|
f'Details: {common_utils.format_exception(e)}')
|
102
|
-
with
|
119
|
+
with enable_traceback():
|
103
120
|
logger.error(f' Traceback:\n{traceback.format_exc()}')
|
104
121
|
raise
|
122
|
+
|
123
|
+
|
124
|
+
def log_path_hint(log_path: Union[str, 'pathlib.Path'],
|
125
|
+
is_local: bool = False) -> str:
|
126
|
+
"""Gets the log path hint for the given log path."""
|
127
|
+
log_path = str(log_path)
|
128
|
+
expanded_home = os.path.expanduser('~')
|
129
|
+
if log_path.startswith(expanded_home):
|
130
|
+
log_path = '~' + log_path[len(expanded_home):]
|
131
|
+
if is_local:
|
132
|
+
return _LOG_PATH_HINT_LOCAL.format(log_path=log_path)
|
133
|
+
if log_path.startswith(constants.SKY_LOGS_DIRECTORY):
|
134
|
+
log_path = log_path[len(constants.SKY_LOGS_DIRECTORY):]
|
135
|
+
log_path = log_path.lstrip(os.path.sep)
|
136
|
+
return _LOG_PATH_HINT.format(log_path=log_path)
|
137
|
+
|
138
|
+
|
139
|
+
def starting_message(message: str) -> str:
|
140
|
+
"""Gets the starting message for the given message."""
|
141
|
+
# We have to reset the color before the message, because sometimes if a
|
142
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
143
|
+
# color might be messed up.
|
144
|
+
return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
|
145
|
+
|
146
|
+
|
147
|
+
def finishing_message(message: str,
|
148
|
+
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
149
|
+
is_local: bool = False,
|
150
|
+
follow_up_message: Optional[str] = None) -> str:
|
151
|
+
"""Gets the finishing message for the given message.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
message: The main message to be displayed.
|
155
|
+
log_path: The log path to be displayed in the message.
|
156
|
+
is_local: Whether the log path is local or on remote API server.
|
157
|
+
follow_up_message: A message to be displayed after the main message.
|
158
|
+
The follow up message is not colored.
|
159
|
+
"""
|
160
|
+
# We have to reset the color before the message, because sometimes if a
|
161
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
162
|
+
# color might be messed up.
|
163
|
+
follow_up_message = follow_up_message if (follow_up_message
|
164
|
+
is not None) else ''
|
165
|
+
success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
|
166
|
+
f'{message}{colorama.Style.RESET_ALL}{follow_up_message}')
|
167
|
+
if log_path is None:
|
168
|
+
return success_prefix
|
169
|
+
path_hint = log_path_hint(log_path, is_local)
|
170
|
+
return f'{success_prefix} {path_hint}'
|
171
|
+
|
172
|
+
|
173
|
+
def error_message(message: str,
|
174
|
+
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
175
|
+
is_local: bool = False) -> str:
|
176
|
+
"""Gets the error message for the given message."""
|
177
|
+
# We have to reset the color before the message, because sometimes if a
|
178
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
179
|
+
# color might be messed up.
|
180
|
+
error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
|
181
|
+
f'{colorama.Style.RESET_ALL} {message}')
|
182
|
+
if log_path is None:
|
183
|
+
return error_prefix
|
184
|
+
path_hint = log_path_hint(log_path, is_local)
|
185
|
+
return f'{error_prefix} {path_hint}'
|
186
|
+
|
187
|
+
|
188
|
+
def retry_message(message: str) -> str:
|
189
|
+
"""Gets the retry message for the given message."""
|
190
|
+
# We have to reset the color before the message, because sometimes if a
|
191
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
192
|
+
# color might be messed up.
|
193
|
+
return (f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
|
194
|
+
f'{colorama.Style.RESET_ALL} {message}')
|
195
|
+
|
196
|
+
|
197
|
+
def spinner_message(message: str,
|
198
|
+
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
199
|
+
is_local: bool = False) -> str:
|
200
|
+
"""Gets the spinner message for the given message and log path."""
|
201
|
+
colored_spinner = f'[bold cyan]{message}[/]'
|
202
|
+
if log_path is None:
|
203
|
+
return colored_spinner
|
204
|
+
path_hint = log_path_hint(log_path, is_local)
|
205
|
+
return f'{colored_spinner} {path_hint}'
|
206
|
+
|
207
|
+
|
208
|
+
class CommandHintType(enum.Enum):
|
209
|
+
CLUSTER_JOB = 'cluster_job'
|
210
|
+
MANAGED_JOB = 'managed_job'
|
211
|
+
|
212
|
+
|
213
|
+
def command_hint_messages(hint_type: CommandHintType,
|
214
|
+
job_id: Optional[str] = None,
|
215
|
+
cluster_name: Optional[str] = None) -> str:
|
216
|
+
"""Gets the command hint messages for the given job id."""
|
217
|
+
if hint_type == CommandHintType.CLUSTER_JOB:
|
218
|
+
job_hint_str = (f'\nJob ID: {job_id}'
|
219
|
+
f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
|
220
|
+
f'{BOLD}sky cancel {cluster_name} {job_id}{RESET_BOLD}'
|
221
|
+
f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
|
222
|
+
f'{BOLD}sky logs {cluster_name} {job_id}{RESET_BOLD}'
|
223
|
+
f'\n{INDENT_LAST_SYMBOL}To view job queue:\t\t'
|
224
|
+
f'{BOLD}sky queue {cluster_name}{RESET_BOLD}')
|
225
|
+
cluster_hint_str = (f'\nCluster name: {cluster_name}'
|
226
|
+
f'\n{INDENT_SYMBOL}To log into the head VM:\t'
|
227
|
+
f'{BOLD}ssh {cluster_name}'
|
228
|
+
f'{RESET_BOLD}'
|
229
|
+
f'\n{INDENT_SYMBOL}To submit a job:'
|
230
|
+
f'\t\t{BOLD}sky exec {cluster_name} yaml_file'
|
231
|
+
f'{RESET_BOLD}'
|
232
|
+
f'\n{INDENT_SYMBOL}To stop the cluster:'
|
233
|
+
f'\t{BOLD}sky stop {cluster_name}'
|
234
|
+
f'{RESET_BOLD}'
|
235
|
+
f'\n{INDENT_LAST_SYMBOL}To teardown the cluster:'
|
236
|
+
f'\t{BOLD}sky down {cluster_name}'
|
237
|
+
f'{RESET_BOLD}')
|
238
|
+
hint_str = '\n📋 Useful Commands'
|
239
|
+
if job_id is not None:
|
240
|
+
hint_str += f'{job_hint_str}'
|
241
|
+
hint_str += f'{cluster_hint_str}'
|
242
|
+
return hint_str
|
243
|
+
elif hint_type == CommandHintType.MANAGED_JOB:
|
244
|
+
return (f'\n📋 Useful Commands'
|
245
|
+
f'\nManaged Job ID: {job_id}'
|
246
|
+
f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
|
247
|
+
f'{BOLD}sky jobs cancel {job_id}{RESET_BOLD}'
|
248
|
+
f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
|
249
|
+
f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
|
250
|
+
f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
|
251
|
+
f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
|
252
|
+
f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
|
253
|
+
f'{BOLD}sky jobs queue{RESET_BOLD}'
|
254
|
+
f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
|
255
|
+
f'{BOLD}sky jobs dashboard{RESET_BOLD}')
|
256
|
+
else:
|
257
|
+
raise ValueError(f'Invalid hint type: {hint_type}')
|