skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
This is a remote utility module that provides logging functionality.
|
4
4
|
"""
|
5
|
+
import collections
|
5
6
|
import copy
|
6
7
|
import io
|
7
8
|
import multiprocessing.pool
|
@@ -12,7 +13,8 @@ import sys
|
|
12
13
|
import tempfile
|
13
14
|
import textwrap
|
14
15
|
import time
|
15
|
-
from typing import Dict, Iterator, List, Optional,
|
16
|
+
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
17
|
+
Tuple, Union)
|
16
18
|
|
17
19
|
import colorama
|
18
20
|
|
@@ -21,13 +23,19 @@ from sky.skylet import constants
|
|
21
23
|
from sky.skylet import job_lib
|
22
24
|
from sky.utils import log_utils
|
23
25
|
from sky.utils import subprocess_utils
|
26
|
+
from sky.utils import ux_utils
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
SKY_LOG_WAITING_GAP_SECONDS = 1
|
29
|
+
SKY_LOG_WAITING_MAX_RETRY = 5
|
30
|
+
SKY_LOG_TAILING_GAP_SECONDS = 0.2
|
31
|
+
# Peek the head of the lines to check if we need to start
|
32
|
+
# streaming when tail > 0.
|
33
|
+
PEEK_HEAD_LINES_FOR_START_STREAM = 20
|
28
34
|
|
29
35
|
logger = sky_logging.init_logger(__name__)
|
30
36
|
|
37
|
+
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
38
|
+
|
31
39
|
|
32
40
|
class _ProcessingArgs:
|
33
41
|
"""Arguments for processing logs."""
|
@@ -170,53 +178,19 @@ def run_with_log(
|
|
170
178
|
if process_stream:
|
171
179
|
stdout_arg = subprocess.PIPE
|
172
180
|
stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
|
181
|
+
# Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
|
182
|
+
# the terminal output when typing in the terminal that starts the API
|
183
|
+
# server.
|
184
|
+
stdin = kwargs.pop('stdin', subprocess.DEVNULL)
|
173
185
|
with subprocess.Popen(cmd,
|
174
186
|
stdout=stdout_arg,
|
175
187
|
stderr=stderr_arg,
|
176
188
|
start_new_session=True,
|
177
189
|
shell=shell,
|
190
|
+
stdin=stdin,
|
178
191
|
**kwargs) as proc:
|
179
192
|
try:
|
180
|
-
|
181
|
-
# open a new subprocess to gracefully kill the proc, SIGTERM
|
182
|
-
# and then SIGKILL the process group.
|
183
|
-
# Adapted from ray/dashboard/modules/job/job_manager.py#L154
|
184
|
-
parent_pid = os.getpid()
|
185
|
-
daemon_script = os.path.join(
|
186
|
-
os.path.dirname(os.path.abspath(job_lib.__file__)),
|
187
|
-
'subprocess_daemon.py')
|
188
|
-
if not hasattr(constants, 'SKY_GET_PYTHON_PATH_CMD'):
|
189
|
-
# Backward compatibility: for cluster started before #3326, this
|
190
|
-
# constant does not exist. Since we generate the job script
|
191
|
-
# in backends.cloud_vm_ray_backend with inspect, so the
|
192
|
-
# the lates `run_with_log` will be used, but the `constants` is
|
193
|
-
# not updated. We fallback to `python3` in this case.
|
194
|
-
# TODO(zhwu): remove this after 0.7.0.
|
195
|
-
python_path = 'python3'
|
196
|
-
else:
|
197
|
-
python_path = subprocess.check_output(
|
198
|
-
constants.SKY_GET_PYTHON_PATH_CMD,
|
199
|
-
shell=True,
|
200
|
-
stderr=subprocess.DEVNULL,
|
201
|
-
encoding='utf-8').strip()
|
202
|
-
daemon_cmd = [
|
203
|
-
python_path,
|
204
|
-
daemon_script,
|
205
|
-
'--parent-pid',
|
206
|
-
str(parent_pid),
|
207
|
-
'--proc-pid',
|
208
|
-
str(proc.pid),
|
209
|
-
]
|
210
|
-
|
211
|
-
subprocess.Popen(
|
212
|
-
daemon_cmd,
|
213
|
-
start_new_session=True,
|
214
|
-
# Suppress output
|
215
|
-
stdout=subprocess.DEVNULL,
|
216
|
-
stderr=subprocess.DEVNULL,
|
217
|
-
# Disable input
|
218
|
-
stdin=subprocess.DEVNULL,
|
219
|
-
)
|
193
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
220
194
|
stdout = ''
|
221
195
|
stderr = ''
|
222
196
|
|
@@ -263,6 +237,9 @@ def make_task_bash_script(codegen: str,
|
|
263
237
|
# set -a is used for exporting all variables functions to the environment
|
264
238
|
# so that bash `user_script` can access `conda activate`. Detail: #436.
|
265
239
|
# Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html # pylint: disable=line-too-long
|
240
|
+
# DEACTIVATE_SKY_REMOTE_PYTHON_ENV: Deactivate the SkyPilot runtime env, as
|
241
|
+
# the ray cluster is started within the runtime env, which may cause the
|
242
|
+
# user program to run in that env as well.
|
266
243
|
# PYTHONUNBUFFERED is used to disable python output buffering.
|
267
244
|
script = [
|
268
245
|
textwrap.dedent(f"""\
|
@@ -271,6 +248,7 @@ def make_task_bash_script(codegen: str,
|
|
271
248
|
set -a
|
272
249
|
. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true
|
273
250
|
set +a
|
251
|
+
{constants.DEACTIVATE_SKY_REMOTE_PYTHON_ENV}
|
274
252
|
export PYTHONUNBUFFERED=1
|
275
253
|
cd {constants.SKY_REMOTE_WORKDIR}"""),
|
276
254
|
]
|
@@ -316,21 +294,16 @@ def run_bash_command_with_log(bash_command: str,
|
|
316
294
|
# Need this `-i` option to make sure `source ~/.bashrc` work.
|
317
295
|
inner_command = f'/bin/bash -i {script_path}'
|
318
296
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
log_path,
|
325
|
-
stream_logs=stream_logs,
|
326
|
-
with_ray=with_ray,
|
327
|
-
# Disable input to avoid blocking.
|
328
|
-
stdin=subprocess.DEVNULL,
|
329
|
-
shell=True)
|
297
|
+
return run_with_log(inner_command,
|
298
|
+
log_path,
|
299
|
+
stream_logs=stream_logs,
|
300
|
+
with_ray=with_ray,
|
301
|
+
shell=True)
|
330
302
|
|
331
303
|
|
332
304
|
def _follow_job_logs(file,
|
333
305
|
job_id: int,
|
306
|
+
start_streaming: bool,
|
334
307
|
start_streaming_at: str = '') -> Iterator[str]:
|
335
308
|
"""Yield each line from a file as they are written.
|
336
309
|
|
@@ -339,7 +312,6 @@ def _follow_job_logs(file,
|
|
339
312
|
# No need to lock the status here, as the while loop can handle
|
340
313
|
# the older status.
|
341
314
|
status = job_lib.get_status_no_lock(job_id)
|
342
|
-
start_streaming = False
|
343
315
|
wait_last_logs = True
|
344
316
|
while True:
|
345
317
|
tmp = file.readline()
|
@@ -366,21 +338,58 @@ def _follow_job_logs(file,
|
|
366
338
|
]:
|
367
339
|
if wait_last_logs:
|
368
340
|
# Wait all the logs are printed before exit.
|
369
|
-
time.sleep(1 +
|
341
|
+
time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
|
370
342
|
wait_last_logs = False
|
371
343
|
continue
|
372
344
|
status_str = status.value if status is not None else 'None'
|
373
|
-
print(
|
345
|
+
print(ux_utils.finishing_message(
|
346
|
+
f'Job finished (status: {status_str}).'),
|
347
|
+
flush=True)
|
374
348
|
return
|
375
349
|
|
376
|
-
time.sleep(
|
350
|
+
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
377
351
|
status = job_lib.get_status_no_lock(job_id)
|
378
352
|
|
379
353
|
|
354
|
+
def _peek_head_lines(log_file: TextIO) -> List[str]:
|
355
|
+
"""Peek the head of the file."""
|
356
|
+
lines = [
|
357
|
+
log_file.readline() for _ in range(PEEK_HEAD_LINES_FOR_START_STREAM)
|
358
|
+
]
|
359
|
+
# Reset the file pointer to the beginning
|
360
|
+
log_file.seek(0, os.SEEK_SET)
|
361
|
+
return [line for line in lines if line]
|
362
|
+
|
363
|
+
|
364
|
+
def _should_stream_the_whole_tail_lines(head_lines_of_log_file: List[str],
|
365
|
+
tail_lines: Deque[str],
|
366
|
+
start_stream_at: str) -> bool:
|
367
|
+
"""Check if the entire tail lines should be streamed."""
|
368
|
+
# See comment:
|
369
|
+
# https://github.com/skypilot-org/skypilot/pull/4241#discussion_r1833611567
|
370
|
+
# for more details.
|
371
|
+
# Case 1: If start_stream_at is found at the head of the tail lines,
|
372
|
+
# we should not stream the whole tail lines.
|
373
|
+
for index, line in enumerate(tail_lines):
|
374
|
+
if index >= PEEK_HEAD_LINES_FOR_START_STREAM:
|
375
|
+
break
|
376
|
+
if start_stream_at in line:
|
377
|
+
return False
|
378
|
+
# Case 2: If start_stream_at is found at the head of log file, but not at
|
379
|
+
# the tail lines, we need to stream the whole tail lines.
|
380
|
+
for line in head_lines_of_log_file:
|
381
|
+
if start_stream_at in line:
|
382
|
+
return True
|
383
|
+
# Case 3: If start_stream_at is not at the head, and not found at the tail
|
384
|
+
# lines, we should not stream the whole tail lines.
|
385
|
+
return False
|
386
|
+
|
387
|
+
|
380
388
|
def tail_logs(job_id: Optional[int],
|
381
389
|
log_dir: Optional[str],
|
382
390
|
managed_job_id: Optional[int] = None,
|
383
|
-
follow: bool = True
|
391
|
+
follow: bool = True,
|
392
|
+
tail: int = 0) -> None:
|
384
393
|
"""Tail the logs of a job.
|
385
394
|
|
386
395
|
Args:
|
@@ -389,6 +398,8 @@ def tail_logs(job_id: Optional[int],
|
|
389
398
|
managed_job_id: The managed job id (for logging info only to avoid
|
390
399
|
confusion).
|
391
400
|
follow: Whether to follow the logs or print the logs so far and exit.
|
401
|
+
tail: The number of lines to display from the end of the log file,
|
402
|
+
if 0, print all lines.
|
392
403
|
"""
|
393
404
|
if job_id is None:
|
394
405
|
# This only happens when job_lib.get_latest_job_id() returns None,
|
@@ -405,8 +416,6 @@ def tail_logs(job_id: Optional[int],
|
|
405
416
|
return
|
406
417
|
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
407
418
|
f'{managed_job_id}.')
|
408
|
-
logger.info(f'{colorama.Fore.YELLOW}Start streaming logs for {job_str}.'
|
409
|
-
f'{colorama.Style.RESET_ALL}')
|
410
419
|
log_path = os.path.join(log_dir, 'run.log')
|
411
420
|
log_path = os.path.expanduser(log_path)
|
412
421
|
|
@@ -419,18 +428,20 @@ def tail_logs(job_id: Optional[int],
|
|
419
428
|
retry_cnt += 1
|
420
429
|
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
421
430
|
break
|
422
|
-
if retry_cnt >=
|
431
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
423
432
|
print(
|
424
433
|
f'{colorama.Fore.RED}ERROR: Logs for '
|
425
434
|
f'{job_str} (status: {status.value}) does not exist '
|
426
435
|
f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
|
427
436
|
return
|
428
|
-
print(f'INFO: Waiting {
|
437
|
+
print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
429
438
|
'to be written...')
|
430
|
-
time.sleep(
|
439
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
431
440
|
status = job_lib.update_job_status([job_id], silent=True)[0]
|
432
441
|
|
433
|
-
start_stream_at =
|
442
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
443
|
+
# Explicitly declare the type to avoid mypy warning.
|
444
|
+
lines: Iterable[str] = []
|
434
445
|
if follow and status in [
|
435
446
|
job_lib.JobStatus.SETTING_UP,
|
436
447
|
job_lib.JobStatus.PENDING,
|
@@ -441,19 +452,48 @@ def tail_logs(job_id: Optional[int],
|
|
441
452
|
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
442
453
|
# Using `_follow` instead of `tail -f` to streaming the whole
|
443
454
|
# log and creating a new process for tail.
|
455
|
+
start_streaming = False
|
456
|
+
if tail > 0:
|
457
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
458
|
+
lines = collections.deque(log_file, maxlen=tail)
|
459
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
460
|
+
head_lines_of_log_file, lines, start_stream_at)
|
461
|
+
for line in lines:
|
462
|
+
if start_stream_at in line:
|
463
|
+
start_streaming = True
|
464
|
+
if start_streaming:
|
465
|
+
print(line, end='')
|
466
|
+
# Flush the last n lines
|
467
|
+
print(end='', flush=True)
|
468
|
+
# Now, the cursor is at the end of the last lines
|
469
|
+
# if tail > 0
|
444
470
|
for line in _follow_job_logs(log_file,
|
445
471
|
job_id=job_id,
|
472
|
+
start_streaming=start_streaming,
|
446
473
|
start_streaming_at=start_stream_at):
|
447
474
|
print(line, end='', flush=True)
|
448
475
|
else:
|
449
476
|
try:
|
450
|
-
|
451
|
-
with open(log_path, 'r', encoding='utf-8') as
|
452
|
-
|
477
|
+
start_streaming = False
|
478
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
479
|
+
if tail > 0:
|
480
|
+
# If tail > 0, we need to read the last n lines.
|
481
|
+
# We use double ended queue to rotate the last n lines.
|
482
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
483
|
+
lines = collections.deque(log_file, maxlen=tail)
|
484
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
485
|
+
head_lines_of_log_file, lines, start_stream_at)
|
486
|
+
else:
|
487
|
+
lines = log_file
|
488
|
+
for line in lines:
|
453
489
|
if start_stream_at in line:
|
454
|
-
|
455
|
-
if
|
490
|
+
start_streaming = True
|
491
|
+
if start_streaming:
|
456
492
|
print(line, end='', flush=True)
|
493
|
+
status_str = status.value if status is not None else 'None'
|
494
|
+
print(ux_utils.finishing_message(
|
495
|
+
f'Job finished (status: {status_str}).'),
|
496
|
+
flush=True)
|
457
497
|
except FileNotFoundError:
|
458
498
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
459
499
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
sky/skylet/log_lib.pyi
CHANGED
@@ -13,6 +13,12 @@ from sky.skylet import constants as constants
|
|
13
13
|
from sky.skylet import job_lib as job_lib
|
14
14
|
from sky.utils import log_utils as log_utils
|
15
15
|
|
16
|
+
SKY_LOG_WAITING_GAP_SECONDS: int = ...
|
17
|
+
SKY_LOG_WAITING_MAX_RETRY: int = ...
|
18
|
+
SKY_LOG_TAILING_GAP_SECONDS: float = ...
|
19
|
+
LOG_FILE_START_STREAMING_AT: str = ...
|
20
|
+
|
21
|
+
|
16
22
|
class _ProcessingArgs:
|
17
23
|
log_path: str
|
18
24
|
stream_logs: bool
|
@@ -25,7 +25,7 @@ def docker_start_cmds(
|
|
25
25
|
docker_cmd,
|
26
26
|
):
|
27
27
|
"""Generating docker start command without --rm.
|
28
|
-
|
28
|
+
|
29
29
|
The code is borrowed from `ray.autoscaler._private.docker`.
|
30
30
|
|
31
31
|
Changes we made:
|
@@ -65,8 +65,8 @@ def docker_start_cmds(
|
|
65
65
|
'--cap-add=SYS_ADMIN',
|
66
66
|
'--device=/dev/fuse',
|
67
67
|
'--security-opt=apparmor:unconfined',
|
68
|
+
'--entrypoint=/bin/bash',
|
68
69
|
image,
|
69
|
-
'bash',
|
70
70
|
]
|
71
71
|
return ' '.join(docker_run)
|
72
72
|
|
@@ -159,19 +159,17 @@ class SkyDockerCommandRunner(DockerCommandRunner):
|
|
159
159
|
return True
|
160
160
|
|
161
161
|
# SkyPilot: Docker login if user specified a private docker registry.
|
162
|
-
if
|
162
|
+
if 'docker_login_config' in self.docker_config:
|
163
163
|
# TODO(tian): Maybe support a command to get the login password?
|
164
|
-
docker_login_config: docker_utils.DockerLoginConfig =
|
165
|
-
|
164
|
+
docker_login_config: docker_utils.DockerLoginConfig = (
|
165
|
+
self.docker_config['docker_login_config'])
|
166
166
|
self._run_with_retry(
|
167
167
|
f'{self.docker_cmd} login --username '
|
168
168
|
f'{docker_login_config.username} --password '
|
169
169
|
f'{docker_login_config.password} {docker_login_config.server}')
|
170
170
|
# We automatically add the server prefix to the image name if
|
171
171
|
# the user did not add it.
|
172
|
-
|
173
|
-
if not specific_image.startswith(server_prefix):
|
174
|
-
specific_image = f'{server_prefix}{specific_image}'
|
172
|
+
specific_image = docker_login_config.format_image(specific_image)
|
175
173
|
|
176
174
|
if self.docker_config.get('pull_before_run', True):
|
177
175
|
assert specific_image, ('Image must be included in config if '
|
@@ -377,7 +377,7 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
377
377
|
node["id"], nic_id
|
378
378
|
).get_result()
|
379
379
|
floating_ips = res["floating_ips"]
|
380
|
-
if
|
380
|
+
if not floating_ips:
|
381
381
|
# not adding a node that's yet/failed to
|
382
382
|
# to get a floating ip provisioned
|
383
383
|
continue
|
@@ -485,7 +485,7 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
485
485
|
"""Returns instance (node) information matching the specified name"""
|
486
486
|
|
487
487
|
instances_data = self.ibm_vpc_client.list_instances(name=name).get_result()
|
488
|
-
if
|
488
|
+
if instances_data["instances"]:
|
489
489
|
return instances_data["instances"][0]
|
490
490
|
return None
|
491
491
|
|
@@ -107,20 +107,28 @@ class ZoneConfig:
|
|
107
107
|
for item in subnet_contents
|
108
108
|
if item['subnetState'] == 'ACTIVE' and item["vpcId"] == vpc
|
109
109
|
]
|
110
|
-
if
|
110
|
+
if subnet_list:
|
111
111
|
vpc_subnets[vpc] = subnet_list
|
112
112
|
|
113
113
|
return vpc_subnets
|
114
114
|
|
115
115
|
def _get_vm_init_script(self, ssh_public_key):
|
116
116
|
|
117
|
+
import subprocess
|
117
118
|
init_script_content = self._get_default_config_cmd(
|
118
119
|
) + self._get_ssh_key_gen_cmd(ssh_public_key)
|
120
|
+
init_script_content_string = f'"{init_script_content}"'
|
121
|
+
command = f'echo {init_script_content_string} | base64'
|
122
|
+
result = subprocess.run(command,
|
123
|
+
shell=True,
|
124
|
+
capture_output=True,
|
125
|
+
text=True)
|
126
|
+
init_script_content_base64 = result.stdout
|
119
127
|
return {
|
120
|
-
"encodingType": "
|
128
|
+
"encodingType": "base64",
|
121
129
|
"initialScriptShell": "bash",
|
122
130
|
"initialScriptType": "text",
|
123
|
-
"initialScriptContent":
|
131
|
+
"initialScriptContent": init_script_content_base64
|
124
132
|
}
|
125
133
|
|
126
134
|
def _get_ssh_key_gen_cmd(self, ssh_public_key):
|
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
|
|
180
180
|
metadata['tags'] = instance_info['tags']
|
181
181
|
# TODO(ewzeng): The internal ip is hard to get, so set it to the
|
182
182
|
# external ip as a hack. This should be changed in the future.
|
183
|
-
# https://docs.lambdalabs.com/cloud/learn-private-ip-address
|
183
|
+
# https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
|
184
184
|
metadata['internal_ip'] = vm['ip']
|
185
185
|
metadata['external_ip'] = vm['external_ip']
|
186
186
|
return metadata
|
@@ -259,7 +259,7 @@ class SCPNodeProvider(NodeProvider):
|
|
259
259
|
for sg in sg_contents
|
260
260
|
if sg["securityGroupId"] == sg_id
|
261
261
|
]
|
262
|
-
if
|
262
|
+
if sg and sg[0] == "ACTIVE":
|
263
263
|
break
|
264
264
|
time.sleep(5)
|
265
265
|
|
@@ -282,16 +282,16 @@ class SCPNodeProvider(NodeProvider):
|
|
282
282
|
for sg in sg_contents
|
283
283
|
if sg["securityGroupId"] == sg_id
|
284
284
|
]
|
285
|
-
if
|
285
|
+
if not sg:
|
286
286
|
break
|
287
287
|
|
288
288
|
def _refresh_security_group(self, vms):
|
289
|
-
if
|
289
|
+
if vms:
|
290
290
|
return
|
291
291
|
# remove security group if vm does not exist
|
292
292
|
keys = self.metadata.keys()
|
293
293
|
security_group_id = self.metadata[
|
294
|
-
keys[0]]['creation']['securityGroupId'] if
|
294
|
+
keys[0]]['creation']['securityGroupId'] if keys else None
|
295
295
|
if security_group_id:
|
296
296
|
try:
|
297
297
|
self._del_security_group(security_group_id)
|
@@ -308,7 +308,7 @@ class SCPNodeProvider(NodeProvider):
|
|
308
308
|
for vm in vm_contents
|
309
309
|
if vm["virtualServerId"] == vm_id
|
310
310
|
]
|
311
|
-
if
|
311
|
+
if not vms:
|
312
312
|
break
|
313
313
|
|
314
314
|
def _del_firwall_rules(self, firewall_id, rule_ids):
|
@@ -391,7 +391,7 @@ class SCPNodeProvider(NodeProvider):
|
|
391
391
|
return None, None, None, None
|
392
392
|
|
393
393
|
def _undo_funcs(self, undo_func_list):
|
394
|
-
while
|
394
|
+
while undo_func_list:
|
395
395
|
func = undo_func_list.pop()
|
396
396
|
func()
|
397
397
|
|
@@ -468,7 +468,7 @@ class SCPNodeProvider(NodeProvider):
|
|
468
468
|
|
469
469
|
zone_config = ZoneConfig(self.scp_client, node_config)
|
470
470
|
vpc_subnets = zone_config.get_vcp_subnets()
|
471
|
-
if
|
471
|
+
if not vpc_subnets:
|
472
472
|
raise SCPError("This region/zone does not have available VPCs.")
|
473
473
|
|
474
474
|
instance_config = zone_config.bootstrap_instance_config(node_config)
|
sky/skylet/skylet.py
CHANGED
@@ -20,11 +20,13 @@ EVENTS = [
|
|
20
20
|
# The managed job update event should be after the job update event.
|
21
21
|
# Otherwise, the abnormal managed job status update will be delayed
|
22
22
|
# until the next job update event.
|
23
|
-
events.
|
23
|
+
events.ManagedJobEvent(),
|
24
24
|
# This is for monitoring controller job status. If it becomes
|
25
25
|
# unhealthy, this event will correctly update the controller
|
26
26
|
# status to CONTROLLER_FAILED.
|
27
27
|
events.ServiceUpdateEvent(),
|
28
|
+
# Report usage heartbeat every 10 minutes.
|
29
|
+
events.UsageHeartbeatReportEvent(),
|
28
30
|
]
|
29
31
|
|
30
32
|
while True:
|
sky/skylet/subprocess_daemon.py
CHANGED
@@ -1,20 +1,57 @@
|
|
1
1
|
"""Sky subprocess daemon.
|
2
|
-
|
3
2
|
Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child
|
4
3
|
processes of proc_pid.
|
5
4
|
"""
|
6
|
-
|
7
5
|
import argparse
|
6
|
+
import os
|
8
7
|
import sys
|
9
8
|
import time
|
10
9
|
|
11
10
|
import psutil
|
12
11
|
|
13
|
-
if __name__ == '__main__':
|
14
12
|
|
13
|
+
def daemonize():
|
14
|
+
"""Detaches the process from its parent process with double-forking.
|
15
|
+
|
16
|
+
This detachment is crucial in the context of SkyPilot and Ray job. When
|
17
|
+
'sky cancel' is executed, it uses Ray's stop job API to terminate the job.
|
18
|
+
Without daemonization, this subprocess_daemon process will still be a child
|
19
|
+
of the parent process which would be terminated along with the parent
|
20
|
+
process, ray::task or the cancel request for jobs, which is launched with
|
21
|
+
Ray job. Daemonization ensures this process survives the 'sky cancel'
|
22
|
+
command, allowing it to prevent orphaned processes of Ray job.
|
23
|
+
"""
|
24
|
+
# First fork: Creates a child process identical to the parent
|
25
|
+
if os.fork() > 0:
|
26
|
+
# Parent process exits, allowing the child to run independently
|
27
|
+
sys.exit()
|
28
|
+
|
29
|
+
# Continues to run from first forked child process.
|
30
|
+
# Detach from parent environment.
|
31
|
+
os.setsid()
|
32
|
+
|
33
|
+
# Second fork: Creates a grandchild process
|
34
|
+
if os.fork() > 0:
|
35
|
+
# First child exits, orphaning the grandchild
|
36
|
+
sys.exit()
|
37
|
+
# Continues execution in the grandchild process
|
38
|
+
# This process is now fully detached from the original parent and terminal
|
39
|
+
|
40
|
+
|
41
|
+
if __name__ == '__main__':
|
42
|
+
daemonize()
|
15
43
|
parser = argparse.ArgumentParser()
|
16
44
|
parser.add_argument('--parent-pid', type=int, required=True)
|
17
45
|
parser.add_argument('--proc-pid', type=int, required=True)
|
46
|
+
parser.add_argument(
|
47
|
+
'--initial-children',
|
48
|
+
type=str,
|
49
|
+
default='',
|
50
|
+
help=(
|
51
|
+
'Comma-separated list of initial children PIDs. This is to guard '
|
52
|
+
'against the case where the target process has already terminated, '
|
53
|
+
'while the children are still running.'),
|
54
|
+
)
|
18
55
|
args = parser.parse_args()
|
19
56
|
|
20
57
|
process = None
|
@@ -25,32 +62,47 @@ if __name__ == '__main__':
|
|
25
62
|
except psutil.NoSuchProcess:
|
26
63
|
pass
|
27
64
|
|
28
|
-
|
29
|
-
|
65
|
+
# Initialize children list from arguments
|
66
|
+
children = []
|
67
|
+
if args.initial_children:
|
68
|
+
for pid in args.initial_children.split(','):
|
69
|
+
try:
|
70
|
+
child = psutil.Process(int(pid))
|
71
|
+
children.append(child)
|
72
|
+
except (psutil.NoSuchProcess, ValueError):
|
73
|
+
pass
|
30
74
|
|
31
|
-
if parent_process is not None:
|
32
|
-
# Wait for either parent or target process to exit
|
75
|
+
if process is not None and parent_process is not None:
|
76
|
+
# Wait for either parent or target process to exit
|
33
77
|
while process.is_running() and parent_process.is_running():
|
78
|
+
try:
|
79
|
+
tmp_children = process.children(recursive=True)
|
80
|
+
if tmp_children:
|
81
|
+
children = tmp_children
|
82
|
+
except psutil.NoSuchProcess:
|
83
|
+
pass
|
34
84
|
time.sleep(1)
|
35
85
|
|
36
|
-
|
37
|
-
|
38
|
-
children.
|
39
|
-
|
86
|
+
if process is not None:
|
87
|
+
# Kill the target process first to avoid having more children, or fail
|
88
|
+
# the process due to the children being defunct.
|
89
|
+
children = [process] + children
|
90
|
+
|
91
|
+
if not children:
|
40
92
|
sys.exit()
|
41
93
|
|
42
|
-
for
|
94
|
+
for child in children:
|
43
95
|
try:
|
44
|
-
|
96
|
+
child.terminate()
|
45
97
|
except psutil.NoSuchProcess:
|
46
|
-
|
98
|
+
continue
|
47
99
|
|
48
100
|
# Wait 30s for the processes to exit gracefully.
|
49
101
|
time.sleep(30)
|
50
102
|
|
51
103
|
# SIGKILL if they're still running.
|
52
|
-
for
|
104
|
+
for child in children:
|
53
105
|
try:
|
54
|
-
|
106
|
+
child.kill()
|
55
107
|
except psutil.NoSuchProcess:
|
56
|
-
|
108
|
+
continue
|