skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/data/mounting_utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import textwrap
|
|
|
7
7
|
from typing import Optional
|
|
8
8
|
|
|
9
9
|
from sky import exceptions
|
|
10
|
+
from sky import skypilot_config
|
|
10
11
|
from sky.skylet import constants
|
|
11
12
|
from sky.utils import command_runner
|
|
12
13
|
|
|
@@ -223,7 +224,10 @@ def get_gcs_mount_cmd(bucket_name: str,
|
|
|
223
224
|
"""Returns a command to mount a GCS bucket using gcsfuse."""
|
|
224
225
|
bucket_sub_path_arg = f'--only-dir {_bucket_sub_path} '\
|
|
225
226
|
if _bucket_sub_path else ''
|
|
226
|
-
|
|
227
|
+
log_file = '$(mktemp -t gcsfuse.XXXX.log)'
|
|
228
|
+
mount_cmd = (f'gcsfuse --log-file {log_file} '
|
|
229
|
+
'--debug_fuse_errors '
|
|
230
|
+
'-o allow_other '
|
|
227
231
|
'--implicit-dirs '
|
|
228
232
|
f'--stat-cache-capacity {_STAT_CACHE_CAPACITY} '
|
|
229
233
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
|
@@ -470,6 +474,13 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
|
|
|
470
474
|
f'{hashed_mount_path}.log')
|
|
471
475
|
create_log_cmd = (f'mkdir -p {constants.RCLONE_MOUNT_CACHED_LOG_DIR} && '
|
|
472
476
|
f'touch {log_file_path}')
|
|
477
|
+
|
|
478
|
+
# Check if sequential upload is enabled via config.
|
|
479
|
+
# Default is False (parallel uploads for better performance).
|
|
480
|
+
sequential_upload = skypilot_config.get_nested(
|
|
481
|
+
('data', 'mount_cached', 'sequential_upload'), False)
|
|
482
|
+
transfers_flag = '--transfers 1 ' if sequential_upload else ''
|
|
483
|
+
|
|
473
484
|
# when mounting multiple directories with vfs cache mode, it's handled by
|
|
474
485
|
# rclone to create separate cache directories at ~/.cache/rclone/vfs. It is
|
|
475
486
|
# not necessary to specify separate cache directories.
|
|
@@ -488,13 +499,11 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
|
|
|
488
499
|
# interval allows for faster detection of new or updated files on the
|
|
489
500
|
# remote, but increases the frequency of metadata lookups.
|
|
490
501
|
'--allow-other --vfs-cache-mode full --dir-cache-time 10s '
|
|
491
|
-
# '--transfers 1' guarantees the files written at the local mount point
|
|
492
|
-
# to be uploaded to the backend storage in the order of creation.
|
|
493
502
|
# '--vfs-cache-poll-interval' specifies the frequency of how often
|
|
494
503
|
# rclone checks the local mount point for stale objects in cache.
|
|
495
504
|
# '--vfs-write-back' defines the time to write files on remote storage
|
|
496
505
|
# after last use of the file in local mountpoint.
|
|
497
|
-
'--
|
|
506
|
+
f'{transfers_flag}--vfs-cache-poll-interval 10s --vfs-write-back 1s '
|
|
498
507
|
# Have rclone evict files if the cache size exceeds 10G.
|
|
499
508
|
# This is to prevent cache from growing too large and
|
|
500
509
|
# using up all the disk space. Note that files that opened
|
|
@@ -502,6 +511,9 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
|
|
|
502
511
|
'--vfs-cache-max-size 10G '
|
|
503
512
|
# give each mount its own cache directory
|
|
504
513
|
f'--cache-dir {constants.RCLONE_CACHE_DIR}/{hashed_mount_path} '
|
|
514
|
+
# Use a faster fingerprint algorithm to detect changes in files.
|
|
515
|
+
# Recommended by rclone documentation for buckets like s3.
|
|
516
|
+
'--vfs-fast-fingerprint '
|
|
505
517
|
# This command produces children processes, which need to be
|
|
506
518
|
# detached from the current process's terminal. The command doesn't
|
|
507
519
|
# produce any output, so we aren't dropping any logs.
|
|
@@ -646,8 +658,35 @@ def get_mounting_script(
|
|
|
646
658
|
else
|
|
647
659
|
echo "No goofys log file found in /tmp"
|
|
648
660
|
fi
|
|
661
|
+
elif [ "$MOUNT_BINARY" = "gcsfuse" ]; then
|
|
662
|
+
echo "Looking for gcsfuse log files..."
|
|
663
|
+
# Find gcsfuse log files in /tmp (created by mktemp -t gcsfuse.XXXX.log)
|
|
664
|
+
GCSFUSE_LOGS=$(ls -t /tmp/gcsfuse.*.log 2>/dev/null | head -1)
|
|
665
|
+
if [ -n "$GCSFUSE_LOGS" ]; then
|
|
666
|
+
echo "=== GCSFuse log file contents ==="
|
|
667
|
+
cat "$GCSFUSE_LOGS"
|
|
668
|
+
echo "=== End of gcsfuse log file ==="
|
|
669
|
+
else
|
|
670
|
+
echo "No gcsfuse log file found in /tmp"
|
|
671
|
+
fi
|
|
672
|
+
elif [ "$MOUNT_BINARY" = "rclone" ]; then
|
|
673
|
+
echo "Looking for rclone log files..."
|
|
674
|
+
# Find rclone log files in ~/.sky/rclone_log/ (for MOUNT_CACHED mode)
|
|
675
|
+
RCLONE_LOG_DIR={constants.RCLONE_MOUNT_CACHED_LOG_DIR}
|
|
676
|
+
if [ -d "$RCLONE_LOG_DIR" ]; then
|
|
677
|
+
RCLONE_LOGS=$(ls -t "$RCLONE_LOG_DIR"/*.log 2>/dev/null | head -1)
|
|
678
|
+
if [ -n "$RCLONE_LOGS" ]; then
|
|
679
|
+
echo "=== Rclone log file contents ==="
|
|
680
|
+
tail -50 "$RCLONE_LOGS"
|
|
681
|
+
echo "=== End of rclone log file ==="
|
|
682
|
+
else
|
|
683
|
+
echo "No rclone log file found in $RCLONE_LOG_DIR"
|
|
684
|
+
fi
|
|
685
|
+
else
|
|
686
|
+
echo "Rclone log directory $RCLONE_LOG_DIR not found"
|
|
687
|
+
fi
|
|
649
688
|
fi
|
|
650
|
-
# TODO(kevin): Print logs from
|
|
689
|
+
# TODO(kevin): Print logs from blobfuse2, etc too for observability.
|
|
651
690
|
exit $MOUNT_EXIT_CODE
|
|
652
691
|
fi
|
|
653
692
|
echo "Mounting done."
|
sky/global_user_state.py
CHANGED
|
@@ -16,7 +16,7 @@ import re
|
|
|
16
16
|
import threading
|
|
17
17
|
import time
|
|
18
18
|
import typing
|
|
19
|
-
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
19
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
20
20
|
import uuid
|
|
21
21
|
|
|
22
22
|
import sqlalchemy
|
|
@@ -1020,8 +1020,46 @@ async def cluster_event_retention_daemon():
|
|
|
1020
1020
|
await asyncio.sleep(sleep_amount)
|
|
1021
1021
|
|
|
1022
1022
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1023
|
+
@typing.overload
|
|
1024
|
+
def get_cluster_events(
|
|
1025
|
+
cluster_name: Optional[str],
|
|
1026
|
+
cluster_hash: Optional[str],
|
|
1027
|
+
event_type: ClusterEventType,
|
|
1028
|
+
include_timestamps: Literal[False],
|
|
1029
|
+
limit: Optional[int] = ...,
|
|
1030
|
+
) -> List[str]:
|
|
1031
|
+
...
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
@typing.overload
|
|
1035
|
+
def get_cluster_events(
|
|
1036
|
+
cluster_name: Optional[str],
|
|
1037
|
+
cluster_hash: Optional[str],
|
|
1038
|
+
event_type: ClusterEventType,
|
|
1039
|
+
include_timestamps: Literal[True],
|
|
1040
|
+
limit: Optional[int] = ...,
|
|
1041
|
+
) -> List[Dict[str, Union[str, int]]]:
|
|
1042
|
+
...
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
@typing.overload
|
|
1046
|
+
def get_cluster_events(
|
|
1047
|
+
cluster_name: Optional[str],
|
|
1048
|
+
cluster_hash: Optional[str],
|
|
1049
|
+
event_type: ClusterEventType,
|
|
1050
|
+
include_timestamps: bool = ...,
|
|
1051
|
+
limit: Optional[int] = ...,
|
|
1052
|
+
) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
|
|
1053
|
+
...
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def get_cluster_events(
|
|
1057
|
+
cluster_name: Optional[str],
|
|
1058
|
+
cluster_hash: Optional[str],
|
|
1059
|
+
event_type: ClusterEventType,
|
|
1060
|
+
include_timestamps: bool = False,
|
|
1061
|
+
limit: Optional[int] = None
|
|
1062
|
+
) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
|
|
1025
1063
|
"""Returns the cluster events for the cluster.
|
|
1026
1064
|
|
|
1027
1065
|
Args:
|
|
@@ -1030,22 +1068,44 @@ def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
|
1030
1068
|
cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
|
|
1031
1069
|
is specified.
|
|
1032
1070
|
event_type: Type of the event.
|
|
1071
|
+
include_timestamps: If True, returns list of dicts with 'reason' and
|
|
1072
|
+
'transitioned_at' fields. If False, returns list of reason strings.
|
|
1073
|
+
limit: If specified, returns at most this many events (most recent).
|
|
1074
|
+
If None, returns all events.
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
If include_timestamps is False: List of reason strings.
|
|
1078
|
+
If include_timestamps is True: List of dicts with 'reason' and
|
|
1079
|
+
'transitioned_at' (unix timestamp) fields.
|
|
1080
|
+
Events are ordered from oldest to newest.
|
|
1033
1081
|
"""
|
|
1034
1082
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1035
1083
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1084
|
+
cluster_hash = _resolve_cluster_hash(cluster_hash, cluster_name)
|
|
1085
|
+
if cluster_hash is None:
|
|
1086
|
+
raise ValueError(f'Hash for cluster {cluster_name} not found.')
|
|
1087
|
+
|
|
1088
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1089
|
+
if limit is not None:
|
|
1090
|
+
# To get the most recent N events in ASC order, we use a subquery:
|
|
1091
|
+
# 1. Get most recent N events (ORDER BY DESC LIMIT N)
|
|
1092
|
+
# 2. Re-order them by ASC
|
|
1093
|
+
subquery = session.query(cluster_event_table).filter_by(
|
|
1094
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
1095
|
+
cluster_event_table.c.transitioned_at.desc()).limit(
|
|
1096
|
+
limit).subquery()
|
|
1097
|
+
rows = session.query(subquery).order_by(
|
|
1098
|
+
subquery.c.transitioned_at.asc()).all()
|
|
1099
|
+
else:
|
|
1100
|
+
rows = session.query(cluster_event_table).filter_by(
|
|
1101
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
1102
|
+
cluster_event_table.c.transitioned_at.asc()).all()
|
|
1103
|
+
|
|
1104
|
+
if include_timestamps:
|
|
1105
|
+
return [{
|
|
1106
|
+
'reason': row.reason,
|
|
1107
|
+
'transitioned_at': row.transitioned_at
|
|
1108
|
+
} for row in rows]
|
|
1049
1109
|
return [row.reason for row in rows]
|
|
1050
1110
|
|
|
1051
1111
|
|
|
@@ -1537,6 +1597,38 @@ def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
|
|
|
1537
1597
|
return row.cluster_hash
|
|
1538
1598
|
|
|
1539
1599
|
|
|
1600
|
+
def _resolve_cluster_hash(cluster_hash: Optional[str] = None,
|
|
1601
|
+
cluster_name: Optional[str] = None) -> Optional[str]:
|
|
1602
|
+
"""Resolve cluster_hash from either cluster_hash or cluster_name.
|
|
1603
|
+
|
|
1604
|
+
Validates that exactly one of cluster_hash or cluster_name is provided,
|
|
1605
|
+
then resolves cluster_name to cluster_hash if needed.
|
|
1606
|
+
|
|
1607
|
+
Args:
|
|
1608
|
+
cluster_hash: Direct cluster hash, if known.
|
|
1609
|
+
cluster_name: Cluster name to resolve to hash.
|
|
1610
|
+
|
|
1611
|
+
Returns:
|
|
1612
|
+
The cluster_hash string, or None if cluster_name was provided but
|
|
1613
|
+
the cluster doesn't exist.
|
|
1614
|
+
|
|
1615
|
+
Raises:
|
|
1616
|
+
ValueError: If both or neither of cluster_hash/cluster_name are
|
|
1617
|
+
provided.
|
|
1618
|
+
"""
|
|
1619
|
+
if cluster_hash is not None and cluster_name is not None:
|
|
1620
|
+
raise ValueError(f'Cannot specify both cluster_hash ({cluster_hash}) '
|
|
1621
|
+
f'and cluster_name ({cluster_name})')
|
|
1622
|
+
|
|
1623
|
+
if cluster_hash is None and cluster_name is None:
|
|
1624
|
+
raise ValueError('Must specify either cluster_hash or cluster_name')
|
|
1625
|
+
|
|
1626
|
+
if cluster_name is not None:
|
|
1627
|
+
return _get_hash_for_existing_cluster(cluster_name)
|
|
1628
|
+
|
|
1629
|
+
return cluster_hash
|
|
1630
|
+
|
|
1631
|
+
|
|
1540
1632
|
@_init_db
|
|
1541
1633
|
@metrics_lib.time_me
|
|
1542
1634
|
def get_launched_resources_from_cluster_hash(
|
|
@@ -2241,7 +2333,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
|
|
|
2241
2333
|
rows = session.query(volume_table).all()
|
|
2242
2334
|
else:
|
|
2243
2335
|
rows = session.query(volume_table).filter_by(
|
|
2244
|
-
is_ephemeral=is_ephemeral).all()
|
|
2336
|
+
is_ephemeral=int(is_ephemeral)).all()
|
|
2245
2337
|
records = []
|
|
2246
2338
|
for row in rows:
|
|
2247
2339
|
records.append({
|
|
@@ -2253,7 +2345,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
|
|
|
2253
2345
|
'last_attached_at': row.last_attached_at,
|
|
2254
2346
|
'last_use': row.last_use,
|
|
2255
2347
|
'status': status_lib.VolumeStatus[row.status],
|
|
2256
|
-
'is_ephemeral': row.is_ephemeral,
|
|
2348
|
+
'is_ephemeral': bool(row.is_ephemeral),
|
|
2257
2349
|
})
|
|
2258
2350
|
return records
|
|
2259
2351
|
|
|
@@ -2316,7 +2408,7 @@ def add_volume(
|
|
|
2316
2408
|
last_attached_at=last_attached_at,
|
|
2317
2409
|
last_use=last_use,
|
|
2318
2410
|
status=status.value,
|
|
2319
|
-
is_ephemeral=is_ephemeral,
|
|
2411
|
+
is_ephemeral=int(is_ephemeral),
|
|
2320
2412
|
)
|
|
2321
2413
|
do_update_stmt = insert_stmnt.on_conflict_do_nothing()
|
|
2322
2414
|
session.execute(do_update_stmt)
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -7,6 +7,7 @@ import click
|
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.adaptors import common as adaptors_common
|
|
10
|
+
from sky.backends import backend_utils
|
|
10
11
|
from sky.client import common as client_common
|
|
11
12
|
from sky.client import sdk
|
|
12
13
|
from sky.schemas.api import responses
|
|
@@ -100,9 +101,13 @@ def launch(
|
|
|
100
101
|
pool_statuses = sdk.get(pool_status_request_id)
|
|
101
102
|
if not pool_statuses:
|
|
102
103
|
raise click.UsageError(f'Pool {pool!r} not found.')
|
|
103
|
-
resources
|
|
104
|
-
|
|
105
|
-
|
|
104
|
+
# Show the job's requested resources, not the pool worker
|
|
105
|
+
# resources
|
|
106
|
+
job_resources_str = backend_utils.get_task_resources_str(
|
|
107
|
+
dag.tasks[0], is_managed_job=True)
|
|
108
|
+
click.secho(
|
|
109
|
+
f'Use resources from pool {pool!r}: {job_resources_str}.',
|
|
110
|
+
fg='green')
|
|
106
111
|
if num_jobs is not None:
|
|
107
112
|
job_identity = f'{num_jobs} managed jobs'
|
|
108
113
|
prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
|
sky/jobs/controller.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
"""
|
|
3
3
|
import asyncio
|
|
4
4
|
import io
|
|
5
|
+
import json
|
|
5
6
|
import os
|
|
6
7
|
import pathlib
|
|
7
8
|
import resource
|
|
@@ -11,7 +12,7 @@ import threading
|
|
|
11
12
|
import time
|
|
12
13
|
import traceback
|
|
13
14
|
import typing
|
|
14
|
-
from typing import Dict, Optional, Set
|
|
15
|
+
from typing import Dict, List, Optional, Set
|
|
15
16
|
|
|
16
17
|
import dotenv
|
|
17
18
|
|
|
@@ -31,6 +32,7 @@ from sky.jobs import recovery_strategy
|
|
|
31
32
|
from sky.jobs import scheduler
|
|
32
33
|
from sky.jobs import state as managed_job_state
|
|
33
34
|
from sky.jobs import utils as managed_job_utils
|
|
35
|
+
from sky.server import plugins
|
|
34
36
|
from sky.skylet import constants
|
|
35
37
|
from sky.skylet import job_lib
|
|
36
38
|
from sky.usage import usage_lib
|
|
@@ -43,11 +45,16 @@ from sky.utils import controller_utils
|
|
|
43
45
|
from sky.utils import dag_utils
|
|
44
46
|
from sky.utils import status_lib
|
|
45
47
|
from sky.utils import ux_utils
|
|
48
|
+
from sky.utils.plugin_extensions import ExternalClusterFailure
|
|
49
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
46
50
|
|
|
47
51
|
if typing.TYPE_CHECKING:
|
|
48
52
|
import psutil
|
|
53
|
+
|
|
54
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
49
55
|
else:
|
|
50
56
|
psutil = adaptors_common.LazyImport('psutil')
|
|
57
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
51
58
|
|
|
52
59
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
53
60
|
|
|
@@ -236,6 +243,64 @@ class JobController:
|
|
|
236
243
|
await context_utils.to_thread(managed_job_utils.terminate_cluster,
|
|
237
244
|
cluster_name)
|
|
238
245
|
|
|
246
|
+
async def _get_job_exit_codes(
|
|
247
|
+
self, job_id: Optional[int],
|
|
248
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
249
|
+
) -> Optional[list]:
|
|
250
|
+
"""Retrieve exit codes from the remote cluster.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
job_id: The job ID on the remote cluster.
|
|
254
|
+
handle: The handle to the cluster.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List of exit codes, or None if not available.
|
|
258
|
+
"""
|
|
259
|
+
try:
|
|
260
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
261
|
+
|
|
262
|
+
if not use_legacy:
|
|
263
|
+
try:
|
|
264
|
+
request = jobsv1_pb2.GetJobExitCodesRequest()
|
|
265
|
+
if job_id is not None:
|
|
266
|
+
request.job_id = job_id
|
|
267
|
+
|
|
268
|
+
response = await context_utils.to_thread(
|
|
269
|
+
backend_utils.invoke_skylet_with_retries,
|
|
270
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
271
|
+
handle.get_grpc_channel()).get_job_exit_codes(
|
|
272
|
+
request))
|
|
273
|
+
|
|
274
|
+
exit_codes = list(
|
|
275
|
+
response.exit_codes) if response.exit_codes else None
|
|
276
|
+
return exit_codes
|
|
277
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
278
|
+
# Fall back to legacy if RPC not implemented
|
|
279
|
+
use_legacy = True
|
|
280
|
+
|
|
281
|
+
if use_legacy:
|
|
282
|
+
# Use existing SSH-based code generation
|
|
283
|
+
code = job_lib.JobLibCodeGen.get_job_exit_codes(job_id)
|
|
284
|
+
|
|
285
|
+
returncode, stdout, stderr = await context_utils.to_thread(
|
|
286
|
+
self._backend.run_on_head,
|
|
287
|
+
handle,
|
|
288
|
+
code,
|
|
289
|
+
stream_logs=False,
|
|
290
|
+
require_outputs=True,
|
|
291
|
+
separate_stderr=True)
|
|
292
|
+
|
|
293
|
+
if returncode != 0:
|
|
294
|
+
logger.debug(f'Failed to retrieve exit codes: {stderr}')
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
exit_codes = json.loads(stdout.strip())
|
|
298
|
+
return exit_codes
|
|
299
|
+
except Exception as e: # pylint: disable=broad-except
|
|
300
|
+
logger.debug(f'Failed to retrieve job exit codes: {e}')
|
|
301
|
+
return None
|
|
302
|
+
return None
|
|
303
|
+
|
|
239
304
|
async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
|
240
305
|
"""Busy loop monitoring cluster status and handling recovery.
|
|
241
306
|
|
|
@@ -334,6 +399,12 @@ class JobController:
|
|
|
334
399
|
resources_str = backend_utils.get_task_resources_str(
|
|
335
400
|
task, is_managed_job=True)
|
|
336
401
|
|
|
402
|
+
# Get full_resources_json using get_resource_config which handles
|
|
403
|
+
# heterogeneous resource configurations (any_of/ordered).
|
|
404
|
+
full_resources_json = None
|
|
405
|
+
if task.resources:
|
|
406
|
+
full_resources_json = task.get_resource_config()
|
|
407
|
+
|
|
337
408
|
await managed_job_state.set_starting_async(
|
|
338
409
|
self._job_id,
|
|
339
410
|
task_id,
|
|
@@ -342,9 +413,12 @@ class JobController:
|
|
|
342
413
|
resources_str=resources_str,
|
|
343
414
|
specs={
|
|
344
415
|
'max_restarts_on_errors':
|
|
345
|
-
self._strategy_executor.max_restarts_on_errors
|
|
416
|
+
self._strategy_executor.max_restarts_on_errors,
|
|
417
|
+
'recover_on_exit_codes':
|
|
418
|
+
self._strategy_executor.recover_on_exit_codes
|
|
346
419
|
},
|
|
347
|
-
callback_func=callback_func
|
|
420
|
+
callback_func=callback_func,
|
|
421
|
+
full_resources_json=full_resources_json)
|
|
348
422
|
logger.info(f'Submitted managed job {self._job_id} '
|
|
349
423
|
f'(task: {task_id}, name: {task.name!r}); '
|
|
350
424
|
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
|
@@ -365,9 +439,8 @@ class JobController:
|
|
|
365
439
|
launch_time = time.time() - launch_start
|
|
366
440
|
logger.info(f'Cluster launch completed in {launch_time:.2f}s')
|
|
367
441
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
else:
|
|
442
|
+
job_id_on_pool_cluster: Optional[int] = None
|
|
443
|
+
if self._pool:
|
|
371
444
|
# Update the cluster name when using pool.
|
|
372
445
|
cluster_name, job_id_on_pool_cluster = (
|
|
373
446
|
await
|
|
@@ -411,6 +484,8 @@ class JobController:
|
|
|
411
484
|
except KeyError:
|
|
412
485
|
pass
|
|
413
486
|
|
|
487
|
+
transient_job_check_error_start_time = None
|
|
488
|
+
job_check_backoff = None
|
|
414
489
|
while True:
|
|
415
490
|
status_check_count += 1
|
|
416
491
|
|
|
@@ -462,19 +537,38 @@ class JobController:
|
|
|
462
537
|
# recovering, we will set the job status to None, which will force
|
|
463
538
|
# enter the recovering logic.
|
|
464
539
|
job_status = None
|
|
540
|
+
transient_job_check_error_reason = None
|
|
465
541
|
if not force_transit_to_recovering:
|
|
466
542
|
try:
|
|
467
|
-
job_status = await
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
543
|
+
job_status, transient_job_check_error_reason = await (
|
|
544
|
+
managed_job_utils.get_job_status(
|
|
545
|
+
self._backend,
|
|
546
|
+
cluster_name,
|
|
547
|
+
job_id=job_id_on_pool_cluster,
|
|
548
|
+
))
|
|
472
549
|
except exceptions.FetchClusterInfoError as fetch_e:
|
|
473
550
|
logger.info(
|
|
474
551
|
'Failed to fetch the job status. Start recovery.\n'
|
|
475
552
|
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
|
476
553
|
f'Traceback: {traceback.format_exc()}')
|
|
477
554
|
|
|
555
|
+
# When job status check fails, we need to retry to avoid false alarm
|
|
556
|
+
# for job failure, as it could be a transient error for
|
|
557
|
+
# communication issue.
|
|
558
|
+
if transient_job_check_error_reason is not None:
|
|
559
|
+
logger.info(
|
|
560
|
+
'Potential transient error when fetching the job '
|
|
561
|
+
f'status. Reason: {transient_job_check_error_reason}.\n'
|
|
562
|
+
'Check cluster status to determine if the job is '
|
|
563
|
+
'preempted or failed.')
|
|
564
|
+
if transient_job_check_error_start_time is None:
|
|
565
|
+
transient_job_check_error_start_time = time.time()
|
|
566
|
+
job_check_backoff = common_utils.Backoff(
|
|
567
|
+
initial_backoff=1, max_backoff_factor=5)
|
|
568
|
+
else:
|
|
569
|
+
transient_job_check_error_start_time = None
|
|
570
|
+
job_check_backoff = None
|
|
571
|
+
|
|
478
572
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
479
573
|
logger.info(f'Task {task_id} succeeded! '
|
|
480
574
|
'Getting end time and cleaning up')
|
|
@@ -550,15 +644,16 @@ class JobController:
|
|
|
550
644
|
|
|
551
645
|
# Pull the actual cluster status from the cloud provider to
|
|
552
646
|
# determine whether the cluster is preempted or failed.
|
|
553
|
-
#
|
|
554
|
-
#
|
|
555
|
-
# can
|
|
556
|
-
#
|
|
557
|
-
(cluster_status,
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
647
|
+
# NOTE: Some failures may not be reflected in the cluster status
|
|
648
|
+
# depending on the cloud, which can also cause failure of the job.
|
|
649
|
+
# Plugins can report such failures via ExternalFailureSource.
|
|
650
|
+
# TODO(cooperc): do we need to add this to asyncio thread?
|
|
651
|
+
(cluster_status, handle) = await context_utils.to_thread(
|
|
652
|
+
backend_utils.refresh_cluster_status_handle,
|
|
653
|
+
cluster_name,
|
|
654
|
+
force_refresh_statuses=set(status_lib.ClusterStatus))
|
|
655
|
+
|
|
656
|
+
external_failures: Optional[List[ExternalClusterFailure]] = None
|
|
562
657
|
if cluster_status != status_lib.ClusterStatus.UP:
|
|
563
658
|
# The cluster is (partially) preempted or failed. It can be
|
|
564
659
|
# down, INIT or STOPPED, based on the interruption behavior of
|
|
@@ -569,6 +664,15 @@ class JobController:
|
|
|
569
664
|
logger.info(
|
|
570
665
|
f'Cluster is preempted or failed{cluster_status_str}. '
|
|
571
666
|
'Recovering...')
|
|
667
|
+
if ExternalFailureSource.is_registered():
|
|
668
|
+
cluster_failures = await context_utils.to_thread(
|
|
669
|
+
ExternalFailureSource.get, cluster_name=cluster_name)
|
|
670
|
+
if cluster_failures:
|
|
671
|
+
logger.info(
|
|
672
|
+
f'Detected cluster failures: {cluster_failures}')
|
|
673
|
+
external_failures = (
|
|
674
|
+
ExternalClusterFailure.from_failure_list(
|
|
675
|
+
cluster_failures))
|
|
572
676
|
else:
|
|
573
677
|
if job_status is not None and not job_status.is_terminal():
|
|
574
678
|
# The multi-node job is still running, continue monitoring.
|
|
@@ -612,18 +716,37 @@ class JobController:
|
|
|
612
716
|
'can be caused by the job taking too much memory '
|
|
613
717
|
'or other resources. Try adding more memory, CPU, '
|
|
614
718
|
f'or disk in your job definition. {failure_reason}')
|
|
719
|
+
|
|
720
|
+
# Retrieve exit codes from the failed job
|
|
721
|
+
exit_codes = await self._get_job_exit_codes(
|
|
722
|
+
job_id_on_pool_cluster, handle)
|
|
723
|
+
|
|
615
724
|
should_restart_on_failure = (
|
|
616
|
-
self._strategy_executor.should_restart_on_failure(
|
|
725
|
+
self._strategy_executor.should_restart_on_failure(
|
|
726
|
+
exit_codes=exit_codes))
|
|
617
727
|
if should_restart_on_failure:
|
|
618
728
|
max_restarts = (
|
|
619
729
|
self._strategy_executor.max_restarts_on_errors)
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
f'
|
|
623
|
-
f'Retry the job as max_restarts_on_errors is '
|
|
624
|
-
f'set to {max_restarts}. '
|
|
730
|
+
exit_code_msg = (
|
|
731
|
+
'(Retry the job as '
|
|
732
|
+
f'max_restarts_on_errors is set to {max_restarts}. '
|
|
625
733
|
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
|
626
|
-
f'/{max_restarts}]')
|
|
734
|
+
f'/{max_restarts}])')
|
|
735
|
+
if (exit_codes and
|
|
736
|
+
self._strategy_executor.recover_on_exit_codes):
|
|
737
|
+
recover_codes = (
|
|
738
|
+
self._strategy_executor.recover_on_exit_codes)
|
|
739
|
+
matching_codes = [
|
|
740
|
+
c for c in exit_codes if c in recover_codes
|
|
741
|
+
]
|
|
742
|
+
if matching_codes:
|
|
743
|
+
exit_code_msg = (
|
|
744
|
+
f'(Exit code(s) {matching_codes} matched '
|
|
745
|
+
'recover_on_exit_codes '
|
|
746
|
+
f'[{recover_codes}])')
|
|
747
|
+
logger.info(
|
|
748
|
+
'User program crashed '
|
|
749
|
+
f'({managed_job_status.value}). {exit_code_msg}')
|
|
627
750
|
else:
|
|
628
751
|
logger.info(
|
|
629
752
|
f'Task {task_id} failed and will not be retried')
|
|
@@ -655,9 +778,42 @@ class JobController:
|
|
|
655
778
|
# job status. Try to recover the job (will not restart the
|
|
656
779
|
# cluster, if the cluster is healthy).
|
|
657
780
|
assert job_status is None, job_status
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
781
|
+
if transient_job_check_error_reason is not None:
|
|
782
|
+
assert (transient_job_check_error_start_time
|
|
783
|
+
is not None), (
|
|
784
|
+
transient_job_check_error_start_time,
|
|
785
|
+
transient_job_check_error_reason)
|
|
786
|
+
assert job_check_backoff is not None, (
|
|
787
|
+
job_check_backoff, transient_job_check_error_reason)
|
|
788
|
+
elapsed = time.time(
|
|
789
|
+
) - transient_job_check_error_start_time
|
|
790
|
+
if (elapsed < managed_job_utils.
|
|
791
|
+
JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS):
|
|
792
|
+
remaining_timeout = (
|
|
793
|
+
managed_job_utils.
|
|
794
|
+
JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS -
|
|
795
|
+
elapsed)
|
|
796
|
+
backoff_time = min(
|
|
797
|
+
job_check_backoff.current_backoff(),
|
|
798
|
+
remaining_timeout)
|
|
799
|
+
logger.info(
|
|
800
|
+
'Failed to fetch the job status while the '
|
|
801
|
+
'cluster is healthy. Retrying to avoid false'
|
|
802
|
+
'alarm for job failure. Retrying in '
|
|
803
|
+
f'{backoff_time:.1f} seconds...')
|
|
804
|
+
await asyncio.sleep(backoff_time)
|
|
805
|
+
continue
|
|
806
|
+
else:
|
|
807
|
+
logger.info(
|
|
808
|
+
'Failed to fetch the job status after retrying '
|
|
809
|
+
f'for {elapsed:.1f} seconds. Try to recover '
|
|
810
|
+
'the job by restarting the job/cluster.')
|
|
811
|
+
else:
|
|
812
|
+
logger.info(
|
|
813
|
+
'Failed to fetch the job status due to '
|
|
814
|
+
'unrecoverable error. Try to recover the job by'
|
|
815
|
+
' restarting the job/cluster.')
|
|
816
|
+
|
|
661
817
|
# When the handle is None, the cluster should be cleaned up already.
|
|
662
818
|
if handle is not None:
|
|
663
819
|
resources = handle.launched_resources
|
|
@@ -688,7 +844,9 @@ class JobController:
|
|
|
688
844
|
job_id=self._job_id,
|
|
689
845
|
task_id=task_id,
|
|
690
846
|
force_transit_to_recovering=force_transit_to_recovering,
|
|
691
|
-
callback_func=callback_func
|
|
847
|
+
callback_func=callback_func,
|
|
848
|
+
external_failures=external_failures,
|
|
849
|
+
)
|
|
692
850
|
|
|
693
851
|
recovered_time = await self._strategy_executor.recover()
|
|
694
852
|
|
|
@@ -1183,6 +1341,8 @@ async def main(controller_uuid: str):
|
|
|
1183
1341
|
|
|
1184
1342
|
context_utils.hijack_sys_attrs()
|
|
1185
1343
|
|
|
1344
|
+
plugins.load_plugins(plugins.ExtensionContext())
|
|
1345
|
+
|
|
1186
1346
|
controller = ControllerManager(controller_uuid)
|
|
1187
1347
|
|
|
1188
1348
|
# Will happen multiple times, who cares though
|