skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -10,7 +10,7 @@ import logging
|
|
|
10
10
|
import os
|
|
11
11
|
import traceback
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Optional, Set
|
|
13
|
+
from typing import List, Optional, Set
|
|
14
14
|
|
|
15
15
|
from sky import backends
|
|
16
16
|
from sky import dag as dag_lib
|
|
@@ -30,6 +30,7 @@ from sky.usage import usage_lib
|
|
|
30
30
|
from sky.utils import common_utils
|
|
31
31
|
from sky.utils import context_utils
|
|
32
32
|
from sky.utils import env_options
|
|
33
|
+
from sky.utils import instance_links as instance_links_utils
|
|
33
34
|
from sky.utils import registry
|
|
34
35
|
from sky.utils import status_lib
|
|
35
36
|
from sky.utils import ux_utils
|
|
@@ -74,6 +75,7 @@ class StrategyExecutor:
|
|
|
74
75
|
starting: Set[int],
|
|
75
76
|
starting_lock: asyncio.Lock,
|
|
76
77
|
starting_signal: asyncio.Condition,
|
|
78
|
+
recover_on_exit_codes: Optional[List[int]] = None,
|
|
77
79
|
) -> None:
|
|
78
80
|
"""Initialize the strategy executor.
|
|
79
81
|
|
|
@@ -87,6 +89,8 @@ class StrategyExecutor:
|
|
|
87
89
|
starting: Set of job IDs that are currently starting.
|
|
88
90
|
starting_lock: Lock to synchronize starting jobs.
|
|
89
91
|
starting_signal: Condition to signal when a job can start.
|
|
92
|
+
recover_on_exit_codes: List of exit codes that should trigger
|
|
93
|
+
recovery regardless of max_restarts_on_errors limit.
|
|
90
94
|
"""
|
|
91
95
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
92
96
|
'Only CloudVMRayBackend is supported.')
|
|
@@ -99,6 +103,7 @@ class StrategyExecutor:
|
|
|
99
103
|
self.cluster_name = cluster_name
|
|
100
104
|
self.backend = backend
|
|
101
105
|
self.max_restarts_on_errors = max_restarts_on_errors
|
|
106
|
+
self.recover_on_exit_codes = recover_on_exit_codes or []
|
|
102
107
|
self.job_id = job_id
|
|
103
108
|
self.task_id = task_id
|
|
104
109
|
self.pool = pool
|
|
@@ -123,6 +128,9 @@ class StrategyExecutor:
|
|
|
123
128
|
) -> 'StrategyExecutor':
|
|
124
129
|
"""Create a strategy from a task."""
|
|
125
130
|
|
|
131
|
+
# TODO(cooperc): Consider defaulting to FAILOVER if using k8s with a
|
|
132
|
+
# single context, since there are not multiple clouds/regions to
|
|
133
|
+
# failover through.
|
|
126
134
|
resource_list = list(task.resources)
|
|
127
135
|
job_recovery = resource_list[0].job_recovery
|
|
128
136
|
for resource in resource_list:
|
|
@@ -144,16 +152,26 @@ class StrategyExecutor:
|
|
|
144
152
|
job_recovery_name: Optional[str] = name
|
|
145
153
|
max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
|
|
146
154
|
0)
|
|
155
|
+
recover_exit_codes = job_recovery.pop('recover_on_exit_codes', None)
|
|
156
|
+
# Normalize single integer to list
|
|
157
|
+
recover_on_exit_codes: Optional[List[int]] = None
|
|
158
|
+
if isinstance(recover_exit_codes, int):
|
|
159
|
+
recover_on_exit_codes = [recover_exit_codes]
|
|
160
|
+
elif isinstance(recover_exit_codes, list):
|
|
161
|
+
recover_on_exit_codes = [
|
|
162
|
+
int(code) for code in recover_exit_codes
|
|
163
|
+
]
|
|
147
164
|
else:
|
|
148
165
|
job_recovery_name = job_recovery
|
|
149
166
|
max_restarts_on_errors = 0
|
|
167
|
+
recover_on_exit_codes = None
|
|
150
168
|
job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
|
|
151
169
|
from_str(job_recovery_name))
|
|
152
170
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
153
171
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
154
172
|
max_restarts_on_errors, job_id, task_id,
|
|
155
173
|
pool, starting, starting_lock,
|
|
156
|
-
starting_signal)
|
|
174
|
+
starting_signal, recover_on_exit_codes)
|
|
157
175
|
|
|
158
176
|
async def launch(self) -> float:
|
|
159
177
|
"""Launch the cluster for the first time.
|
|
@@ -275,19 +293,25 @@ class StrategyExecutor:
|
|
|
275
293
|
break
|
|
276
294
|
|
|
277
295
|
try:
|
|
278
|
-
status =
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
296
|
+
status, transient_error_reason = (
|
|
297
|
+
await managed_job_utils.get_job_status(
|
|
298
|
+
self.backend,
|
|
299
|
+
self.cluster_name,
|
|
300
|
+
job_id=self.job_id_on_pool_cluster))
|
|
282
301
|
except Exception as e: # pylint: disable=broad-except
|
|
302
|
+
transient_error_reason = common_utils.format_exception(e)
|
|
283
303
|
# If any unexpected error happens, retry the job checking
|
|
284
304
|
# loop.
|
|
285
305
|
# Note: the CommandError is already handled in the
|
|
286
306
|
# get_job_status, so it should not happen here.
|
|
287
307
|
# TODO(zhwu): log the unexpected error to usage collection
|
|
288
308
|
# for future debugging.
|
|
289
|
-
logger.info(
|
|
290
|
-
'
|
|
309
|
+
logger.info('Unexpected exception during fetching job status: '
|
|
310
|
+
f'{common_utils.format_exception(e)}')
|
|
311
|
+
continue
|
|
312
|
+
if transient_error_reason is not None:
|
|
313
|
+
logger.info('Transient error when fetching the job status: '
|
|
314
|
+
f'{transient_error_reason}')
|
|
291
315
|
continue
|
|
292
316
|
|
|
293
317
|
# Check the job status until it is not in initialized status
|
|
@@ -444,9 +468,16 @@ class StrategyExecutor:
|
|
|
444
468
|
raise
|
|
445
469
|
logger.info('Managed job cluster launched.')
|
|
446
470
|
else:
|
|
471
|
+
# Get task resources from DAG for resource-aware
|
|
472
|
+
# scheduling.
|
|
473
|
+
task_resources = None
|
|
474
|
+
if self.dag.tasks:
|
|
475
|
+
task = self.dag.tasks[self.task_id]
|
|
476
|
+
task_resources = task.resources
|
|
477
|
+
|
|
447
478
|
self.cluster_name = await (context_utils.to_thread(
|
|
448
479
|
serve_utils.get_next_cluster_name, self.pool,
|
|
449
|
-
self.job_id))
|
|
480
|
+
self.job_id, task_resources))
|
|
450
481
|
if self.cluster_name is None:
|
|
451
482
|
raise exceptions.NoClusterLaunchedError(
|
|
452
483
|
'No cluster name found in the pool.')
|
|
@@ -537,6 +568,52 @@ class StrategyExecutor:
|
|
|
537
568
|
# At this point, a sky.launch() has succeeded. Cluster
|
|
538
569
|
# may be UP (no preemption since) or DOWN (newly
|
|
539
570
|
# preempted).
|
|
571
|
+
# Auto-populate instance links if cluster is on a real
|
|
572
|
+
# cloud
|
|
573
|
+
if self.cluster_name is not None and self.pool is None:
|
|
574
|
+
try:
|
|
575
|
+
handle = await context_utils.to_thread(
|
|
576
|
+
global_user_state.
|
|
577
|
+
get_handle_from_cluster_name,
|
|
578
|
+
self.cluster_name)
|
|
579
|
+
if (handle is not None and hasattr(
|
|
580
|
+
handle, 'cached_cluster_info') and
|
|
581
|
+
handle.cached_cluster_info is not None):
|
|
582
|
+
cluster_info = handle.cached_cluster_info
|
|
583
|
+
instance_links = (instance_links_utils.
|
|
584
|
+
generate_instance_links(
|
|
585
|
+
cluster_info,
|
|
586
|
+
self.cluster_name))
|
|
587
|
+
if instance_links:
|
|
588
|
+
# Store instance links directly in
|
|
589
|
+
# database
|
|
590
|
+
await state.update_links_async(
|
|
591
|
+
self.job_id, self.task_id,
|
|
592
|
+
instance_links)
|
|
593
|
+
logger.debug(
|
|
594
|
+
f'Auto-populated instance links: '
|
|
595
|
+
f'{instance_links}')
|
|
596
|
+
else:
|
|
597
|
+
logger.debug('Failed to generate '
|
|
598
|
+
'instance links')
|
|
599
|
+
else:
|
|
600
|
+
logger.debug(
|
|
601
|
+
'Cluster handle not found or '
|
|
602
|
+
'cached cluster info is None so'
|
|
603
|
+
'not populating instance links')
|
|
604
|
+
except Exception as e: # pylint: disable=broad-except
|
|
605
|
+
# Don't fail the launch if we can't generate
|
|
606
|
+
# links
|
|
607
|
+
logger.debug(
|
|
608
|
+
'Failed to auto-populate instance links: '
|
|
609
|
+
f'{e}')
|
|
610
|
+
else:
|
|
611
|
+
if self.pool:
|
|
612
|
+
logger.debug('Not populating instance links '
|
|
613
|
+
'since the cluster is for a pool')
|
|
614
|
+
else:
|
|
615
|
+
logger.debug('Not populating instance links '
|
|
616
|
+
'since the cluster name is None')
|
|
540
617
|
job_submitted_at = await (
|
|
541
618
|
self._wait_until_job_starts_on_cluster())
|
|
542
619
|
if job_submitted_at is not None:
|
|
@@ -589,15 +666,35 @@ class StrategyExecutor:
|
|
|
589
666
|
# NoClusterLaunchedError.
|
|
590
667
|
assert False, 'Unreachable'
|
|
591
668
|
|
|
592
|
-
def should_restart_on_failure(self
|
|
669
|
+
def should_restart_on_failure(self,
|
|
670
|
+
exit_codes: Optional[List[int]] = None
|
|
671
|
+
) -> bool:
|
|
593
672
|
"""Increments counter & checks if job should be restarted on a failure.
|
|
594
673
|
|
|
674
|
+
Args:
|
|
675
|
+
exit_codes: List of exit codes from the failed job. If any exit code
|
|
676
|
+
matches recover_on_exit_codes, recovery will be triggered
|
|
677
|
+
regardless of max_restarts_on_errors limit.
|
|
678
|
+
|
|
595
679
|
Returns:
|
|
596
680
|
True if the job should be restarted, otherwise False.
|
|
597
681
|
"""
|
|
682
|
+
# Check if any exit code matches the configured recover_on_exit_codes
|
|
683
|
+
# This triggers recovery without incrementing the counter
|
|
684
|
+
if exit_codes and self.recover_on_exit_codes:
|
|
685
|
+
for exit_code in exit_codes:
|
|
686
|
+
if exit_code in self.recover_on_exit_codes:
|
|
687
|
+
logger.info(f'Exit code {exit_code} matched '
|
|
688
|
+
'recover_on_exit_codes, triggering recovery')
|
|
689
|
+
return True
|
|
690
|
+
|
|
691
|
+
# Otherwise, check the max_restarts_on_errors counter
|
|
598
692
|
self.restart_cnt_on_failure += 1
|
|
599
693
|
if self.restart_cnt_on_failure > self.max_restarts_on_errors:
|
|
600
694
|
return False
|
|
695
|
+
logger.info(f'Restart count {self.restart_cnt_on_failure} '
|
|
696
|
+
'is less than max_restarts_on_errors, '
|
|
697
|
+
'restarting job')
|
|
601
698
|
return True
|
|
602
699
|
|
|
603
700
|
|
|
@@ -620,10 +717,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
620
717
|
starting: Set[int],
|
|
621
718
|
starting_lock: asyncio.Lock,
|
|
622
719
|
starting_signal: asyncio.Condition,
|
|
720
|
+
recover_on_exit_codes: Optional[List[int]] = None,
|
|
623
721
|
) -> None:
|
|
624
722
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
625
723
|
job_id, task_id, pool, starting, starting_lock,
|
|
626
|
-
starting_signal)
|
|
724
|
+
starting_signal, recover_on_exit_codes)
|
|
627
725
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
628
726
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
629
727
|
# rely on cluster handle, as it can be None if the cluster is
|
sky/jobs/server/core.py
CHANGED
|
@@ -25,6 +25,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
25
25
|
from sky.backends import backend_utils
|
|
26
26
|
from sky.backends import cloud_vm_ray_backend
|
|
27
27
|
from sky.catalog import common as service_catalog_common
|
|
28
|
+
from sky.data import data_utils
|
|
28
29
|
from sky.data import storage as storage_lib
|
|
29
30
|
from sky.jobs import constants as managed_job_constants
|
|
30
31
|
from sky.jobs import state as managed_job_state
|
|
@@ -93,6 +94,51 @@ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
|
93
94
|
]
|
|
94
95
|
|
|
95
96
|
|
|
97
|
+
def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
|
|
98
|
+
"""Warn if local file mounts or workdir may be lost during rolling update.
|
|
99
|
+
|
|
100
|
+
When rolling update is enabled with consolidation mode but no jobs bucket
|
|
101
|
+
is configured, local file mounts and workdirs are stored locally on the API
|
|
102
|
+
server pod and will be lost during a rolling update.
|
|
103
|
+
"""
|
|
104
|
+
# If rolling update is not enabled, don't warn.
|
|
105
|
+
if os.environ.get(skylet_constants.SKYPILOT_ROLLING_UPDATE_ENABLED) is None:
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
# If consolidation mode is not enabled, don't warn.
|
|
109
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
# If a jobs bucket is configured, don't warn.
|
|
113
|
+
if skypilot_config.get_nested(('jobs', 'bucket'), None) is not None:
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
# Check if any task has local file_mounts (not cloud store URLs) or workdir
|
|
117
|
+
has_local_file_mounts = False
|
|
118
|
+
has_local_workdir = False
|
|
119
|
+
for task_ in dag.tasks:
|
|
120
|
+
if task_.file_mounts:
|
|
121
|
+
for src in task_.file_mounts.values():
|
|
122
|
+
if not data_utils.is_cloud_store_url(src):
|
|
123
|
+
has_local_file_mounts = True
|
|
124
|
+
break
|
|
125
|
+
if task_.workdir and isinstance(task_.workdir, str):
|
|
126
|
+
has_local_workdir = True
|
|
127
|
+
break
|
|
128
|
+
if has_local_file_mounts:
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
if not has_local_file_mounts and not has_local_workdir:
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
logger.warning(
|
|
135
|
+
f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
|
|
136
|
+
'with rolling update enabled for API server. To persist files'
|
|
137
|
+
' across API server restarts/update, use buckets, volumes, or git '
|
|
138
|
+
'for your file mounts; or, configure a bucket in your SkyPilot config '
|
|
139
|
+
f'under `jobs.bucket`. {colorama.Style.RESET_ALL}')
|
|
140
|
+
|
|
141
|
+
|
|
96
142
|
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
97
143
|
"""Upload files to the controller.
|
|
98
144
|
|
|
@@ -103,14 +149,21 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
103
149
|
"""
|
|
104
150
|
local_to_controller_file_mounts: Dict[str, str] = {}
|
|
105
151
|
|
|
106
|
-
#
|
|
107
|
-
#
|
|
152
|
+
# Check if user has explicitly configured a bucket for jobs.
|
|
153
|
+
# If so, we should use cloud storage even in consolidation mode to persist
|
|
154
|
+
# files across rolling updates and pod restarts.
|
|
155
|
+
has_explicit_bucket = skypilot_config.get_nested(('jobs', 'bucket'),
|
|
156
|
+
None) is not None
|
|
108
157
|
storage_clouds = (
|
|
109
158
|
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
|
|
110
159
|
force_disable_cloud_bucket = skypilot_config.get_nested(
|
|
111
160
|
('jobs', 'force_disable_cloud_bucket'), False)
|
|
112
|
-
|
|
113
|
-
|
|
161
|
+
# Use cloud storage if:
|
|
162
|
+
# 1. Not in consolidation mode, OR
|
|
163
|
+
# 2. In consolidation mode BUT user has explicit bucket configured
|
|
164
|
+
# AND storage clouds are available AND cloud bucket is not force-disabled
|
|
165
|
+
if ((not managed_job_utils.is_consolidation_mode() or has_explicit_bucket)
|
|
166
|
+
and storage_clouds and not force_disable_cloud_bucket):
|
|
114
167
|
for task_ in dag.tasks:
|
|
115
168
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
116
169
|
task_, task_type='jobs')
|
|
@@ -346,6 +399,9 @@ def launch(
|
|
|
346
399
|
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
|
|
347
400
|
f'Reason: {common_utils.format_exception(e)}')
|
|
348
401
|
|
|
402
|
+
# Warn if file mounts may be lost during rolling update
|
|
403
|
+
_warn_file_mounts_rolling_update(dag)
|
|
404
|
+
|
|
349
405
|
local_to_controller_file_mounts = _upload_files_to_controller(dag)
|
|
350
406
|
controller = controller_utils.Controllers.JOBS_CONTROLLER
|
|
351
407
|
controller_name = controller.value.cluster_name
|
|
@@ -1216,3 +1272,24 @@ def pool_sync_down_logs(
|
|
|
1216
1272
|
replica_ids=worker_ids,
|
|
1217
1273
|
tail=tail,
|
|
1218
1274
|
pool=True)
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
@usage_lib.entrypoint
|
|
1278
|
+
def get_job_events(
|
|
1279
|
+
job_id: int,
|
|
1280
|
+
task_id: Optional[int] = None,
|
|
1281
|
+
limit: Optional[int] = 10,
|
|
1282
|
+
) -> List[Dict[str, Any]]:
|
|
1283
|
+
"""Get task events for a managed job.
|
|
1284
|
+
|
|
1285
|
+
Args:
|
|
1286
|
+
job_id: The job ID to get task events for.
|
|
1287
|
+
task_id: Optional task ID to filter by.
|
|
1288
|
+
limit: Optional limit on number of task events to return (default 10).
|
|
1289
|
+
|
|
1290
|
+
Returns:
|
|
1291
|
+
List of task event records.
|
|
1292
|
+
"""
|
|
1293
|
+
return managed_job_state.get_job_events(job_id=job_id,
|
|
1294
|
+
task_id=task_id,
|
|
1295
|
+
limit=limit)
|
sky/jobs/server/server.py
CHANGED
|
@@ -242,3 +242,17 @@ async def pool_download_logs(
|
|
|
242
242
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
243
243
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
244
244
|
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@router.post('/events')
|
|
248
|
+
async def events(request: fastapi.Request,
|
|
249
|
+
body: payloads.GetJobEventsBody) -> None:
|
|
250
|
+
"""Gets task events for a managed job."""
|
|
251
|
+
await executor.schedule_request_async(
|
|
252
|
+
request_id=request.state.request_id,
|
|
253
|
+
request_name=request_names.RequestName.JOBS_EVENTS,
|
|
254
|
+
request_body=body,
|
|
255
|
+
func=core.get_job_events,
|
|
256
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
257
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
258
|
+
)
|