skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +160 -23
- sky/backends/cloud_vm_ray_backend.py +226 -74
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +23 -18
- sky/clouds/aws.py +26 -6
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +734 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +709 -508
- sky/jobs/utils.py +90 -40
- sky/logs/agent.py +10 -2
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +55 -27
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +9 -1
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +14 -7
- sky/skylet/events.py +2 -10
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +25 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -4,9 +4,11 @@ NOTE: whenever an API change is made in this file, we need to bump the
|
|
|
4
4
|
jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
|
|
5
5
|
ManagedJobCodeGen.
|
|
6
6
|
"""
|
|
7
|
+
import asyncio
|
|
7
8
|
import collections
|
|
8
9
|
import datetime
|
|
9
10
|
import enum
|
|
11
|
+
import logging
|
|
10
12
|
import os
|
|
11
13
|
import pathlib
|
|
12
14
|
import shlex
|
|
@@ -14,11 +16,11 @@ import textwrap
|
|
|
14
16
|
import time
|
|
15
17
|
import traceback
|
|
16
18
|
import typing
|
|
17
|
-
from typing import Any, Deque, Dict, List, Optional, Set, TextIO,
|
|
19
|
+
from typing import (Any, Deque, Dict, List, Literal, Optional, Set, TextIO,
|
|
20
|
+
Tuple, Union)
|
|
18
21
|
|
|
19
22
|
import colorama
|
|
20
23
|
import filelock
|
|
21
|
-
from typing_extensions import Literal
|
|
22
24
|
|
|
23
25
|
from sky import backends
|
|
24
26
|
from sky import exceptions
|
|
@@ -37,6 +39,7 @@ from sky.usage import usage_lib
|
|
|
37
39
|
from sky.utils import annotations
|
|
38
40
|
from sky.utils import command_runner
|
|
39
41
|
from sky.utils import common_utils
|
|
42
|
+
from sky.utils import context_utils
|
|
40
43
|
from sky.utils import controller_utils
|
|
41
44
|
from sky.utils import infra_utils
|
|
42
45
|
from sky.utils import log_utils
|
|
@@ -56,9 +59,9 @@ else:
|
|
|
56
59
|
|
|
57
60
|
logger = sky_logging.init_logger(__name__)
|
|
58
61
|
|
|
59
|
-
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
60
62
|
# Controller checks its job's status every this many seconds.
|
|
61
|
-
|
|
63
|
+
# This is a tradeoff between the latency and the resource usage.
|
|
64
|
+
JOB_STATUS_CHECK_GAP_SECONDS = 15
|
|
62
65
|
|
|
63
66
|
# Controller checks if its job has started every this many seconds.
|
|
64
67
|
JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
@@ -82,7 +85,7 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
82
85
|
# blocking for a long time. This should be significantly longer than the
|
|
83
86
|
# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
|
|
84
87
|
# update the state.
|
|
85
|
-
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS =
|
|
88
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
86
89
|
|
|
87
90
|
|
|
88
91
|
class ManagedJobQueueResultType(enum.Enum):
|
|
@@ -99,7 +102,11 @@ class UserSignal(enum.Enum):
|
|
|
99
102
|
|
|
100
103
|
|
|
101
104
|
# ====== internal functions ======
|
|
102
|
-
def terminate_cluster(
|
|
105
|
+
def terminate_cluster(
|
|
106
|
+
cluster_name: str,
|
|
107
|
+
max_retry: int = 6,
|
|
108
|
+
_logger: logging.Logger = logger, # pylint: disable=invalid-name
|
|
109
|
+
) -> None:
|
|
103
110
|
"""Terminate the cluster."""
|
|
104
111
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
105
112
|
retry_cnt = 0
|
|
@@ -122,18 +129,18 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
|
122
129
|
return
|
|
123
130
|
except exceptions.ClusterDoesNotExist:
|
|
124
131
|
# The cluster is already down.
|
|
125
|
-
|
|
132
|
+
_logger.debug(f'The cluster {cluster_name} is already down.')
|
|
126
133
|
return
|
|
127
134
|
except Exception as e: # pylint: disable=broad-except
|
|
128
135
|
retry_cnt += 1
|
|
129
136
|
if retry_cnt >= max_retry:
|
|
130
137
|
raise RuntimeError(
|
|
131
138
|
f'Failed to terminate the cluster {cluster_name}.') from e
|
|
132
|
-
|
|
139
|
+
_logger.error(
|
|
133
140
|
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
|
134
141
|
f'Details: {common_utils.format_exception(e)}')
|
|
135
142
|
with ux_utils.enable_traceback():
|
|
136
|
-
|
|
143
|
+
_logger.error(f' Traceback: {traceback.format_exc()}')
|
|
137
144
|
time.sleep(backoff.current_backoff())
|
|
138
145
|
|
|
139
146
|
|
|
@@ -183,6 +190,9 @@ def _validate_consolidation_mode_config(
|
|
|
183
190
|
# Use LRU Cache so that the check is only done once.
|
|
184
191
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
185
192
|
def is_consolidation_mode() -> bool:
|
|
193
|
+
if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
194
|
+
return True
|
|
195
|
+
|
|
186
196
|
consolidation_mode = skypilot_config.get_nested(
|
|
187
197
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
188
198
|
# We should only do this check on API server, as the controller will not
|
|
@@ -199,6 +209,7 @@ def ha_recovery_for_consolidation_mode():
|
|
|
199
209
|
# already has all runtime installed. Directly start jobs recovery here.
|
|
200
210
|
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
201
211
|
runner = command_runner.LocalProcessCommandRunner()
|
|
212
|
+
scheduler.maybe_start_controllers()
|
|
202
213
|
with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
|
|
203
214
|
'w',
|
|
204
215
|
encoding='utf-8') as f:
|
|
@@ -214,7 +225,7 @@ def ha_recovery_for_consolidation_mode():
|
|
|
214
225
|
# just keep running.
|
|
215
226
|
if controller_pid is not None:
|
|
216
227
|
try:
|
|
217
|
-
if
|
|
228
|
+
if controller_process_alive(controller_pid, job_id):
|
|
218
229
|
f.write(f'Controller pid {controller_pid} for '
|
|
219
230
|
f'job {job_id} is still running. '
|
|
220
231
|
'Skipping recovery.\n')
|
|
@@ -227,7 +238,7 @@ def ha_recovery_for_consolidation_mode():
|
|
|
227
238
|
|
|
228
239
|
if job['schedule_state'] not in [
|
|
229
240
|
managed_job_state.ManagedJobScheduleState.DONE,
|
|
230
|
-
managed_job_state.ManagedJobScheduleState.WAITING
|
|
241
|
+
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
231
242
|
]:
|
|
232
243
|
script = managed_job_state.get_ha_recovery_script(job_id)
|
|
233
244
|
if script is None:
|
|
@@ -242,56 +253,66 @@ def ha_recovery_for_consolidation_mode():
|
|
|
242
253
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
243
254
|
|
|
244
255
|
|
|
245
|
-
def get_job_status(
|
|
246
|
-
|
|
256
|
+
async def get_job_status(
|
|
257
|
+
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
258
|
+
job_id: Optional[int],
|
|
259
|
+
job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
|
|
247
260
|
"""Check the status of the job running on a managed job cluster.
|
|
248
261
|
|
|
249
262
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
250
263
|
FAILED_SETUP or CANCELLED.
|
|
251
264
|
"""
|
|
252
|
-
|
|
265
|
+
# TODO(luca) make this async
|
|
266
|
+
handle = await context_utils.to_thread(
|
|
267
|
+
global_user_state.get_handle_from_cluster_name, cluster_name)
|
|
253
268
|
if handle is None:
|
|
254
269
|
# This can happen if the cluster was preempted and background status
|
|
255
270
|
# refresh already noticed and cleaned it up.
|
|
256
|
-
|
|
271
|
+
job_logger.info(f'Cluster {cluster_name} not found.')
|
|
257
272
|
return None
|
|
258
273
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
259
274
|
job_ids = None if job_id is None else [job_id]
|
|
260
275
|
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
261
276
|
try:
|
|
262
|
-
|
|
263
|
-
statuses = backend.get_job_status
|
|
264
|
-
|
|
265
|
-
|
|
277
|
+
job_logger.info('=== Checking the job status... ===')
|
|
278
|
+
statuses = await context_utils.to_thread(backend.get_job_status,
|
|
279
|
+
handle,
|
|
280
|
+
job_ids=job_ids,
|
|
281
|
+
stream_logs=False)
|
|
266
282
|
status = list(statuses.values())[0]
|
|
267
283
|
if status is None:
|
|
268
|
-
|
|
284
|
+
job_logger.info('No job found.')
|
|
269
285
|
else:
|
|
270
|
-
|
|
271
|
-
|
|
286
|
+
job_logger.info(f'Job status: {status}')
|
|
287
|
+
job_logger.info('=' * 34)
|
|
272
288
|
return status
|
|
273
289
|
except exceptions.CommandError as e:
|
|
274
290
|
# Retry on k8s transient network errors. This is useful when using
|
|
275
291
|
# coreweave which may have transient network issue sometimes.
|
|
276
292
|
if (e.detailed_reason is not None and
|
|
277
293
|
_JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
294
|
+
job_logger.info('Failed to connect to the cluster. Retrying '
|
|
295
|
+
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
296
|
+
job_logger.info('=' * 34)
|
|
297
|
+
await asyncio.sleep(1)
|
|
282
298
|
else:
|
|
283
|
-
|
|
284
|
-
|
|
299
|
+
job_logger.info(
|
|
300
|
+
f'Failed to get job status: {e.detailed_reason}')
|
|
301
|
+
job_logger.info('=' * 34)
|
|
285
302
|
return None
|
|
286
303
|
return None
|
|
287
304
|
|
|
288
305
|
|
|
289
|
-
def
|
|
306
|
+
def controller_process_alive(pid: int, job_id: int) -> bool:
|
|
290
307
|
"""Check if the controller process is alive."""
|
|
291
308
|
try:
|
|
309
|
+
if pid < 0:
|
|
310
|
+
# new job controller process will always be negative
|
|
311
|
+
pid = -pid
|
|
292
312
|
process = psutil.Process(pid)
|
|
293
313
|
cmd_str = ' '.join(process.cmdline())
|
|
294
|
-
return process.is_running() and f'--job-id {job_id}' in cmd_str
|
|
314
|
+
return process.is_running() and ((f'--job-id {job_id}' in cmd_str) or
|
|
315
|
+
('controller' in cmd_str))
|
|
295
316
|
except psutil.NoSuchProcess:
|
|
296
317
|
return False
|
|
297
318
|
|
|
@@ -466,7 +487,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
466
487
|
failure_reason = f'No controller pid set for {schedule_state.value}'
|
|
467
488
|
else:
|
|
468
489
|
logger.debug(f'Checking controller pid {pid}')
|
|
469
|
-
if
|
|
490
|
+
if controller_process_alive(pid, job_id):
|
|
470
491
|
# The controller is still running, so this job is fine.
|
|
471
492
|
continue
|
|
472
493
|
|
|
@@ -565,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
565
586
|
raise
|
|
566
587
|
|
|
567
588
|
|
|
568
|
-
def event_callback_func(
|
|
589
|
+
def event_callback_func(
|
|
590
|
+
job_id: int, task_id: Optional[int],
|
|
591
|
+
task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
|
|
569
592
|
"""Run event callback for the task."""
|
|
570
593
|
|
|
571
594
|
def callback_func(status: str):
|
|
@@ -604,7 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
604
627
|
f'Bash:{event_callback},log_path:{log_path},result:{result}')
|
|
605
628
|
logger.info(f'=== END: event callback for {status!r} ===')
|
|
606
629
|
|
|
607
|
-
|
|
630
|
+
async def async_callback_func(status: str):
|
|
631
|
+
return await context_utils.to_thread(callback_func, status)
|
|
632
|
+
|
|
633
|
+
return async_callback_func
|
|
608
634
|
|
|
609
635
|
|
|
610
636
|
# ======== user functions ========
|
|
@@ -651,16 +677,41 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
|
651
677
|
logger.info(f'Job {job_id} is already in terminal state '
|
|
652
678
|
f'{job_status.value}. Skipped.')
|
|
653
679
|
continue
|
|
680
|
+
elif job_status == managed_job_state.ManagedJobStatus.PENDING:
|
|
681
|
+
# the if is a short circuit, this will be atomic.
|
|
682
|
+
cancelled = managed_job_state.set_pending_cancelled(job_id)
|
|
683
|
+
if cancelled:
|
|
684
|
+
cancelled_job_ids.append(job_id)
|
|
685
|
+
continue
|
|
654
686
|
|
|
655
687
|
update_managed_jobs_statuses(job_id)
|
|
656
688
|
|
|
689
|
+
job_controller_pid = managed_job_state.get_job_controller_pid(job_id)
|
|
690
|
+
if job_controller_pid is not None and job_controller_pid < 0:
|
|
691
|
+
# This is a consolidated job controller, so we need to cancel the
|
|
692
|
+
# with the controller server API
|
|
693
|
+
try:
|
|
694
|
+
# we create a file as a signal to the controller server
|
|
695
|
+
signal_file = pathlib.Path(
|
|
696
|
+
managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
|
|
697
|
+
signal_file.touch()
|
|
698
|
+
cancelled_job_ids.append(job_id)
|
|
699
|
+
except OSError as e:
|
|
700
|
+
logger.error(f'Failed to cancel job {job_id} '
|
|
701
|
+
f'with controller server: {e}')
|
|
702
|
+
# don't add it to the to be cancelled job ids, since we don't
|
|
703
|
+
# know for sure yet.
|
|
704
|
+
continue
|
|
705
|
+
continue
|
|
706
|
+
|
|
657
707
|
job_workspace = managed_job_state.get_workspace(job_id)
|
|
658
708
|
if current_workspace is not None and job_workspace != current_workspace:
|
|
659
709
|
wrong_workspace_job_ids.append(job_id)
|
|
660
710
|
continue
|
|
661
711
|
|
|
662
712
|
# Send the signal to the jobs controller.
|
|
663
|
-
signal_file = pathlib.Path(
|
|
713
|
+
signal_file = (pathlib.Path(
|
|
714
|
+
managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
|
|
664
715
|
# Filelock is needed to prevent race condition between signal
|
|
665
716
|
# check/removal and signal writing.
|
|
666
717
|
with filelock.FileLock(str(signal_file) + '.lock'):
|
|
@@ -1159,8 +1210,7 @@ def dump_managed_job_queue(
|
|
|
1159
1210
|
# It's possible for a WAITING/ALIVE_WAITING job to be ready to
|
|
1160
1211
|
# launch, but the scheduler just hasn't run yet.
|
|
1161
1212
|
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1162
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING
|
|
1163
|
-
):
|
|
1213
|
+
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
|
|
1164
1214
|
# This job will not block others.
|
|
1165
1215
|
continue
|
|
1166
1216
|
|
|
@@ -1370,12 +1420,12 @@ def load_managed_job_queue(
|
|
|
1370
1420
|
"""Load job queue from json string."""
|
|
1371
1421
|
result = message_utils.decode_payload(payload)
|
|
1372
1422
|
result_type = ManagedJobQueueResultType.DICT
|
|
1373
|
-
status_counts = {}
|
|
1423
|
+
status_counts: Dict[str, int] = {}
|
|
1374
1424
|
if isinstance(result, dict):
|
|
1375
|
-
jobs = result['jobs']
|
|
1376
|
-
total = result['total']
|
|
1425
|
+
jobs: List[Dict[str, Any]] = result['jobs']
|
|
1426
|
+
total: int = result['total']
|
|
1377
1427
|
status_counts = result.get('status_counts', {})
|
|
1378
|
-
total_no_filter = result.get('total_no_filter', total)
|
|
1428
|
+
total_no_filter: int = result.get('total_no_filter', total)
|
|
1379
1429
|
else:
|
|
1380
1430
|
jobs = result
|
|
1381
1431
|
total = len(jobs)
|
sky/logs/agent.py
CHANGED
|
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
|
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
37
|
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
|
38
|
-
'sudo apt-get install -y gnupg; '
|
|
38
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
39
39
|
# pylint: disable=line-too-long
|
|
40
|
-
'curl https://
|
|
40
|
+
'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
41
|
+
# pylint: disable=line-too-long
|
|
42
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
43
|
+
# pylint: disable=line-too-long
|
|
44
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
45
|
+
# pylint: disable=line-too-long
|
|
46
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
47
|
+
'sudo apt-get update; '
|
|
48
|
+
'sudo apt-get install -y fluent-bit; '
|
|
41
49
|
'fi')
|
|
42
50
|
cfg = self.fluentbit_config(cluster_name)
|
|
43
51
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
sky/provision/aws/config.py
CHANGED
|
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
305
305
|
Returns:
|
|
306
306
|
A list of route tables associated with the options VPC and region
|
|
307
307
|
"""
|
|
308
|
-
filters
|
|
308
|
+
filters: List['ec2_type_defs.FilterTypeDef'] = [{
|
|
309
|
+
'Name': 'association.main',
|
|
310
|
+
'Values': [str(main).lower()],
|
|
311
|
+
}]
|
|
309
312
|
if vpc_id is not None:
|
|
310
313
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
|
311
314
|
logger.debug(
|
sky/provision/gcp/config.py
CHANGED
|
@@ -5,6 +5,8 @@ import time
|
|
|
5
5
|
import typing
|
|
6
6
|
from typing import Any, Dict, List, Set, Tuple
|
|
7
7
|
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
8
10
|
from sky.adaptors import gcp
|
|
9
11
|
from sky.clouds.utils import gcp_utils
|
|
10
12
|
from sky.provision import common
|
|
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
|
415
417
|
return iam_role
|
|
416
418
|
|
|
417
419
|
|
|
420
|
+
AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
|
|
421
|
+
|
|
422
|
+
|
|
418
423
|
def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
419
424
|
compute):
|
|
420
425
|
"""Check if the firewall rules in the VPC are sufficient."""
|
|
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
466
471
|
}
|
|
467
472
|
"""
|
|
468
473
|
source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
|
|
469
|
-
source2allowed_list: Dict[Tuple[str, str], List[
|
|
474
|
+
source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
|
|
470
475
|
for rule in rules:
|
|
471
476
|
# Rules applied to specific VM (targetTags) may not work for the
|
|
472
477
|
# current VM, so should be skipped.
|
|
@@ -3,7 +3,7 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
from typing import Any, Dict, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
|
666
666
|
|
|
667
667
|
|
|
668
668
|
class KubernetesError(Exception):
|
|
669
|
-
|
|
669
|
+
|
|
670
|
+
def __init__(self,
|
|
671
|
+
*args,
|
|
672
|
+
insufficent_resources: Optional[List[str]] = None):
|
|
673
|
+
self.insufficent_resources = insufficent_resources
|
|
674
|
+
super().__init__(*args)
|
|
@@ -3,6 +3,7 @@ import copy
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import time
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
191
192
|
break
|
|
192
193
|
if event_message is not None:
|
|
193
194
|
if pod_status == 'Pending':
|
|
194
|
-
|
|
195
|
+
out_of = {}
|
|
196
|
+
# key: resource name, value: (extra message, nice name)
|
|
195
197
|
if 'Insufficient cpu' in event_message:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
199
|
+
'custom-columns=NAME:.metadata.name,'
|
|
200
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
201
|
+
'the available CPUs on the node.', 'CPUs')
|
|
198
202
|
if 'Insufficient memory' in event_message:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
203
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
204
|
+
'custom-columns=NAME:.metadata.name,'
|
|
205
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
206
|
+
'to check the available memory on the '
|
|
207
|
+
'node.', 'Memory')
|
|
208
|
+
|
|
202
209
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
203
210
|
# fusermount-server, we need a new way to check whether the
|
|
204
211
|
# fusermount-server daemonset is ready.
|
|
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
206
213
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
207
214
|
for key in lf.get_label_keys()
|
|
208
215
|
]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
216
|
+
for label_key in gpu_lf_keys:
|
|
217
|
+
# TODO(romilb): We may have additional node
|
|
218
|
+
# affinity selectors in the future - in that
|
|
219
|
+
# case we will need to update this logic.
|
|
220
|
+
# TODO(Doyoung): Update the error message raised
|
|
221
|
+
# with the multi-host TPU support.
|
|
222
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
223
|
+
context) # pylint: disable=line-too-long
|
|
224
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
225
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
226
|
+
in event_message) and pod.spec.node_selector):
|
|
227
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
228
|
+
info_msg = (
|
|
229
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
230
|
+
'see the available GPUs.')
|
|
231
|
+
else:
|
|
232
|
+
info_msg = ': '
|
|
233
|
+
if (pod.spec.node_selector and
|
|
234
|
+
label_key in pod.spec.node_selector):
|
|
235
|
+
extra_msg = (
|
|
236
|
+
f'Verify if any node matching label '
|
|
237
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
238
|
+
f'sufficient resource {gpu_resource_key} '
|
|
239
|
+
f'is available in the cluster.')
|
|
240
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
241
|
+
else:
|
|
242
|
+
extra_msg = info_msg
|
|
243
|
+
if gpu_resource_key not in out_of or len(
|
|
244
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
245
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
246
|
+
|
|
247
|
+
if len(out_of) > 0:
|
|
248
|
+
# We are out of some resources. We should raise an error.
|
|
249
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
250
|
+
rsrc_err_msg += 'cluster:\n'
|
|
251
|
+
out_of_keys = list(out_of.keys())
|
|
252
|
+
for i in range(len(out_of_keys)):
|
|
253
|
+
rsrc = out_of_keys[i]
|
|
254
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
255
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
256
|
+
if i == len(out_of_keys) - 1:
|
|
257
|
+
indent = '└──'
|
|
258
|
+
else:
|
|
259
|
+
indent = '├──'
|
|
260
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
261
|
+
f'sufficient {nice_name} for your request'
|
|
262
|
+
f'{extra_msg}')
|
|
263
|
+
if i != len(out_of_keys) - 1:
|
|
264
|
+
rsrc_err_msg += '\n'
|
|
265
|
+
|
|
266
|
+
# Emit the error message without logging prefixes for better UX.
|
|
267
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
268
|
+
tmp_handler.flush = sys.stdout.flush
|
|
269
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
270
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
271
|
+
prev_propagate = logger.propagate
|
|
272
|
+
try:
|
|
273
|
+
logger.addHandler(tmp_handler)
|
|
274
|
+
logger.propagate = False
|
|
275
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
276
|
+
finally:
|
|
277
|
+
logger.removeHandler(tmp_handler)
|
|
278
|
+
logger.propagate = prev_propagate
|
|
279
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
280
|
+
raise config_lib.KubernetesError(
|
|
281
|
+
f'{timeout_err_msg} '
|
|
282
|
+
f'Pod status: {pod_status} '
|
|
283
|
+
f'Details: \'{event_message}\' ',
|
|
284
|
+
insufficent_resources=nice_names,
|
|
285
|
+
)
|
|
286
|
+
|
|
244
287
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
245
288
|
f'Pod status: {pod_status} '
|
|
246
289
|
f'Details: \'{event_message}\' ')
|
|
@@ -451,6 +451,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
451
451
|
|
|
452
452
|
LABEL_KEY = 'gpu.nvidia.com/class'
|
|
453
453
|
|
|
454
|
+
# TODO (kyuds): fill in more label values for different accelerators.
|
|
455
|
+
ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
|
|
456
|
+
|
|
454
457
|
@classmethod
|
|
455
458
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
|
456
459
|
return cls.LABEL_KEY
|
|
@@ -469,7 +472,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
469
472
|
|
|
470
473
|
@classmethod
|
|
471
474
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
|
472
|
-
return value
|
|
475
|
+
# return original label value if not found in mappings.
|
|
476
|
+
return cls.ACC_VALUE_MAPPINGS.get(value, value)
|
|
473
477
|
|
|
474
478
|
|
|
475
479
|
class GKELabelFormatter(GPULabelFormatter):
|
|
@@ -1012,15 +1016,16 @@ class GKEAutoscaler(Autoscaler):
|
|
|
1012
1016
|
to fit the instance type.
|
|
1013
1017
|
"""
|
|
1014
1018
|
for accelerator in node_pool_accelerators:
|
|
1019
|
+
raw_value = accelerator['acceleratorType']
|
|
1015
1020
|
node_accelerator_type = (
|
|
1016
|
-
GKELabelFormatter.get_accelerator_from_label_value(
|
|
1017
|
-
accelerator['acceleratorType']))
|
|
1021
|
+
GKELabelFormatter.get_accelerator_from_label_value(raw_value))
|
|
1018
1022
|
# handle heterogenous nodes.
|
|
1019
1023
|
if not node_accelerator_type:
|
|
1020
1024
|
continue
|
|
1021
1025
|
node_accelerator_count = accelerator['acceleratorCount']
|
|
1022
|
-
|
|
1023
|
-
|
|
1026
|
+
viable_names = [node_accelerator_type.lower(), raw_value.lower()]
|
|
1027
|
+
if (requested_gpu_type.lower() in viable_names and
|
|
1028
|
+
int(node_accelerator_count) >= requested_gpu_count):
|
|
1024
1029
|
return True
|
|
1025
1030
|
return False
|
|
1026
1031
|
|
|
@@ -1448,9 +1453,13 @@ def get_accelerator_label_key_values(
|
|
|
1448
1453
|
if is_multi_host_tpu(node_metadata_labels):
|
|
1449
1454
|
continue
|
|
1450
1455
|
for label, value in label_list:
|
|
1451
|
-
if
|
|
1452
|
-
|
|
1453
|
-
|
|
1456
|
+
if label_formatter.match_label_key(label):
|
|
1457
|
+
# match either canonicalized name or raw name
|
|
1458
|
+
accelerator = (label_formatter.
|
|
1459
|
+
get_accelerator_from_label_value(value))
|
|
1460
|
+
viable = [value.lower(), accelerator.lower()]
|
|
1461
|
+
if acc_type.lower() not in viable:
|
|
1462
|
+
continue
|
|
1454
1463
|
if is_tpu_on_gke(acc_type):
|
|
1455
1464
|
assert isinstance(label_formatter,
|
|
1456
1465
|
GKELabelFormatter)
|
sky/provision/provisioner.py
CHANGED
sky/provision/vast/instance.py
CHANGED
|
@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
|
|
|
39
39
|
|
|
40
40
|
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
41
41
|
for inst_id, inst in instances.items():
|
|
42
|
-
if inst['name'].endswith('-head'):
|
|
42
|
+
if inst.get('name') and inst['name'].endswith('-head'):
|
|
43
43
|
return inst_id
|
|
44
44
|
return None
|
|
45
45
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add skylet_ssh_tunnel_metadata to clusters.
|
|
2
|
+
|
|
3
|
+
Revision ID: 008
|
|
4
|
+
Revises: 007
|
|
5
|
+
Create Date: 2025-09-09
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '008'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '007'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add skylet_ssh_tunnel_metadata column to clusters."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
27
|
+
'skylet_ssh_tunnel_metadata',
|
|
28
|
+
sa.LargeBinary(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No-op for backward compatibility."""
|
|
34
|
+
pass
|