skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +160 -23
  4. sky/backends/cloud_vm_ray_backend.py +226 -74
  5. sky/catalog/__init__.py +7 -0
  6. sky/catalog/aws_catalog.py +4 -0
  7. sky/catalog/common.py +18 -0
  8. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  9. sky/client/cli/command.py +2 -71
  10. sky/client/sdk.py +20 -0
  11. sky/client/sdk_async.py +23 -18
  12. sky/clouds/aws.py +26 -6
  13. sky/clouds/cloud.py +8 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  17. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  18. sky/dashboard/out/clusters/[cluster].html +1 -1
  19. sky/dashboard/out/clusters.html +1 -1
  20. sky/dashboard/out/config.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra/[context].html +1 -1
  23. sky/dashboard/out/infra.html +1 -1
  24. sky/dashboard/out/jobs/[job].html +1 -1
  25. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/volumes.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/data/storage.py +5 -1
  33. sky/execution.py +21 -14
  34. sky/global_user_state.py +34 -0
  35. sky/jobs/client/sdk_async.py +4 -2
  36. sky/jobs/constants.py +3 -0
  37. sky/jobs/controller.py +734 -310
  38. sky/jobs/recovery_strategy.py +251 -129
  39. sky/jobs/scheduler.py +247 -174
  40. sky/jobs/server/core.py +20 -4
  41. sky/jobs/server/utils.py +2 -2
  42. sky/jobs/state.py +709 -508
  43. sky/jobs/utils.py +90 -40
  44. sky/logs/agent.py +10 -2
  45. sky/provision/aws/config.py +4 -1
  46. sky/provision/gcp/config.py +6 -1
  47. sky/provision/kubernetes/config.py +7 -2
  48. sky/provision/kubernetes/instance.py +84 -41
  49. sky/provision/kubernetes/utils.py +17 -8
  50. sky/provision/provisioner.py +1 -0
  51. sky/provision/vast/instance.py +1 -1
  52. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  53. sky/serve/replica_managers.py +0 -7
  54. sky/serve/serve_utils.py +5 -0
  55. sky/serve/server/impl.py +1 -2
  56. sky/serve/service.py +0 -2
  57. sky/server/common.py +8 -3
  58. sky/server/config.py +55 -27
  59. sky/server/constants.py +1 -0
  60. sky/server/daemons.py +7 -11
  61. sky/server/metrics.py +41 -8
  62. sky/server/requests/executor.py +41 -4
  63. sky/server/requests/serializers/encoders.py +1 -1
  64. sky/server/server.py +9 -1
  65. sky/server/uvicorn.py +11 -5
  66. sky/setup_files/dependencies.py +4 -2
  67. sky/skylet/attempt_skylet.py +1 -0
  68. sky/skylet/constants.py +14 -7
  69. sky/skylet/events.py +2 -10
  70. sky/skylet/log_lib.py +11 -0
  71. sky/skylet/log_lib.pyi +9 -0
  72. sky/task.py +62 -0
  73. sky/templates/kubernetes-ray.yml.j2 +120 -3
  74. sky/utils/accelerator_registry.py +3 -1
  75. sky/utils/command_runner.py +35 -11
  76. sky/utils/command_runner.pyi +25 -3
  77. sky/utils/common_utils.py +11 -1
  78. sky/utils/context_utils.py +15 -2
  79. sky/utils/controller_utils.py +5 -0
  80. sky/utils/db/db_utils.py +31 -2
  81. sky/utils/db/migration_utils.py +1 -1
  82. sky/utils/git.py +559 -1
  83. sky/utils/resource_checker.py +8 -7
  84. sky/utils/rich_utils.py +3 -1
  85. sky/utils/subprocess_utils.py +9 -0
  86. sky/volumes/volume.py +2 -0
  87. sky/workspaces/core.py +57 -21
  88. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
  89. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
  90. sky/client/cli/git.py +0 -549
  91. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  92. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  93. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  94. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  95. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -4,9 +4,11 @@ NOTE: whenever an API change is made in this file, we need to bump the
4
4
  jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
5
5
  ManagedJobCodeGen.
6
6
  """
7
+ import asyncio
7
8
  import collections
8
9
  import datetime
9
10
  import enum
11
+ import logging
10
12
  import os
11
13
  import pathlib
12
14
  import shlex
@@ -14,11 +16,11 @@ import textwrap
14
16
  import time
15
17
  import traceback
16
18
  import typing
17
- from typing import Any, Deque, Dict, List, Optional, Set, TextIO, Tuple, Union
19
+ from typing import (Any, Deque, Dict, List, Literal, Optional, Set, TextIO,
20
+ Tuple, Union)
18
21
 
19
22
  import colorama
20
23
  import filelock
21
- from typing_extensions import Literal
22
24
 
23
25
  from sky import backends
24
26
  from sky import exceptions
@@ -37,6 +39,7 @@ from sky.usage import usage_lib
37
39
  from sky.utils import annotations
38
40
  from sky.utils import command_runner
39
41
  from sky.utils import common_utils
42
+ from sky.utils import context_utils
40
43
  from sky.utils import controller_utils
41
44
  from sky.utils import infra_utils
42
45
  from sky.utils import log_utils
@@ -56,9 +59,9 @@ else:
56
59
 
57
60
  logger = sky_logging.init_logger(__name__)
58
61
 
59
- SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
60
62
  # Controller checks its job's status every this many seconds.
61
- JOB_STATUS_CHECK_GAP_SECONDS = 20
63
+ # This is a tradeoff between the latency and the resource usage.
64
+ JOB_STATUS_CHECK_GAP_SECONDS = 15
62
65
 
63
66
  # Controller checks if its job has started every this many seconds.
64
67
  JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
@@ -82,7 +85,7 @@ _JOB_CANCELLED_MESSAGE = (
82
85
  # blocking for a long time. This should be significantly longer than the
83
86
  # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
84
87
  # update the state.
85
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
88
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
86
89
 
87
90
 
88
91
  class ManagedJobQueueResultType(enum.Enum):
@@ -99,7 +102,11 @@ class UserSignal(enum.Enum):
99
102
 
100
103
 
101
104
  # ====== internal functions ======
102
- def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
105
+ def terminate_cluster(
106
+ cluster_name: str,
107
+ max_retry: int = 6,
108
+ _logger: logging.Logger = logger, # pylint: disable=invalid-name
109
+ ) -> None:
103
110
  """Terminate the cluster."""
104
111
  from sky import core # pylint: disable=import-outside-toplevel
105
112
  retry_cnt = 0
@@ -122,18 +129,18 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
122
129
  return
123
130
  except exceptions.ClusterDoesNotExist:
124
131
  # The cluster is already down.
125
- logger.debug(f'The cluster {cluster_name} is already down.')
132
+ _logger.debug(f'The cluster {cluster_name} is already down.')
126
133
  return
127
134
  except Exception as e: # pylint: disable=broad-except
128
135
  retry_cnt += 1
129
136
  if retry_cnt >= max_retry:
130
137
  raise RuntimeError(
131
138
  f'Failed to terminate the cluster {cluster_name}.') from e
132
- logger.error(
139
+ _logger.error(
133
140
  f'Failed to terminate the cluster {cluster_name}. Retrying.'
134
141
  f'Details: {common_utils.format_exception(e)}')
135
142
  with ux_utils.enable_traceback():
136
- logger.error(f' Traceback: {traceback.format_exc()}')
143
+ _logger.error(f' Traceback: {traceback.format_exc()}')
137
144
  time.sleep(backoff.current_backoff())
138
145
 
139
146
 
@@ -183,6 +190,9 @@ def _validate_consolidation_mode_config(
183
190
  # Use LRU Cache so that the check is only done once.
184
191
  @annotations.lru_cache(scope='request', maxsize=1)
185
192
  def is_consolidation_mode() -> bool:
193
+ if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
194
+ return True
195
+
186
196
  consolidation_mode = skypilot_config.get_nested(
187
197
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
188
198
  # We should only do this check on API server, as the controller will not
@@ -199,6 +209,7 @@ def ha_recovery_for_consolidation_mode():
199
209
  # already has all runtime installed. Directly start jobs recovery here.
200
210
  # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
201
211
  runner = command_runner.LocalProcessCommandRunner()
212
+ scheduler.maybe_start_controllers()
202
213
  with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
203
214
  'w',
204
215
  encoding='utf-8') as f:
@@ -214,7 +225,7 @@ def ha_recovery_for_consolidation_mode():
214
225
  # just keep running.
215
226
  if controller_pid is not None:
216
227
  try:
217
- if _controller_process_alive(controller_pid, job_id):
228
+ if controller_process_alive(controller_pid, job_id):
218
229
  f.write(f'Controller pid {controller_pid} for '
219
230
  f'job {job_id} is still running. '
220
231
  'Skipping recovery.\n')
@@ -227,7 +238,7 @@ def ha_recovery_for_consolidation_mode():
227
238
 
228
239
  if job['schedule_state'] not in [
229
240
  managed_job_state.ManagedJobScheduleState.DONE,
230
- managed_job_state.ManagedJobScheduleState.WAITING
241
+ managed_job_state.ManagedJobScheduleState.WAITING,
231
242
  ]:
232
243
  script = managed_job_state.get_ha_recovery_script(job_id)
233
244
  if script is None:
@@ -242,56 +253,66 @@ def ha_recovery_for_consolidation_mode():
242
253
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
243
254
 
244
255
 
245
- def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
246
- job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
256
+ async def get_job_status(
257
+ backend: 'backends.CloudVmRayBackend', cluster_name: str,
258
+ job_id: Optional[int],
259
+ job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
247
260
  """Check the status of the job running on a managed job cluster.
248
261
 
249
262
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
250
263
  FAILED_SETUP or CANCELLED.
251
264
  """
252
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
265
+ # TODO(luca) make this async
266
+ handle = await context_utils.to_thread(
267
+ global_user_state.get_handle_from_cluster_name, cluster_name)
253
268
  if handle is None:
254
269
  # This can happen if the cluster was preempted and background status
255
270
  # refresh already noticed and cleaned it up.
256
- logger.info(f'Cluster {cluster_name} not found.')
271
+ job_logger.info(f'Cluster {cluster_name} not found.')
257
272
  return None
258
273
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
259
274
  job_ids = None if job_id is None else [job_id]
260
275
  for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
261
276
  try:
262
- logger.info('=== Checking the job status... ===')
263
- statuses = backend.get_job_status(handle,
264
- job_ids=job_ids,
265
- stream_logs=False)
277
+ job_logger.info('=== Checking the job status... ===')
278
+ statuses = await context_utils.to_thread(backend.get_job_status,
279
+ handle,
280
+ job_ids=job_ids,
281
+ stream_logs=False)
266
282
  status = list(statuses.values())[0]
267
283
  if status is None:
268
- logger.info('No job found.')
284
+ job_logger.info('No job found.')
269
285
  else:
270
- logger.info(f'Job status: {status}')
271
- logger.info('=' * 34)
286
+ job_logger.info(f'Job status: {status}')
287
+ job_logger.info('=' * 34)
272
288
  return status
273
289
  except exceptions.CommandError as e:
274
290
  # Retry on k8s transient network errors. This is useful when using
275
291
  # coreweave which may have transient network issue sometimes.
276
292
  if (e.detailed_reason is not None and
277
293
  _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
278
- logger.info('Failed to connect to the cluster. Retrying '
279
- f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
280
- logger.info('=' * 34)
281
- time.sleep(1)
294
+ job_logger.info('Failed to connect to the cluster. Retrying '
295
+ f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
296
+ job_logger.info('=' * 34)
297
+ await asyncio.sleep(1)
282
298
  else:
283
- logger.info(f'Failed to get job status: {e.detailed_reason}')
284
- logger.info('=' * 34)
299
+ job_logger.info(
300
+ f'Failed to get job status: {e.detailed_reason}')
301
+ job_logger.info('=' * 34)
285
302
  return None
286
303
  return None
287
304
 
288
305
 
289
- def _controller_process_alive(pid: int, job_id: int) -> bool:
306
+ def controller_process_alive(pid: int, job_id: int) -> bool:
290
307
  """Check if the controller process is alive."""
291
308
  try:
309
+ if pid < 0:
310
+ # new job controller process will always be negative
311
+ pid = -pid
292
312
  process = psutil.Process(pid)
293
313
  cmd_str = ' '.join(process.cmdline())
294
- return process.is_running() and f'--job-id {job_id}' in cmd_str
314
+ return process.is_running() and ((f'--job-id {job_id}' in cmd_str) or
315
+ ('controller' in cmd_str))
295
316
  except psutil.NoSuchProcess:
296
317
  return False
297
318
 
@@ -466,7 +487,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
466
487
  failure_reason = f'No controller pid set for {schedule_state.value}'
467
488
  else:
468
489
  logger.debug(f'Checking controller pid {pid}')
469
- if _controller_process_alive(pid, job_id):
490
+ if controller_process_alive(pid, job_id):
470
491
  # The controller is still running, so this job is fine.
471
492
  continue
472
493
 
@@ -565,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
565
586
  raise
566
587
 
567
588
 
568
- def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
589
+ def event_callback_func(
590
+ job_id: int, task_id: Optional[int],
591
+ task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
569
592
  """Run event callback for the task."""
570
593
 
571
594
  def callback_func(status: str):
@@ -604,7 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
604
627
  f'Bash:{event_callback},log_path:{log_path},result:{result}')
605
628
  logger.info(f'=== END: event callback for {status!r} ===')
606
629
 
607
- return callback_func
630
+ async def async_callback_func(status: str):
631
+ return await context_utils.to_thread(callback_func, status)
632
+
633
+ return async_callback_func
608
634
 
609
635
 
610
636
  # ======== user functions ========
@@ -651,16 +677,41 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
651
677
  logger.info(f'Job {job_id} is already in terminal state '
652
678
  f'{job_status.value}. Skipped.')
653
679
  continue
680
+ elif job_status == managed_job_state.ManagedJobStatus.PENDING:
681
+ # the if is a short circuit, this will be atomic.
682
+ cancelled = managed_job_state.set_pending_cancelled(job_id)
683
+ if cancelled:
684
+ cancelled_job_ids.append(job_id)
685
+ continue
654
686
 
655
687
  update_managed_jobs_statuses(job_id)
656
688
 
689
+ job_controller_pid = managed_job_state.get_job_controller_pid(job_id)
690
+ if job_controller_pid is not None and job_controller_pid < 0:
691
+ # This is a consolidated job controller, so we need to cancel the
692
+ # with the controller server API
693
+ try:
694
+ # we create a file as a signal to the controller server
695
+ signal_file = pathlib.Path(
696
+ managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
697
+ signal_file.touch()
698
+ cancelled_job_ids.append(job_id)
699
+ except OSError as e:
700
+ logger.error(f'Failed to cancel job {job_id} '
701
+ f'with controller server: {e}')
702
+ # don't add it to the to be cancelled job ids, since we don't
703
+ # know for sure yet.
704
+ continue
705
+ continue
706
+
657
707
  job_workspace = managed_job_state.get_workspace(job_id)
658
708
  if current_workspace is not None and job_workspace != current_workspace:
659
709
  wrong_workspace_job_ids.append(job_id)
660
710
  continue
661
711
 
662
712
  # Send the signal to the jobs controller.
663
- signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
713
+ signal_file = (pathlib.Path(
714
+ managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
664
715
  # Filelock is needed to prevent race condition between signal
665
716
  # check/removal and signal writing.
666
717
  with filelock.FileLock(str(signal_file) + '.lock'):
@@ -1159,8 +1210,7 @@ def dump_managed_job_queue(
1159
1210
  # It's possible for a WAITING/ALIVE_WAITING job to be ready to
1160
1211
  # launch, but the scheduler just hasn't run yet.
1161
1212
  managed_job_state.ManagedJobScheduleState.WAITING,
1162
- managed_job_state.ManagedJobScheduleState.ALIVE_WAITING,
1163
- ):
1213
+ managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
1164
1214
  # This job will not block others.
1165
1215
  continue
1166
1216
 
@@ -1370,12 +1420,12 @@ def load_managed_job_queue(
1370
1420
  """Load job queue from json string."""
1371
1421
  result = message_utils.decode_payload(payload)
1372
1422
  result_type = ManagedJobQueueResultType.DICT
1373
- status_counts = {}
1423
+ status_counts: Dict[str, int] = {}
1374
1424
  if isinstance(result, dict):
1375
- jobs = result['jobs']
1376
- total = result['total']
1425
+ jobs: List[Dict[str, Any]] = result['jobs']
1426
+ total: int = result['total']
1377
1427
  status_counts = result.get('status_counts', {})
1378
- total_no_filter = result.get('total_no_filter', total)
1428
+ total_no_filter: int = result.get('total_no_filter', total)
1379
1429
  else:
1380
1430
  jobs = result
1381
1431
  total = len(jobs)
sky/logs/agent.py CHANGED
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
37
  'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
- 'sudo apt-get install -y gnupg; '
38
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
39
39
  # pylint: disable=line-too-long
40
- 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
40
+ 'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
41
+ # pylint: disable=line-too-long
42
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
43
+ # pylint: disable=line-too-long
44
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
45
+ # pylint: disable=line-too-long
46
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
47
+ 'sudo apt-get update; '
48
+ 'sudo apt-get install -y fluent-bit; '
41
49
  'fi')
42
50
  cfg = self.fluentbit_config(cluster_name)
43
51
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
305
305
  Returns:
306
306
  A list of route tables associated with the options VPC and region
307
307
  """
308
- filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
308
+ filters: List['ec2_type_defs.FilterTypeDef'] = [{
309
+ 'Name': 'association.main',
310
+ 'Values': [str(main).lower()],
311
+ }]
309
312
  if vpc_id is not None:
310
313
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
311
314
  logger.debug(
@@ -5,6 +5,8 @@ import time
5
5
  import typing
6
6
  from typing import Any, Dict, List, Set, Tuple
7
7
 
8
+ from typing_extensions import TypedDict
9
+
8
10
  from sky.adaptors import gcp
9
11
  from sky.clouds.utils import gcp_utils
10
12
  from sky.provision import common
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
415
417
  return iam_role
416
418
 
417
419
 
420
+ AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
421
+
422
+
418
423
  def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
419
424
  compute):
420
425
  """Check if the firewall rules in the VPC are sufficient."""
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
466
471
  }
467
472
  """
468
473
  source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
469
- source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
474
+ source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
470
475
  for rule in rules:
471
476
  # Rules applied to specific VM (targetTags) may not work for the
472
477
  # current VM, so should be skipped.
@@ -3,7 +3,7 @@ import copy
3
3
  import logging
4
4
  import math
5
5
  import os
6
- from typing import Any, Dict, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from sky.adaptors import kubernetes
9
9
  from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
666
666
 
667
667
 
668
668
  class KubernetesError(Exception):
669
- pass
669
+
670
+ def __init__(self,
671
+ *args,
672
+ insufficent_resources: Optional[List[str]] = None):
673
+ self.insufficent_resources = insufficent_resources
674
+ super().__init__(*args)
@@ -3,6 +3,7 @@ import copy
3
3
  import datetime
4
4
  import json
5
5
  import re
6
+ import sys
6
7
  import time
7
8
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
191
192
  break
192
193
  if event_message is not None:
193
194
  if pod_status == 'Pending':
194
- logger.info(event_message)
195
+ out_of = {}
196
+ # key: resource name, value: (extra message, nice name)
195
197
  if 'Insufficient cpu' in event_message:
196
- raise config_lib.KubernetesError(
197
- _lack_resource_msg('CPU', pod, details=event_message))
198
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
199
+ 'custom-columns=NAME:.metadata.name,'
200
+ 'CPU:.status.allocatable.cpu\' to check '
201
+ 'the available CPUs on the node.', 'CPUs')
198
202
  if 'Insufficient memory' in event_message:
199
- raise config_lib.KubernetesError(
200
- _lack_resource_msg('memory', pod,
201
- details=event_message))
203
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
204
+ 'custom-columns=NAME:.metadata.name,'
205
+ 'MEMORY:.status.allocatable.memory\' '
206
+ 'to check the available memory on the '
207
+ 'node.', 'Memory')
208
+
202
209
  # TODO(aylei): after switching from smarter-device-manager to
203
210
  # fusermount-server, we need a new way to check whether the
204
211
  # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
206
213
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
207
214
  for key in lf.get_label_keys()
208
215
  ]
209
- if pod.spec.node_selector:
210
- for label_key in pod.spec.node_selector.keys():
211
- if label_key in gpu_lf_keys:
212
- # TODO(romilb): We may have additional node
213
- # affinity selectors in the future - in that
214
- # case we will need to update this logic.
215
- # TODO(Doyoung): Update the error message raised
216
- # with the multi-host TPU support.
217
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
218
- if 'Insufficient google.com/tpu' in event_message:
219
- extra_msg = (
220
- f'Verify if '
221
- f'{pod.spec.node_selector[label_key]}'
222
- ' is available in the cluster. Note '
223
- 'that multi-host TPU podslices are '
224
- 'currently not unsupported.')
225
- raise config_lib.KubernetesError(
226
- _lack_resource_msg('TPU',
227
- pod,
228
- extra_msg,
229
- details=event_message))
230
- elif ((f'Insufficient {gpu_resource_key}'
231
- in event_message) or
232
- ('didn\'t match Pod\'s node affinity/selector'
233
- in event_message)):
234
- extra_msg = (
235
- f'Verify if any node matching label '
236
- f'{pod.spec.node_selector[label_key]} and '
237
- f'sufficient resource {gpu_resource_key} '
238
- f'is available in the cluster.')
239
- raise config_lib.KubernetesError(
240
- _lack_resource_msg('GPU',
241
- pod,
242
- extra_msg,
243
- details=event_message))
216
+ for label_key in gpu_lf_keys:
217
+ # TODO(romilb): We may have additional node
218
+ # affinity selectors in the future - in that
219
+ # case we will need to update this logic.
220
+ # TODO(Doyoung): Update the error message raised
221
+ # with the multi-host TPU support.
222
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
223
+ context) # pylint: disable=line-too-long
224
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
225
+ ('didn\'t match Pod\'s node affinity/selector'
226
+ in event_message) and pod.spec.node_selector):
227
+ if 'gpu' in gpu_resource_key.lower():
228
+ info_msg = (
229
+ ': Run \'sky show-gpus --infra kubernetes\' to '
230
+ 'see the available GPUs.')
231
+ else:
232
+ info_msg = ': '
233
+ if (pod.spec.node_selector and
234
+ label_key in pod.spec.node_selector):
235
+ extra_msg = (
236
+ f'Verify if any node matching label '
237
+ f'{pod.spec.node_selector[label_key]} and '
238
+ f'sufficient resource {gpu_resource_key} '
239
+ f'is available in the cluster.')
240
+ extra_msg = info_msg + ' ' + extra_msg
241
+ else:
242
+ extra_msg = info_msg
243
+ if gpu_resource_key not in out_of or len(
244
+ out_of[gpu_resource_key][0]) < len(extra_msg):
245
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
246
+
247
+ if len(out_of) > 0:
248
+ # We are out of some resources. We should raise an error.
249
+ rsrc_err_msg = 'Insufficient resource capacity on the '
250
+ rsrc_err_msg += 'cluster:\n'
251
+ out_of_keys = list(out_of.keys())
252
+ for i in range(len(out_of_keys)):
253
+ rsrc = out_of_keys[i]
254
+ (extra_msg, nice_name) = out_of[rsrc]
255
+ extra_msg = extra_msg if extra_msg else ''
256
+ if i == len(out_of_keys) - 1:
257
+ indent = '└──'
258
+ else:
259
+ indent = '├──'
260
+ rsrc_err_msg += (f'{indent} Cluster does not have '
261
+ f'sufficient {nice_name} for your request'
262
+ f'{extra_msg}')
263
+ if i != len(out_of_keys) - 1:
264
+ rsrc_err_msg += '\n'
265
+
266
+ # Emit the error message without logging prefixes for better UX.
267
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
268
+ tmp_handler.flush = sys.stdout.flush
269
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
270
+ tmp_handler.setLevel(sky_logging.ERROR)
271
+ prev_propagate = logger.propagate
272
+ try:
273
+ logger.addHandler(tmp_handler)
274
+ logger.propagate = False
275
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
276
+ finally:
277
+ logger.removeHandler(tmp_handler)
278
+ logger.propagate = prev_propagate
279
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
280
+ raise config_lib.KubernetesError(
281
+ f'{timeout_err_msg} '
282
+ f'Pod status: {pod_status} '
283
+ f'Details: \'{event_message}\' ',
284
+ insufficent_resources=nice_names,
285
+ )
286
+
244
287
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
245
288
  f'Pod status: {pod_status} '
246
289
  f'Details: \'{event_message}\' ')
@@ -451,6 +451,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
451
451
 
452
452
  LABEL_KEY = 'gpu.nvidia.com/class'
453
453
 
454
+ # TODO (kyuds): fill in more label values for different accelerators.
455
+ ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
456
+
454
457
  @classmethod
455
458
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
456
459
  return cls.LABEL_KEY
@@ -469,7 +472,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
469
472
 
470
473
  @classmethod
471
474
  def get_accelerator_from_label_value(cls, value: str) -> str:
472
- return value
475
+ # return original label value if not found in mappings.
476
+ return cls.ACC_VALUE_MAPPINGS.get(value, value)
473
477
 
474
478
 
475
479
  class GKELabelFormatter(GPULabelFormatter):
@@ -1012,15 +1016,16 @@ class GKEAutoscaler(Autoscaler):
1012
1016
  to fit the instance type.
1013
1017
  """
1014
1018
  for accelerator in node_pool_accelerators:
1019
+ raw_value = accelerator['acceleratorType']
1015
1020
  node_accelerator_type = (
1016
- GKELabelFormatter.get_accelerator_from_label_value(
1017
- accelerator['acceleratorType']))
1021
+ GKELabelFormatter.get_accelerator_from_label_value(raw_value))
1018
1022
  # handle heterogenous nodes.
1019
1023
  if not node_accelerator_type:
1020
1024
  continue
1021
1025
  node_accelerator_count = accelerator['acceleratorCount']
1022
- if node_accelerator_type == requested_gpu_type and int(
1023
- node_accelerator_count) >= requested_gpu_count:
1026
+ viable_names = [node_accelerator_type.lower(), raw_value.lower()]
1027
+ if (requested_gpu_type.lower() in viable_names and
1028
+ int(node_accelerator_count) >= requested_gpu_count):
1024
1029
  return True
1025
1030
  return False
1026
1031
 
@@ -1448,9 +1453,13 @@ def get_accelerator_label_key_values(
1448
1453
  if is_multi_host_tpu(node_metadata_labels):
1449
1454
  continue
1450
1455
  for label, value in label_list:
1451
- if (label_formatter.match_label_key(label) and
1452
- label_formatter.get_accelerator_from_label_value(
1453
- value).lower() == acc_type.lower()):
1456
+ if label_formatter.match_label_key(label):
1457
+ # match either canonicalized name or raw name
1458
+ accelerator = (label_formatter.
1459
+ get_accelerator_from_label_value(value))
1460
+ viable = [value.lower(), accelerator.lower()]
1461
+ if acc_type.lower() not in viable:
1462
+ continue
1454
1463
  if is_tpu_on_gke(acc_type):
1455
1464
  assert isinstance(label_formatter,
1456
1465
  GKELabelFormatter)
@@ -526,6 +526,7 @@ def _post_provision_setup(
526
526
  status.update(
527
527
  ux_utils.spinner_message(
528
528
  'Checking controller version compatibility'))
529
+
529
530
  try:
530
531
  server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
531
532
  except exceptions.ClusterNotUpError:
@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
39
39
 
40
40
  def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
41
  for inst_id, inst in instances.items():
42
- if inst['name'].endswith('-head'):
42
+ if inst.get('name') and inst['name'].endswith('-head'):
43
43
  return inst_id
44
44
  return None
45
45
 
@@ -0,0 +1,34 @@
1
+ """Add skylet_ssh_tunnel_metadata to clusters.
2
+
3
+ Revision ID: 008
4
+ Revises: 007
5
+ Create Date: 2025-09-09
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '008'
18
+ down_revision: Union[str, Sequence[str], None] = '007'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add skylet_ssh_tunnel_metadata column to clusters."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('clusters',
27
+ 'skylet_ssh_tunnel_metadata',
28
+ sa.LargeBinary(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No-op for backward compatibility."""
34
+ pass