skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
@@ -543,7 +543,7 @@ class StrategyExecutor:
543
543
 
544
544
  except exceptions.NoClusterLaunchedError:
545
545
  # Update the status to PENDING during backoff.
546
- state.set_backoff_pending_async(self.job_id, self.task_id)
546
+ await state.set_backoff_pending_async(self.job_id, self.task_id)
547
547
  # Calculate the backoff time and sleep.
548
548
  gap_seconds = (backoff.current_backoff()
549
549
  if self.pool is None else 1)
sky/jobs/state.py CHANGED
@@ -238,6 +238,7 @@ def _init_db_async(func):
238
238
  last_exc = e
239
239
  logger.debug(f'DB error: {last_exc}')
240
240
  await asyncio.sleep(backoff.current_backoff())
241
+ assert last_exc is not None
241
242
  raise last_exc
242
243
 
243
244
  return wrapper
@@ -266,6 +267,7 @@ def _init_db(func):
266
267
  last_exc = e
267
268
  logger.debug(f'DB error: {last_exc}')
268
269
  time.sleep(backoff.current_backoff())
270
+ assert last_exc is not None
269
271
  raise last_exc
270
272
 
271
273
  return wrapper
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
735
737
  # Subquery to get the spot_job_ids that match the joined condition
736
738
  subquery = session.query(spot_table.c.job_id).join(
737
739
  job_info_table,
738
- spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
739
- spot_table.c.spot_job_id == job_id,
740
- spot_table.c.status == ManagedJobStatus.PENDING.value,
741
- sqlalchemy.or_(
742
- job_info_table.c.schedule_state ==
743
- ManagedJobScheduleState.WAITING.value,
744
- job_info_table.c.schedule_state ==
745
- ManagedJobScheduleState.INACTIVE.value,
746
- ),
747
- ).subquery()
740
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id
741
+ ).filter(
742
+ spot_table.c.spot_job_id == job_id,
743
+ spot_table.c.status == ManagedJobStatus.PENDING.value,
744
+ # Note: it's possible that a WAITING job actually needs to be
745
+ # cleaned up, if we are in the middle of an upgrade/recovery and
746
+ # the job is waiting to be reclaimed by a new controller. But,
747
+ # in this case the status will not be PENDING.
748
+ sqlalchemy.or_(
749
+ job_info_table.c.schedule_state ==
750
+ ManagedJobScheduleState.WAITING.value,
751
+ job_info_table.c.schedule_state ==
752
+ ManagedJobScheduleState.INACTIVE.value,
753
+ ),
754
+ ).subquery()
748
755
 
749
756
  count = session.query(spot_table).filter(
750
757
  spot_table.c.job_id.in_(subquery)).update(
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
1105
1112
  """Set the job id on the pool cluster for a job."""
1106
1113
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
1107
1114
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1108
- await session.execute(job_info_table.c.spot_job_id == job_id).update(
1109
- {job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
1115
+ await session.execute(
1116
+ sqlalchemy.update(job_info_table).
1117
+ where(job_info_table.c.spot_job_id == job_id).values({
1118
+ job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
1119
+ }))
1110
1120
  await session.commit()
1111
1121
 
1112
1122
 
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
1130
1140
  job_id: int) -> Tuple[Optional[str], Optional[int]]:
1131
1141
  """Get the cluster name and job id on the pool from the managed job id."""
1132
1142
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
1133
- async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
1134
- info = await session.execute(
1143
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1144
+ result = await session.execute(
1135
1145
  sqlalchemy.select(job_info_table.c.current_cluster_name,
1136
1146
  job_info_table.c.job_id_on_pool_cluster).where(
1137
- job_info_table.c.spot_job_id == job_id)
1138
- ).fetchone()
1147
+ job_info_table.c.spot_job_id == job_id))
1148
+ info = result.fetchone()
1139
1149
  if info is None:
1140
1150
  return None, None
1141
1151
  return info[0], info[1]
sky/jobs/utils.py CHANGED
@@ -29,6 +29,7 @@ from sky import sky_logging
29
29
  from sky import skypilot_config
30
30
  from sky.adaptors import common as adaptors_common
31
31
  from sky.backends import backend_utils
32
+ from sky.backends import cloud_vm_ray_backend
32
33
  from sky.jobs import constants as managed_job_constants
33
34
  from sky.jobs import scheduler
34
35
  from sky.jobs import state as managed_job_state
@@ -50,12 +51,16 @@ from sky.utils import subprocess_utils
50
51
  from sky.utils import ux_utils
51
52
 
52
53
  if typing.TYPE_CHECKING:
54
+ import grpc
53
55
  import psutil
54
56
 
55
57
  import sky
56
58
  from sky import dag as dag_lib
59
+ from sky.schemas.generated import jobsv1_pb2
57
60
  else:
58
61
  psutil = adaptors_common.LazyImport('psutil')
62
+ grpc = adaptors_common.LazyImport('grpc')
63
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
59
64
 
60
65
  logger = sky_logging.init_logger(__name__)
61
66
 
@@ -286,19 +291,34 @@ async def get_job_status(
286
291
  job_logger.info(f'Job status: {status}')
287
292
  job_logger.info('=' * 34)
288
293
  return status
289
- except exceptions.CommandError as e:
294
+ except (exceptions.CommandError, grpc.RpcError,
295
+ grpc.FutureTimeoutError) as e:
290
296
  # Retry on k8s transient network errors. This is useful when using
291
297
  # coreweave which may have transient network issue sometimes.
292
- if (e.detailed_reason is not None and
293
- _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
294
- job_logger.info('Failed to connect to the cluster. Retrying '
295
- f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
296
- job_logger.info('=' * 34)
298
+ is_transient_error = False
299
+ detailed_reason = None
300
+ if isinstance(e, exceptions.CommandError):
301
+ detailed_reason = e.detailed_reason
302
+ if (detailed_reason is not None and
303
+ _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
304
+ is_transient_error = True
305
+ elif isinstance(e, grpc.RpcError):
306
+ detailed_reason = e.details()
307
+ if e.code() in [
308
+ grpc.StatusCode.UNAVAILABLE,
309
+ grpc.StatusCode.DEADLINE_EXCEEDED
310
+ ]:
311
+ is_transient_error = True
312
+ elif isinstance(e, grpc.FutureTimeoutError):
313
+ detailed_reason = 'Timeout'
314
+ if is_transient_error:
315
+ logger.info('Failed to connect to the cluster. Retrying '
316
+ f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
317
+ logger.info('=' * 34)
297
318
  await asyncio.sleep(1)
298
319
  else:
299
- job_logger.info(
300
- f'Failed to get job status: {e.detailed_reason}')
301
- job_logger.info('=' * 34)
320
+ logger.info(f'Failed to get job status: {detailed_reason}')
321
+ logger.info('=' * 34)
302
322
  return None
303
323
  return None
304
324
 
@@ -547,9 +567,32 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
547
567
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
548
568
  job_id: Optional[int], get_end_time: bool) -> float:
549
569
  """Get the submitted/ended time of the job."""
550
- code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
551
- job_id=job_id, get_ended_time=get_end_time)
552
570
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
571
+ assert handle is not None, (
572
+ f'handle for cluster {cluster_name!r} should not be None')
573
+ if handle.is_grpc_enabled_with_flag:
574
+ try:
575
+ if get_end_time:
576
+ end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
577
+ job_id=job_id)
578
+ end_ts_response = backend_utils.invoke_skylet_with_retries(
579
+ lambda: cloud_vm_ray_backend.SkyletClient(
580
+ handle.get_grpc_channel()).get_job_ended_timestamp(
581
+ end_ts_request))
582
+ return end_ts_response.timestamp
583
+ else:
584
+ submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
585
+ job_id=job_id)
586
+ submit_ts_response = backend_utils.invoke_skylet_with_retries(
587
+ lambda: cloud_vm_ray_backend.SkyletClient(
588
+ handle.get_grpc_channel()).get_job_submitted_timestamp(
589
+ submit_ts_request))
590
+ return submit_ts_response.timestamp
591
+ except exceptions.SkyletMethodNotImplementedError:
592
+ pass
593
+
594
+ code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
595
+ job_id=job_id, get_ended_time=get_end_time))
553
596
  returncode, stdout, stderr = backend.run_on_head(handle,
554
597
  code,
555
598
  stream_logs=False,
@@ -573,8 +616,13 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
573
616
  cluster_name,
574
617
  job_id=job_id,
575
618
  get_end_time=True)
576
- except exceptions.CommandError as e:
577
- if e.returncode == 255:
619
+ except (exceptions.CommandError, grpc.RpcError,
620
+ grpc.FutureTimeoutError) as e:
621
+ if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
622
+ (isinstance(e, grpc.RpcError) and e.code() in [
623
+ grpc.StatusCode.UNAVAILABLE,
624
+ grpc.StatusCode.DEADLINE_EXCEEDED,
625
+ ]) or isinstance(e, grpc.FutureTimeoutError):
578
626
  # Failed to connect - probably the instance was preempted since the
579
627
  # job completed. We shouldn't crash here, so just log and use the
580
628
  # current time.
@@ -586,7 +634,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
586
634
  raise
587
635
 
588
636
 
589
- def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
637
+ def event_callback_func(
638
+ job_id: int, task_id: Optional[int],
639
+ task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
590
640
  """Run event callback for the task."""
591
641
 
592
642
  def callback_func(status: str):
@@ -625,17 +675,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
625
675
  f'Bash:{event_callback},log_path:{log_path},result:{result}')
626
676
  logger.info(f'=== END: event callback for {status!r} ===')
627
677
 
628
- try:
629
- asyncio.get_running_loop()
630
-
631
- # In async context
632
- async def async_callback_func(status: str):
633
- return await context_utils.to_thread(callback_func, status)
678
+ async def async_callback_func(status: str):
679
+ return await context_utils.to_thread(callback_func, status)
634
680
 
635
- return async_callback_func
636
- except RuntimeError:
637
- # Not in async context
638
- return callback_func
681
+ return async_callback_func
639
682
 
640
683
 
641
684
  # ======== user functions ========
sky/logs/agent.py CHANGED
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
37
  'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
- 'sudo apt-get install -y gnupg; '
38
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
39
39
  # pylint: disable=line-too-long
40
- 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
40
+ 'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
41
+ # pylint: disable=line-too-long
42
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
43
+ # pylint: disable=line-too-long
44
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
45
+ # pylint: disable=line-too-long
46
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
47
+ 'sudo apt-get update; '
48
+ 'sudo apt-get install -y fluent-bit; '
41
49
  'fi')
42
50
  cfg = self.fluentbit_config(cluster_name)
43
51
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
sky/provision/__init__.py CHANGED
@@ -26,6 +26,7 @@ from sky.provision import nebius
26
26
  from sky.provision import oci
27
27
  from sky.provision import runpod
28
28
  from sky.provision import scp
29
+ from sky.provision import seeweb
29
30
  from sky.provision import ssh
30
31
  from sky.provision import vast
31
32
  from sky.provision import vsphere
@@ -3,7 +3,7 @@ import copy
3
3
  import logging
4
4
  import math
5
5
  import os
6
- from typing import Any, Dict, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from sky.adaptors import kubernetes
9
9
  from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
666
666
 
667
667
 
668
668
  class KubernetesError(Exception):
669
- pass
669
+
670
+ def __init__(self,
671
+ *args,
672
+ insufficent_resources: Optional[List[str]] = None):
673
+ self.insufficent_resources = insufficent_resources
674
+ super().__init__(*args)
@@ -3,6 +3,7 @@ import copy
3
3
  import datetime
4
4
  import json
5
5
  import re
6
+ import sys
6
7
  import time
7
8
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
191
192
  break
192
193
  if event_message is not None:
193
194
  if pod_status == 'Pending':
194
- logger.info(event_message)
195
+ out_of = {}
196
+ # key: resource name, value: (extra message, nice name)
195
197
  if 'Insufficient cpu' in event_message:
196
- raise config_lib.KubernetesError(
197
- _lack_resource_msg('CPU', pod, details=event_message))
198
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
199
+ 'custom-columns=NAME:.metadata.name,'
200
+ 'CPU:.status.allocatable.cpu\' to check '
201
+ 'the available CPUs on the node.', 'CPUs')
198
202
  if 'Insufficient memory' in event_message:
199
- raise config_lib.KubernetesError(
200
- _lack_resource_msg('memory', pod,
201
- details=event_message))
203
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
204
+ 'custom-columns=NAME:.metadata.name,'
205
+ 'MEMORY:.status.allocatable.memory\' '
206
+ 'to check the available memory on the '
207
+ 'node.', 'Memory')
208
+
202
209
  # TODO(aylei): after switching from smarter-device-manager to
203
210
  # fusermount-server, we need a new way to check whether the
204
211
  # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
206
213
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
207
214
  for key in lf.get_label_keys()
208
215
  ]
209
- if pod.spec.node_selector:
210
- for label_key in pod.spec.node_selector.keys():
211
- if label_key in gpu_lf_keys:
212
- # TODO(romilb): We may have additional node
213
- # affinity selectors in the future - in that
214
- # case we will need to update this logic.
215
- # TODO(Doyoung): Update the error message raised
216
- # with the multi-host TPU support.
217
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
218
- if 'Insufficient google.com/tpu' in event_message:
219
- extra_msg = (
220
- f'Verify if '
221
- f'{pod.spec.node_selector[label_key]}'
222
- ' is available in the cluster. Note '
223
- 'that multi-host TPU podslices are '
224
- 'currently not unsupported.')
225
- raise config_lib.KubernetesError(
226
- _lack_resource_msg('TPU',
227
- pod,
228
- extra_msg,
229
- details=event_message))
230
- elif ((f'Insufficient {gpu_resource_key}'
231
- in event_message) or
232
- ('didn\'t match Pod\'s node affinity/selector'
233
- in event_message)):
234
- extra_msg = (
235
- f'Verify if any node matching label '
236
- f'{pod.spec.node_selector[label_key]} and '
237
- f'sufficient resource {gpu_resource_key} '
238
- f'is available in the cluster.')
239
- raise config_lib.KubernetesError(
240
- _lack_resource_msg('GPU',
241
- pod,
242
- extra_msg,
243
- details=event_message))
216
+ for label_key in gpu_lf_keys:
217
+ # TODO(romilb): We may have additional node
218
+ # affinity selectors in the future - in that
219
+ # case we will need to update this logic.
220
+ # TODO(Doyoung): Update the error message raised
221
+ # with the multi-host TPU support.
222
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
223
+ context) # pylint: disable=line-too-long
224
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
225
+ ('didn\'t match Pod\'s node affinity/selector'
226
+ in event_message) and pod.spec.node_selector):
227
+ if 'gpu' in gpu_resource_key.lower():
228
+ info_msg = (
229
+ ': Run \'sky show-gpus --infra kubernetes\' to '
230
+ 'see the available GPUs.')
231
+ else:
232
+ info_msg = ': '
233
+ if (pod.spec.node_selector and
234
+ label_key in pod.spec.node_selector):
235
+ extra_msg = (
236
+ f'Verify if any node matching label '
237
+ f'{pod.spec.node_selector[label_key]} and '
238
+ f'sufficient resource {gpu_resource_key} '
239
+ f'is available in the cluster.')
240
+ extra_msg = info_msg + ' ' + extra_msg
241
+ else:
242
+ extra_msg = info_msg
243
+ if gpu_resource_key not in out_of or len(
244
+ out_of[gpu_resource_key][0]) < len(extra_msg):
245
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
246
+
247
+ if len(out_of) > 0:
248
+ # We are out of some resources. We should raise an error.
249
+ rsrc_err_msg = 'Insufficient resource capacity on the '
250
+ rsrc_err_msg += 'cluster:\n'
251
+ out_of_keys = list(out_of.keys())
252
+ for i in range(len(out_of_keys)):
253
+ rsrc = out_of_keys[i]
254
+ (extra_msg, nice_name) = out_of[rsrc]
255
+ extra_msg = extra_msg if extra_msg else ''
256
+ if i == len(out_of_keys) - 1:
257
+ indent = '└──'
258
+ else:
259
+ indent = '├──'
260
+ rsrc_err_msg += (f'{indent} Cluster does not have '
261
+ f'sufficient {nice_name} for your request'
262
+ f'{extra_msg}')
263
+ if i != len(out_of_keys) - 1:
264
+ rsrc_err_msg += '\n'
265
+
266
+ # Emit the error message without logging prefixes for better UX.
267
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
268
+ tmp_handler.flush = sys.stdout.flush
269
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
270
+ tmp_handler.setLevel(sky_logging.ERROR)
271
+ prev_propagate = logger.propagate
272
+ try:
273
+ logger.addHandler(tmp_handler)
274
+ logger.propagate = False
275
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
276
+ finally:
277
+ logger.removeHandler(tmp_handler)
278
+ logger.propagate = prev_propagate
279
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
280
+ raise config_lib.KubernetesError(
281
+ f'{timeout_err_msg} '
282
+ f'Pod status: {pod_status} '
283
+ f'Details: \'{event_message}\' ',
284
+ insufficent_resources=nice_names,
285
+ )
286
+
244
287
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
245
288
  f'Pod status: {pod_status} '
246
289
  f'Details: \'{event_message}\' ')
@@ -3550,9 +3550,20 @@ def process_skypilot_pods(
3550
3550
  f'requesting GPUs: {pod.metadata.name}')
3551
3551
  gpu_label = label_formatter.get_label_key()
3552
3552
  # Get GPU name from pod node selector
3553
- if pod.spec.node_selector is not None:
3554
- gpu_name = label_formatter.get_accelerator_from_label_value(
3555
- pod.spec.node_selector.get(gpu_label))
3553
+ node_selector_terms = (
3554
+ pod.spec.affinity.node_affinity.
3555
+ required_during_scheduling_ignored_during_execution.
3556
+ node_selector_terms)
3557
+ if node_selector_terms is not None:
3558
+ expressions = []
3559
+ for term in node_selector_terms:
3560
+ if term.match_expressions:
3561
+ expressions.extend(term.match_expressions)
3562
+ for expression in expressions:
3563
+ if expression.key == gpu_label and expression.operator == 'In':
3564
+ gpu_name = label_formatter.get_accelerator_from_label_value(
3565
+ expression.values[0])
3566
+ break
3556
3567
 
3557
3568
  resources = resources_lib.Resources(
3558
3569
  cloud=clouds.Kubernetes(),
@@ -0,0 +1,11 @@
1
+ """Seeweb provisioner for SkyPilot."""
2
+
3
+ from sky.provision.seeweb.config import bootstrap_instances
4
+ from sky.provision.seeweb.instance import cleanup_ports
5
+ from sky.provision.seeweb.instance import get_cluster_info
6
+ from sky.provision.seeweb.instance import open_ports
7
+ from sky.provision.seeweb.instance import query_instances
8
+ from sky.provision.seeweb.instance import run_instances
9
+ from sky.provision.seeweb.instance import stop_instances
10
+ from sky.provision.seeweb.instance import terminate_instances
11
+ from sky.provision.seeweb.instance import wait_instances
@@ -0,0 +1,13 @@
1
+ """Configuration for Seeweb provisioning."""
2
+
3
+ from typing import Any, Dict
4
+
5
+
6
+ def bootstrap_instances(*args, **_kwargs) -> Dict[str, Any]:
7
+ """Bootstrap instances for Seeweb.
8
+
9
+ Seeweb doesn't require any special configuration bootstrapping,
10
+ so we just return the config as-is.
11
+ """
12
+ config = args[2]
13
+ return config