skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -543,7 +543,7 @@ class StrategyExecutor:
|
|
|
543
543
|
|
|
544
544
|
except exceptions.NoClusterLaunchedError:
|
|
545
545
|
# Update the status to PENDING during backoff.
|
|
546
|
-
state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
546
|
+
await state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
547
547
|
# Calculate the backoff time and sleep.
|
|
548
548
|
gap_seconds = (backoff.current_backoff()
|
|
549
549
|
if self.pool is None else 1)
|
sky/jobs/state.py
CHANGED
|
@@ -238,6 +238,7 @@ def _init_db_async(func):
|
|
|
238
238
|
last_exc = e
|
|
239
239
|
logger.debug(f'DB error: {last_exc}')
|
|
240
240
|
await asyncio.sleep(backoff.current_backoff())
|
|
241
|
+
assert last_exc is not None
|
|
241
242
|
raise last_exc
|
|
242
243
|
|
|
243
244
|
return wrapper
|
|
@@ -266,6 +267,7 @@ def _init_db(func):
|
|
|
266
267
|
last_exc = e
|
|
267
268
|
logger.debug(f'DB error: {last_exc}')
|
|
268
269
|
time.sleep(backoff.current_backoff())
|
|
270
|
+
assert last_exc is not None
|
|
269
271
|
raise last_exc
|
|
270
272
|
|
|
271
273
|
return wrapper
|
|
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
|
|
|
735
737
|
# Subquery to get the spot_job_ids that match the joined condition
|
|
736
738
|
subquery = session.query(spot_table.c.job_id).join(
|
|
737
739
|
job_info_table,
|
|
738
|
-
spot_table.c.spot_job_id == job_info_table.c.spot_job_id
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
740
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id
|
|
741
|
+
).filter(
|
|
742
|
+
spot_table.c.spot_job_id == job_id,
|
|
743
|
+
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
744
|
+
# Note: it's possible that a WAITING job actually needs to be
|
|
745
|
+
# cleaned up, if we are in the middle of an upgrade/recovery and
|
|
746
|
+
# the job is waiting to be reclaimed by a new controller. But,
|
|
747
|
+
# in this case the status will not be PENDING.
|
|
748
|
+
sqlalchemy.or_(
|
|
749
|
+
job_info_table.c.schedule_state ==
|
|
750
|
+
ManagedJobScheduleState.WAITING.value,
|
|
751
|
+
job_info_table.c.schedule_state ==
|
|
752
|
+
ManagedJobScheduleState.INACTIVE.value,
|
|
753
|
+
),
|
|
754
|
+
).subquery()
|
|
748
755
|
|
|
749
756
|
count = session.query(spot_table).filter(
|
|
750
757
|
spot_table.c.job_id.in_(subquery)).update(
|
|
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
|
|
|
1105
1112
|
"""Set the job id on the pool cluster for a job."""
|
|
1106
1113
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1107
1114
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1108
|
-
await session.execute(
|
|
1109
|
-
|
|
1115
|
+
await session.execute(
|
|
1116
|
+
sqlalchemy.update(job_info_table).
|
|
1117
|
+
where(job_info_table.c.spot_job_id == job_id).values({
|
|
1118
|
+
job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
|
|
1119
|
+
}))
|
|
1110
1120
|
await session.commit()
|
|
1111
1121
|
|
|
1112
1122
|
|
|
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
|
|
|
1130
1140
|
job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
1131
1141
|
"""Get the cluster name and job id on the pool from the managed job id."""
|
|
1132
1142
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1133
|
-
async with
|
|
1134
|
-
|
|
1143
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1144
|
+
result = await session.execute(
|
|
1135
1145
|
sqlalchemy.select(job_info_table.c.current_cluster_name,
|
|
1136
1146
|
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1137
|
-
job_info_table.c.spot_job_id == job_id)
|
|
1138
|
-
|
|
1147
|
+
job_info_table.c.spot_job_id == job_id))
|
|
1148
|
+
info = result.fetchone()
|
|
1139
1149
|
if info is None:
|
|
1140
1150
|
return None, None
|
|
1141
1151
|
return info[0], info[1]
|
sky/jobs/utils.py
CHANGED
|
@@ -29,6 +29,7 @@ from sky import sky_logging
|
|
|
29
29
|
from sky import skypilot_config
|
|
30
30
|
from sky.adaptors import common as adaptors_common
|
|
31
31
|
from sky.backends import backend_utils
|
|
32
|
+
from sky.backends import cloud_vm_ray_backend
|
|
32
33
|
from sky.jobs import constants as managed_job_constants
|
|
33
34
|
from sky.jobs import scheduler
|
|
34
35
|
from sky.jobs import state as managed_job_state
|
|
@@ -50,12 +51,16 @@ from sky.utils import subprocess_utils
|
|
|
50
51
|
from sky.utils import ux_utils
|
|
51
52
|
|
|
52
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
import grpc
|
|
53
55
|
import psutil
|
|
54
56
|
|
|
55
57
|
import sky
|
|
56
58
|
from sky import dag as dag_lib
|
|
59
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
57
60
|
else:
|
|
58
61
|
psutil = adaptors_common.LazyImport('psutil')
|
|
62
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
63
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
59
64
|
|
|
60
65
|
logger = sky_logging.init_logger(__name__)
|
|
61
66
|
|
|
@@ -286,19 +291,34 @@ async def get_job_status(
|
|
|
286
291
|
job_logger.info(f'Job status: {status}')
|
|
287
292
|
job_logger.info('=' * 34)
|
|
288
293
|
return status
|
|
289
|
-
except exceptions.CommandError
|
|
294
|
+
except (exceptions.CommandError, grpc.RpcError,
|
|
295
|
+
grpc.FutureTimeoutError) as e:
|
|
290
296
|
# Retry on k8s transient network errors. This is useful when using
|
|
291
297
|
# coreweave which may have transient network issue sometimes.
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
298
|
+
is_transient_error = False
|
|
299
|
+
detailed_reason = None
|
|
300
|
+
if isinstance(e, exceptions.CommandError):
|
|
301
|
+
detailed_reason = e.detailed_reason
|
|
302
|
+
if (detailed_reason is not None and
|
|
303
|
+
_JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
|
|
304
|
+
is_transient_error = True
|
|
305
|
+
elif isinstance(e, grpc.RpcError):
|
|
306
|
+
detailed_reason = e.details()
|
|
307
|
+
if e.code() in [
|
|
308
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
309
|
+
grpc.StatusCode.DEADLINE_EXCEEDED
|
|
310
|
+
]:
|
|
311
|
+
is_transient_error = True
|
|
312
|
+
elif isinstance(e, grpc.FutureTimeoutError):
|
|
313
|
+
detailed_reason = 'Timeout'
|
|
314
|
+
if is_transient_error:
|
|
315
|
+
logger.info('Failed to connect to the cluster. Retrying '
|
|
316
|
+
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
317
|
+
logger.info('=' * 34)
|
|
297
318
|
await asyncio.sleep(1)
|
|
298
319
|
else:
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
job_logger.info('=' * 34)
|
|
320
|
+
logger.info(f'Failed to get job status: {detailed_reason}')
|
|
321
|
+
logger.info('=' * 34)
|
|
302
322
|
return None
|
|
303
323
|
return None
|
|
304
324
|
|
|
@@ -547,9 +567,32 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
547
567
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
548
568
|
job_id: Optional[int], get_end_time: bool) -> float:
|
|
549
569
|
"""Get the submitted/ended time of the job."""
|
|
550
|
-
code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
551
|
-
job_id=job_id, get_ended_time=get_end_time)
|
|
552
570
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
571
|
+
assert handle is not None, (
|
|
572
|
+
f'handle for cluster {cluster_name!r} should not be None')
|
|
573
|
+
if handle.is_grpc_enabled_with_flag:
|
|
574
|
+
try:
|
|
575
|
+
if get_end_time:
|
|
576
|
+
end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
|
|
577
|
+
job_id=job_id)
|
|
578
|
+
end_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
579
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
580
|
+
handle.get_grpc_channel()).get_job_ended_timestamp(
|
|
581
|
+
end_ts_request))
|
|
582
|
+
return end_ts_response.timestamp
|
|
583
|
+
else:
|
|
584
|
+
submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
|
|
585
|
+
job_id=job_id)
|
|
586
|
+
submit_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
587
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
588
|
+
handle.get_grpc_channel()).get_job_submitted_timestamp(
|
|
589
|
+
submit_ts_request))
|
|
590
|
+
return submit_ts_response.timestamp
|
|
591
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
592
|
+
pass
|
|
593
|
+
|
|
594
|
+
code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
595
|
+
job_id=job_id, get_ended_time=get_end_time))
|
|
553
596
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
554
597
|
code,
|
|
555
598
|
stream_logs=False,
|
|
@@ -573,8 +616,13 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
573
616
|
cluster_name,
|
|
574
617
|
job_id=job_id,
|
|
575
618
|
get_end_time=True)
|
|
576
|
-
except exceptions.CommandError
|
|
577
|
-
|
|
619
|
+
except (exceptions.CommandError, grpc.RpcError,
|
|
620
|
+
grpc.FutureTimeoutError) as e:
|
|
621
|
+
if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
|
|
622
|
+
(isinstance(e, grpc.RpcError) and e.code() in [
|
|
623
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
624
|
+
grpc.StatusCode.DEADLINE_EXCEEDED,
|
|
625
|
+
]) or isinstance(e, grpc.FutureTimeoutError):
|
|
578
626
|
# Failed to connect - probably the instance was preempted since the
|
|
579
627
|
# job completed. We shouldn't crash here, so just log and use the
|
|
580
628
|
# current time.
|
|
@@ -586,7 +634,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
586
634
|
raise
|
|
587
635
|
|
|
588
636
|
|
|
589
|
-
def event_callback_func(
|
|
637
|
+
def event_callback_func(
|
|
638
|
+
job_id: int, task_id: Optional[int],
|
|
639
|
+
task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
|
|
590
640
|
"""Run event callback for the task."""
|
|
591
641
|
|
|
592
642
|
def callback_func(status: str):
|
|
@@ -625,17 +675,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
625
675
|
f'Bash:{event_callback},log_path:{log_path},result:{result}')
|
|
626
676
|
logger.info(f'=== END: event callback for {status!r} ===')
|
|
627
677
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
# In async context
|
|
632
|
-
async def async_callback_func(status: str):
|
|
633
|
-
return await context_utils.to_thread(callback_func, status)
|
|
678
|
+
async def async_callback_func(status: str):
|
|
679
|
+
return await context_utils.to_thread(callback_func, status)
|
|
634
680
|
|
|
635
|
-
|
|
636
|
-
except RuntimeError:
|
|
637
|
-
# Not in async context
|
|
638
|
-
return callback_func
|
|
681
|
+
return async_callback_func
|
|
639
682
|
|
|
640
683
|
|
|
641
684
|
# ======== user functions ========
|
sky/logs/agent.py
CHANGED
|
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
|
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
37
|
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
|
38
|
-
'sudo apt-get install -y gnupg; '
|
|
38
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
39
39
|
# pylint: disable=line-too-long
|
|
40
|
-
'curl https://
|
|
40
|
+
'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
41
|
+
# pylint: disable=line-too-long
|
|
42
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
43
|
+
# pylint: disable=line-too-long
|
|
44
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
45
|
+
# pylint: disable=line-too-long
|
|
46
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
47
|
+
'sudo apt-get update; '
|
|
48
|
+
'sudo apt-get install -y fluent-bit; '
|
|
41
49
|
'fi')
|
|
42
50
|
cfg = self.fluentbit_config(cluster_name)
|
|
43
51
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
sky/provision/__init__.py
CHANGED
|
@@ -26,6 +26,7 @@ from sky.provision import nebius
|
|
|
26
26
|
from sky.provision import oci
|
|
27
27
|
from sky.provision import runpod
|
|
28
28
|
from sky.provision import scp
|
|
29
|
+
from sky.provision import seeweb
|
|
29
30
|
from sky.provision import ssh
|
|
30
31
|
from sky.provision import vast
|
|
31
32
|
from sky.provision import vsphere
|
|
@@ -3,7 +3,7 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
from typing import Any, Dict, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
|
666
666
|
|
|
667
667
|
|
|
668
668
|
class KubernetesError(Exception):
|
|
669
|
-
|
|
669
|
+
|
|
670
|
+
def __init__(self,
|
|
671
|
+
*args,
|
|
672
|
+
insufficent_resources: Optional[List[str]] = None):
|
|
673
|
+
self.insufficent_resources = insufficent_resources
|
|
674
|
+
super().__init__(*args)
|
|
@@ -3,6 +3,7 @@ import copy
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import time
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
191
192
|
break
|
|
192
193
|
if event_message is not None:
|
|
193
194
|
if pod_status == 'Pending':
|
|
194
|
-
|
|
195
|
+
out_of = {}
|
|
196
|
+
# key: resource name, value: (extra message, nice name)
|
|
195
197
|
if 'Insufficient cpu' in event_message:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
199
|
+
'custom-columns=NAME:.metadata.name,'
|
|
200
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
201
|
+
'the available CPUs on the node.', 'CPUs')
|
|
198
202
|
if 'Insufficient memory' in event_message:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
203
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
204
|
+
'custom-columns=NAME:.metadata.name,'
|
|
205
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
206
|
+
'to check the available memory on the '
|
|
207
|
+
'node.', 'Memory')
|
|
208
|
+
|
|
202
209
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
203
210
|
# fusermount-server, we need a new way to check whether the
|
|
204
211
|
# fusermount-server daemonset is ready.
|
|
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
206
213
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
207
214
|
for key in lf.get_label_keys()
|
|
208
215
|
]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
216
|
+
for label_key in gpu_lf_keys:
|
|
217
|
+
# TODO(romilb): We may have additional node
|
|
218
|
+
# affinity selectors in the future - in that
|
|
219
|
+
# case we will need to update this logic.
|
|
220
|
+
# TODO(Doyoung): Update the error message raised
|
|
221
|
+
# with the multi-host TPU support.
|
|
222
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
223
|
+
context) # pylint: disable=line-too-long
|
|
224
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
225
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
226
|
+
in event_message) and pod.spec.node_selector):
|
|
227
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
228
|
+
info_msg = (
|
|
229
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
230
|
+
'see the available GPUs.')
|
|
231
|
+
else:
|
|
232
|
+
info_msg = ': '
|
|
233
|
+
if (pod.spec.node_selector and
|
|
234
|
+
label_key in pod.spec.node_selector):
|
|
235
|
+
extra_msg = (
|
|
236
|
+
f'Verify if any node matching label '
|
|
237
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
238
|
+
f'sufficient resource {gpu_resource_key} '
|
|
239
|
+
f'is available in the cluster.')
|
|
240
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
241
|
+
else:
|
|
242
|
+
extra_msg = info_msg
|
|
243
|
+
if gpu_resource_key not in out_of or len(
|
|
244
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
245
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
246
|
+
|
|
247
|
+
if len(out_of) > 0:
|
|
248
|
+
# We are out of some resources. We should raise an error.
|
|
249
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
250
|
+
rsrc_err_msg += 'cluster:\n'
|
|
251
|
+
out_of_keys = list(out_of.keys())
|
|
252
|
+
for i in range(len(out_of_keys)):
|
|
253
|
+
rsrc = out_of_keys[i]
|
|
254
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
255
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
256
|
+
if i == len(out_of_keys) - 1:
|
|
257
|
+
indent = '└──'
|
|
258
|
+
else:
|
|
259
|
+
indent = '├──'
|
|
260
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
261
|
+
f'sufficient {nice_name} for your request'
|
|
262
|
+
f'{extra_msg}')
|
|
263
|
+
if i != len(out_of_keys) - 1:
|
|
264
|
+
rsrc_err_msg += '\n'
|
|
265
|
+
|
|
266
|
+
# Emit the error message without logging prefixes for better UX.
|
|
267
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
268
|
+
tmp_handler.flush = sys.stdout.flush
|
|
269
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
270
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
271
|
+
prev_propagate = logger.propagate
|
|
272
|
+
try:
|
|
273
|
+
logger.addHandler(tmp_handler)
|
|
274
|
+
logger.propagate = False
|
|
275
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
276
|
+
finally:
|
|
277
|
+
logger.removeHandler(tmp_handler)
|
|
278
|
+
logger.propagate = prev_propagate
|
|
279
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
280
|
+
raise config_lib.KubernetesError(
|
|
281
|
+
f'{timeout_err_msg} '
|
|
282
|
+
f'Pod status: {pod_status} '
|
|
283
|
+
f'Details: \'{event_message}\' ',
|
|
284
|
+
insufficent_resources=nice_names,
|
|
285
|
+
)
|
|
286
|
+
|
|
244
287
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
245
288
|
f'Pod status: {pod_status} '
|
|
246
289
|
f'Details: \'{event_message}\' ')
|
|
@@ -3550,9 +3550,20 @@ def process_skypilot_pods(
|
|
|
3550
3550
|
f'requesting GPUs: {pod.metadata.name}')
|
|
3551
3551
|
gpu_label = label_formatter.get_label_key()
|
|
3552
3552
|
# Get GPU name from pod node selector
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
|
|
3553
|
+
node_selector_terms = (
|
|
3554
|
+
pod.spec.affinity.node_affinity.
|
|
3555
|
+
required_during_scheduling_ignored_during_execution.
|
|
3556
|
+
node_selector_terms)
|
|
3557
|
+
if node_selector_terms is not None:
|
|
3558
|
+
expressions = []
|
|
3559
|
+
for term in node_selector_terms:
|
|
3560
|
+
if term.match_expressions:
|
|
3561
|
+
expressions.extend(term.match_expressions)
|
|
3562
|
+
for expression in expressions:
|
|
3563
|
+
if expression.key == gpu_label and expression.operator == 'In':
|
|
3564
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
|
3565
|
+
expression.values[0])
|
|
3566
|
+
break
|
|
3556
3567
|
|
|
3557
3568
|
resources = resources_lib.Resources(
|
|
3558
3569
|
cloud=clouds.Kubernetes(),
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Seeweb provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.seeweb.config import bootstrap_instances
|
|
4
|
+
from sky.provision.seeweb.instance import cleanup_ports
|
|
5
|
+
from sky.provision.seeweb.instance import get_cluster_info
|
|
6
|
+
from sky.provision.seeweb.instance import open_ports
|
|
7
|
+
from sky.provision.seeweb.instance import query_instances
|
|
8
|
+
from sky.provision.seeweb.instance import run_instances
|
|
9
|
+
from sky.provision.seeweb.instance import stop_instances
|
|
10
|
+
from sky.provision.seeweb.instance import terminate_instances
|
|
11
|
+
from sky.provision.seeweb.instance import wait_instances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Configuration for Seeweb provisioning."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(*args, **_kwargs) -> Dict[str, Any]:
|
|
7
|
+
"""Bootstrap instances for Seeweb.
|
|
8
|
+
|
|
9
|
+
Seeweb doesn't require any special configuration bootstrapping,
|
|
10
|
+
so we just return the config as-is.
|
|
11
|
+
"""
|
|
12
|
+
config = args[2]
|
|
13
|
+
return config
|