skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +19 -5
- sky/check.py +398 -171
- sky/cli.py +302 -98
- sky/client/cli.py +302 -98
- sky/client/sdk.py +104 -12
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +23 -5
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +58 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +42 -19
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +29 -7
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/fluidstack/instance.py +1 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/server.py +108 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +83 -9
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +218 -1
- sky/utils/schemas.py +75 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
@@ -23,6 +23,7 @@ from sky import backends
|
|
23
23
|
from sky import exceptions
|
24
24
|
from sky import global_user_state
|
25
25
|
from sky import sky_logging
|
26
|
+
from sky import skypilot_config
|
26
27
|
from sky.adaptors import common as adaptors_common
|
27
28
|
from sky.backends import backend_utils
|
28
29
|
from sky.jobs import constants as managed_job_constants
|
@@ -463,7 +464,8 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
|
|
463
464
|
|
464
465
|
|
465
466
|
def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
466
|
-
all_users: bool = False
|
467
|
+
all_users: bool = False,
|
468
|
+
current_workspace: Optional[str] = None) -> str:
|
467
469
|
"""Cancel jobs by id.
|
468
470
|
|
469
471
|
If job_ids is None, cancel all jobs.
|
@@ -474,9 +476,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
474
476
|
job_ids = list(set(job_ids))
|
475
477
|
if not job_ids:
|
476
478
|
return 'No job to cancel.'
|
477
|
-
|
478
|
-
|
479
|
+
if current_workspace is None:
|
480
|
+
current_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
|
481
|
+
|
479
482
|
cancelled_job_ids: List[int] = []
|
483
|
+
wrong_workspace_job_ids: List[int] = []
|
480
484
|
for job_id in job_ids:
|
481
485
|
# Check the status of the managed job status. If it is in
|
482
486
|
# terminal state, we can safely skip it.
|
@@ -491,6 +495,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
491
495
|
|
492
496
|
update_managed_jobs_statuses(job_id)
|
493
497
|
|
498
|
+
job_workspace = managed_job_state.get_workspace(job_id)
|
499
|
+
if current_workspace is not None and job_workspace != current_workspace:
|
500
|
+
wrong_workspace_job_ids.append(job_id)
|
501
|
+
continue
|
502
|
+
|
494
503
|
# Send the signal to the jobs controller.
|
495
504
|
signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
|
496
505
|
# Filelock is needed to prevent race condition between signal
|
@@ -501,17 +510,30 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
501
510
|
f.flush()
|
502
511
|
cancelled_job_ids.append(job_id)
|
503
512
|
|
513
|
+
wrong_workspace_job_str = ''
|
514
|
+
if wrong_workspace_job_ids:
|
515
|
+
plural = 's' if len(wrong_workspace_job_ids) > 1 else ''
|
516
|
+
plural_verb = 'are' if len(wrong_workspace_job_ids) > 1 else 'is'
|
517
|
+
wrong_workspace_job_str = (
|
518
|
+
f' Job{plural} with ID{plural}'
|
519
|
+
f' {", ".join(map(str, wrong_workspace_job_ids))} '
|
520
|
+
f'{plural_verb} skipped as they are not in the active workspace '
|
521
|
+
f'{current_workspace!r}. Check the workspace of the job with: '
|
522
|
+
f'sky jobs queue')
|
523
|
+
|
504
524
|
if not cancelled_job_ids:
|
505
|
-
return 'No job to cancel.'
|
525
|
+
return f'No job to cancel.{wrong_workspace_job_str}'
|
506
526
|
identity_str = f'Job with ID {cancelled_job_ids[0]} is'
|
507
527
|
if len(cancelled_job_ids) > 1:
|
508
528
|
cancelled_job_ids_str = ', '.join(map(str, cancelled_job_ids))
|
509
529
|
identity_str = f'Jobs with IDs {cancelled_job_ids_str} are'
|
510
530
|
|
511
|
-
|
531
|
+
msg = f'{identity_str} scheduled to be cancelled.{wrong_workspace_job_str}'
|
532
|
+
return msg
|
512
533
|
|
513
534
|
|
514
|
-
def cancel_job_by_name(job_name: str
|
535
|
+
def cancel_job_by_name(job_name: str,
|
536
|
+
current_workspace: Optional[str] = None) -> str:
|
515
537
|
"""Cancel a job by name."""
|
516
538
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
|
517
539
|
if not job_ids:
|
@@ -520,8 +542,8 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
520
542
|
return (f'{colorama.Fore.RED}Multiple running jobs found '
|
521
543
|
f'with name {job_name!r}.\n'
|
522
544
|
f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}')
|
523
|
-
cancel_jobs_by_id(job_ids)
|
524
|
-
return f'
|
545
|
+
msg = cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
546
|
+
return f'{job_name!r} {msg}'
|
525
547
|
|
526
548
|
|
527
549
|
def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
@@ -1020,10 +1042,15 @@ def format_job_table(
|
|
1020
1042
|
jobs[get_hash(task)].append(task)
|
1021
1043
|
|
1022
1044
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
1045
|
+
workspaces = set()
|
1023
1046
|
for job_tasks in jobs.values():
|
1024
1047
|
managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
|
1025
1048
|
if not managed_job_status.is_terminal():
|
1026
1049
|
status_counts[managed_job_status.value] += 1
|
1050
|
+
workspaces.add(job_tasks[0].get('workspace',
|
1051
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE))
|
1052
|
+
|
1053
|
+
show_workspace = len(workspaces) > 1 or show_all
|
1027
1054
|
|
1028
1055
|
user_cols: List[str] = []
|
1029
1056
|
if show_user:
|
@@ -1034,6 +1061,7 @@ def format_job_table(
|
|
1034
1061
|
columns = [
|
1035
1062
|
'ID',
|
1036
1063
|
'TASK',
|
1064
|
+
*(['WORKSPACE'] if show_workspace else []),
|
1037
1065
|
'NAME',
|
1038
1066
|
*user_cols,
|
1039
1067
|
'REQUESTED',
|
@@ -1093,6 +1121,8 @@ def format_job_table(
|
|
1093
1121
|
for job_hash, job_tasks in jobs.items():
|
1094
1122
|
if show_all:
|
1095
1123
|
schedule_state = job_tasks[0]['schedule_state']
|
1124
|
+
workspace = job_tasks[0].get('workspace',
|
1125
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE)
|
1096
1126
|
|
1097
1127
|
if len(job_tasks) > 1:
|
1098
1128
|
# Aggregate the tasks into a new row in the table.
|
@@ -1134,6 +1164,7 @@ def format_job_table(
|
|
1134
1164
|
job_values = [
|
1135
1165
|
job_id,
|
1136
1166
|
'',
|
1167
|
+
*([''] if show_workspace else []),
|
1137
1168
|
job_name,
|
1138
1169
|
*user_values,
|
1139
1170
|
'-',
|
@@ -1163,9 +1194,11 @@ def format_job_table(
|
|
1163
1194
|
0, task['job_duration'], absolute=True)
|
1164
1195
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
1165
1196
|
user_values = get_user_column_values(task)
|
1197
|
+
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
1166
1198
|
values = [
|
1167
1199
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
1168
1200
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
1201
|
+
*([task_workspace] if show_workspace else []),
|
1169
1202
|
task['task_name'],
|
1170
1203
|
*user_values,
|
1171
1204
|
task['resources'],
|
@@ -1263,22 +1296,36 @@ class ManagedJobCodeGen:
|
|
1263
1296
|
def cancel_jobs_by_id(cls,
|
1264
1297
|
job_ids: Optional[List[int]],
|
1265
1298
|
all_users: bool = False) -> str:
|
1299
|
+
active_workspace = skypilot_config.get_active_workspace()
|
1266
1300
|
code = textwrap.dedent(f"""\
|
1267
1301
|
if managed_job_version < 2:
|
1268
1302
|
# For backward compatibility, since all_users is not supported
|
1269
|
-
# before #4787.
|
1303
|
+
# before #4787.
|
1270
1304
|
# TODO(cooperc): Remove compatibility before 0.12.0
|
1271
1305
|
msg = utils.cancel_jobs_by_id({job_ids})
|
1272
|
-
|
1306
|
+
elif managed_job_version < 4:
|
1307
|
+
# For backward compatibility, since current_workspace is not
|
1308
|
+
# supported before #5660. Don't check the workspace.
|
1309
|
+
# TODO(zhwu): Remove compatibility before 0.12.0
|
1273
1310
|
msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
|
1311
|
+
else:
|
1312
|
+
msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users},
|
1313
|
+
current_workspace={active_workspace!r})
|
1274
1314
|
print(msg, end="", flush=True)
|
1275
1315
|
""")
|
1276
1316
|
return cls._build(code)
|
1277
1317
|
|
1278
1318
|
@classmethod
|
1279
1319
|
def cancel_job_by_name(cls, job_name: str) -> str:
|
1320
|
+
active_workspace = skypilot_config.get_active_workspace()
|
1280
1321
|
code = textwrap.dedent(f"""\
|
1281
|
-
|
1322
|
+
if managed_job_version < 4:
|
1323
|
+
# For backward compatibility, since current_workspace is not
|
1324
|
+
# supported before #5660. Don't check the workspace.
|
1325
|
+
# TODO(zhwu): Remove compatibility before 0.12.0
|
1326
|
+
msg = utils.cancel_job_by_name({job_name!r})
|
1327
|
+
else:
|
1328
|
+
msg = utils.cancel_job_by_name({job_name!r}, {active_workspace!r})
|
1282
1329
|
print(msg, end="", flush=True)
|
1283
1330
|
""")
|
1284
1331
|
return cls._build(code)
|
@@ -1314,11 +1361,16 @@ class ManagedJobCodeGen:
|
|
1314
1361
|
return cls._build(code)
|
1315
1362
|
|
1316
1363
|
@classmethod
|
1317
|
-
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag'
|
1364
|
+
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
|
1365
|
+
workspace) -> str:
|
1318
1366
|
dag_name = managed_job_dag.name
|
1319
1367
|
# Add the managed job to queue table.
|
1320
1368
|
code = textwrap.dedent(f"""\
|
1321
|
-
|
1369
|
+
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
1370
|
+
if managed_job_version < 4:
|
1371
|
+
set_job_info_kwargs = {{}}
|
1372
|
+
managed_job_state.set_job_info(
|
1373
|
+
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
1322
1374
|
""")
|
1323
1375
|
for task_id, task in enumerate(managed_job_dag.tasks):
|
1324
1376
|
resources_str = backend_utils.get_task_resources_str(
|
sky/optimizer.py
CHANGED
@@ -14,6 +14,7 @@ from sky import clouds
|
|
14
14
|
from sky import exceptions
|
15
15
|
from sky import resources as resources_lib
|
16
16
|
from sky import sky_logging
|
17
|
+
from sky import skypilot_config
|
17
18
|
from sky import task as task_lib
|
18
19
|
from sky.adaptors import common as adaptors_common
|
19
20
|
from sky.clouds import cloud as sky_cloud
|
@@ -21,6 +22,7 @@ from sky.usage import usage_lib
|
|
21
22
|
from sky.utils import common
|
22
23
|
from sky.utils import env_options
|
23
24
|
from sky.utils import log_utils
|
25
|
+
from sky.utils import registry
|
24
26
|
from sky.utils import resources_utils
|
25
27
|
from sky.utils import rich_utils
|
26
28
|
from sky.utils import subprocess_utils
|
@@ -376,6 +378,10 @@ class Optimizer:
|
|
376
378
|
if any(orig_resources.cloud is None
|
377
379
|
for orig_resources in node.resources):
|
378
380
|
source_hint = 'catalog and kubernetes cluster'
|
381
|
+
elif all(
|
382
|
+
isinstance(orig_resources.cloud, clouds.SSH)
|
383
|
+
for orig_resources in node.resources):
|
384
|
+
source_hint = 'node pool'
|
379
385
|
elif all(
|
380
386
|
isinstance(orig_resources.cloud, clouds.Kubernetes)
|
381
387
|
for orig_resources in node.resources):
|
@@ -858,11 +864,19 @@ class Optimizer:
|
|
858
864
|
'accelerators': f'{resources.accelerators}',
|
859
865
|
'use_spot': resources.use_spot
|
860
866
|
}
|
867
|
+
|
868
|
+
# Handle special case for Kubernetes and SSH clouds
|
861
869
|
if isinstance(resources.cloud, clouds.Kubernetes):
|
862
|
-
# Region for Kubernetes
|
863
|
-
# Kubernetes clusters. We add
|
864
|
-
#
|
870
|
+
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
871
|
+
# context name, i.e. different Kubernetes clusters. We add
|
872
|
+
# region to the key to show all the Kubernetes clusters in the
|
873
|
+
# optimizer table for better UX.
|
874
|
+
|
875
|
+
if resources.cloud.__class__.__name__ == 'SSH':
|
876
|
+
resource_key_dict[
|
877
|
+
'cloud'] = 'SSH' # Force the cloud name to be SSH
|
865
878
|
resource_key_dict['region'] = resources.region
|
879
|
+
|
866
880
|
return json.dumps(resource_key_dict, sort_keys=True)
|
867
881
|
|
868
882
|
# Print the list of resouces that the optimizer considered.
|
@@ -1204,9 +1218,11 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1204
1218
|
clouds_to_check_again = list(clouds_need_recheck -
|
1205
1219
|
global_disabled_clouds)
|
1206
1220
|
if len(clouds_to_check_again) > 0:
|
1207
|
-
sky_check.check_capability(
|
1208
|
-
|
1209
|
-
|
1221
|
+
sky_check.check_capability(
|
1222
|
+
sky_cloud.CloudCapability.COMPUTE,
|
1223
|
+
quiet=True,
|
1224
|
+
clouds=clouds_to_check_again,
|
1225
|
+
workspace=skypilot_config.get_active_workspace())
|
1210
1226
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1211
1227
|
capability=sky_cloud.CloudCapability.COMPUTE,
|
1212
1228
|
raise_if_no_cloud_access=True)
|
@@ -1216,7 +1232,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1216
1232
|
if disabled_clouds:
|
1217
1233
|
is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
|
1218
1234
|
task_name = f' {task.name!r}' if task.name is not None else ''
|
1219
|
-
|
1235
|
+
disabled_display_names = []
|
1236
|
+
for c in disabled_clouds:
|
1237
|
+
cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
|
1238
|
+
if cloud_obj_one is not None:
|
1239
|
+
disabled_display_names.append(cloud_obj_one.display_name())
|
1240
|
+
cloud_names = ', '.join(disabled_display_names)
|
1241
|
+
msg = (f'Task{task_name} requires {cloud_names} '
|
1220
1242
|
f'which {is_or_are} not enabled. To enable access, change '
|
1221
1243
|
f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
|
1222
1244
|
f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
|
sky/provision/__init__.py
CHANGED
@@ -23,6 +23,7 @@ from sky.provision import lambda_cloud
|
|
23
23
|
from sky.provision import nebius
|
24
24
|
from sky.provision import oci
|
25
25
|
from sky.provision import runpod
|
26
|
+
from sky.provision import ssh
|
26
27
|
from sky.provision import vast
|
27
28
|
from sky.provision import vsphere
|
28
29
|
from sky.utils import command_runner
|
sky/provision/aws/instance.py
CHANGED
@@ -836,7 +836,23 @@ def open_ports(
|
|
836
836
|
|
837
837
|
# For the case when every new ports is already opened.
|
838
838
|
if ip_permissions:
|
839
|
-
|
839
|
+
# Filter out any permissions that already exist in the security group
|
840
|
+
existing_permissions = set()
|
841
|
+
for rule in sg.ip_permissions:
|
842
|
+
if rule['IpProtocol'] == 'tcp':
|
843
|
+
for ip_range in rule.get('IpRanges', []):
|
844
|
+
if ip_range.get('CidrIp') == '0.0.0.0/0':
|
845
|
+
existing_permissions.add(
|
846
|
+
(rule['FromPort'], rule['ToPort']))
|
847
|
+
|
848
|
+
# Remove any permissions that already exist
|
849
|
+
filtered_permissions = []
|
850
|
+
for perm in ip_permissions:
|
851
|
+
if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
|
852
|
+
filtered_permissions.append(perm)
|
853
|
+
|
854
|
+
if filtered_permissions:
|
855
|
+
sg.authorize_ingress(IpPermissions=filtered_permissions)
|
840
856
|
|
841
857
|
|
842
858
|
def cleanup_ports(
|
@@ -26,6 +26,7 @@ logger = sky_logging.init_logger(__name__)
|
|
26
26
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
29
|
+
|
29
30
|
private_key_path, _ = auth.get_or_generate_keys()
|
30
31
|
runner = command_runner.SSHCommandRunner(
|
31
32
|
(node_info['ip_address'], 22),
|
@@ -1265,6 +1265,8 @@ def query_instances(
|
|
1265
1265
|
assert provider_config is not None
|
1266
1266
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
1267
1267
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
1268
|
+
is_ssh = context.startswith('ssh-') if context else False
|
1269
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
1268
1270
|
|
1269
1271
|
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
1270
1272
|
try:
|
@@ -1274,15 +1276,24 @@ def query_instances(
|
|
1274
1276
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
1275
1277
|
except kubernetes.max_retry_error():
|
1276
1278
|
with ux_utils.print_exception_no_traceback():
|
1277
|
-
|
1279
|
+
if is_ssh:
|
1280
|
+
node_pool = context.lstrip('ssh-') if context else ''
|
1281
|
+
msg = (
|
1282
|
+
f'Cannot connect to SSH Node Pool {node_pool}. '
|
1283
|
+
'Please check if the SSH Node Pool is up and accessible. '
|
1284
|
+
'To debug, run `sky check ssh` to check the status of '
|
1285
|
+
'the SSH Node Pool.')
|
1286
|
+
else:
|
1287
|
+
ctx = kubernetes_utils.get_current_kube_config_context_name()
|
1288
|
+
msg = (f'Network error - check if the {identity} in '
|
1289
|
+
f'context {ctx} is up and accessible.')
|
1278
1290
|
raise exceptions.ClusterStatusFetchingError(
|
1279
|
-
f'Failed to query cluster {cluster_name_on_cloud!r} status. '
|
1280
|
-
|
1281
|
-
f'context {ctx} is up and accessible.') from None
|
1291
|
+
f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
|
1292
|
+
msg) from None
|
1282
1293
|
except Exception as e: # pylint: disable=broad-except
|
1283
1294
|
with ux_utils.print_exception_no_traceback():
|
1284
1295
|
raise exceptions.ClusterStatusFetchingError(
|
1285
|
-
f'Failed to query
|
1296
|
+
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
1286
1297
|
f'status: {common_utils.format_exception(e)}')
|
1287
1298
|
|
1288
1299
|
# Check if the pods are running or pending
|
@@ -1137,6 +1137,11 @@ def get_accelerator_label_key_values(
|
|
1137
1137
|
# support pollingthe clusters for autoscaling information, such as the
|
1138
1138
|
# node pools configured etc.
|
1139
1139
|
|
1140
|
+
is_ssh_node_pool = context.startswith('ssh-') if context else False
|
1141
|
+
cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
|
1142
|
+
context_display_name = context.lstrip('ssh-') if (
|
1143
|
+
context and is_ssh_node_pool) else context
|
1144
|
+
|
1140
1145
|
autoscaler_type = get_autoscaler_type()
|
1141
1146
|
if autoscaler_type is not None:
|
1142
1147
|
# If autoscaler is set in config.yaml, override the label key and value
|
@@ -1176,13 +1181,17 @@ def get_accelerator_label_key_values(
|
|
1176
1181
|
suffix = ''
|
1177
1182
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1178
1183
|
suffix = f' Found node labels: {node_labels}'
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1184
|
+
msg = (f'Could not detect GPU labels in {cloud_name}.')
|
1185
|
+
if not is_ssh_node_pool:
|
1186
|
+
msg += (' Run `sky check ssh` to debug.')
|
1187
|
+
else:
|
1188
|
+
msg += (
|
1189
|
+
' If this cluster has GPUs, please ensure GPU nodes have '
|
1190
|
+
'node labels of either of these formats: '
|
1191
|
+
f'{supported_formats}. Please refer to '
|
1192
|
+
'the documentation on how to set up node labels.')
|
1193
|
+
msg += f'{suffix}'
|
1194
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
1186
1195
|
else:
|
1187
1196
|
# Validate the label value on all nodes labels to ensure they are
|
1188
1197
|
# correctly setup and will behave as expected.
|
@@ -1193,7 +1202,7 @@ def get_accelerator_label_key_values(
|
|
1193
1202
|
value)
|
1194
1203
|
if not is_valid:
|
1195
1204
|
raise exceptions.ResourcesUnavailableError(
|
1196
|
-
f'Node {node_name!r} in
|
1205
|
+
f'Node {node_name!r} in {cloud_name} has '
|
1197
1206
|
f'invalid GPU label: {label}={value}. {reason}')
|
1198
1207
|
if check_mode:
|
1199
1208
|
# If check mode is enabled and we reached so far, we can
|
@@ -1257,10 +1266,10 @@ def get_accelerator_label_key_values(
|
|
1257
1266
|
# TODO(Doyoung): Update the error message raised with the
|
1258
1267
|
# multi-host TPU support.
|
1259
1268
|
raise exceptions.ResourcesUnavailableError(
|
1260
|
-
'Could not find any node in the
|
1269
|
+
f'Could not find any node in the {cloud_name} '
|
1261
1270
|
f'with {acc_type}. Please ensure at least one node in the '
|
1262
1271
|
f'cluster has {acc_type} and node labels are setup '
|
1263
|
-
'correctly. Please refer to the
|
1272
|
+
'correctly. Please refer to the documentation for more. '
|
1264
1273
|
f'{suffix}. Note that multi-host TPU podslices are '
|
1265
1274
|
'currently not unsupported.')
|
1266
1275
|
else:
|
@@ -1270,15 +1279,24 @@ def get_accelerator_label_key_values(
|
|
1270
1279
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1271
1280
|
suffix = (' Available resources on the cluster: '
|
1272
1281
|
f'{cluster_resources}')
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
+
if is_ssh_node_pool:
|
1283
|
+
msg = (
|
1284
|
+
f'Could not detect GPUs in SSH Node Pool '
|
1285
|
+
f'\'{context_display_name}\'. If this cluster contains '
|
1286
|
+
'GPUs, please ensure GPU drivers are installed on the node '
|
1287
|
+
'and re-run '
|
1288
|
+
f'`sky ssh up --infra {context_display_name}`. {suffix}')
|
1289
|
+
else:
|
1290
|
+
msg = (
|
1291
|
+
f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
|
1292
|
+
f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
|
1293
|
+
' contains GPUs, please ensure GPU drivers are installed on '
|
1294
|
+
'the node. Check if the GPUs are setup correctly by running '
|
1295
|
+
'`kubectl describe nodes` and looking for the '
|
1296
|
+
f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
|
1297
|
+
'Please refer to the documentation on how to set up GPUs.'
|
1298
|
+
f'{suffix}')
|
1299
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
1282
1300
|
assert False, 'This should not be reached'
|
1283
1301
|
|
1284
1302
|
|
sky/provision/nebius/instance.py
CHANGED
@@ -134,7 +134,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
134
134
|
disk_size=config.node_config['DiskSize'],
|
135
135
|
user_data=config.node_config['UserData'],
|
136
136
|
associate_public_ip_address=(
|
137
|
-
not config.provider_config['use_internal_ips'])
|
137
|
+
not config.provider_config['use_internal_ips']),
|
138
|
+
filesystems=config.node_config.get('filesystems', []),
|
139
|
+
)
|
138
140
|
except Exception as e: # pylint: disable=broad-except
|
139
141
|
logger.warning(f'run_instances error: {e}')
|
140
142
|
raise
|
sky/provision/nebius/utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Nebius library wrapper for SkyPilot."""
|
2
2
|
import time
|
3
|
-
from typing import Any, Dict
|
3
|
+
from typing import Any, Dict, List
|
4
4
|
import uuid
|
5
5
|
|
6
6
|
from sky import sky_logging
|
@@ -158,7 +158,8 @@ def start(instance_id: str) -> None:
|
|
158
158
|
|
159
159
|
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
160
160
|
preset: str, region: str, image_family: str, disk_size: int,
|
161
|
-
user_data: str, associate_public_ip_address: bool
|
161
|
+
user_data: str, associate_public_ip_address: bool,
|
162
|
+
filesystems: List[Dict[str, Any]]) -> str:
|
162
163
|
# Each node must have a unique name to avoid conflicts between
|
163
164
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
164
165
|
# to the node name.
|
@@ -217,6 +218,16 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
217
218
|
f' seconds) while waiting for disk {disk_name}'
|
218
219
|
f' to be ready.')
|
219
220
|
|
221
|
+
filesystems_spec = []
|
222
|
+
if filesystems:
|
223
|
+
for fs in filesystems:
|
224
|
+
filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
|
225
|
+
mount_tag=fs['filesystem_mount_tag'],
|
226
|
+
attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
|
227
|
+
fs['filesystem_attach_mode']],
|
228
|
+
existing_filesystem=nebius.compute().ExistingFilesystem(
|
229
|
+
id=fs['filesystem_id'])))
|
230
|
+
|
220
231
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
221
232
|
sub_net = service.list(nebius.vpc().ListSubnetsRequest(
|
222
233
|
parent_id=project_id,)).wait()
|
@@ -237,6 +248,7 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
237
248
|
cloud_init_user_data=user_data,
|
238
249
|
resources=nebius.compute().ResourcesSpec(platform=platform,
|
239
250
|
preset=preset),
|
251
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
240
252
|
network_interfaces=[
|
241
253
|
nebius.compute().NetworkInterfaceSpec(
|
242
254
|
subnet_id=sub_net.items[0].metadata.id,
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""SSH provisioner for SkyPilot.
|
2
|
+
|
3
|
+
This module implements the provisioner interface for SSH targets.
|
4
|
+
It reuses most of the functionality from the Kubernetes provisioner,
|
5
|
+
since the SSH implementation is based on Kubernetes under the hood.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from sky.provision.kubernetes.config import bootstrap_instances
|
9
|
+
from sky.provision.kubernetes.instance import get_cluster_info
|
10
|
+
from sky.provision.kubernetes.instance import get_command_runners
|
11
|
+
from sky.provision.kubernetes.instance import query_instances
|
12
|
+
from sky.provision.kubernetes.instance import run_instances
|
13
|
+
from sky.provision.kubernetes.instance import stop_instances
|
14
|
+
from sky.provision.kubernetes.instance import terminate_instances
|
15
|
+
from sky.provision.kubernetes.instance import wait_instances
|
16
|
+
from sky.provision.kubernetes.network import cleanup_ports
|
17
|
+
from sky.provision.kubernetes.network import open_ports
|
18
|
+
from sky.provision.kubernetes.network import query_ports
|
sky/resources.py
CHANGED
@@ -452,7 +452,10 @@ class Resources:
|
|
452
452
|
def repr_with_region_zone(self) -> str:
|
453
453
|
region_str = ''
|
454
454
|
if self.region is not None:
|
455
|
-
|
455
|
+
region_name = self.region
|
456
|
+
if self.region.startswith('ssh-'):
|
457
|
+
region_name = self.region.lstrip('ssh-')
|
458
|
+
region_str = f', region={region_name}'
|
456
459
|
zone_str = ''
|
457
460
|
if self.zone is not None:
|
458
461
|
zone_str = f', zone={self.zone}'
|
sky/serve/server/core.py
CHANGED
@@ -14,6 +14,7 @@ from sky import backends
|
|
14
14
|
from sky import exceptions
|
15
15
|
from sky import execution
|
16
16
|
from sky import sky_logging
|
17
|
+
from sky import skypilot_config
|
17
18
|
from sky import task as task_lib
|
18
19
|
from sky.backends import backend_utils
|
19
20
|
from sky.clouds.service_catalog import common as service_catalog_common
|
@@ -221,12 +222,14 @@ def up(
|
|
221
222
|
# Since the controller may be shared among multiple users, launch the
|
222
223
|
# controller with the API server's user hash.
|
223
224
|
with common.with_server_user_hash():
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
225
|
+
with skypilot_config.local_active_workspace_ctx(
|
226
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE):
|
227
|
+
controller_job_id, controller_handle = execution.launch(
|
228
|
+
task=controller_task,
|
229
|
+
cluster_name=controller_name,
|
230
|
+
retry_until_up=True,
|
231
|
+
_disable_controller_check=True,
|
232
|
+
)
|
230
233
|
|
231
234
|
style = colorama.Style
|
232
235
|
fore = colorama.Fore
|
sky/server/html/token_page.html
CHANGED
@@ -49,6 +49,11 @@
|
|
49
49
|
margin-bottom: 20px;
|
50
50
|
color: #5f6368;
|
51
51
|
}
|
52
|
+
.user-identifier {
|
53
|
+
font-size: 12px; /* Smaller font size */
|
54
|
+
color: #80868b; /* Lighter color */
|
55
|
+
margin-bottom: 8px; /* Adjusted margin */
|
56
|
+
}
|
52
57
|
.code-block {
|
53
58
|
background-color: #f1f3f4;
|
54
59
|
border: 1px solid #dadce0;
|
@@ -110,8 +115,8 @@
|
|
110
115
|
</svg>
|
111
116
|
</div>
|
112
117
|
<h1>Sign in to SkyPilot CLI</h1>
|
118
|
+
<p class="user-identifier">USER_PLACEHOLDER</p>
|
113
119
|
<p>You are seeing this page because a SkyPilot command requires authentication.</p>
|
114
|
-
|
115
120
|
<p>Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
|
116
121
|
<div id="token-box" class="code-block">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
|
117
122
|
<button id="copy-btn" class="copy-button">Copy Token</button>
|
sky/server/requests/executor.py
CHANGED
@@ -228,6 +228,7 @@ def override_request_env_and_config(
|
|
228
228
|
"""Override the environment and SkyPilot config for a request."""
|
229
229
|
original_env = os.environ.copy()
|
230
230
|
os.environ.update(request_body.env_vars)
|
231
|
+
# Note: may be overridden by AuthProxyMiddleware.
|
231
232
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
232
233
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
233
234
|
global_user_state.add_or_update_user(user)
|
sky/server/requests/payloads.py
CHANGED
@@ -88,6 +88,11 @@ class RequestBody(pydantic.BaseModel):
|
|
88
88
|
using_remote_api_server: bool = False
|
89
89
|
override_skypilot_config: Optional[Dict[str, Any]] = {}
|
90
90
|
|
91
|
+
# Allow extra fields in the request body, which is useful for backward
|
92
|
+
# compatibility, i.e., we can add new fields to the request body without
|
93
|
+
# breaking the existing old API server.
|
94
|
+
model_config = pydantic.ConfigDict(extra='allow')
|
95
|
+
|
91
96
|
def __init__(self, **data):
|
92
97
|
data['env_vars'] = data.get('env_vars', request_body_env_vars())
|
93
98
|
usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
|
@@ -126,6 +131,7 @@ class CheckBody(RequestBody):
|
|
126
131
|
"""The request body for the check endpoint."""
|
127
132
|
clouds: Optional[Tuple[str, ...]] = None
|
128
133
|
verbose: bool = False
|
134
|
+
workspace: Optional[str] = None
|
129
135
|
|
130
136
|
|
131
137
|
class DagRequestBody(RequestBody):
|
@@ -446,6 +452,7 @@ class RealtimeGpuAvailabilityRequestBody(RequestBody):
|
|
446
452
|
context: Optional[str] = None
|
447
453
|
name_filter: Optional[str] = None
|
448
454
|
quantity_filter: Optional[int] = None
|
455
|
+
is_ssh: Optional[bool] = None
|
449
456
|
|
450
457
|
|
451
458
|
class KubernetesNodeInfoRequestBody(RequestBody):
|
@@ -485,6 +492,12 @@ class LocalUpBody(RequestBody):
|
|
485
492
|
password: Optional[str] = None
|
486
493
|
|
487
494
|
|
495
|
+
class SSHUpBody(RequestBody):
|
496
|
+
"""The request body for the SSH up/down endpoints."""
|
497
|
+
infra: Optional[str] = None
|
498
|
+
cleanup: bool = False
|
499
|
+
|
500
|
+
|
488
501
|
class ServeTerminateReplicaBody(RequestBody):
|
489
502
|
"""The request body for the serve terminate replica endpoint."""
|
490
503
|
service_name: str
|
@@ -518,3 +531,8 @@ class UploadZipFileResponse(pydantic.BaseModel):
|
|
518
531
|
"""The response body for the upload zip file endpoint."""
|
519
532
|
status: str
|
520
533
|
missing_chunks: Optional[List[str]] = None
|
534
|
+
|
535
|
+
|
536
|
+
class EnabledCloudsBody(RequestBody):
|
537
|
+
"""The request body for the enabled clouds endpoint."""
|
538
|
+
workspace: Optional[str] = None
|