skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/backend_utils.py +62 -45
  4. sky/backends/cloud_vm_ray_backend.py +19 -5
  5. sky/check.py +398 -171
  6. sky/cli.py +302 -98
  7. sky/client/cli.py +302 -98
  8. sky/client/sdk.py +104 -12
  9. sky/clouds/__init__.py +3 -0
  10. sky/clouds/aws.py +4 -2
  11. sky/clouds/azure.py +4 -2
  12. sky/clouds/cloud.py +24 -6
  13. sky/clouds/cudo.py +2 -1
  14. sky/clouds/do.py +2 -1
  15. sky/clouds/fluidstack.py +2 -1
  16. sky/clouds/gcp.py +23 -5
  17. sky/clouds/ibm.py +4 -2
  18. sky/clouds/kubernetes.py +66 -22
  19. sky/clouds/lambda_cloud.py +2 -1
  20. sky/clouds/nebius.py +18 -2
  21. sky/clouds/oci.py +4 -2
  22. sky/clouds/paperspace.py +2 -1
  23. sky/clouds/runpod.py +2 -1
  24. sky/clouds/scp.py +2 -1
  25. sky/clouds/service_catalog/constants.py +1 -1
  26. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  27. sky/clouds/ssh.py +203 -0
  28. sky/clouds/vast.py +2 -1
  29. sky/clouds/vsphere.py +2 -1
  30. sky/core.py +58 -11
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  35. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  37. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  52. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
  53. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  54. sky/dashboard/out/clusters/[cluster].html +1 -1
  55. sky/dashboard/out/clusters.html +1 -1
  56. sky/dashboard/out/index.html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -0
  61. sky/dashboard/out/workspaces.html +1 -0
  62. sky/data/storage.py +1 -1
  63. sky/global_user_state.py +42 -19
  64. sky/jobs/constants.py +1 -1
  65. sky/jobs/server/core.py +72 -56
  66. sky/jobs/state.py +26 -5
  67. sky/jobs/utils.py +65 -13
  68. sky/optimizer.py +29 -7
  69. sky/provision/__init__.py +1 -0
  70. sky/provision/aws/instance.py +17 -1
  71. sky/provision/fluidstack/instance.py +1 -0
  72. sky/provision/kubernetes/instance.py +16 -5
  73. sky/provision/kubernetes/utils.py +37 -19
  74. sky/provision/nebius/instance.py +3 -1
  75. sky/provision/nebius/utils.py +14 -2
  76. sky/provision/ssh/__init__.py +18 -0
  77. sky/resources.py +4 -1
  78. sky/serve/server/core.py +9 -6
  79. sky/server/html/token_page.html +6 -1
  80. sky/server/requests/executor.py +1 -0
  81. sky/server/requests/payloads.py +18 -0
  82. sky/server/server.py +108 -5
  83. sky/setup_files/dependencies.py +1 -0
  84. sky/skylet/constants.py +4 -1
  85. sky/skypilot_config.py +83 -9
  86. sky/templates/nebius-ray.yml.j2 +12 -0
  87. sky/utils/cli_utils/status_utils.py +18 -8
  88. sky/utils/infra_utils.py +21 -1
  89. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  90. sky/utils/kubernetes/create_cluster.sh +1 -0
  91. sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
  92. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  93. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  94. sky/utils/log_utils.py +218 -1
  95. sky/utils/schemas.py +75 -0
  96. sky/utils/ux_utils.py +2 -1
  97. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
  98. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
  99. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  102. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  106. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  114. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  115. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  116. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -23,6 +23,7 @@ from sky import backends
23
23
  from sky import exceptions
24
24
  from sky import global_user_state
25
25
  from sky import sky_logging
26
+ from sky import skypilot_config
26
27
  from sky.adaptors import common as adaptors_common
27
28
  from sky.backends import backend_utils
28
29
  from sky.jobs import constants as managed_job_constants
@@ -463,7 +464,8 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
463
464
 
464
465
 
465
466
  def cancel_jobs_by_id(job_ids: Optional[List[int]],
466
- all_users: bool = False) -> str:
467
+ all_users: bool = False,
468
+ current_workspace: Optional[str] = None) -> str:
467
469
  """Cancel jobs by id.
468
470
 
469
471
  If job_ids is None, cancel all jobs.
@@ -474,9 +476,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
474
476
  job_ids = list(set(job_ids))
475
477
  if not job_ids:
476
478
  return 'No job to cancel.'
477
- job_id_str = ', '.join(map(str, job_ids))
478
- logger.info(f'Cancelling jobs {job_id_str}.')
479
+ if current_workspace is None:
480
+ current_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
481
+
479
482
  cancelled_job_ids: List[int] = []
483
+ wrong_workspace_job_ids: List[int] = []
480
484
  for job_id in job_ids:
481
485
  # Check the status of the managed job status. If it is in
482
486
  # terminal state, we can safely skip it.
@@ -491,6 +495,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
491
495
 
492
496
  update_managed_jobs_statuses(job_id)
493
497
 
498
+ job_workspace = managed_job_state.get_workspace(job_id)
499
+ if current_workspace is not None and job_workspace != current_workspace:
500
+ wrong_workspace_job_ids.append(job_id)
501
+ continue
502
+
494
503
  # Send the signal to the jobs controller.
495
504
  signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
496
505
  # Filelock is needed to prevent race condition between signal
@@ -501,17 +510,30 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
501
510
  f.flush()
502
511
  cancelled_job_ids.append(job_id)
503
512
 
513
+ wrong_workspace_job_str = ''
514
+ if wrong_workspace_job_ids:
515
+ plural = 's' if len(wrong_workspace_job_ids) > 1 else ''
516
+ plural_verb = 'are' if len(wrong_workspace_job_ids) > 1 else 'is'
517
+ wrong_workspace_job_str = (
518
+ f' Job{plural} with ID{plural}'
519
+ f' {", ".join(map(str, wrong_workspace_job_ids))} '
520
+ f'{plural_verb} skipped as they are not in the active workspace '
521
+ f'{current_workspace!r}. Check the workspace of the job with: '
522
+ f'sky jobs queue')
523
+
504
524
  if not cancelled_job_ids:
505
- return 'No job to cancel.'
525
+ return f'No job to cancel.{wrong_workspace_job_str}'
506
526
  identity_str = f'Job with ID {cancelled_job_ids[0]} is'
507
527
  if len(cancelled_job_ids) > 1:
508
528
  cancelled_job_ids_str = ', '.join(map(str, cancelled_job_ids))
509
529
  identity_str = f'Jobs with IDs {cancelled_job_ids_str} are'
510
530
 
511
- return f'{identity_str} scheduled to be cancelled.'
531
+ msg = f'{identity_str} scheduled to be cancelled.{wrong_workspace_job_str}'
532
+ return msg
512
533
 
513
534
 
514
- def cancel_job_by_name(job_name: str) -> str:
535
+ def cancel_job_by_name(job_name: str,
536
+ current_workspace: Optional[str] = None) -> str:
515
537
  """Cancel a job by name."""
516
538
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
517
539
  if not job_ids:
@@ -520,8 +542,8 @@ def cancel_job_by_name(job_name: str) -> str:
520
542
  return (f'{colorama.Fore.RED}Multiple running jobs found '
521
543
  f'with name {job_name!r}.\n'
522
544
  f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}')
523
- cancel_jobs_by_id(job_ids)
524
- return f'Job {job_name!r} is scheduled to be cancelled.'
545
+ msg = cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
546
+ return f'{job_name!r} {msg}'
525
547
 
526
548
 
527
549
  def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
@@ -1020,10 +1042,15 @@ def format_job_table(
1020
1042
  jobs[get_hash(task)].append(task)
1021
1043
 
1022
1044
  status_counts: Dict[str, int] = collections.defaultdict(int)
1045
+ workspaces = set()
1023
1046
  for job_tasks in jobs.values():
1024
1047
  managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
1025
1048
  if not managed_job_status.is_terminal():
1026
1049
  status_counts[managed_job_status.value] += 1
1050
+ workspaces.add(job_tasks[0].get('workspace',
1051
+ constants.SKYPILOT_DEFAULT_WORKSPACE))
1052
+
1053
+ show_workspace = len(workspaces) > 1 or show_all
1027
1054
 
1028
1055
  user_cols: List[str] = []
1029
1056
  if show_user:
@@ -1034,6 +1061,7 @@ def format_job_table(
1034
1061
  columns = [
1035
1062
  'ID',
1036
1063
  'TASK',
1064
+ *(['WORKSPACE'] if show_workspace else []),
1037
1065
  'NAME',
1038
1066
  *user_cols,
1039
1067
  'REQUESTED',
@@ -1093,6 +1121,8 @@ def format_job_table(
1093
1121
  for job_hash, job_tasks in jobs.items():
1094
1122
  if show_all:
1095
1123
  schedule_state = job_tasks[0]['schedule_state']
1124
+ workspace = job_tasks[0].get('workspace',
1125
+ constants.SKYPILOT_DEFAULT_WORKSPACE)
1096
1126
 
1097
1127
  if len(job_tasks) > 1:
1098
1128
  # Aggregate the tasks into a new row in the table.
@@ -1134,6 +1164,7 @@ def format_job_table(
1134
1164
  job_values = [
1135
1165
  job_id,
1136
1166
  '',
1167
+ *([''] if show_workspace else []),
1137
1168
  job_name,
1138
1169
  *user_values,
1139
1170
  '-',
@@ -1163,9 +1194,11 @@ def format_job_table(
1163
1194
  0, task['job_duration'], absolute=True)
1164
1195
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1165
1196
  user_values = get_user_column_values(task)
1197
+ task_workspace = '-' if len(job_tasks) > 1 else workspace
1166
1198
  values = [
1167
1199
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1168
1200
  task['task_id'] if len(job_tasks) > 1 else '-',
1201
+ *([task_workspace] if show_workspace else []),
1169
1202
  task['task_name'],
1170
1203
  *user_values,
1171
1204
  task['resources'],
@@ -1263,22 +1296,36 @@ class ManagedJobCodeGen:
1263
1296
  def cancel_jobs_by_id(cls,
1264
1297
  job_ids: Optional[List[int]],
1265
1298
  all_users: bool = False) -> str:
1299
+ active_workspace = skypilot_config.get_active_workspace()
1266
1300
  code = textwrap.dedent(f"""\
1267
1301
  if managed_job_version < 2:
1268
1302
  # For backward compatibility, since all_users is not supported
1269
- # before #4787. Assume th
1303
+ # before #4787.
1270
1304
  # TODO(cooperc): Remove compatibility before 0.12.0
1271
1305
  msg = utils.cancel_jobs_by_id({job_ids})
1272
- else:
1306
+ elif managed_job_version < 4:
1307
+ # For backward compatibility, since current_workspace is not
1308
+ # supported before #5660. Don't check the workspace.
1309
+ # TODO(zhwu): Remove compatibility before 0.12.0
1273
1310
  msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
1311
+ else:
1312
+ msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users},
1313
+ current_workspace={active_workspace!r})
1274
1314
  print(msg, end="", flush=True)
1275
1315
  """)
1276
1316
  return cls._build(code)
1277
1317
 
1278
1318
  @classmethod
1279
1319
  def cancel_job_by_name(cls, job_name: str) -> str:
1320
+ active_workspace = skypilot_config.get_active_workspace()
1280
1321
  code = textwrap.dedent(f"""\
1281
- msg = utils.cancel_job_by_name({job_name!r})
1322
+ if managed_job_version < 4:
1323
+ # For backward compatibility, since current_workspace is not
1324
+ # supported before #5660. Don't check the workspace.
1325
+ # TODO(zhwu): Remove compatibility before 0.12.0
1326
+ msg = utils.cancel_job_by_name({job_name!r})
1327
+ else:
1328
+ msg = utils.cancel_job_by_name({job_name!r}, {active_workspace!r})
1282
1329
  print(msg, end="", flush=True)
1283
1330
  """)
1284
1331
  return cls._build(code)
@@ -1314,11 +1361,16 @@ class ManagedJobCodeGen:
1314
1361
  return cls._build(code)
1315
1362
 
1316
1363
  @classmethod
1317
- def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag') -> str:
1364
+ def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1365
+ workspace) -> str:
1318
1366
  dag_name = managed_job_dag.name
1319
1367
  # Add the managed job to queue table.
1320
1368
  code = textwrap.dedent(f"""\
1321
- managed_job_state.set_job_info({job_id}, {dag_name!r})
1369
+ set_job_info_kwargs = {{'workspace': {workspace!r}}}
1370
+ if managed_job_version < 4:
1371
+ set_job_info_kwargs = {{}}
1372
+ managed_job_state.set_job_info(
1373
+ {job_id}, {dag_name!r}, **set_job_info_kwargs)
1322
1374
  """)
1323
1375
  for task_id, task in enumerate(managed_job_dag.tasks):
1324
1376
  resources_str = backend_utils.get_task_resources_str(
sky/optimizer.py CHANGED
@@ -14,6 +14,7 @@ from sky import clouds
14
14
  from sky import exceptions
15
15
  from sky import resources as resources_lib
16
16
  from sky import sky_logging
17
+ from sky import skypilot_config
17
18
  from sky import task as task_lib
18
19
  from sky.adaptors import common as adaptors_common
19
20
  from sky.clouds import cloud as sky_cloud
@@ -21,6 +22,7 @@ from sky.usage import usage_lib
21
22
  from sky.utils import common
22
23
  from sky.utils import env_options
23
24
  from sky.utils import log_utils
25
+ from sky.utils import registry
24
26
  from sky.utils import resources_utils
25
27
  from sky.utils import rich_utils
26
28
  from sky.utils import subprocess_utils
@@ -376,6 +378,10 @@ class Optimizer:
376
378
  if any(orig_resources.cloud is None
377
379
  for orig_resources in node.resources):
378
380
  source_hint = 'catalog and kubernetes cluster'
381
+ elif all(
382
+ isinstance(orig_resources.cloud, clouds.SSH)
383
+ for orig_resources in node.resources):
384
+ source_hint = 'node pool'
379
385
  elif all(
380
386
  isinstance(orig_resources.cloud, clouds.Kubernetes)
381
387
  for orig_resources in node.resources):
@@ -858,11 +864,19 @@ class Optimizer:
858
864
  'accelerators': f'{resources.accelerators}',
859
865
  'use_spot': resources.use_spot
860
866
  }
867
+
868
+ # Handle special case for Kubernetes and SSH clouds
861
869
  if isinstance(resources.cloud, clouds.Kubernetes):
862
- # Region for Kubernetes is the context name, i.e. different
863
- # Kubernetes clusters. We add region to the key to show all the
864
- # Kubernetes clusters in the optimizer table for better UX.
870
+ # Region for Kubernetes-like clouds (SSH, Kubernetes) is the
871
+ # context name, i.e. different Kubernetes clusters. We add
872
+ # region to the key to show all the Kubernetes clusters in the
873
+ # optimizer table for better UX.
874
+
875
+ if resources.cloud.__class__.__name__ == 'SSH':
876
+ resource_key_dict[
877
+ 'cloud'] = 'SSH' # Force the cloud name to be SSH
865
878
  resource_key_dict['region'] = resources.region
879
+
866
880
  return json.dumps(resource_key_dict, sort_keys=True)
867
881
 
868
882
  # Print the list of resouces that the optimizer considered.
@@ -1204,9 +1218,11 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1204
1218
  clouds_to_check_again = list(clouds_need_recheck -
1205
1219
  global_disabled_clouds)
1206
1220
  if len(clouds_to_check_again) > 0:
1207
- sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
1208
- quiet=True,
1209
- clouds=clouds_to_check_again)
1221
+ sky_check.check_capability(
1222
+ sky_cloud.CloudCapability.COMPUTE,
1223
+ quiet=True,
1224
+ clouds=clouds_to_check_again,
1225
+ workspace=skypilot_config.get_active_workspace())
1210
1226
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1211
1227
  capability=sky_cloud.CloudCapability.COMPUTE,
1212
1228
  raise_if_no_cloud_access=True)
@@ -1216,7 +1232,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1216
1232
  if disabled_clouds:
1217
1233
  is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
1218
1234
  task_name = f' {task.name!r}' if task.name is not None else ''
1219
- msg = (f'Task{task_name} requires {", ".join(disabled_clouds)} '
1235
+ disabled_display_names = []
1236
+ for c in disabled_clouds:
1237
+ cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
1238
+ if cloud_obj_one is not None:
1239
+ disabled_display_names.append(cloud_obj_one.display_name())
1240
+ cloud_names = ', '.join(disabled_display_names)
1241
+ msg = (f'Task{task_name} requires {cloud_names} '
1220
1242
  f'which {is_or_are} not enabled. To enable access, change '
1221
1243
  f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
1222
1244
  f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
sky/provision/__init__.py CHANGED
@@ -23,6 +23,7 @@ from sky.provision import lambda_cloud
23
23
  from sky.provision import nebius
24
24
  from sky.provision import oci
25
25
  from sky.provision import runpod
26
+ from sky.provision import ssh
26
27
  from sky.provision import vast
27
28
  from sky.provision import vsphere
28
29
  from sky.utils import command_runner
@@ -836,7 +836,23 @@ def open_ports(
836
836
 
837
837
  # For the case when every new ports is already opened.
838
838
  if ip_permissions:
839
- sg.authorize_ingress(IpPermissions=ip_permissions)
839
+ # Filter out any permissions that already exist in the security group
840
+ existing_permissions = set()
841
+ for rule in sg.ip_permissions:
842
+ if rule['IpProtocol'] == 'tcp':
843
+ for ip_range in rule.get('IpRanges', []):
844
+ if ip_range.get('CidrIp') == '0.0.0.0/0':
845
+ existing_permissions.add(
846
+ (rule['FromPort'], rule['ToPort']))
847
+
848
+ # Remove any permissions that already exist
849
+ filtered_permissions = []
850
+ for perm in ip_permissions:
851
+ if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
852
+ filtered_permissions.append(perm)
853
+
854
+ if filtered_permissions:
855
+ sg.authorize_ingress(IpPermissions=filtered_permissions)
840
856
 
841
857
 
842
858
  def cleanup_ports(
@@ -26,6 +26,7 @@ logger = sky_logging.init_logger(__name__)
26
26
 
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
+
29
30
  private_key_path, _ = auth.get_or_generate_keys()
30
31
  runner = command_runner.SSHCommandRunner(
31
32
  (node_info['ip_address'], 22),
@@ -1265,6 +1265,8 @@ def query_instances(
1265
1265
  assert provider_config is not None
1266
1266
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1267
1267
  context = kubernetes_utils.get_context_from_config(provider_config)
1268
+ is_ssh = context.startswith('ssh-') if context else False
1269
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1268
1270
 
1269
1271
  # Get all the pods with the label skypilot-cluster: <cluster_name>
1270
1272
  try:
@@ -1274,15 +1276,24 @@ def query_instances(
1274
1276
  _request_timeout=kubernetes.API_TIMEOUT).items
1275
1277
  except kubernetes.max_retry_error():
1276
1278
  with ux_utils.print_exception_no_traceback():
1277
- ctx = kubernetes_utils.get_current_kube_config_context_name()
1279
+ if is_ssh:
1280
+ node_pool = context.lstrip('ssh-') if context else ''
1281
+ msg = (
1282
+ f'Cannot connect to SSH Node Pool {node_pool}. '
1283
+ 'Please check if the SSH Node Pool is up and accessible. '
1284
+ 'To debug, run `sky check ssh` to check the status of '
1285
+ 'the SSH Node Pool.')
1286
+ else:
1287
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
1288
+ msg = (f'Network error - check if the {identity} in '
1289
+ f'context {ctx} is up and accessible.')
1278
1290
  raise exceptions.ClusterStatusFetchingError(
1279
- f'Failed to query cluster {cluster_name_on_cloud!r} status. '
1280
- 'Network error - check if the Kubernetes cluster in '
1281
- f'context {ctx} is up and accessible.') from None
1291
+ f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
1292
+ msg) from None
1282
1293
  except Exception as e: # pylint: disable=broad-except
1283
1294
  with ux_utils.print_exception_no_traceback():
1284
1295
  raise exceptions.ClusterStatusFetchingError(
1285
- f'Failed to query Kubernetes cluster {cluster_name_on_cloud!r} '
1296
+ f'Failed to query {identity} {cluster_name_on_cloud!r} '
1286
1297
  f'status: {common_utils.format_exception(e)}')
1287
1298
 
1288
1299
  # Check if the pods are running or pending
@@ -1137,6 +1137,11 @@ def get_accelerator_label_key_values(
1137
1137
  # support pollingthe clusters for autoscaling information, such as the
1138
1138
  # node pools configured etc.
1139
1139
 
1140
+ is_ssh_node_pool = context.startswith('ssh-') if context else False
1141
+ cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
1142
+ context_display_name = context.lstrip('ssh-') if (
1143
+ context and is_ssh_node_pool) else context
1144
+
1140
1145
  autoscaler_type = get_autoscaler_type()
1141
1146
  if autoscaler_type is not None:
1142
1147
  # If autoscaler is set in config.yaml, override the label key and value
@@ -1176,13 +1181,17 @@ def get_accelerator_label_key_values(
1176
1181
  suffix = ''
1177
1182
  if env_options.Options.SHOW_DEBUG_INFO.get():
1178
1183
  suffix = f' Found node labels: {node_labels}'
1179
- raise exceptions.ResourcesUnavailableError(
1180
- 'Could not detect GPU labels in Kubernetes cluster. '
1181
- 'If this cluster has GPUs, please ensure GPU nodes have '
1182
- 'node labels of either of these formats: '
1183
- f'{supported_formats}. Please refer to '
1184
- 'the documentation on how to set up node labels.'
1185
- f'{suffix}')
1184
+ msg = (f'Could not detect GPU labels in {cloud_name}.')
1185
+ if not is_ssh_node_pool:
1186
+ msg += (' Run `sky check ssh` to debug.')
1187
+ else:
1188
+ msg += (
1189
+ ' If this cluster has GPUs, please ensure GPU nodes have '
1190
+ 'node labels of either of these formats: '
1191
+ f'{supported_formats}. Please refer to '
1192
+ 'the documentation on how to set up node labels.')
1193
+ msg += f'{suffix}'
1194
+ raise exceptions.ResourcesUnavailableError(msg)
1186
1195
  else:
1187
1196
  # Validate the label value on all nodes labels to ensure they are
1188
1197
  # correctly setup and will behave as expected.
@@ -1193,7 +1202,7 @@ def get_accelerator_label_key_values(
1193
1202
  value)
1194
1203
  if not is_valid:
1195
1204
  raise exceptions.ResourcesUnavailableError(
1196
- f'Node {node_name!r} in Kubernetes cluster has '
1205
+ f'Node {node_name!r} in {cloud_name} has '
1197
1206
  f'invalid GPU label: {label}={value}. {reason}')
1198
1207
  if check_mode:
1199
1208
  # If check mode is enabled and we reached so far, we can
@@ -1257,10 +1266,10 @@ def get_accelerator_label_key_values(
1257
1266
  # TODO(Doyoung): Update the error message raised with the
1258
1267
  # multi-host TPU support.
1259
1268
  raise exceptions.ResourcesUnavailableError(
1260
- 'Could not find any node in the Kubernetes cluster '
1269
+ f'Could not find any node in the {cloud_name} '
1261
1270
  f'with {acc_type}. Please ensure at least one node in the '
1262
1271
  f'cluster has {acc_type} and node labels are setup '
1263
- 'correctly. Please refer to the documentration for more. '
1272
+ 'correctly. Please refer to the documentation for more. '
1264
1273
  f'{suffix}. Note that multi-host TPU podslices are '
1265
1274
  'currently not unsupported.')
1266
1275
  else:
@@ -1270,15 +1279,24 @@ def get_accelerator_label_key_values(
1270
1279
  if env_options.Options.SHOW_DEBUG_INFO.get():
1271
1280
  suffix = (' Available resources on the cluster: '
1272
1281
  f'{cluster_resources}')
1273
- raise exceptions.ResourcesUnavailableError(
1274
- f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1275
- f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1276
- ' contains GPUs, please ensure GPU drivers are installed on '
1277
- 'the node. Check if the GPUs are setup correctly by running '
1278
- '`kubectl describe nodes` and looking for the '
1279
- f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1280
- 'Please refer to the documentation on how to set up GPUs.'
1281
- f'{suffix}')
1282
+ if is_ssh_node_pool:
1283
+ msg = (
1284
+ f'Could not detect GPUs in SSH Node Pool '
1285
+ f'\'{context_display_name}\'. If this cluster contains '
1286
+ 'GPUs, please ensure GPU drivers are installed on the node '
1287
+ 'and re-run '
1288
+ f'`sky ssh up --infra {context_display_name}`. {suffix}')
1289
+ else:
1290
+ msg = (
1291
+ f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1292
+ f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1293
+ ' contains GPUs, please ensure GPU drivers are installed on '
1294
+ 'the node. Check if the GPUs are setup correctly by running '
1295
+ '`kubectl describe nodes` and looking for the '
1296
+ f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1297
+ 'Please refer to the documentation on how to set up GPUs.'
1298
+ f'{suffix}')
1299
+ raise exceptions.ResourcesUnavailableError(msg)
1282
1300
  assert False, 'This should not be reached'
1283
1301
 
1284
1302
 
@@ -134,7 +134,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
134
134
  disk_size=config.node_config['DiskSize'],
135
135
  user_data=config.node_config['UserData'],
136
136
  associate_public_ip_address=(
137
- not config.provider_config['use_internal_ips']))
137
+ not config.provider_config['use_internal_ips']),
138
+ filesystems=config.node_config.get('filesystems', []),
139
+ )
138
140
  except Exception as e: # pylint: disable=broad-except
139
141
  logger.warning(f'run_instances error: {e}')
140
142
  raise
@@ -1,6 +1,6 @@
1
1
  """Nebius library wrapper for SkyPilot."""
2
2
  import time
3
- from typing import Any, Dict
3
+ from typing import Any, Dict, List
4
4
  import uuid
5
5
 
6
6
  from sky import sky_logging
@@ -158,7 +158,8 @@ def start(instance_id: str) -> None:
158
158
 
159
159
  def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
160
160
  preset: str, region: str, image_family: str, disk_size: int,
161
- user_data: str, associate_public_ip_address: bool) -> str:
161
+ user_data: str, associate_public_ip_address: bool,
162
+ filesystems: List[Dict[str, Any]]) -> str:
162
163
  # Each node must have a unique name to avoid conflicts between
163
164
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
164
165
  # to the node name.
@@ -217,6 +218,16 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
217
218
  f' seconds) while waiting for disk {disk_name}'
218
219
  f' to be ready.')
219
220
 
221
+ filesystems_spec = []
222
+ if filesystems:
223
+ for fs in filesystems:
224
+ filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
225
+ mount_tag=fs['filesystem_mount_tag'],
226
+ attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
227
+ fs['filesystem_attach_mode']],
228
+ existing_filesystem=nebius.compute().ExistingFilesystem(
229
+ id=fs['filesystem_id'])))
230
+
220
231
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
221
232
  sub_net = service.list(nebius.vpc().ListSubnetsRequest(
222
233
  parent_id=project_id,)).wait()
@@ -237,6 +248,7 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
237
248
  cloud_init_user_data=user_data,
238
249
  resources=nebius.compute().ResourcesSpec(platform=platform,
239
250
  preset=preset),
251
+ filesystems=filesystems_spec if filesystems_spec else None,
240
252
  network_interfaces=[
241
253
  nebius.compute().NetworkInterfaceSpec(
242
254
  subnet_id=sub_net.items[0].metadata.id,
@@ -0,0 +1,18 @@
1
+ """SSH provisioner for SkyPilot.
2
+
3
+ This module implements the provisioner interface for SSH targets.
4
+ It reuses most of the functionality from the Kubernetes provisioner,
5
+ since the SSH implementation is based on Kubernetes under the hood.
6
+ """
7
+
8
+ from sky.provision.kubernetes.config import bootstrap_instances
9
+ from sky.provision.kubernetes.instance import get_cluster_info
10
+ from sky.provision.kubernetes.instance import get_command_runners
11
+ from sky.provision.kubernetes.instance import query_instances
12
+ from sky.provision.kubernetes.instance import run_instances
13
+ from sky.provision.kubernetes.instance import stop_instances
14
+ from sky.provision.kubernetes.instance import terminate_instances
15
+ from sky.provision.kubernetes.instance import wait_instances
16
+ from sky.provision.kubernetes.network import cleanup_ports
17
+ from sky.provision.kubernetes.network import open_ports
18
+ from sky.provision.kubernetes.network import query_ports
sky/resources.py CHANGED
@@ -452,7 +452,10 @@ class Resources:
452
452
  def repr_with_region_zone(self) -> str:
453
453
  region_str = ''
454
454
  if self.region is not None:
455
- region_str = f', region={self.region}'
455
+ region_name = self.region
456
+ if self.region.startswith('ssh-'):
457
+ region_name = self.region.lstrip('ssh-')
458
+ region_str = f', region={region_name}'
456
459
  zone_str = ''
457
460
  if self.zone is not None:
458
461
  zone_str = f', zone={self.zone}'
sky/serve/server/core.py CHANGED
@@ -14,6 +14,7 @@ from sky import backends
14
14
  from sky import exceptions
15
15
  from sky import execution
16
16
  from sky import sky_logging
17
+ from sky import skypilot_config
17
18
  from sky import task as task_lib
18
19
  from sky.backends import backend_utils
19
20
  from sky.clouds.service_catalog import common as service_catalog_common
@@ -221,12 +222,14 @@ def up(
221
222
  # Since the controller may be shared among multiple users, launch the
222
223
  # controller with the API server's user hash.
223
224
  with common.with_server_user_hash():
224
- controller_job_id, controller_handle = execution.launch(
225
- task=controller_task,
226
- cluster_name=controller_name,
227
- retry_until_up=True,
228
- _disable_controller_check=True,
229
- )
225
+ with skypilot_config.local_active_workspace_ctx(
226
+ constants.SKYPILOT_DEFAULT_WORKSPACE):
227
+ controller_job_id, controller_handle = execution.launch(
228
+ task=controller_task,
229
+ cluster_name=controller_name,
230
+ retry_until_up=True,
231
+ _disable_controller_check=True,
232
+ )
230
233
 
231
234
  style = colorama.Style
232
235
  fore = colorama.Fore
@@ -49,6 +49,11 @@
49
49
  margin-bottom: 20px;
50
50
  color: #5f6368;
51
51
  }
52
+ .user-identifier {
53
+ font-size: 12px; /* Smaller font size */
54
+ color: #80868b; /* Lighter color */
55
+ margin-bottom: 8px; /* Adjusted margin */
56
+ }
52
57
  .code-block {
53
58
  background-color: #f1f3f4;
54
59
  border: 1px solid #dadce0;
@@ -110,8 +115,8 @@
110
115
  </svg>
111
116
  </div>
112
117
  <h1>Sign in to SkyPilot CLI</h1>
118
+ <p class="user-identifier">USER_PLACEHOLDER</p>
113
119
  <p>You are seeing this page because a SkyPilot command requires authentication.</p>
114
-
115
120
  <p>Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
116
121
  <div id="token-box" class="code-block">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
117
122
  <button id="copy-btn" class="copy-button">Copy Token</button>
@@ -228,6 +228,7 @@ def override_request_env_and_config(
228
228
  """Override the environment and SkyPilot config for a request."""
229
229
  original_env = os.environ.copy()
230
230
  os.environ.update(request_body.env_vars)
231
+ # Note: may be overridden by AuthProxyMiddleware.
231
232
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
232
233
  name=request_body.env_vars[constants.USER_ENV_VAR])
233
234
  global_user_state.add_or_update_user(user)
@@ -88,6 +88,11 @@ class RequestBody(pydantic.BaseModel):
88
88
  using_remote_api_server: bool = False
89
89
  override_skypilot_config: Optional[Dict[str, Any]] = {}
90
90
 
91
+ # Allow extra fields in the request body, which is useful for backward
92
+ # compatibility, i.e., we can add new fields to the request body without
93
+ # breaking the existing old API server.
94
+ model_config = pydantic.ConfigDict(extra='allow')
95
+
91
96
  def __init__(self, **data):
92
97
  data['env_vars'] = data.get('env_vars', request_body_env_vars())
93
98
  usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
@@ -126,6 +131,7 @@ class CheckBody(RequestBody):
126
131
  """The request body for the check endpoint."""
127
132
  clouds: Optional[Tuple[str, ...]] = None
128
133
  verbose: bool = False
134
+ workspace: Optional[str] = None
129
135
 
130
136
 
131
137
  class DagRequestBody(RequestBody):
@@ -446,6 +452,7 @@ class RealtimeGpuAvailabilityRequestBody(RequestBody):
446
452
  context: Optional[str] = None
447
453
  name_filter: Optional[str] = None
448
454
  quantity_filter: Optional[int] = None
455
+ is_ssh: Optional[bool] = None
449
456
 
450
457
 
451
458
  class KubernetesNodeInfoRequestBody(RequestBody):
@@ -485,6 +492,12 @@ class LocalUpBody(RequestBody):
485
492
  password: Optional[str] = None
486
493
 
487
494
 
495
+ class SSHUpBody(RequestBody):
496
+ """The request body for the SSH up/down endpoints."""
497
+ infra: Optional[str] = None
498
+ cleanup: bool = False
499
+
500
+
488
501
  class ServeTerminateReplicaBody(RequestBody):
489
502
  """The request body for the serve terminate replica endpoint."""
490
503
  service_name: str
@@ -518,3 +531,8 @@ class UploadZipFileResponse(pydantic.BaseModel):
518
531
  """The response body for the upload zip file endpoint."""
519
532
  status: str
520
533
  missing_chunks: Optional[List[str]] = None
534
+
535
+
536
+ class EnabledCloudsBody(RequestBody):
537
+ """The request body for the enabled clouds endpoint."""
538
+ workspace: Optional[str] = None