skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +99 -16
  3. sky/authentication.py +54 -7
  4. sky/backends/backend_utils.py +35 -22
  5. sky/backends/cloud_vm_ray_backend.py +30 -15
  6. sky/check.py +1 -1
  7. sky/cli.py +20 -8
  8. sky/client/cli.py +20 -8
  9. sky/client/oauth.py +82 -0
  10. sky/client/sdk.py +60 -10
  11. sky/clouds/nebius.py +55 -14
  12. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
  18. sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
  21. sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
  24. sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
  26. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
  29. sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
  35. sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
  36. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  37. sky/dashboard/out/clusters/[cluster].html +1 -1
  38. sky/dashboard/out/clusters.html +1 -1
  39. sky/dashboard/out/config.html +1 -1
  40. sky/dashboard/out/index.html +1 -1
  41. sky/dashboard/out/infra/[context].html +1 -0
  42. sky/dashboard/out/infra.html +1 -1
  43. sky/dashboard/out/jobs/[job].html +1 -1
  44. sky/dashboard/out/jobs.html +1 -1
  45. sky/dashboard/out/users.html +1 -1
  46. sky/dashboard/out/workspace/new.html +1 -1
  47. sky/dashboard/out/workspaces/[name].html +1 -1
  48. sky/dashboard/out/workspaces.html +1 -1
  49. sky/exceptions.py +11 -1
  50. sky/global_user_state.py +149 -1
  51. sky/jobs/client/sdk.py +1 -0
  52. sky/jobs/constants.py +3 -1
  53. sky/jobs/controller.py +3 -5
  54. sky/jobs/recovery_strategy.py +148 -102
  55. sky/jobs/scheduler.py +23 -8
  56. sky/jobs/server/core.py +16 -0
  57. sky/jobs/state.py +153 -39
  58. sky/jobs/utils.py +33 -5
  59. sky/provision/kubernetes/utils.py +2 -1
  60. sky/provision/provisioner.py +15 -10
  61. sky/resources.py +16 -1
  62. sky/serve/controller.py +10 -7
  63. sky/serve/replica_managers.py +22 -18
  64. sky/serve/service.py +5 -4
  65. sky/server/common.py +11 -4
  66. sky/server/html/token_page.html +32 -6
  67. sky/server/server.py +3 -1
  68. sky/server/stream_utils.py +21 -0
  69. sky/setup_files/dependencies.py +7 -1
  70. sky/skylet/constants.py +1 -1
  71. sky/task.py +26 -0
  72. sky/templates/jobs-controller.yaml.j2 +2 -1
  73. sky/templates/kubernetes-ray.yml.j2 +19 -1
  74. sky/utils/common_utils.py +66 -0
  75. sky/utils/rich_utils.py +5 -0
  76. sky/utils/schemas.py +32 -1
  77. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
  78. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
  79. sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
  81. sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
  82. sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
  83. sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
  84. sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
  90. sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
  92. sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
  93. /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
  94. /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
  95. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
  96. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
  97. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
  98. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/serve/controller.py CHANGED
@@ -42,12 +42,13 @@ class SkyServeController:
42
42
  """
43
43
 
44
44
  def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
45
- task_yaml: str, host: str, port: int) -> None:
45
+ service_task_yaml: str, host: str, port: int) -> None:
46
46
  self._service_name = service_name
47
47
  self._replica_manager: replica_managers.ReplicaManager = (
48
- replica_managers.SkyPilotReplicaManager(service_name=service_name,
49
- spec=service_spec,
50
- task_yaml_path=task_yaml))
48
+ replica_managers.SkyPilotReplicaManager(
49
+ service_name=service_name,
50
+ spec=service_spec,
51
+ service_task_yaml_path=service_task_yaml))
51
52
  self._autoscaler: autoscalers.Autoscaler = (
52
53
  autoscalers.Autoscaler.from_spec(service_name, service_spec))
53
54
  self._host = host
@@ -240,7 +241,9 @@ class SkyServeController:
240
241
  # TODO(tian): Probably we should support service that will stop the VM in
241
242
  # specific time period.
242
243
  def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
243
- task_yaml: str, controller_host: str, controller_port: int):
244
- controller = SkyServeController(service_name, service_spec, task_yaml,
245
- controller_host, controller_port)
244
+ service_task_yaml: str, controller_host: str,
245
+ controller_port: int):
246
+ controller = SkyServeController(service_name, service_spec,
247
+ service_task_yaml, controller_host,
248
+ controller_port)
246
249
  controller.run()
@@ -58,7 +58,7 @@ _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
58
58
  # TODO(tian): Combine this with
59
59
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
60
60
  def launch_cluster(replica_id: int,
61
- task_yaml_path: str,
61
+ service_task_yaml_path: str,
62
62
  cluster_name: str,
63
63
  resources_override: Optional[Dict[str, Any]] = None,
64
64
  retry_until_up: bool = True,
@@ -78,7 +78,8 @@ def launch_cluster(replica_id: int,
78
78
  f'{cluster_name} with resources override: '
79
79
  f'{resources_override}')
80
80
  try:
81
- config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
81
+ config = common_utils.read_yaml(
82
+ os.path.expanduser(service_task_yaml_path))
82
83
  task = sky.Task.from_yaml_config(config)
83
84
  if resources_override is not None:
84
85
  resources = task.resources
@@ -173,9 +174,9 @@ def terminate_cluster(cluster_name: str,
173
174
  time.sleep(gap_seconds)
174
175
 
175
176
 
176
- def _get_resources_ports(task_yaml: str) -> str:
177
+ def _get_resources_ports(service_task_yaml_path: str) -> str:
177
178
  """Get the resources ports used by the task."""
178
- task = sky.Task.from_yaml(task_yaml)
179
+ task = sky.Task.from_yaml(service_task_yaml_path)
179
180
  # Already checked all ports are valid in sky.serve.core.up
180
181
  assert task.resources, task
181
182
  assert task.service is not None, task
@@ -183,7 +184,7 @@ def _get_resources_ports(task_yaml: str) -> str:
183
184
  return task.service.ports
184
185
 
185
186
 
186
- def _should_use_spot(task_yaml: str,
187
+ def _should_use_spot(service_task_yaml_path: str,
187
188
  resource_override: Optional[Dict[str, Any]]) -> bool:
188
189
  """Get whether the task should use spot."""
189
190
  if resource_override is not None:
@@ -191,7 +192,7 @@ def _should_use_spot(task_yaml: str,
191
192
  if use_spot_override is not None:
192
193
  assert isinstance(use_spot_override, bool)
193
194
  return use_spot_override
194
- task = sky.Task.from_yaml(task_yaml)
195
+ task = sky.Task.from_yaml(service_task_yaml_path)
195
196
  spot_use_resources = [
196
197
  resources for resources in task.resources if resources.use_spot
197
198
  ]
@@ -634,10 +635,10 @@ class SkyPilotReplicaManager(ReplicaManager):
634
635
  """
635
636
 
636
637
  def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
637
- task_yaml_path: str) -> None:
638
+ service_task_yaml_path: str) -> None:
638
639
  super().__init__(service_name, spec)
639
- self._task_yaml_path = task_yaml_path
640
- task = sky.Task.from_yaml(task_yaml_path)
640
+ self.service_task_yaml_path = service_task_yaml_path
641
+ task = sky.Task.from_yaml(service_task_yaml_path)
641
642
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
642
643
  spot_placer.SpotPlacer.from_task(spec, task))
643
644
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -714,7 +715,8 @@ class SkyPilotReplicaManager(ReplicaManager):
714
715
  self._service_name, replica_id)
715
716
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
716
717
  self._service_name, replica_id)
717
- use_spot = _should_use_spot(self._task_yaml_path, resources_override)
718
+ use_spot = _should_use_spot(self.service_task_yaml_path,
719
+ resources_override)
718
720
  retry_until_up = True
719
721
  location = None
720
722
  if use_spot and self._spot_placer is not None:
@@ -742,10 +744,10 @@ class SkyPilotReplicaManager(ReplicaManager):
742
744
  launch_cluster,
743
745
  log_file_name,
744
746
  ).run,
745
- args=(replica_id, self._task_yaml_path, cluster_name,
747
+ args=(replica_id, self.service_task_yaml_path, cluster_name,
746
748
  resources_override, retry_until_up),
747
749
  )
748
- replica_port = _get_resources_ports(self._task_yaml_path)
750
+ replica_port = _get_resources_ports(self.service_task_yaml_path)
749
751
 
750
752
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
751
753
  location, self.latest_version, resources_override)
@@ -1290,11 +1292,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1290
1292
  logger.error(f'Invalid version: {version}, '
1291
1293
  f'latest version: {self.latest_version}')
1292
1294
  return
1293
- task_yaml_path = serve_utils.generate_task_yaml_file_name(
1295
+ service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1294
1296
  self._service_name, version)
1295
1297
  serve_state.add_or_update_version(self._service_name, version, spec)
1296
1298
  self.latest_version = version
1297
- self._task_yaml_path = task_yaml_path
1299
+ self.service_task_yaml_path = service_task_yaml_path
1298
1300
  self._update_mode = update_mode
1299
1301
 
1300
1302
  # Reuse all replicas that have the same config as the new version
@@ -1302,7 +1304,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1302
1304
  # the latest version. This can significantly improve the speed
1303
1305
  # for updating an existing service with only config changes to the
1304
1306
  # service specs, e.g. scale down the service.
1305
- new_config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
1307
+ new_config = common_utils.read_yaml(
1308
+ os.path.expanduser(service_task_yaml_path))
1306
1309
  # Always create new replicas and scale down old ones when file_mounts
1307
1310
  # are not empty.
1308
1311
  if new_config.get('file_mounts', None) != {}:
@@ -1313,10 +1316,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1313
1316
  for info in replica_infos:
1314
1317
  if info.version < version and not info.is_terminal:
1315
1318
  # Assume user does not change the yaml file on the controller.
1316
- old_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1317
- self._service_name, info.version)
1319
+ old_service_task_yaml_path = (
1320
+ serve_utils.generate_task_yaml_file_name(
1321
+ self._service_name, info.version))
1318
1322
  old_config = common_utils.read_yaml(
1319
- os.path.expanduser(old_task_yaml_path))
1323
+ os.path.expanduser(old_service_task_yaml_path))
1320
1324
  for key in ['service']:
1321
1325
  old_config.pop(key)
1322
1326
  # Bump replica version if all fields except for service are
sky/serve/service.py CHANGED
@@ -186,7 +186,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
186
186
 
187
187
  service_dir = os.path.expanduser(
188
188
  serve_utils.generate_remote_service_dir_name(service_name))
189
- task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
189
+ service_task_yaml = serve_utils.generate_task_yaml_file_name(
190
+ service_name, version)
190
191
 
191
192
  if not is_recovery:
192
193
  if (len(serve_state.get_services()) >=
@@ -218,7 +219,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
218
219
  # don't want the new file mounts to overwrite the old one, so we
219
220
  # sync to a tmp file first and then copy it to the final name
220
221
  # if there is no name conflict.
221
- shutil.copy(tmp_task_yaml, task_yaml)
222
+ shutil.copy(tmp_task_yaml, service_task_yaml)
222
223
 
223
224
  controller_process = None
224
225
  load_balancer_process = None
@@ -249,8 +250,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
249
250
  controller_host = _get_controller_host()
250
251
  controller_process = multiprocessing.Process(
251
252
  target=controller.run_controller,
252
- args=(service_name, service_spec, task_yaml, controller_host,
253
- controller_port))
253
+ args=(service_name, service_spec, service_task_yaml,
254
+ controller_host, controller_port))
254
255
  controller_process.start()
255
256
 
256
257
  if not is_recovery:
sky/server/common.py CHANGED
@@ -159,7 +159,8 @@ def get_server_url(host: Optional[str] = None) -> str:
159
159
 
160
160
 
161
161
  @annotations.lru_cache(scope='global')
162
- def get_dashboard_url(server_url: str) -> str:
162
+ def get_dashboard_url(server_url: str,
163
+ starting_page: Optional[str] = None) -> str:
163
164
  # The server_url may include username or password with the
164
165
  # format of https://username:password@example.com:8080/path
165
166
  # We need to remove the username and password and only
@@ -172,7 +173,10 @@ def get_dashboard_url(server_url: str) -> str:
172
173
  if parsed.path:
173
174
  dashboard_url = f'{dashboard_url}{parsed.path}'
174
175
  dashboard_url = dashboard_url.rstrip('/')
175
- return f'{dashboard_url}/dashboard'
176
+ dashboard_url = f'{dashboard_url}/dashboard'
177
+ if starting_page:
178
+ dashboard_url = f'{dashboard_url}/{starting_page}'
179
+ return dashboard_url
176
180
 
177
181
 
178
182
  @annotations.lru_cache(scope='global')
@@ -529,10 +533,13 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
529
533
  api_server_status = None
530
534
  try:
531
535
  api_server_status = check_server_healthy()
536
+ if api_server_status == ApiServerStatus.NEEDS_AUTH:
537
+ endpoint = get_server_url()
538
+ with ux_utils.print_exception_no_traceback():
539
+ raise exceptions.ApiServerAuthenticationError(endpoint)
532
540
  except exceptions.ApiServerConnectionError as exc:
533
541
  endpoint = get_server_url()
534
- if (not is_api_server_local() or
535
- api_server_status == ApiServerStatus.NEEDS_AUTH):
542
+ if not is_api_server_local():
536
543
  with ux_utils.print_exception_no_traceback():
537
544
  raise exceptions.ApiServerConnectionError(endpoint) from exc
538
545
  # Lock to prevent multiple processes from starting the server at the
@@ -100,6 +100,9 @@
100
100
  color: #5f6368;
101
101
  margin-top: 30px;
102
102
  }
103
+ .local-port-info {
104
+ display: none;
105
+ }
103
106
  </style>
104
107
  </head>
105
108
  <body>
@@ -114,14 +117,18 @@
114
117
  <path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
115
118
  </svg>
116
119
  </div>
117
- <h1>Sign in to SkyPilot CLI</h1>
120
+ <h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
121
+ <h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
118
122
  <p class="user-identifier">USER_PLACEHOLDER</p>
119
- <p>You are seeing this page because a SkyPilot command requires authentication.</p>
120
- <p>Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
121
- <div id="token-box" class="code-block">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
122
- <button id="copy-btn" class="copy-button">Copy Token</button>
123
+ <!-- display token info by default -->
124
+ <p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
125
+ <p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
126
+ <div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
127
+ <button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
128
+ <p class="footer-text no-local-port">You can close this tab after copying the token.</p>
123
129
 
124
- <p class="footer-text">You can close this tab after copying the token.</p>
130
+ <!-- don't display local port info unless successful -->
131
+ <p class="local-port-info">You can now close this tab.</p>
125
132
  </div>
126
133
 
127
134
  <script>
@@ -154,6 +161,25 @@
154
161
  copyBtn.textContent = 'Copy Token';
155
162
  }, 2000);
156
163
  });
164
+
165
+ function hideTokenInfo() {
166
+ const noLocalPortElems = document.querySelectorAll('.no-local-port');
167
+ noLocalPortElems.forEach(elem => {
168
+ elem.style.display = 'none';
169
+ });
170
+ const localPortInfoElems = document.querySelectorAll('.local-port-info');
171
+ localPortInfoElems.forEach(elem => {
172
+ elem.classList.remove('local-port-info');
173
+ });
174
+ }
175
+
176
+ if (window.location.search.includes('local_port=')) {
177
+ const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
178
+ fetch(uri, {
179
+ method: 'POST',
180
+ body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
181
+ }).then(hideTokenInfo)
182
+ }
157
183
  </script>
158
184
  </body>
159
185
  </html>
sky/server/server.py CHANGED
@@ -272,7 +272,9 @@ app.include_router(workspaces_rest.router,
272
272
 
273
273
 
274
274
  @app.get('/token')
275
- async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
275
+ async def token(request: fastapi.Request,
276
+ local_port: Optional[int] = None) -> fastapi.responses.Response:
277
+ del local_port # local_port is used by the served js, but ignored by server
276
278
  user = _get_auth_user_header(request)
277
279
 
278
280
  token_data = {
@@ -15,6 +15,8 @@ from sky.utils import rich_utils
15
15
 
16
16
  logger = sky_logging.init_logger(__name__)
17
17
 
18
+ _HEARTBEAT_INTERVAL = 30
19
+
18
20
 
19
21
  async def _yield_log_file_with_payloads_skipped(
20
22
  log_file) -> AsyncGenerator[str, None]:
@@ -90,6 +92,8 @@ async def log_streamer(request_id: Optional[str],
90
92
  for line_str in lines:
91
93
  yield line_str
92
94
 
95
+ last_heartbeat_time = asyncio.get_event_loop().time()
96
+
93
97
  while True:
94
98
  # Sleep 0 to yield control to allow other coroutines to run,
95
99
  # while keeps the loop tight to make log stream responsive.
@@ -106,15 +110,32 @@ async def log_streamer(request_id: Optional[str],
106
110
  break
107
111
  if not follow:
108
112
  break
113
+
114
+ current_time = asyncio.get_event_loop().time()
115
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
116
+ # Currently just used to keep the connection busy, refer to
117
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
118
+ # more details.
119
+ yield message_utils.encode_payload(
120
+ rich_utils.Control.HEARTBEAT.encode(''))
121
+ last_heartbeat_time = current_time
122
+
109
123
  # Sleep shortly to avoid storming the DB and CPU, this has
110
124
  # little impact on the responsivness here since we are waiting
111
125
  # for a new line to come in.
112
126
  await asyncio.sleep(0.1)
113
127
  continue
128
+
129
+ # Refresh the heartbeat time, this is a trivial optimization for
130
+ # performance but it helps avoid unnecessary heartbeat strings
131
+ # being printed when the client runs in an old version.
132
+ last_heartbeat_time = asyncio.get_event_loop().time()
114
133
  line_str = line.decode('utf-8')
115
134
  if plain_logs:
116
135
  is_payload, line_str = message_utils.decode_payload(
117
136
  line_str, raise_for_mismatch=False)
137
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
138
+ # sending invisible characters might be okay.
118
139
  if is_payload:
119
140
  continue
120
141
  yield line_str
@@ -118,7 +118,13 @@ extras_require: Dict[str, List[str]] = {
118
118
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
119
119
  # parameter for stopping instances. Reference:
120
120
  # https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
121
- 'gcp': ['google-api-python-client>=2.69.0', 'google-cloud-storage'],
121
+ 'gcp': [
122
+ 'google-api-python-client>=2.69.0',
123
+ 'google-cloud-storage',
124
+ # see https://github.com/conda/conda/issues/13619
125
+ # see https://github.com/googleapis/google-api-python-client/issues/2554
126
+ 'pyopenssl >= 23.2.0, <24.3.0',
127
+ ],
122
128
  'ibm': [
123
129
  'ibm-cloud-sdk-core',
124
130
  'ibm-vpc',
sky/skylet/constants.py CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '12'
92
+ SKYLET_VERSION = '13'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/task.py CHANGED
@@ -292,6 +292,8 @@ class Task:
292
292
  self.resources: Union[List[sky.Resources],
293
293
  Set[sky.Resources]] = {sky.Resources()}
294
294
  self._service: Optional[service_spec.SkyServiceSpec] = None
295
+ # The priority of the managed job running this task.
296
+ self._job_priority: Optional[int] = None
295
297
  # Resources that this task cannot run on.
296
298
  self.blocked_resources = blocked_resources
297
299
 
@@ -629,6 +631,10 @@ class Task:
629
631
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
630
632
  task.set_service(service)
631
633
 
634
+ job = config.pop('job', None)
635
+ if job is not None and 'priority' in job:
636
+ task.set_job_priority(job['priority'])
637
+
632
638
  assert not config, f'Invalid task args: {config.keys()}'
633
639
  return task
634
640
 
@@ -831,6 +837,23 @@ class Task:
831
837
  self._service = service
832
838
  return self
833
839
 
840
+ @property
841
+ def job_priority(self) -> Optional[int]:
842
+ """The priority of the managed job running this task."""
843
+ return self._job_priority
844
+
845
+ def set_job_priority(self, priority: int) -> 'Task':
846
+ """Sets the job priority for this task.
847
+
848
+ Args:
849
+ priority: an integer between 0 and 1000.
850
+
851
+ Returns:
852
+ self: The current task, with job priority set.
853
+ """
854
+ self._job_priority = priority
855
+ return self
856
+
834
857
  def set_time_estimator(self, func: Callable[['sky.Resources'],
835
858
  int]) -> 'Task':
836
859
  """Sets a func mapping resources to estimated time (secs).
@@ -1274,6 +1297,9 @@ class Task:
1274
1297
  if self.service is not None:
1275
1298
  add_if_not_none('service', self.service.to_yaml_config())
1276
1299
 
1300
+ if self.job_priority is not None:
1301
+ add_if_not_none('job', {'priority': self.job_priority})
1302
+
1277
1303
  add_if_not_none('num_nodes', self.num_nodes)
1278
1304
 
1279
1305
  if self.inputs is not None:
@@ -66,7 +66,8 @@ run: |
66
66
  # managed_job_codegen.set_pending() before we get here.
67
67
  python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
68
68
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
69
- --env-file {{remote_env_file_path}}
69
+ --env-file {{remote_env_file_path}} \
70
+ --priority {{priority}}
70
71
 
71
72
 
72
73
  envs:
@@ -395,6 +395,13 @@ available_node_types:
395
395
  # STEP 1: Run apt update, install missing packages, and set up ssh.
396
396
  (
397
397
  (
398
+ # For backwards compatibility, we put a marker file in the pod
399
+ # to indicate that the apt ssh setup step will write a completion
400
+ # marker file (/tmp/apt_ssh_setup_complete) to the pod.
401
+ # TODO: Remove this marker file and its usage in setup_commands
402
+ # after v0.11.0 release.
403
+ touch /tmp/apt_ssh_setup_started
404
+
398
405
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
399
406
  echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
400
407
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
@@ -402,7 +409,7 @@ available_node_types:
402
409
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
403
410
 
404
411
  # Separate packages into two groups: packages that are installed first
405
- # so that curl, rsync and wget are available sooner to unblock the following
412
+ # so that curl, rsync, ssh and wget are available sooner to unblock the following
406
413
  # conda installation and rsync.
407
414
  # Also, we install fuse first to avoid confliction with fuse3.
408
415
  set -e
@@ -494,6 +501,8 @@ available_node_types:
494
501
  $(prefix_cmd) service ssh restart;
495
502
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
496
503
 
504
+ touch /tmp/apt_ssh_setup_complete
505
+ echo "=== SSH setup completed ==="
497
506
  ) > /tmp/${STEPS[0]}.log 2>&1 || {
498
507
  echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
499
508
  cat /tmp/${STEPS[0]}.log
@@ -791,6 +800,15 @@ setup_commands:
791
800
  {%- endfor %}
792
801
  STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
793
802
  start_epoch=$(date +%s);
803
+
804
+ # Wait for SSH setup to complete before proceeding
805
+ if [ -f /tmp/apt_ssh_setup_started ]; then
806
+ echo "=== Logs for asynchronous SSH setup ===";
807
+ [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
808
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
809
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
810
+ fi
811
+
794
812
  echo "=== Logs for asynchronous ray and skypilot installation ===";
795
813
  if [ -f /tmp/skypilot_is_nimbus ]; then
796
814
  echo "=== Logs for asynchronous ray and skypilot installation ===";
sky/utils/common_utils.py CHANGED
@@ -324,9 +324,75 @@ def get_pretty_entrypoint_cmd() -> str:
324
324
  # Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
325
325
  # things like 'examples/app.py'.
326
326
  argv[0] = basename
327
+
328
+ # Redact sensitive environment variable values
329
+ argv = _redact_env_values(argv)
330
+
327
331
  return ' '.join(argv)
328
332
 
329
333
 
334
+ def _redact_env_values(argv: List[str]) -> List[str]:
335
+ """Redact sensitive values from --env arguments.
336
+
337
+ Args:
338
+ argv: Command line arguments
339
+
340
+ Returns:
341
+ Modified argv with redacted --env values, or original argv if any error
342
+
343
+ Examples:
344
+ ['sky', 'launch', '--env', 'HF_TOKEN=secret'] ->
345
+ ['sky', 'launch', '--env', 'HF_TOKEN=<redacted>']
346
+
347
+ ['sky', 'launch', '--env=HF_TOKEN=secret'] ->
348
+ ['sky', 'launch', '--env=HF_TOKEN=<redacted>']
349
+
350
+ ['sky', 'launch', '--env', 'HF_TOKEN'] ->
351
+ ['sky', 'launch', '--env', 'HF_TOKEN'] (no change)
352
+ """
353
+ try:
354
+ if not argv:
355
+ return argv or []
356
+
357
+ result = []
358
+ i = 0
359
+
360
+ while i < len(argv):
361
+ arg = argv[i]
362
+
363
+ # Ensure arg is a string
364
+ if not isinstance(arg, str):
365
+ result.append(arg)
366
+ i += 1
367
+ continue
368
+
369
+ if arg == '--env' and i + 1 < len(argv):
370
+ result.append(arg)
371
+ next_arg = argv[i + 1]
372
+ # Ensure next_arg is a string and handle redaction safely
373
+ if isinstance(next_arg, str):
374
+ redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
375
+ next_arg)
376
+ result.append(redacted)
377
+ else:
378
+ result.append(next_arg)
379
+ i += 2
380
+ elif arg.startswith('--env='):
381
+ # Redact only if there's a value after the key
382
+ redacted = re.sub(r'^(--env=[^=]+)=.*', r'\1=<redacted>', arg)
383
+ result.append(redacted)
384
+ i += 1
385
+ else:
386
+ result.append(arg)
387
+ i += 1
388
+
389
+ return result
390
+ except Exception: # pylint: disable=broad-except
391
+ # If anything goes wrong with redaction, return original argv
392
+ # This ensures the command can still execute
393
+ return argv or []
394
+
395
+
330
396
  def user_and_hostname_hash() -> str:
331
397
  """Returns a string containing <user>-<hostname hash last 4 chars>.
332
398
 
sky/utils/rich_utils.py CHANGED
@@ -57,6 +57,7 @@ class Control(enum.Enum):
57
57
  STOP = 'rich_stop'
58
58
  EXIT = 'rich_exit'
59
59
  UPDATE = 'rich_update'
60
+ HEARTBEAT = 'heartbeat'
60
61
 
61
62
  def encode(self, msg: str) -> str:
62
63
  return f'<{self.value}>{msg}</{self.value}>'
@@ -385,6 +386,10 @@ def decode_rich_status(
385
386
  decoding_status.__exit__(None, None, None)
386
387
  elif control == Control.START:
387
388
  decoding_status.start()
389
+ elif control == Control.HEARTBEAT:
390
+ # Heartbeat is not displayed to the user, so we do not
391
+ # need to update the status.
392
+ pass
388
393
  finally:
389
394
  if decoding_status is not None:
390
395
  decoding_status.__exit__(None, None, None)
sky/utils/schemas.py CHANGED
@@ -646,6 +646,18 @@ def get_task_schema():
646
646
  'service': {
647
647
  'type': 'object',
648
648
  },
649
+ 'job': {
650
+ 'type': 'object',
651
+ 'required': [],
652
+ 'additionalProperties': False,
653
+ 'properties': {
654
+ 'priority': {
655
+ 'type': 'integer',
656
+ 'minimum': 0,
657
+ 'maximum': 1000,
658
+ },
659
+ },
660
+ },
649
661
  'setup': {
650
662
  'type': 'string',
651
663
  },
@@ -1096,6 +1108,9 @@ def get_config_schema():
1096
1108
  'required': [],
1097
1109
  'properties': {
1098
1110
  **_NETWORK_CONFIG_SCHEMA,
1111
+ 'tenant_id': {
1112
+ 'type': 'string',
1113
+ },
1099
1114
  },
1100
1115
  'additionalProperties': {
1101
1116
  'type': 'object',
@@ -1200,7 +1215,7 @@ def get_config_schema():
1200
1215
  # all clouds except gcp, kubernetes, ssh
1201
1216
  not_supported_clouds = [
1202
1217
  cloud for cloud in allowed_workspace_cloud_names
1203
- if cloud.lower() not in ['gcp', 'kubernetes', 'ssh']
1218
+ if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
1204
1219
  ]
1205
1220
  not_supported_cloud_regex = '|'.join(not_supported_clouds)
1206
1221
  workspaces_schema = {
@@ -1269,6 +1284,22 @@ def get_config_schema():
1269
1284
  },
1270
1285
  'additionalProperties': False,
1271
1286
  },
1287
+ 'nebius': {
1288
+ 'type': 'object',
1289
+ 'required': [],
1290
+ 'properties': {
1291
+ 'credentials_file_path': {
1292
+ 'type': 'string',
1293
+ },
1294
+ 'tenant_id': {
1295
+ 'type': 'string',
1296
+ },
1297
+ 'disabled': {
1298
+ 'type': 'boolean'
1299
+ },
1300
+ },
1301
+ 'additionalProperties': False,
1302
+ },
1272
1303
  },
1273
1304
  },
1274
1305
  }