skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +99 -16
- sky/authentication.py +54 -7
- sky/backends/backend_utils.py +35 -22
- sky/backends/cloud_vm_ray_backend.py +30 -15
- sky/check.py +1 -1
- sky/cli.py +20 -8
- sky/client/cli.py +20 -8
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/nebius.py +55 -14
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
- sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
- sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
- sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -1
- sky/global_user_state.py +149 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +153 -39
- sky/jobs/utils.py +33 -5
- sky/provision/kubernetes/utils.py +2 -1
- sky/provision/provisioner.py +15 -10
- sky/resources.py +16 -1
- sky/serve/controller.py +10 -7
- sky/serve/replica_managers.py +22 -18
- sky/serve/service.py +5 -4
- sky/server/common.py +11 -4
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/server/stream_utils.py +21 -0
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/templates/kubernetes-ray.yml.j2 +19 -1
- sky/utils/common_utils.py +66 -0
- sky/utils/rich_utils.py +5 -0
- sky/utils/schemas.py +32 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/serve/controller.py
CHANGED
@@ -42,12 +42,13 @@ class SkyServeController:
|
|
42
42
|
"""
|
43
43
|
|
44
44
|
def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
|
45
|
-
|
45
|
+
service_task_yaml: str, host: str, port: int) -> None:
|
46
46
|
self._service_name = service_name
|
47
47
|
self._replica_manager: replica_managers.ReplicaManager = (
|
48
|
-
replica_managers.SkyPilotReplicaManager(
|
49
|
-
|
50
|
-
|
48
|
+
replica_managers.SkyPilotReplicaManager(
|
49
|
+
service_name=service_name,
|
50
|
+
spec=service_spec,
|
51
|
+
service_task_yaml_path=service_task_yaml))
|
51
52
|
self._autoscaler: autoscalers.Autoscaler = (
|
52
53
|
autoscalers.Autoscaler.from_spec(service_name, service_spec))
|
53
54
|
self._host = host
|
@@ -240,7 +241,9 @@ class SkyServeController:
|
|
240
241
|
# TODO(tian): Probably we should support service that will stop the VM in
|
241
242
|
# specific time period.
|
242
243
|
def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
|
243
|
-
|
244
|
-
|
245
|
-
|
244
|
+
service_task_yaml: str, controller_host: str,
|
245
|
+
controller_port: int):
|
246
|
+
controller = SkyServeController(service_name, service_spec,
|
247
|
+
service_task_yaml, controller_host,
|
248
|
+
controller_port)
|
246
249
|
controller.run()
|
sky/serve/replica_managers.py
CHANGED
@@ -58,7 +58,7 @@ _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
|
|
58
58
|
# TODO(tian): Combine this with
|
59
59
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
60
60
|
def launch_cluster(replica_id: int,
|
61
|
-
|
61
|
+
service_task_yaml_path: str,
|
62
62
|
cluster_name: str,
|
63
63
|
resources_override: Optional[Dict[str, Any]] = None,
|
64
64
|
retry_until_up: bool = True,
|
@@ -78,7 +78,8 @@ def launch_cluster(replica_id: int,
|
|
78
78
|
f'{cluster_name} with resources override: '
|
79
79
|
f'{resources_override}')
|
80
80
|
try:
|
81
|
-
config = common_utils.read_yaml(
|
81
|
+
config = common_utils.read_yaml(
|
82
|
+
os.path.expanduser(service_task_yaml_path))
|
82
83
|
task = sky.Task.from_yaml_config(config)
|
83
84
|
if resources_override is not None:
|
84
85
|
resources = task.resources
|
@@ -173,9 +174,9 @@ def terminate_cluster(cluster_name: str,
|
|
173
174
|
time.sleep(gap_seconds)
|
174
175
|
|
175
176
|
|
176
|
-
def _get_resources_ports(
|
177
|
+
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
177
178
|
"""Get the resources ports used by the task."""
|
178
|
-
task = sky.Task.from_yaml(
|
179
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
179
180
|
# Already checked all ports are valid in sky.serve.core.up
|
180
181
|
assert task.resources, task
|
181
182
|
assert task.service is not None, task
|
@@ -183,7 +184,7 @@ def _get_resources_ports(task_yaml: str) -> str:
|
|
183
184
|
return task.service.ports
|
184
185
|
|
185
186
|
|
186
|
-
def _should_use_spot(
|
187
|
+
def _should_use_spot(service_task_yaml_path: str,
|
187
188
|
resource_override: Optional[Dict[str, Any]]) -> bool:
|
188
189
|
"""Get whether the task should use spot."""
|
189
190
|
if resource_override is not None:
|
@@ -191,7 +192,7 @@ def _should_use_spot(task_yaml: str,
|
|
191
192
|
if use_spot_override is not None:
|
192
193
|
assert isinstance(use_spot_override, bool)
|
193
194
|
return use_spot_override
|
194
|
-
task = sky.Task.from_yaml(
|
195
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
195
196
|
spot_use_resources = [
|
196
197
|
resources for resources in task.resources if resources.use_spot
|
197
198
|
]
|
@@ -634,10 +635,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
634
635
|
"""
|
635
636
|
|
636
637
|
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
637
|
-
|
638
|
+
service_task_yaml_path: str) -> None:
|
638
639
|
super().__init__(service_name, spec)
|
639
|
-
self.
|
640
|
-
task = sky.Task.from_yaml(
|
640
|
+
self.service_task_yaml_path = service_task_yaml_path
|
641
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
641
642
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
642
643
|
spot_placer.SpotPlacer.from_task(spec, task))
|
643
644
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
@@ -714,7 +715,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
714
715
|
self._service_name, replica_id)
|
715
716
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
716
717
|
self._service_name, replica_id)
|
717
|
-
use_spot = _should_use_spot(self.
|
718
|
+
use_spot = _should_use_spot(self.service_task_yaml_path,
|
719
|
+
resources_override)
|
718
720
|
retry_until_up = True
|
719
721
|
location = None
|
720
722
|
if use_spot and self._spot_placer is not None:
|
@@ -742,10 +744,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
742
744
|
launch_cluster,
|
743
745
|
log_file_name,
|
744
746
|
).run,
|
745
|
-
args=(replica_id, self.
|
747
|
+
args=(replica_id, self.service_task_yaml_path, cluster_name,
|
746
748
|
resources_override, retry_until_up),
|
747
749
|
)
|
748
|
-
replica_port = _get_resources_ports(self.
|
750
|
+
replica_port = _get_resources_ports(self.service_task_yaml_path)
|
749
751
|
|
750
752
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
751
753
|
location, self.latest_version, resources_override)
|
@@ -1290,11 +1292,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1290
1292
|
logger.error(f'Invalid version: {version}, '
|
1291
1293
|
f'latest version: {self.latest_version}')
|
1292
1294
|
return
|
1293
|
-
|
1295
|
+
service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
1294
1296
|
self._service_name, version)
|
1295
1297
|
serve_state.add_or_update_version(self._service_name, version, spec)
|
1296
1298
|
self.latest_version = version
|
1297
|
-
self.
|
1299
|
+
self.service_task_yaml_path = service_task_yaml_path
|
1298
1300
|
self._update_mode = update_mode
|
1299
1301
|
|
1300
1302
|
# Reuse all replicas that have the same config as the new version
|
@@ -1302,7 +1304,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1302
1304
|
# the latest version. This can significantly improve the speed
|
1303
1305
|
# for updating an existing service with only config changes to the
|
1304
1306
|
# service specs, e.g. scale down the service.
|
1305
|
-
new_config = common_utils.read_yaml(
|
1307
|
+
new_config = common_utils.read_yaml(
|
1308
|
+
os.path.expanduser(service_task_yaml_path))
|
1306
1309
|
# Always create new replicas and scale down old ones when file_mounts
|
1307
1310
|
# are not empty.
|
1308
1311
|
if new_config.get('file_mounts', None) != {}:
|
@@ -1313,10 +1316,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1313
1316
|
for info in replica_infos:
|
1314
1317
|
if info.version < version and not info.is_terminal:
|
1315
1318
|
# Assume user does not change the yaml file on the controller.
|
1316
|
-
|
1317
|
-
|
1319
|
+
old_service_task_yaml_path = (
|
1320
|
+
serve_utils.generate_task_yaml_file_name(
|
1321
|
+
self._service_name, info.version))
|
1318
1322
|
old_config = common_utils.read_yaml(
|
1319
|
-
os.path.expanduser(
|
1323
|
+
os.path.expanduser(old_service_task_yaml_path))
|
1320
1324
|
for key in ['service']:
|
1321
1325
|
old_config.pop(key)
|
1322
1326
|
# Bump replica version if all fields except for service are
|
sky/serve/service.py
CHANGED
@@ -186,7 +186,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
186
186
|
|
187
187
|
service_dir = os.path.expanduser(
|
188
188
|
serve_utils.generate_remote_service_dir_name(service_name))
|
189
|
-
|
189
|
+
service_task_yaml = serve_utils.generate_task_yaml_file_name(
|
190
|
+
service_name, version)
|
190
191
|
|
191
192
|
if not is_recovery:
|
192
193
|
if (len(serve_state.get_services()) >=
|
@@ -218,7 +219,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
218
219
|
# don't want the new file mounts to overwrite the old one, so we
|
219
220
|
# sync to a tmp file first and then copy it to the final name
|
220
221
|
# if there is no name conflict.
|
221
|
-
shutil.copy(tmp_task_yaml,
|
222
|
+
shutil.copy(tmp_task_yaml, service_task_yaml)
|
222
223
|
|
223
224
|
controller_process = None
|
224
225
|
load_balancer_process = None
|
@@ -249,8 +250,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
249
250
|
controller_host = _get_controller_host()
|
250
251
|
controller_process = multiprocessing.Process(
|
251
252
|
target=controller.run_controller,
|
252
|
-
args=(service_name, service_spec,
|
253
|
-
controller_port))
|
253
|
+
args=(service_name, service_spec, service_task_yaml,
|
254
|
+
controller_host, controller_port))
|
254
255
|
controller_process.start()
|
255
256
|
|
256
257
|
if not is_recovery:
|
sky/server/common.py
CHANGED
@@ -159,7 +159,8 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
159
159
|
|
160
160
|
|
161
161
|
@annotations.lru_cache(scope='global')
|
162
|
-
def get_dashboard_url(server_url: str
|
162
|
+
def get_dashboard_url(server_url: str,
|
163
|
+
starting_page: Optional[str] = None) -> str:
|
163
164
|
# The server_url may include username or password with the
|
164
165
|
# format of https://username:password@example.com:8080/path
|
165
166
|
# We need to remove the username and password and only
|
@@ -172,7 +173,10 @@ def get_dashboard_url(server_url: str) -> str:
|
|
172
173
|
if parsed.path:
|
173
174
|
dashboard_url = f'{dashboard_url}{parsed.path}'
|
174
175
|
dashboard_url = dashboard_url.rstrip('/')
|
175
|
-
|
176
|
+
dashboard_url = f'{dashboard_url}/dashboard'
|
177
|
+
if starting_page:
|
178
|
+
dashboard_url = f'{dashboard_url}/{starting_page}'
|
179
|
+
return dashboard_url
|
176
180
|
|
177
181
|
|
178
182
|
@annotations.lru_cache(scope='global')
|
@@ -529,10 +533,13 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
529
533
|
api_server_status = None
|
530
534
|
try:
|
531
535
|
api_server_status = check_server_healthy()
|
536
|
+
if api_server_status == ApiServerStatus.NEEDS_AUTH:
|
537
|
+
endpoint = get_server_url()
|
538
|
+
with ux_utils.print_exception_no_traceback():
|
539
|
+
raise exceptions.ApiServerAuthenticationError(endpoint)
|
532
540
|
except exceptions.ApiServerConnectionError as exc:
|
533
541
|
endpoint = get_server_url()
|
534
|
-
if
|
535
|
-
api_server_status == ApiServerStatus.NEEDS_AUTH):
|
542
|
+
if not is_api_server_local():
|
536
543
|
with ux_utils.print_exception_no_traceback():
|
537
544
|
raise exceptions.ApiServerConnectionError(endpoint) from exc
|
538
545
|
# Lock to prevent multiple processes from starting the server at the
|
sky/server/html/token_page.html
CHANGED
@@ -100,6 +100,9 @@
|
|
100
100
|
color: #5f6368;
|
101
101
|
margin-top: 30px;
|
102
102
|
}
|
103
|
+
.local-port-info {
|
104
|
+
display: none;
|
105
|
+
}
|
103
106
|
</style>
|
104
107
|
</head>
|
105
108
|
<body>
|
@@ -114,14 +117,18 @@
|
|
114
117
|
<path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
|
115
118
|
</svg>
|
116
119
|
</div>
|
117
|
-
<h1>Sign in to SkyPilot CLI</h1>
|
120
|
+
<h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
|
121
|
+
<h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
|
118
122
|
<p class="user-identifier">USER_PLACEHOLDER</p>
|
119
|
-
|
120
|
-
<p>
|
121
|
-
<
|
122
|
-
<
|
123
|
+
<!-- display token info by default -->
|
124
|
+
<p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
|
125
|
+
<p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
|
126
|
+
<div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
|
127
|
+
<button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
|
128
|
+
<p class="footer-text no-local-port">You can close this tab after copying the token.</p>
|
123
129
|
|
124
|
-
|
130
|
+
<!-- don't display local port info unless successful -->
|
131
|
+
<p class="local-port-info">You can now close this tab.</p>
|
125
132
|
</div>
|
126
133
|
|
127
134
|
<script>
|
@@ -154,6 +161,25 @@
|
|
154
161
|
copyBtn.textContent = 'Copy Token';
|
155
162
|
}, 2000);
|
156
163
|
});
|
164
|
+
|
165
|
+
function hideTokenInfo() {
|
166
|
+
const noLocalPortElems = document.querySelectorAll('.no-local-port');
|
167
|
+
noLocalPortElems.forEach(elem => {
|
168
|
+
elem.style.display = 'none';
|
169
|
+
});
|
170
|
+
const localPortInfoElems = document.querySelectorAll('.local-port-info');
|
171
|
+
localPortInfoElems.forEach(elem => {
|
172
|
+
elem.classList.remove('local-port-info');
|
173
|
+
});
|
174
|
+
}
|
175
|
+
|
176
|
+
if (window.location.search.includes('local_port=')) {
|
177
|
+
const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
|
178
|
+
fetch(uri, {
|
179
|
+
method: 'POST',
|
180
|
+
body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
|
181
|
+
}).then(hideTokenInfo)
|
182
|
+
}
|
157
183
|
</script>
|
158
184
|
</body>
|
159
185
|
</html>
|
sky/server/server.py
CHANGED
@@ -272,7 +272,9 @@ app.include_router(workspaces_rest.router,
|
|
272
272
|
|
273
273
|
|
274
274
|
@app.get('/token')
|
275
|
-
async def token(request: fastapi.Request
|
275
|
+
async def token(request: fastapi.Request,
|
276
|
+
local_port: Optional[int] = None) -> fastapi.responses.Response:
|
277
|
+
del local_port # local_port is used by the served js, but ignored by server
|
276
278
|
user = _get_auth_user_header(request)
|
277
279
|
|
278
280
|
token_data = {
|
sky/server/stream_utils.py
CHANGED
@@ -15,6 +15,8 @@ from sky.utils import rich_utils
|
|
15
15
|
|
16
16
|
logger = sky_logging.init_logger(__name__)
|
17
17
|
|
18
|
+
_HEARTBEAT_INTERVAL = 30
|
19
|
+
|
18
20
|
|
19
21
|
async def _yield_log_file_with_payloads_skipped(
|
20
22
|
log_file) -> AsyncGenerator[str, None]:
|
@@ -90,6 +92,8 @@ async def log_streamer(request_id: Optional[str],
|
|
90
92
|
for line_str in lines:
|
91
93
|
yield line_str
|
92
94
|
|
95
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
96
|
+
|
93
97
|
while True:
|
94
98
|
# Sleep 0 to yield control to allow other coroutines to run,
|
95
99
|
# while keeps the loop tight to make log stream responsive.
|
@@ -106,15 +110,32 @@ async def log_streamer(request_id: Optional[str],
|
|
106
110
|
break
|
107
111
|
if not follow:
|
108
112
|
break
|
113
|
+
|
114
|
+
current_time = asyncio.get_event_loop().time()
|
115
|
+
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
116
|
+
# Currently just used to keep the connection busy, refer to
|
117
|
+
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
118
|
+
# more details.
|
119
|
+
yield message_utils.encode_payload(
|
120
|
+
rich_utils.Control.HEARTBEAT.encode(''))
|
121
|
+
last_heartbeat_time = current_time
|
122
|
+
|
109
123
|
# Sleep shortly to avoid storming the DB and CPU, this has
|
110
124
|
# little impact on the responsivness here since we are waiting
|
111
125
|
# for a new line to come in.
|
112
126
|
await asyncio.sleep(0.1)
|
113
127
|
continue
|
128
|
+
|
129
|
+
# Refresh the heartbeat time, this is a trivial optimization for
|
130
|
+
# performance but it helps avoid unnecessary heartbeat strings
|
131
|
+
# being printed when the client runs in an old version.
|
132
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
114
133
|
line_str = line.decode('utf-8')
|
115
134
|
if plain_logs:
|
116
135
|
is_payload, line_str = message_utils.decode_payload(
|
117
136
|
line_str, raise_for_mismatch=False)
|
137
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
138
|
+
# sending invisible characters might be okay.
|
118
139
|
if is_payload:
|
119
140
|
continue
|
120
141
|
yield line_str
|
sky/setup_files/dependencies.py
CHANGED
@@ -118,7 +118,13 @@ extras_require: Dict[str, List[str]] = {
|
|
118
118
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
119
119
|
# parameter for stopping instances. Reference:
|
120
120
|
# https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
|
121
|
-
'gcp': [
|
121
|
+
'gcp': [
|
122
|
+
'google-api-python-client>=2.69.0',
|
123
|
+
'google-cloud-storage',
|
124
|
+
# see https://github.com/conda/conda/issues/13619
|
125
|
+
# see https://github.com/googleapis/google-api-python-client/issues/2554
|
126
|
+
'pyopenssl >= 23.2.0, <24.3.0',
|
127
|
+
],
|
122
128
|
'ibm': [
|
123
129
|
'ibm-cloud-sdk-core',
|
124
130
|
'ibm-vpc',
|
sky/skylet/constants.py
CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '13'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/task.py
CHANGED
@@ -292,6 +292,8 @@ class Task:
|
|
292
292
|
self.resources: Union[List[sky.Resources],
|
293
293
|
Set[sky.Resources]] = {sky.Resources()}
|
294
294
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
295
|
+
# The priority of the managed job running this task.
|
296
|
+
self._job_priority: Optional[int] = None
|
295
297
|
# Resources that this task cannot run on.
|
296
298
|
self.blocked_resources = blocked_resources
|
297
299
|
|
@@ -629,6 +631,10 @@ class Task:
|
|
629
631
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
630
632
|
task.set_service(service)
|
631
633
|
|
634
|
+
job = config.pop('job', None)
|
635
|
+
if job is not None and 'priority' in job:
|
636
|
+
task.set_job_priority(job['priority'])
|
637
|
+
|
632
638
|
assert not config, f'Invalid task args: {config.keys()}'
|
633
639
|
return task
|
634
640
|
|
@@ -831,6 +837,23 @@ class Task:
|
|
831
837
|
self._service = service
|
832
838
|
return self
|
833
839
|
|
840
|
+
@property
|
841
|
+
def job_priority(self) -> Optional[int]:
|
842
|
+
"""The priority of the managed job running this task."""
|
843
|
+
return self._job_priority
|
844
|
+
|
845
|
+
def set_job_priority(self, priority: int) -> 'Task':
|
846
|
+
"""Sets the job priority for this task.
|
847
|
+
|
848
|
+
Args:
|
849
|
+
priority: an integer between 0 and 1000.
|
850
|
+
|
851
|
+
Returns:
|
852
|
+
self: The current task, with job priority set.
|
853
|
+
"""
|
854
|
+
self._job_priority = priority
|
855
|
+
return self
|
856
|
+
|
834
857
|
def set_time_estimator(self, func: Callable[['sky.Resources'],
|
835
858
|
int]) -> 'Task':
|
836
859
|
"""Sets a func mapping resources to estimated time (secs).
|
@@ -1274,6 +1297,9 @@ class Task:
|
|
1274
1297
|
if self.service is not None:
|
1275
1298
|
add_if_not_none('service', self.service.to_yaml_config())
|
1276
1299
|
|
1300
|
+
if self.job_priority is not None:
|
1301
|
+
add_if_not_none('job', {'priority': self.job_priority})
|
1302
|
+
|
1277
1303
|
add_if_not_none('num_nodes', self.num_nodes)
|
1278
1304
|
|
1279
1305
|
if self.inputs is not None:
|
@@ -66,7 +66,8 @@ run: |
|
|
66
66
|
# managed_job_codegen.set_pending() before we get here.
|
67
67
|
python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
68
68
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
69
|
-
--env-file {{remote_env_file_path}}
|
69
|
+
--env-file {{remote_env_file_path}} \
|
70
|
+
--priority {{priority}}
|
70
71
|
|
71
72
|
|
72
73
|
envs:
|
@@ -395,6 +395,13 @@ available_node_types:
|
|
395
395
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
396
396
|
(
|
397
397
|
(
|
398
|
+
# For backwards compatibility, we put a marker file in the pod
|
399
|
+
# to indicate that the apt ssh setup step will write a completion
|
400
|
+
# marker file (/tmp/apt_ssh_setup_complete) to the pod.
|
401
|
+
# TODO: Remove this marker file and its usage in setup_commands
|
402
|
+
# after v0.11.0 release.
|
403
|
+
touch /tmp/apt_ssh_setup_started
|
404
|
+
|
398
405
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
399
406
|
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
400
407
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
@@ -402,7 +409,7 @@ available_node_types:
|
|
402
409
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
403
410
|
|
404
411
|
# Separate packages into two groups: packages that are installed first
|
405
|
-
# so that curl, rsync and wget are available sooner to unblock the following
|
412
|
+
# so that curl, rsync, ssh and wget are available sooner to unblock the following
|
406
413
|
# conda installation and rsync.
|
407
414
|
# Also, we install fuse first to avoid confliction with fuse3.
|
408
415
|
set -e
|
@@ -494,6 +501,8 @@ available_node_types:
|
|
494
501
|
$(prefix_cmd) service ssh restart;
|
495
502
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
496
503
|
|
504
|
+
touch /tmp/apt_ssh_setup_complete
|
505
|
+
echo "=== SSH setup completed ==="
|
497
506
|
) > /tmp/${STEPS[0]}.log 2>&1 || {
|
498
507
|
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
|
499
508
|
cat /tmp/${STEPS[0]}.log
|
@@ -791,6 +800,15 @@ setup_commands:
|
|
791
800
|
{%- endfor %}
|
792
801
|
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
793
802
|
start_epoch=$(date +%s);
|
803
|
+
|
804
|
+
# Wait for SSH setup to complete before proceeding
|
805
|
+
if [ -f /tmp/apt_ssh_setup_started ]; then
|
806
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
807
|
+
[ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
|
808
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
809
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
810
|
+
fi
|
811
|
+
|
794
812
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
795
813
|
if [ -f /tmp/skypilot_is_nimbus ]; then
|
796
814
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
sky/utils/common_utils.py
CHANGED
@@ -324,9 +324,75 @@ def get_pretty_entrypoint_cmd() -> str:
|
|
324
324
|
# Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
|
325
325
|
# things like 'examples/app.py'.
|
326
326
|
argv[0] = basename
|
327
|
+
|
328
|
+
# Redact sensitive environment variable values
|
329
|
+
argv = _redact_env_values(argv)
|
330
|
+
|
327
331
|
return ' '.join(argv)
|
328
332
|
|
329
333
|
|
334
|
+
def _redact_env_values(argv: List[str]) -> List[str]:
|
335
|
+
"""Redact sensitive values from --env arguments.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
argv: Command line arguments
|
339
|
+
|
340
|
+
Returns:
|
341
|
+
Modified argv with redacted --env values, or original argv if any error
|
342
|
+
|
343
|
+
Examples:
|
344
|
+
['sky', 'launch', '--env', 'HF_TOKEN=secret'] ->
|
345
|
+
['sky', 'launch', '--env', 'HF_TOKEN=<redacted>']
|
346
|
+
|
347
|
+
['sky', 'launch', '--env=HF_TOKEN=secret'] ->
|
348
|
+
['sky', 'launch', '--env=HF_TOKEN=<redacted>']
|
349
|
+
|
350
|
+
['sky', 'launch', '--env', 'HF_TOKEN'] ->
|
351
|
+
['sky', 'launch', '--env', 'HF_TOKEN'] (no change)
|
352
|
+
"""
|
353
|
+
try:
|
354
|
+
if not argv:
|
355
|
+
return argv or []
|
356
|
+
|
357
|
+
result = []
|
358
|
+
i = 0
|
359
|
+
|
360
|
+
while i < len(argv):
|
361
|
+
arg = argv[i]
|
362
|
+
|
363
|
+
# Ensure arg is a string
|
364
|
+
if not isinstance(arg, str):
|
365
|
+
result.append(arg)
|
366
|
+
i += 1
|
367
|
+
continue
|
368
|
+
|
369
|
+
if arg == '--env' and i + 1 < len(argv):
|
370
|
+
result.append(arg)
|
371
|
+
next_arg = argv[i + 1]
|
372
|
+
# Ensure next_arg is a string and handle redaction safely
|
373
|
+
if isinstance(next_arg, str):
|
374
|
+
redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
|
375
|
+
next_arg)
|
376
|
+
result.append(redacted)
|
377
|
+
else:
|
378
|
+
result.append(next_arg)
|
379
|
+
i += 2
|
380
|
+
elif arg.startswith('--env='):
|
381
|
+
# Redact only if there's a value after the key
|
382
|
+
redacted = re.sub(r'^(--env=[^=]+)=.*', r'\1=<redacted>', arg)
|
383
|
+
result.append(redacted)
|
384
|
+
i += 1
|
385
|
+
else:
|
386
|
+
result.append(arg)
|
387
|
+
i += 1
|
388
|
+
|
389
|
+
return result
|
390
|
+
except Exception: # pylint: disable=broad-except
|
391
|
+
# If anything goes wrong with redaction, return original argv
|
392
|
+
# This ensures the command can still execute
|
393
|
+
return argv or []
|
394
|
+
|
395
|
+
|
330
396
|
def user_and_hostname_hash() -> str:
|
331
397
|
"""Returns a string containing <user>-<hostname hash last 4 chars>.
|
332
398
|
|
sky/utils/rich_utils.py
CHANGED
@@ -57,6 +57,7 @@ class Control(enum.Enum):
|
|
57
57
|
STOP = 'rich_stop'
|
58
58
|
EXIT = 'rich_exit'
|
59
59
|
UPDATE = 'rich_update'
|
60
|
+
HEARTBEAT = 'heartbeat'
|
60
61
|
|
61
62
|
def encode(self, msg: str) -> str:
|
62
63
|
return f'<{self.value}>{msg}</{self.value}>'
|
@@ -385,6 +386,10 @@ def decode_rich_status(
|
|
385
386
|
decoding_status.__exit__(None, None, None)
|
386
387
|
elif control == Control.START:
|
387
388
|
decoding_status.start()
|
389
|
+
elif control == Control.HEARTBEAT:
|
390
|
+
# Heartbeat is not displayed to the user, so we do not
|
391
|
+
# need to update the status.
|
392
|
+
pass
|
388
393
|
finally:
|
389
394
|
if decoding_status is not None:
|
390
395
|
decoding_status.__exit__(None, None, None)
|
sky/utils/schemas.py
CHANGED
@@ -646,6 +646,18 @@ def get_task_schema():
|
|
646
646
|
'service': {
|
647
647
|
'type': 'object',
|
648
648
|
},
|
649
|
+
'job': {
|
650
|
+
'type': 'object',
|
651
|
+
'required': [],
|
652
|
+
'additionalProperties': False,
|
653
|
+
'properties': {
|
654
|
+
'priority': {
|
655
|
+
'type': 'integer',
|
656
|
+
'minimum': 0,
|
657
|
+
'maximum': 1000,
|
658
|
+
},
|
659
|
+
},
|
660
|
+
},
|
649
661
|
'setup': {
|
650
662
|
'type': 'string',
|
651
663
|
},
|
@@ -1096,6 +1108,9 @@ def get_config_schema():
|
|
1096
1108
|
'required': [],
|
1097
1109
|
'properties': {
|
1098
1110
|
**_NETWORK_CONFIG_SCHEMA,
|
1111
|
+
'tenant_id': {
|
1112
|
+
'type': 'string',
|
1113
|
+
},
|
1099
1114
|
},
|
1100
1115
|
'additionalProperties': {
|
1101
1116
|
'type': 'object',
|
@@ -1200,7 +1215,7 @@ def get_config_schema():
|
|
1200
1215
|
# all clouds except gcp, kubernetes, ssh
|
1201
1216
|
not_supported_clouds = [
|
1202
1217
|
cloud for cloud in allowed_workspace_cloud_names
|
1203
|
-
if cloud.lower() not in ['gcp', 'kubernetes', 'ssh']
|
1218
|
+
if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
|
1204
1219
|
]
|
1205
1220
|
not_supported_cloud_regex = '|'.join(not_supported_clouds)
|
1206
1221
|
workspaces_schema = {
|
@@ -1269,6 +1284,22 @@ def get_config_schema():
|
|
1269
1284
|
},
|
1270
1285
|
'additionalProperties': False,
|
1271
1286
|
},
|
1287
|
+
'nebius': {
|
1288
|
+
'type': 'object',
|
1289
|
+
'required': [],
|
1290
|
+
'properties': {
|
1291
|
+
'credentials_file_path': {
|
1292
|
+
'type': 'string',
|
1293
|
+
},
|
1294
|
+
'tenant_id': {
|
1295
|
+
'type': 'string',
|
1296
|
+
},
|
1297
|
+
'disabled': {
|
1298
|
+
'type': 'boolean'
|
1299
|
+
},
|
1300
|
+
},
|
1301
|
+
'additionalProperties': False,
|
1302
|
+
},
|
1272
1303
|
},
|
1273
1304
|
},
|
1274
1305
|
}
|