skypilot-nightly 1.0.0.dev20250607__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +3 -0
- sky/authentication.py +1 -7
- sky/backends/backend_utils.py +18 -2
- sky/backends/cloud_vm_ray_backend.py +9 -20
- sky/check.py +4 -3
- sky/cli.py +6 -9
- sky/client/cli.py +6 -9
- sky/client/sdk.py +49 -4
- sky/clouds/kubernetes.py +15 -24
- sky/core.py +3 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
- sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
- sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
- sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
- sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
- sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
- sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
- sky/dashboard/out/_next/static/css/8b1c8321d4c02372.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +23 -0
- sky/global_user_state.py +192 -80
- sky/jobs/client/sdk.py +29 -21
- sky/jobs/server/core.py +9 -1
- sky/jobs/server/server.py +0 -95
- sky/jobs/utils.py +2 -1
- sky/models.py +18 -0
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/utils.py +106 -7
- sky/serve/client/sdk.py +56 -45
- sky/serve/server/core.py +1 -1
- sky/server/common.py +5 -7
- sky/server/constants.py +0 -2
- sky/server/requests/executor.py +60 -22
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/process.py +69 -29
- sky/server/requests/requests.py +4 -3
- sky/server/server.py +23 -5
- sky/server/stream_utils.py +111 -55
- sky/skylet/constants.py +4 -2
- sky/skylet/job_lib.py +2 -1
- sky/skypilot_config.py +108 -25
- sky/users/model.conf +1 -1
- sky/users/permission.py +149 -32
- sky/users/rbac.py +26 -0
- sky/users/server.py +14 -13
- sky/utils/admin_policy_utils.py +9 -3
- sky/utils/common.py +6 -1
- sky/utils/common_utils.py +21 -3
- sky/utils/context.py +21 -1
- sky/utils/controller_utils.py +16 -1
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
- sky/utils/schemas.py +9 -0
- sky/workspaces/core.py +100 -8
- sky/workspaces/server.py +15 -2
- sky/workspaces/utils.py +56 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +106 -94
- sky/dashboard/out/_next/static/1qG0HTmVilJPxQdBk0fX5/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
- sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-ad1e0db3afcbd9c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/614-635a84e87800f99e.js +0 -66
- sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-c296541442d4af88.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-3a32da4b84176f6d.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-6d78a0814682d771.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-cb81dc4d27f4d009.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-18aed9b56247d074.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b919a73aecdfa78f.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-4f6b9dd9abcb33ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-fe375a56342cf609.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-3a18d0eeb5119fe4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-a1a6abeeb58c1051.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1354e28c81eeb686.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-23bfc8bf373423db.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-5800045bd04e69c2.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-e1f9c0c3ff7ac4bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-686590e0ee4b2412.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-76b07aa5da91b0df.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
- sky/dashboard/out/_next/static/css/667d941a2888ce6e.css +0 -3
- /sky/dashboard/out/_next/static/{1qG0HTmVilJPxQdBk0fX5 → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
sky/server/requests/executor.py
CHANGED
@@ -19,6 +19,7 @@ The number of the workers is determined by the system resources.
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
20
|
"""
|
21
21
|
import asyncio
|
22
|
+
import concurrent.futures
|
22
23
|
import contextlib
|
23
24
|
import multiprocessing
|
24
25
|
import os
|
@@ -53,6 +54,7 @@ from sky.utils import context
|
|
53
54
|
from sky.utils import context_utils
|
54
55
|
from sky.utils import subprocess_utils
|
55
56
|
from sky.utils import timeline
|
57
|
+
from sky.workspaces import core as workspaces_core
|
56
58
|
|
57
59
|
if typing.TYPE_CHECKING:
|
58
60
|
import types
|
@@ -92,21 +94,21 @@ class RequestQueue:
|
|
92
94
|
else:
|
93
95
|
raise RuntimeError(f'Invalid queue backend: {backend}')
|
94
96
|
|
95
|
-
def put(self, request: Tuple[str, bool]) -> None:
|
97
|
+
def put(self, request: Tuple[str, bool, bool]) -> None:
|
96
98
|
"""Put and request to the queue.
|
97
99
|
|
98
100
|
Args:
|
99
|
-
request: A tuple of request_id and
|
101
|
+
request: A tuple of request_id, ignore_return_value, and retryable.
|
100
102
|
"""
|
101
103
|
self.queue.put(request) # type: ignore
|
102
104
|
|
103
|
-
def get(self) -> Optional[Tuple[str, bool]]:
|
105
|
+
def get(self) -> Optional[Tuple[str, bool, bool]]:
|
104
106
|
"""Get a request from the queue.
|
105
107
|
|
106
108
|
It is non-blocking if the queue is empty, and returns None.
|
107
109
|
|
108
110
|
Returns:
|
109
|
-
A tuple of request_id and
|
111
|
+
A tuple of request_id, ignore_return_value, and retryable.
|
110
112
|
"""
|
111
113
|
try:
|
112
114
|
return self.queue.get(block=False)
|
@@ -158,7 +160,7 @@ class RequestWorker:
|
|
158
160
|
if request_element is None:
|
159
161
|
time.sleep(0.1)
|
160
162
|
return
|
161
|
-
request_id, ignore_return_value = request_element
|
163
|
+
request_id, ignore_return_value, retryable = request_element
|
162
164
|
request = api_requests.get_request(request_id)
|
163
165
|
assert request is not None, f'Request with ID {request_id} is None'
|
164
166
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
@@ -170,8 +172,14 @@ class RequestWorker:
|
|
170
172
|
# multiple requests can share the same process pid, which may cause
|
171
173
|
# issues with SkyPilot core functions if they rely on the exit of
|
172
174
|
# the process, such as subprocess_daemon.py.
|
173
|
-
executor.submit_until_success(_request_execution_wrapper,
|
174
|
-
|
175
|
+
fut = executor.submit_until_success(_request_execution_wrapper,
|
176
|
+
request_id, ignore_return_value)
|
177
|
+
if retryable:
|
178
|
+
# If the task might fail and be retried, start a thread to
|
179
|
+
# monitor the future and process retry.
|
180
|
+
threading.Thread(target=self.handle_task_result,
|
181
|
+
args=(fut, request_element),
|
182
|
+
daemon=True).start()
|
175
183
|
|
176
184
|
logger.info(f'[{self}] Submitted request: {request_id}')
|
177
185
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
@@ -181,6 +189,16 @@ class RequestWorker:
|
|
181
189
|
f'{request_id if "request_id" in locals() else ""} '
|
182
190
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
183
191
|
|
192
|
+
def handle_task_result(self, fut: concurrent.futures.Future,
|
193
|
+
request_element: Tuple[str, bool, bool]) -> None:
|
194
|
+
try:
|
195
|
+
fut.result()
|
196
|
+
except exceptions.ExecutionRetryableError as e:
|
197
|
+
time.sleep(e.retry_wait_seconds)
|
198
|
+
# Reschedule the request.
|
199
|
+
queue = _get_queue(self.schedule_type)
|
200
|
+
queue.put(request_element)
|
201
|
+
|
184
202
|
def run(self) -> None:
|
185
203
|
# Handle the SIGTERM signal to abort the executor process gracefully.
|
186
204
|
proc_group = f'{self.schedule_type.value}'
|
@@ -229,6 +247,9 @@ def override_request_env_and_config(
|
|
229
247
|
original_env = os.environ.copy()
|
230
248
|
os.environ.update(request_body.env_vars)
|
231
249
|
# Note: may be overridden by AuthProxyMiddleware.
|
250
|
+
# TODO(zhwu): we need to make the entire request a context available to the
|
251
|
+
# entire request execution, so that we can access info like user through
|
252
|
+
# the execution.
|
232
253
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
233
254
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
234
255
|
global_user_state.add_or_update_user(user)
|
@@ -237,13 +258,17 @@ def override_request_env_and_config(
|
|
237
258
|
server_common.reload_for_new_request(
|
238
259
|
client_entrypoint=request_body.entrypoint,
|
239
260
|
client_command=request_body.entrypoint_command,
|
240
|
-
using_remote_api_server=request_body.using_remote_api_server
|
261
|
+
using_remote_api_server=request_body.using_remote_api_server,
|
262
|
+
user=user)
|
241
263
|
try:
|
242
264
|
logger.debug(
|
243
265
|
f'override path: {request_body.override_skypilot_config_path}')
|
244
266
|
with skypilot_config.override_skypilot_config(
|
245
267
|
request_body.override_skypilot_config,
|
246
268
|
request_body.override_skypilot_config_path):
|
269
|
+
# Rejecting requests to workspaces that the user does not have
|
270
|
+
# permission to access.
|
271
|
+
workspaces_core.reject_request_for_unauthorized_workspace(user)
|
247
272
|
yield
|
248
273
|
finally:
|
249
274
|
# We need to call the save_timeline() since atexit will not be
|
@@ -308,7 +333,9 @@ def _request_execution_wrapper(request_id: str,
|
|
308
333
|
func = request_task.entrypoint
|
309
334
|
request_body = request_task.request_body
|
310
335
|
|
311
|
-
|
336
|
+
# Append to the log file instead of overwriting it since there might be
|
337
|
+
# logs from previous retries.
|
338
|
+
with log_path.open('a', encoding='utf-8') as f:
|
312
339
|
# Store copies of the original stdout and stderr file descriptors
|
313
340
|
original_stdout, original_stderr = _redirect_output(f)
|
314
341
|
# Redirect the stdout/stderr before overriding the environment and
|
@@ -332,6 +359,17 @@ def _request_execution_wrapper(request_id: str,
|
|
332
359
|
subprocess_utils.kill_children_processes()
|
333
360
|
_restore_output(original_stdout, original_stderr)
|
334
361
|
return
|
362
|
+
except exceptions.ExecutionRetryableError as e:
|
363
|
+
logger.error(e)
|
364
|
+
logger.info(e.hint)
|
365
|
+
with api_requests.update_request(request_id) as request_task:
|
366
|
+
assert request_task is not None, request_id
|
367
|
+
# Retried request will undergo rescheduling and a new execution,
|
368
|
+
# clear the pid of the request.
|
369
|
+
request_task.pid = None
|
370
|
+
# Yield control to the scheduler for uniform handling of retries.
|
371
|
+
_restore_output(original_stdout, original_stderr)
|
372
|
+
raise
|
335
373
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
336
374
|
api_requests.set_request_failed(request_id, e)
|
337
375
|
_restore_output(original_stdout, original_stderr)
|
@@ -433,7 +471,7 @@ def prepare_request(
|
|
433
471
|
"""Prepare a request for execution."""
|
434
472
|
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
435
473
|
if is_skypilot_system:
|
436
|
-
user_id =
|
474
|
+
user_id = constants.SKYPILOT_SYSTEM_USER_ID
|
437
475
|
global_user_state.add_or_update_user(
|
438
476
|
models.User(id=user_id, name=user_id))
|
439
477
|
request = api_requests.Request(request_id=request_id,
|
@@ -455,17 +493,17 @@ def prepare_request(
|
|
455
493
|
return request
|
456
494
|
|
457
495
|
|
458
|
-
def schedule_request(
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
496
|
+
def schedule_request(request_id: str,
|
497
|
+
request_name: str,
|
498
|
+
request_body: payloads.RequestBody,
|
499
|
+
func: Callable[P, Any],
|
500
|
+
request_cluster_name: Optional[str] = None,
|
501
|
+
ignore_return_value: bool = False,
|
502
|
+
schedule_type: api_requests.ScheduleType = (
|
503
|
+
api_requests.ScheduleType.LONG),
|
504
|
+
is_skypilot_system: bool = False,
|
505
|
+
precondition: Optional[preconditions.Precondition] = None,
|
506
|
+
retryable: bool = False) -> None:
|
469
507
|
"""Enqueue a request to the request queue.
|
470
508
|
|
471
509
|
Args:
|
@@ -490,7 +528,7 @@ def schedule_request(
|
|
490
528
|
request_cluster_name, schedule_type, is_skypilot_system)
|
491
529
|
|
492
530
|
def enqueue():
|
493
|
-
input_tuple = (request_id, ignore_return_value)
|
531
|
+
input_tuple = (request_id, ignore_return_value, retryable)
|
494
532
|
logger.info(f'Queuing request: {request_id}')
|
495
533
|
_get_queue(schedule_type).put(input_tuple)
|
496
534
|
|
sky/server/requests/payloads.py
CHANGED
@@ -79,6 +79,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
|
|
79
79
|
# server endpoint on the server side. This avoids the warning at
|
80
80
|
# server-side.
|
81
81
|
config.pop_nested(('api_server',), default_value=None)
|
82
|
+
# Remove the admin policy, as the policy has been applied on the client
|
83
|
+
# side.
|
84
|
+
config.pop_nested(('admin_policy',), default_value=None)
|
82
85
|
return config
|
83
86
|
|
84
87
|
|
sky/server/requests/process.py
CHANGED
@@ -6,6 +6,7 @@ import threading
|
|
6
6
|
import time
|
7
7
|
from typing import Callable, Dict, Optional, Tuple
|
8
8
|
|
9
|
+
from sky import exceptions
|
9
10
|
from sky.utils import atomic
|
10
11
|
from sky.utils import subprocess_utils
|
11
12
|
|
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
|
|
67
68
|
|
68
69
|
|
69
70
|
# Define the worker function outside of the class to avoid pickling self
|
70
|
-
def _disposable_worker(fn, initializer
|
71
|
-
|
71
|
+
def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
|
72
|
+
"""The worker function that is used to run the task.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
fn: The function to run.
|
76
|
+
initializer: The initializer function to run before running the task.
|
77
|
+
initargs: The arguments to pass to the initializer function.
|
78
|
+
result_queue: The queue to put the result and exception into.
|
79
|
+
args: The arguments to pass to the function.
|
80
|
+
kwargs: The keyword arguments to pass to the function.
|
81
|
+
"""
|
72
82
|
try:
|
73
83
|
if initializer is not None:
|
74
84
|
initializer(*initargs)
|
75
|
-
fn(*args, **kwargs)
|
85
|
+
result = fn(*args, **kwargs)
|
86
|
+
result_queue.put(result)
|
76
87
|
except BaseException as e: # pylint: disable=broad-except
|
77
|
-
|
88
|
+
result_queue.put(e)
|
78
89
|
|
79
90
|
|
80
91
|
class DisposableExecutor:
|
@@ -98,28 +109,52 @@ class DisposableExecutor:
|
|
98
109
|
self._initializer: Optional[Callable] = initializer
|
99
110
|
self._initargs: Tuple = initargs
|
100
111
|
|
101
|
-
def _monitor_worker(self, process: multiprocessing.Process
|
112
|
+
def _monitor_worker(self, process: multiprocessing.Process,
|
113
|
+
future: concurrent.futures.Future,
|
114
|
+
result_queue: multiprocessing.Queue) -> None:
|
102
115
|
"""Monitor the worker process and cleanup when it's done."""
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
116
|
+
try:
|
117
|
+
process.join()
|
118
|
+
if not future.cancelled():
|
119
|
+
try:
|
120
|
+
# Get result from the queue if process completed
|
121
|
+
if not result_queue.empty():
|
122
|
+
result = result_queue.get(block=False)
|
123
|
+
if isinstance(result, BaseException):
|
124
|
+
future.set_exception(result)
|
125
|
+
else:
|
126
|
+
future.set_result(result)
|
127
|
+
else:
|
128
|
+
# Process ended but no result
|
129
|
+
future.set_result(None)
|
130
|
+
except (multiprocessing.TimeoutError, BrokenPipeError,
|
131
|
+
EOFError) as e:
|
132
|
+
future.set_exception(e)
|
133
|
+
finally:
|
134
|
+
if process.pid:
|
135
|
+
with self._lock:
|
136
|
+
if process.pid in self.workers:
|
137
|
+
del self.workers[process.pid]
|
138
|
+
|
139
|
+
def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
|
140
|
+
"""Submit a task for execution and return a Future."""
|
141
|
+
future: concurrent.futures.Future = concurrent.futures.Future()
|
142
|
+
|
113
143
|
if self._shutdown:
|
114
|
-
|
144
|
+
raise RuntimeError('Cannot submit task after executor is shutdown')
|
145
|
+
|
115
146
|
with self._lock:
|
116
147
|
if (self.max_workers is not None and
|
117
148
|
len(self.workers) >= self.max_workers):
|
118
|
-
|
149
|
+
raise exceptions.ExecutionPoolFullError(
|
150
|
+
'Maximum workers reached')
|
119
151
|
|
152
|
+
result_queue: multiprocessing.Queue = multiprocessing.Queue()
|
120
153
|
process = multiprocessing.Process(target=_disposable_worker,
|
121
154
|
args=(fn, self._initializer,
|
122
|
-
self._initargs,
|
155
|
+
self._initargs, result_queue,
|
156
|
+
args, kwargs))
|
157
|
+
process.daemon = True
|
123
158
|
process.start()
|
124
159
|
|
125
160
|
with self._lock:
|
@@ -128,13 +163,13 @@ class DisposableExecutor:
|
|
128
163
|
raise RuntimeError('Failed to start process')
|
129
164
|
self.workers[pid] = process
|
130
165
|
|
131
|
-
# Start monitor thread to cleanup the worker process when it's done
|
166
|
+
# Start monitor thread to cleanup the worker process when it's done
|
132
167
|
monitor_thread = threading.Thread(target=self._monitor_worker,
|
133
|
-
args=(process,),
|
168
|
+
args=(process, future, result_queue),
|
134
169
|
daemon=True)
|
135
170
|
monitor_thread.start()
|
136
171
|
|
137
|
-
return
|
172
|
+
return future
|
138
173
|
|
139
174
|
def has_idle_workers(self) -> bool:
|
140
175
|
"""Check if there are any idle workers."""
|
@@ -173,12 +208,14 @@ class BurstableExecutor:
|
|
173
208
|
self._burst_executor = DisposableExecutor(max_workers=burst_workers,
|
174
209
|
**kwargs)
|
175
210
|
|
176
|
-
def submit_until_success(self, fn, *args,
|
211
|
+
def submit_until_success(self, fn, *args,
|
212
|
+
**kwargs) -> concurrent.futures.Future:
|
177
213
|
"""Submit a task for execution until success.
|
178
214
|
|
179
215
|
Prioritizes submitting to the guaranteed pool. If no idle workers
|
180
216
|
are available in the guaranteed pool, it will submit to the burst
|
181
|
-
pool.
|
217
|
+
pool. If the burst pool is full, it will retry the whole process until
|
218
|
+
the task is submitted successfully.
|
182
219
|
TODO(aylei): this is coupled with executor.RequestWorker since we
|
183
220
|
know the worker is dedicated to request scheduling and it either
|
184
221
|
blocks on request polling or request submitting. So it is no harm
|
@@ -188,17 +225,20 @@ class BurstableExecutor:
|
|
188
225
|
|
189
226
|
while True:
|
190
227
|
if self._executor is not None and self._executor.has_idle_workers():
|
191
|
-
|
192
|
-
|
228
|
+
logger.info('Submitting to the guaranteed pool')
|
229
|
+
return self._executor.submit(fn, *args, **kwargs)
|
193
230
|
if (self._burst_executor is not None and
|
194
231
|
self._burst_executor.has_idle_workers()):
|
195
|
-
|
196
|
-
|
232
|
+
try:
|
233
|
+
fut = self._burst_executor.submit(fn, *args, **kwargs)
|
234
|
+
return fut
|
235
|
+
except exceptions.ExecutionPoolFullError:
|
236
|
+
# The burst pool is full, try the next candidate.
|
237
|
+
pass
|
197
238
|
if self._executor is not None:
|
198
239
|
# No idle workers in either pool, still queue the request
|
199
240
|
# to the guaranteed pool to keep behavior consistent.
|
200
|
-
self._executor.submit(fn, *args, **kwargs)
|
201
|
-
break
|
241
|
+
return self._executor.submit(fn, *args, **kwargs)
|
202
242
|
logger.debug('No guaranteed pool set and the burst pool is full, '
|
203
243
|
'retry later.')
|
204
244
|
time.sleep(0.1)
|
sky/server/requests/requests.py
CHANGED
@@ -11,7 +11,7 @@ import signal
|
|
11
11
|
import sqlite3
|
12
12
|
import time
|
13
13
|
import traceback
|
14
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
14
|
+
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
|
15
15
|
|
16
16
|
import colorama
|
17
17
|
import filelock
|
@@ -204,7 +204,8 @@ class Request:
|
|
204
204
|
"""
|
205
205
|
assert isinstance(self.request_body,
|
206
206
|
payloads.RequestBody), (self.name, self.request_body)
|
207
|
-
|
207
|
+
user = global_user_state.get_user(self.user_id)
|
208
|
+
user_name = user.name if user is not None else None
|
208
209
|
return RequestPayload(
|
209
210
|
request_id=self.request_id,
|
210
211
|
name=self.name,
|
@@ -464,7 +465,7 @@ def request_lock_path(request_id: str) -> str:
|
|
464
465
|
|
465
466
|
@contextlib.contextmanager
|
466
467
|
@init_db
|
467
|
-
def update_request(request_id: str):
|
468
|
+
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
468
469
|
"""Get a SkyPilot API request."""
|
469
470
|
request = _get_request_no_lock(request_id)
|
470
471
|
yield request
|
sky/server/server.py
CHANGED
@@ -49,6 +49,7 @@ from sky.server.requests import preconditions
|
|
49
49
|
from sky.server.requests import requests as requests_lib
|
50
50
|
from sky.skylet import constants
|
51
51
|
from sky.usage import usage_lib
|
52
|
+
from sky.users import permission
|
52
53
|
from sky.users import server as users_rest
|
53
54
|
from sky.utils import admin_policy_utils
|
54
55
|
from sky.utils import common as common_lib
|
@@ -105,17 +106,21 @@ class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
105
106
|
"""Middleware to handle RBAC."""
|
106
107
|
|
107
108
|
async def dispatch(self, request: fastapi.Request, call_next):
|
108
|
-
|
109
|
+
# TODO(hailong): should have a list of paths
|
110
|
+
# that are not checked for RBAC
|
111
|
+
if (request.url.path.startswith('/dashboard/') or
|
112
|
+
request.url.path.startswith('/api/')):
|
109
113
|
return await call_next(request)
|
110
114
|
|
111
115
|
auth_user = _get_auth_user_header(request)
|
112
116
|
if auth_user is None:
|
113
117
|
return await call_next(request)
|
114
118
|
|
115
|
-
permission_service =
|
119
|
+
permission_service = permission.permission_service
|
116
120
|
# Check the role permission
|
117
|
-
if permission_service.
|
118
|
-
|
121
|
+
if permission_service.check_endpoint_permission(auth_user.id,
|
122
|
+
request.url.path,
|
123
|
+
request.method):
|
119
124
|
return fastapi.responses.JSONResponse(
|
120
125
|
status_code=403, content={'detail': 'Forbidden'})
|
121
126
|
|
@@ -154,9 +159,15 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
154
159
|
if auth_user is not None:
|
155
160
|
newly_added = global_user_state.add_or_update_user(auth_user)
|
156
161
|
if newly_added:
|
157
|
-
|
162
|
+
permission.permission_service.add_user_if_not_exists(
|
158
163
|
auth_user.id)
|
159
164
|
|
165
|
+
# Store user info in request.state for access by GET endpoints
|
166
|
+
if auth_user is not None:
|
167
|
+
request.state.auth_user = auth_user
|
168
|
+
else:
|
169
|
+
request.state.auth_user = None
|
170
|
+
|
160
171
|
body = await request.body()
|
161
172
|
if auth_user and body:
|
162
173
|
try:
|
@@ -177,6 +188,12 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
177
188
|
f'"env_vars" in request body is not a dictionary '
|
178
189
|
f'for request {request.state.request_id}. '
|
179
190
|
'Skipping user info injection into body.')
|
191
|
+
else:
|
192
|
+
original_json['env_vars'] = {}
|
193
|
+
original_json['env_vars'][
|
194
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
195
|
+
original_json['env_vars'][
|
196
|
+
constants.USER_ENV_VAR] = auth_user.name
|
180
197
|
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
181
198
|
return await call_next(request)
|
182
199
|
|
@@ -676,6 +693,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
676
693
|
func=execution.launch,
|
677
694
|
schedule_type=requests_lib.ScheduleType.LONG,
|
678
695
|
request_cluster_name=launch_body.cluster_name,
|
696
|
+
retryable=launch_body.retry_until_up,
|
679
697
|
)
|
680
698
|
|
681
699
|
|
sky/server/stream_utils.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
import asyncio
|
4
4
|
import collections
|
5
5
|
import pathlib
|
6
|
-
from typing import AsyncGenerator, Deque, Optional
|
6
|
+
from typing import AsyncGenerator, Deque, List, Optional
|
7
7
|
|
8
8
|
import aiofiles
|
9
9
|
import fastapi
|
@@ -15,6 +15,12 @@ from sky.utils import rich_utils
|
|
15
15
|
|
16
16
|
logger = sky_logging.init_logger(__name__)
|
17
17
|
|
18
|
+
# When streaming log lines, buffer the lines in memory and flush them in chunks
|
19
|
+
# to improve log tailing throughput. Buffer size is the max size bytes of each
|
20
|
+
# chunk and the timeout threshold for flushing the buffer to ensure
|
21
|
+
# responsiveness.
|
22
|
+
_BUFFER_SIZE = 8 * 1024 # 8KB
|
23
|
+
_BUFFER_TIMEOUT = 0.02 # 20ms
|
18
24
|
_HEARTBEAT_INTERVAL = 30
|
19
25
|
|
20
26
|
|
@@ -36,7 +42,16 @@ async def log_streamer(request_id: Optional[str],
|
|
36
42
|
plain_logs: bool = False,
|
37
43
|
tail: Optional[int] = None,
|
38
44
|
follow: bool = True) -> AsyncGenerator[str, None]:
|
39
|
-
"""Streams the logs of a request.
|
45
|
+
"""Streams the logs of a request.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
request_id: The request ID to check whether the log tailing process
|
49
|
+
should be stopped.
|
50
|
+
log_path: The path to the log file.
|
51
|
+
plain_logs: Whether to show plain logs.
|
52
|
+
tail: The number of lines to tail. If None, tail the whole file.
|
53
|
+
follow: Whether to follow the log file.
|
54
|
+
"""
|
40
55
|
|
41
56
|
if request_id is not None:
|
42
57
|
status_msg = rich_utils.EncodedStatusMessage(
|
@@ -80,65 +95,106 @@ async def log_streamer(request_id: Optional[str],
|
|
80
95
|
if show_request_waiting_spinner:
|
81
96
|
yield status_msg.stop()
|
82
97
|
|
83
|
-
# Find last n lines of the log file. Do not read the whole file into memory.
|
84
98
|
async with aiofiles.open(log_path, 'rb') as f:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
100
|
+
follow):
|
101
|
+
yield chunk
|
102
|
+
|
103
|
+
|
104
|
+
async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
105
|
+
request_id: Optional[str] = None,
|
106
|
+
plain_logs: bool = False,
|
107
|
+
tail: Optional[int] = None,
|
108
|
+
follow: bool = True) -> AsyncGenerator[str, None]:
|
109
|
+
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
110
|
+
|
111
|
+
if tail is not None:
|
112
|
+
# Find last n lines of the log file. Do not read the whole file into
|
113
|
+
# memory.
|
114
|
+
# TODO(zhwu): this will include the control lines for rich status,
|
115
|
+
# which may not lead to exact tail lines when showing on the client
|
116
|
+
# side.
|
117
|
+
lines: Deque[str] = collections.deque(maxlen=tail)
|
118
|
+
async for line_str in _yield_log_file_with_payloads_skipped(f):
|
119
|
+
lines.append(line_str)
|
120
|
+
for line_str in lines:
|
121
|
+
yield line_str
|
96
122
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
123
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
124
|
+
|
125
|
+
# Buffer the lines in memory and flush them in chunks to improve log
|
126
|
+
# tailing throughput.
|
127
|
+
buffer: List[str] = []
|
128
|
+
buffer_bytes = 0
|
129
|
+
last_flush_time = asyncio.get_event_loop().time()
|
130
|
+
|
131
|
+
async def flush_buffer() -> AsyncGenerator[str, None]:
|
132
|
+
nonlocal buffer, buffer_bytes, last_flush_time
|
133
|
+
if buffer:
|
134
|
+
yield ''.join(buffer)
|
135
|
+
buffer.clear()
|
136
|
+
buffer_bytes = 0
|
137
|
+
last_flush_time = asyncio.get_event_loop().time()
|
138
|
+
|
139
|
+
while True:
|
140
|
+
# Sleep 0 to yield control to allow other coroutines to run,
|
141
|
+
# while keeps the loop tight to make log stream responsive.
|
142
|
+
await asyncio.sleep(0)
|
143
|
+
current_time = asyncio.get_event_loop().time()
|
144
|
+
# Flush the buffer when it is not empty and the buffer is full or the
|
145
|
+
# flush timeout is reached.
|
146
|
+
if buffer and (buffer_bytes >= _BUFFER_SIZE or
|
147
|
+
(current_time - last_flush_time) >= _BUFFER_TIMEOUT):
|
148
|
+
async for chunk in flush_buffer():
|
149
|
+
yield chunk
|
150
|
+
|
151
|
+
line: Optional[bytes] = await f.readline()
|
152
|
+
if not line:
|
153
|
+
if request_id is not None:
|
154
|
+
request_task = requests_lib.get_request(request_id)
|
155
|
+
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
156
|
+
if (request_task.status ==
|
157
|
+
requests_lib.RequestStatus.CANCELLED):
|
158
|
+
buffer.append(
|
159
|
+
f'{request_task.name!r} request {request_id}'
|
160
|
+
' cancelled\n')
|
112
161
|
break
|
162
|
+
if not follow:
|
163
|
+
break
|
164
|
+
|
165
|
+
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
166
|
+
# Currently just used to keep the connection busy, refer to
|
167
|
+
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
168
|
+
# more details.
|
169
|
+
buffer.append(
|
170
|
+
message_utils.encode_payload(
|
171
|
+
rich_utils.Control.HEARTBEAT.encode('')))
|
172
|
+
last_heartbeat_time = current_time
|
173
|
+
|
174
|
+
# Sleep shortly to avoid storming the DB and CPU, this has
|
175
|
+
# little impact on the responsivness here since we are waiting
|
176
|
+
# for a new line to come in.
|
177
|
+
await asyncio.sleep(0.1)
|
178
|
+
continue
|
113
179
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
# for a new line to come in.
|
126
|
-
await asyncio.sleep(0.1)
|
180
|
+
# Refresh the heartbeat time, this is a trivial optimization for
|
181
|
+
# performance but it helps avoid unnecessary heartbeat strings
|
182
|
+
# being printed when the client runs in an old version.
|
183
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
184
|
+
line_str = line.decode('utf-8')
|
185
|
+
if plain_logs:
|
186
|
+
is_payload, line_str = message_utils.decode_payload(
|
187
|
+
line_str, raise_for_mismatch=False)
|
188
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
189
|
+
# sending invisible characters might be okay.
|
190
|
+
if is_payload:
|
127
191
|
continue
|
192
|
+
buffer.append(line_str)
|
193
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
128
194
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
last_heartbeat_time = asyncio.get_event_loop().time()
|
133
|
-
line_str = line.decode('utf-8')
|
134
|
-
if plain_logs:
|
135
|
-
is_payload, line_str = message_utils.decode_payload(
|
136
|
-
line_str, raise_for_mismatch=False)
|
137
|
-
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
138
|
-
# sending invisible characters might be okay.
|
139
|
-
if is_payload:
|
140
|
-
continue
|
141
|
-
yield line_str
|
195
|
+
# Flush remaining lines in the buffer.
|
196
|
+
async for chunk in flush_buffer():
|
197
|
+
yield chunk
|
142
198
|
|
143
199
|
|
144
200
|
def stream_response(
|