skypilot-nightly 1.0.0.dev20250607__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +3 -0
  3. sky/authentication.py +1 -7
  4. sky/backends/backend_utils.py +18 -2
  5. sky/backends/cloud_vm_ray_backend.py +9 -20
  6. sky/check.py +4 -3
  7. sky/cli.py +6 -9
  8. sky/client/cli.py +6 -9
  9. sky/client/sdk.py +49 -4
  10. sky/clouds/kubernetes.py +15 -24
  11. sky/core.py +3 -2
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
  21. sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
  23. sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
  24. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
  28. sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
  29. sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
  34. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
  37. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
  44. sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
  50. sky/dashboard/out/_next/static/css/8b1c8321d4c02372.css +3 -0
  51. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  52. sky/dashboard/out/clusters/[cluster].html +1 -1
  53. sky/dashboard/out/clusters.html +1 -1
  54. sky/dashboard/out/config.html +1 -1
  55. sky/dashboard/out/index.html +1 -1
  56. sky/dashboard/out/infra/[context].html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/workspace/new.html +1 -1
  62. sky/dashboard/out/workspaces/[name].html +1 -1
  63. sky/dashboard/out/workspaces.html +1 -1
  64. sky/exceptions.py +23 -0
  65. sky/global_user_state.py +192 -80
  66. sky/jobs/client/sdk.py +29 -21
  67. sky/jobs/server/core.py +9 -1
  68. sky/jobs/server/server.py +0 -95
  69. sky/jobs/utils.py +2 -1
  70. sky/models.py +18 -0
  71. sky/provision/kubernetes/constants.py +9 -0
  72. sky/provision/kubernetes/utils.py +106 -7
  73. sky/serve/client/sdk.py +56 -45
  74. sky/serve/server/core.py +1 -1
  75. sky/server/common.py +5 -7
  76. sky/server/constants.py +0 -2
  77. sky/server/requests/executor.py +60 -22
  78. sky/server/requests/payloads.py +3 -0
  79. sky/server/requests/process.py +69 -29
  80. sky/server/requests/requests.py +4 -3
  81. sky/server/server.py +23 -5
  82. sky/server/stream_utils.py +111 -55
  83. sky/skylet/constants.py +4 -2
  84. sky/skylet/job_lib.py +2 -1
  85. sky/skypilot_config.py +108 -25
  86. sky/users/model.conf +1 -1
  87. sky/users/permission.py +149 -32
  88. sky/users/rbac.py +26 -0
  89. sky/users/server.py +14 -13
  90. sky/utils/admin_policy_utils.py +9 -3
  91. sky/utils/common.py +6 -1
  92. sky/utils/common_utils.py +21 -3
  93. sky/utils/context.py +21 -1
  94. sky/utils/controller_utils.py +16 -1
  95. sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
  96. sky/utils/schemas.py +9 -0
  97. sky/workspaces/core.py +100 -8
  98. sky/workspaces/server.py +15 -2
  99. sky/workspaces/utils.py +56 -0
  100. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +106 -94
  102. sky/dashboard/out/_next/static/1qG0HTmVilJPxQdBk0fX5/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
  104. sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
  106. sky/dashboard/out/_next/static/chunks/470-ad1e0db3afcbd9c9.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/614-635a84e87800f99e.js +0 -66
  108. sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
  109. sky/dashboard/out/_next/static/chunks/843-c296541442d4af88.js +0 -11
  110. sky/dashboard/out/_next/static/chunks/856-3a32da4b84176f6d.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/973-6d78a0814682d771.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/pages/_app-cb81dc4d27f4d009.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-18aed9b56247d074.js +0 -6
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b919a73aecdfa78f.js +0 -6
  116. sky/dashboard/out/_next/static/chunks/pages/clusters-4f6b9dd9abcb33ad.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/pages/config-fe375a56342cf609.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-3a18d0eeb5119fe4.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/pages/infra-a1a6abeeb58c1051.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1354e28c81eeb686.js +0 -16
  121. sky/dashboard/out/_next/static/chunks/pages/jobs-23bfc8bf373423db.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/users-5800045bd04e69c2.js +0 -16
  123. sky/dashboard/out/_next/static/chunks/pages/workspace/new-e1f9c0c3ff7ac4bd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-686590e0ee4b2412.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/workspaces-76b07aa5da91b0df.js +0 -1
  126. sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
  127. sky/dashboard/out/_next/static/css/667d941a2888ce6e.css +0 -3
  128. /sky/dashboard/out/_next/static/{1qG0HTmVilJPxQdBk0fX5 → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
  129. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
  130. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
  131. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
  132. {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ The number of the workers is determined by the system resources.
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
21
  import asyncio
22
+ import concurrent.futures
22
23
  import contextlib
23
24
  import multiprocessing
24
25
  import os
@@ -53,6 +54,7 @@ from sky.utils import context
53
54
  from sky.utils import context_utils
54
55
  from sky.utils import subprocess_utils
55
56
  from sky.utils import timeline
57
+ from sky.workspaces import core as workspaces_core
56
58
 
57
59
  if typing.TYPE_CHECKING:
58
60
  import types
@@ -92,21 +94,21 @@ class RequestQueue:
92
94
  else:
93
95
  raise RuntimeError(f'Invalid queue backend: {backend}')
94
96
 
95
- def put(self, request: Tuple[str, bool]) -> None:
97
+ def put(self, request: Tuple[str, bool, bool]) -> None:
96
98
  """Put and request to the queue.
97
99
 
98
100
  Args:
99
- request: A tuple of request_id and ignore_return_value.
101
+ request: A tuple of request_id, ignore_return_value, and retryable.
100
102
  """
101
103
  self.queue.put(request) # type: ignore
102
104
 
103
- def get(self) -> Optional[Tuple[str, bool]]:
105
+ def get(self) -> Optional[Tuple[str, bool, bool]]:
104
106
  """Get a request from the queue.
105
107
 
106
108
  It is non-blocking if the queue is empty, and returns None.
107
109
 
108
110
  Returns:
109
- A tuple of request_id and ignore_return_value.
111
+ A tuple of request_id, ignore_return_value, and retryable.
110
112
  """
111
113
  try:
112
114
  return self.queue.get(block=False)
@@ -158,7 +160,7 @@ class RequestWorker:
158
160
  if request_element is None:
159
161
  time.sleep(0.1)
160
162
  return
161
- request_id, ignore_return_value = request_element
163
+ request_id, ignore_return_value, retryable = request_element
162
164
  request = api_requests.get_request(request_id)
163
165
  assert request is not None, f'Request with ID {request_id} is None'
164
166
  if request.status == api_requests.RequestStatus.CANCELLED:
@@ -170,8 +172,14 @@ class RequestWorker:
170
172
  # multiple requests can share the same process pid, which may cause
171
173
  # issues with SkyPilot core functions if they rely on the exit of
172
174
  # the process, such as subprocess_daemon.py.
173
- executor.submit_until_success(_request_execution_wrapper,
174
- request_id, ignore_return_value)
175
+ fut = executor.submit_until_success(_request_execution_wrapper,
176
+ request_id, ignore_return_value)
177
+ if retryable:
178
+ # If the task might fail and be retried, start a thread to
179
+ # monitor the future and process retry.
180
+ threading.Thread(target=self.handle_task_result,
181
+ args=(fut, request_element),
182
+ daemon=True).start()
175
183
 
176
184
  logger.info(f'[{self}] Submitted request: {request_id}')
177
185
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
@@ -181,6 +189,16 @@ class RequestWorker:
181
189
  f'{request_id if "request_id" in locals() else ""} '
182
190
  f'{common_utils.format_exception(e, use_bracket=True)}')
183
191
 
192
+ def handle_task_result(self, fut: concurrent.futures.Future,
193
+ request_element: Tuple[str, bool, bool]) -> None:
194
+ try:
195
+ fut.result()
196
+ except exceptions.ExecutionRetryableError as e:
197
+ time.sleep(e.retry_wait_seconds)
198
+ # Reschedule the request.
199
+ queue = _get_queue(self.schedule_type)
200
+ queue.put(request_element)
201
+
184
202
  def run(self) -> None:
185
203
  # Handle the SIGTERM signal to abort the executor process gracefully.
186
204
  proc_group = f'{self.schedule_type.value}'
@@ -229,6 +247,9 @@ def override_request_env_and_config(
229
247
  original_env = os.environ.copy()
230
248
  os.environ.update(request_body.env_vars)
231
249
  # Note: may be overridden by AuthProxyMiddleware.
250
+ # TODO(zhwu): we need to make the entire request a context available to the
251
+ # entire request execution, so that we can access info like user through
252
+ # the execution.
232
253
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
233
254
  name=request_body.env_vars[constants.USER_ENV_VAR])
234
255
  global_user_state.add_or_update_user(user)
@@ -237,13 +258,17 @@ def override_request_env_and_config(
237
258
  server_common.reload_for_new_request(
238
259
  client_entrypoint=request_body.entrypoint,
239
260
  client_command=request_body.entrypoint_command,
240
- using_remote_api_server=request_body.using_remote_api_server)
261
+ using_remote_api_server=request_body.using_remote_api_server,
262
+ user=user)
241
263
  try:
242
264
  logger.debug(
243
265
  f'override path: {request_body.override_skypilot_config_path}')
244
266
  with skypilot_config.override_skypilot_config(
245
267
  request_body.override_skypilot_config,
246
268
  request_body.override_skypilot_config_path):
269
+ # Rejecting requests to workspaces that the user does not have
270
+ # permission to access.
271
+ workspaces_core.reject_request_for_unauthorized_workspace(user)
247
272
  yield
248
273
  finally:
249
274
  # We need to call the save_timeline() since atexit will not be
@@ -308,7 +333,9 @@ def _request_execution_wrapper(request_id: str,
308
333
  func = request_task.entrypoint
309
334
  request_body = request_task.request_body
310
335
 
311
- with log_path.open('w', encoding='utf-8') as f:
336
+ # Append to the log file instead of overwriting it since there might be
337
+ # logs from previous retries.
338
+ with log_path.open('a', encoding='utf-8') as f:
312
339
  # Store copies of the original stdout and stderr file descriptors
313
340
  original_stdout, original_stderr = _redirect_output(f)
314
341
  # Redirect the stdout/stderr before overriding the environment and
@@ -332,6 +359,17 @@ def _request_execution_wrapper(request_id: str,
332
359
  subprocess_utils.kill_children_processes()
333
360
  _restore_output(original_stdout, original_stderr)
334
361
  return
362
+ except exceptions.ExecutionRetryableError as e:
363
+ logger.error(e)
364
+ logger.info(e.hint)
365
+ with api_requests.update_request(request_id) as request_task:
366
+ assert request_task is not None, request_id
367
+ # Retried request will undergo rescheduling and a new execution,
368
+ # clear the pid of the request.
369
+ request_task.pid = None
370
+ # Yield control to the scheduler for uniform handling of retries.
371
+ _restore_output(original_stdout, original_stderr)
372
+ raise
335
373
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
336
374
  api_requests.set_request_failed(request_id, e)
337
375
  _restore_output(original_stdout, original_stderr)
@@ -433,7 +471,7 @@ def prepare_request(
433
471
  """Prepare a request for execution."""
434
472
  user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
435
473
  if is_skypilot_system:
436
- user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
474
+ user_id = constants.SKYPILOT_SYSTEM_USER_ID
437
475
  global_user_state.add_or_update_user(
438
476
  models.User(id=user_id, name=user_id))
439
477
  request = api_requests.Request(request_id=request_id,
@@ -455,17 +493,17 @@ def prepare_request(
455
493
  return request
456
494
 
457
495
 
458
- def schedule_request(
459
- request_id: str,
460
- request_name: str,
461
- request_body: payloads.RequestBody,
462
- func: Callable[P, Any],
463
- request_cluster_name: Optional[str] = None,
464
- ignore_return_value: bool = False,
465
- schedule_type: api_requests.ScheduleType = (
466
- api_requests.ScheduleType.LONG),
467
- is_skypilot_system: bool = False,
468
- precondition: Optional[preconditions.Precondition] = None) -> None:
496
+ def schedule_request(request_id: str,
497
+ request_name: str,
498
+ request_body: payloads.RequestBody,
499
+ func: Callable[P, Any],
500
+ request_cluster_name: Optional[str] = None,
501
+ ignore_return_value: bool = False,
502
+ schedule_type: api_requests.ScheduleType = (
503
+ api_requests.ScheduleType.LONG),
504
+ is_skypilot_system: bool = False,
505
+ precondition: Optional[preconditions.Precondition] = None,
506
+ retryable: bool = False) -> None:
469
507
  """Enqueue a request to the request queue.
470
508
 
471
509
  Args:
@@ -490,7 +528,7 @@ def schedule_request(
490
528
  request_cluster_name, schedule_type, is_skypilot_system)
491
529
 
492
530
  def enqueue():
493
- input_tuple = (request_id, ignore_return_value)
531
+ input_tuple = (request_id, ignore_return_value, retryable)
494
532
  logger.info(f'Queuing request: {request_id}')
495
533
  _get_queue(schedule_type).put(input_tuple)
496
534
 
@@ -79,6 +79,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
79
79
  # server endpoint on the server side. This avoids the warning at
80
80
  # server-side.
81
81
  config.pop_nested(('api_server',), default_value=None)
82
+ # Remove the admin policy, as the policy has been applied on the client
83
+ # side.
84
+ config.pop_nested(('admin_policy',), default_value=None)
82
85
  return config
83
86
 
84
87
 
@@ -6,6 +6,7 @@ import threading
6
6
  import time
7
7
  from typing import Callable, Dict, Optional, Tuple
8
8
 
9
+ from sky import exceptions
9
10
  from sky.utils import atomic
10
11
  from sky.utils import subprocess_utils
11
12
 
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
67
68
 
68
69
 
69
70
  # Define the worker function outside of the class to avoid pickling self
70
- def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
71
- args, kwargs):
71
+ def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
72
+ """The worker function that is used to run the task.
73
+
74
+ Args:
75
+ fn: The function to run.
76
+ initializer: The initializer function to run before running the task.
77
+ initargs: The arguments to pass to the initializer function.
78
+ result_queue: The queue to put the result and exception into.
79
+ args: The arguments to pass to the function.
80
+ kwargs: The keyword arguments to pass to the function.
81
+ """
72
82
  try:
73
83
  if initializer is not None:
74
84
  initializer(*initargs)
75
- fn(*args, **kwargs)
85
+ result = fn(*args, **kwargs)
86
+ result_queue.put(result)
76
87
  except BaseException as e: # pylint: disable=broad-except
77
- return e
88
+ result_queue.put(e)
78
89
 
79
90
 
80
91
  class DisposableExecutor:
@@ -98,28 +109,52 @@ class DisposableExecutor:
98
109
  self._initializer: Optional[Callable] = initializer
99
110
  self._initargs: Tuple = initargs
100
111
 
101
- def _monitor_worker(self, process: multiprocessing.Process) -> None:
112
+ def _monitor_worker(self, process: multiprocessing.Process,
113
+ future: concurrent.futures.Future,
114
+ result_queue: multiprocessing.Queue) -> None:
102
115
  """Monitor the worker process and cleanup when it's done."""
103
- process.join()
104
- if process.pid:
105
- with self._lock:
106
- if process.pid in self.workers:
107
- del self.workers[process.pid]
108
-
109
- # Submit is not compatible with ProcessPoolExecutor because we does not
110
- # bother to return a Future. Can be improved if needed.
111
- def submit(self, fn, *args, **kwargs) -> bool:
112
- """Submit a task for execution."""
116
+ try:
117
+ process.join()
118
+ if not future.cancelled():
119
+ try:
120
+ # Get result from the queue if process completed
121
+ if not result_queue.empty():
122
+ result = result_queue.get(block=False)
123
+ if isinstance(result, BaseException):
124
+ future.set_exception(result)
125
+ else:
126
+ future.set_result(result)
127
+ else:
128
+ # Process ended but no result
129
+ future.set_result(None)
130
+ except (multiprocessing.TimeoutError, BrokenPipeError,
131
+ EOFError) as e:
132
+ future.set_exception(e)
133
+ finally:
134
+ if process.pid:
135
+ with self._lock:
136
+ if process.pid in self.workers:
137
+ del self.workers[process.pid]
138
+
139
+ def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
140
+ """Submit a task for execution and return a Future."""
141
+ future: concurrent.futures.Future = concurrent.futures.Future()
142
+
113
143
  if self._shutdown:
114
- return False
144
+ raise RuntimeError('Cannot submit task after executor is shutdown')
145
+
115
146
  with self._lock:
116
147
  if (self.max_workers is not None and
117
148
  len(self.workers) >= self.max_workers):
118
- return False
149
+ raise exceptions.ExecutionPoolFullError(
150
+ 'Maximum workers reached')
119
151
 
152
+ result_queue: multiprocessing.Queue = multiprocessing.Queue()
120
153
  process = multiprocessing.Process(target=_disposable_worker,
121
154
  args=(fn, self._initializer,
122
- self._initargs, args, kwargs))
155
+ self._initargs, result_queue,
156
+ args, kwargs))
157
+ process.daemon = True
123
158
  process.start()
124
159
 
125
160
  with self._lock:
@@ -128,13 +163,13 @@ class DisposableExecutor:
128
163
  raise RuntimeError('Failed to start process')
129
164
  self.workers[pid] = process
130
165
 
131
- # Start monitor thread to cleanup the worker process when it's done.
166
+ # Start monitor thread to cleanup the worker process when it's done
132
167
  monitor_thread = threading.Thread(target=self._monitor_worker,
133
- args=(process,),
168
+ args=(process, future, result_queue),
134
169
  daemon=True)
135
170
  monitor_thread.start()
136
171
 
137
- return True
172
+ return future
138
173
 
139
174
  def has_idle_workers(self) -> bool:
140
175
  """Check if there are any idle workers."""
@@ -173,12 +208,14 @@ class BurstableExecutor:
173
208
  self._burst_executor = DisposableExecutor(max_workers=burst_workers,
174
209
  **kwargs)
175
210
 
176
- def submit_until_success(self, fn, *args, **kwargs):
211
+ def submit_until_success(self, fn, *args,
212
+ **kwargs) -> concurrent.futures.Future:
177
213
  """Submit a task for execution until success.
178
214
 
179
215
  Prioritizes submitting to the guaranteed pool. If no idle workers
180
216
  are available in the guaranteed pool, it will submit to the burst
181
- pool.
217
+ pool. If the burst pool is full, it will retry the whole process until
218
+ the task is submitted successfully.
182
219
  TODO(aylei): this is coupled with executor.RequestWorker since we
183
220
  know the worker is dedicated to request scheduling and it either
184
221
  blocks on request polling or request submitting. So it is no harm
@@ -188,17 +225,20 @@ class BurstableExecutor:
188
225
 
189
226
  while True:
190
227
  if self._executor is not None and self._executor.has_idle_workers():
191
- self._executor.submit(fn, *args, **kwargs)
192
- break
228
+ logger.info('Submitting to the guaranteed pool')
229
+ return self._executor.submit(fn, *args, **kwargs)
193
230
  if (self._burst_executor is not None and
194
231
  self._burst_executor.has_idle_workers()):
195
- self._burst_executor.submit(fn, *args, **kwargs)
196
- break
232
+ try:
233
+ fut = self._burst_executor.submit(fn, *args, **kwargs)
234
+ return fut
235
+ except exceptions.ExecutionPoolFullError:
236
+ # The burst pool is full, try the next candidate.
237
+ pass
197
238
  if self._executor is not None:
198
239
  # No idle workers in either pool, still queue the request
199
240
  # to the guaranteed pool to keep behavior consistent.
200
- self._executor.submit(fn, *args, **kwargs)
201
- break
241
+ return self._executor.submit(fn, *args, **kwargs)
202
242
  logger.debug('No guaranteed pool set and the burst pool is full, '
203
243
  'retry later.')
204
244
  time.sleep(0.1)
@@ -11,7 +11,7 @@ import signal
11
11
  import sqlite3
12
12
  import time
13
13
  import traceback
14
- from typing import Any, Callable, Dict, List, Optional, Tuple
14
+ from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
15
15
 
16
16
  import colorama
17
17
  import filelock
@@ -204,7 +204,8 @@ class Request:
204
204
  """
205
205
  assert isinstance(self.request_body,
206
206
  payloads.RequestBody), (self.name, self.request_body)
207
- user_name = global_user_state.get_user(self.user_id).name
207
+ user = global_user_state.get_user(self.user_id)
208
+ user_name = user.name if user is not None else None
208
209
  return RequestPayload(
209
210
  request_id=self.request_id,
210
211
  name=self.name,
@@ -464,7 +465,7 @@ def request_lock_path(request_id: str) -> str:
464
465
 
465
466
  @contextlib.contextmanager
466
467
  @init_db
467
- def update_request(request_id: str):
468
+ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
468
469
  """Get a SkyPilot API request."""
469
470
  request = _get_request_no_lock(request_id)
470
471
  yield request
sky/server/server.py CHANGED
@@ -49,6 +49,7 @@ from sky.server.requests import preconditions
49
49
  from sky.server.requests import requests as requests_lib
50
50
  from sky.skylet import constants
51
51
  from sky.usage import usage_lib
52
+ from sky.users import permission
52
53
  from sky.users import server as users_rest
53
54
  from sky.utils import admin_policy_utils
54
55
  from sky.utils import common as common_lib
@@ -105,17 +106,21 @@ class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
105
106
  """Middleware to handle RBAC."""
106
107
 
107
108
  async def dispatch(self, request: fastapi.Request, call_next):
108
- if request.url.path.startswith('/dashboard/'):
109
+ # TODO(hailong): should have a list of paths
110
+ # that are not checked for RBAC
111
+ if (request.url.path.startswith('/dashboard/') or
112
+ request.url.path.startswith('/api/')):
109
113
  return await call_next(request)
110
114
 
111
115
  auth_user = _get_auth_user_header(request)
112
116
  if auth_user is None:
113
117
  return await call_next(request)
114
118
 
115
- permission_service = users_rest.permission_service
119
+ permission_service = permission.permission_service
116
120
  # Check the role permission
117
- if permission_service.check_permission(auth_user.id, request.url.path,
118
- request.method):
121
+ if permission_service.check_endpoint_permission(auth_user.id,
122
+ request.url.path,
123
+ request.method):
119
124
  return fastapi.responses.JSONResponse(
120
125
  status_code=403, content={'detail': 'Forbidden'})
121
126
 
@@ -154,9 +159,15 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
154
159
  if auth_user is not None:
155
160
  newly_added = global_user_state.add_or_update_user(auth_user)
156
161
  if newly_added:
157
- users_rest.permission_service.add_user_if_not_exists(
162
+ permission.permission_service.add_user_if_not_exists(
158
163
  auth_user.id)
159
164
 
165
+ # Store user info in request.state for access by GET endpoints
166
+ if auth_user is not None:
167
+ request.state.auth_user = auth_user
168
+ else:
169
+ request.state.auth_user = None
170
+
160
171
  body = await request.body()
161
172
  if auth_user and body:
162
173
  try:
@@ -177,6 +188,12 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
177
188
  f'"env_vars" in request body is not a dictionary '
178
189
  f'for request {request.state.request_id}. '
179
190
  'Skipping user info injection into body.')
191
+ else:
192
+ original_json['env_vars'] = {}
193
+ original_json['env_vars'][
194
+ constants.USER_ID_ENV_VAR] = auth_user.id
195
+ original_json['env_vars'][
196
+ constants.USER_ENV_VAR] = auth_user.name
180
197
  request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
181
198
  return await call_next(request)
182
199
 
@@ -676,6 +693,7 @@ async def launch(launch_body: payloads.LaunchBody,
676
693
  func=execution.launch,
677
694
  schedule_type=requests_lib.ScheduleType.LONG,
678
695
  request_cluster_name=launch_body.cluster_name,
696
+ retryable=launch_body.retry_until_up,
679
697
  )
680
698
 
681
699
 
@@ -3,7 +3,7 @@
3
3
  import asyncio
4
4
  import collections
5
5
  import pathlib
6
- from typing import AsyncGenerator, Deque, Optional
6
+ from typing import AsyncGenerator, Deque, List, Optional
7
7
 
8
8
  import aiofiles
9
9
  import fastapi
@@ -15,6 +15,12 @@ from sky.utils import rich_utils
15
15
 
16
16
  logger = sky_logging.init_logger(__name__)
17
17
 
18
+ # When streaming log lines, buffer the lines in memory and flush them in chunks
19
+ # to improve log tailing throughput. Buffer size is the max size bytes of each
20
+ # chunk and the timeout threshold for flushing the buffer to ensure
21
+ # responsiveness.
22
+ _BUFFER_SIZE = 8 * 1024 # 8KB
23
+ _BUFFER_TIMEOUT = 0.02 # 20ms
18
24
  _HEARTBEAT_INTERVAL = 30
19
25
 
20
26
 
@@ -36,7 +42,16 @@ async def log_streamer(request_id: Optional[str],
36
42
  plain_logs: bool = False,
37
43
  tail: Optional[int] = None,
38
44
  follow: bool = True) -> AsyncGenerator[str, None]:
39
- """Streams the logs of a request."""
45
+ """Streams the logs of a request.
46
+
47
+ Args:
48
+ request_id: The request ID to check whether the log tailing process
49
+ should be stopped.
50
+ log_path: The path to the log file.
51
+ plain_logs: Whether to show plain logs.
52
+ tail: The number of lines to tail. If None, tail the whole file.
53
+ follow: Whether to follow the log file.
54
+ """
40
55
 
41
56
  if request_id is not None:
42
57
  status_msg = rich_utils.EncodedStatusMessage(
@@ -80,65 +95,106 @@ async def log_streamer(request_id: Optional[str],
80
95
  if show_request_waiting_spinner:
81
96
  yield status_msg.stop()
82
97
 
83
- # Find last n lines of the log file. Do not read the whole file into memory.
84
98
  async with aiofiles.open(log_path, 'rb') as f:
85
- if tail is not None:
86
- # TODO(zhwu): this will include the control lines for rich status,
87
- # which may not lead to exact tail lines when showing on the client
88
- # side.
89
- lines: Deque[str] = collections.deque(maxlen=tail)
90
- async for line_str in _yield_log_file_with_payloads_skipped(f):
91
- lines.append(line_str)
92
- for line_str in lines:
93
- yield line_str
94
-
95
- last_heartbeat_time = asyncio.get_event_loop().time()
99
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
100
+ follow):
101
+ yield chunk
102
+
103
+
104
+ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
105
+ request_id: Optional[str] = None,
106
+ plain_logs: bool = False,
107
+ tail: Optional[int] = None,
108
+ follow: bool = True) -> AsyncGenerator[str, None]:
109
+ """Tail the opened log file, buffer the lines and flush in chunks."""
110
+
111
+ if tail is not None:
112
+ # Find last n lines of the log file. Do not read the whole file into
113
+ # memory.
114
+ # TODO(zhwu): this will include the control lines for rich status,
115
+ # which may not lead to exact tail lines when showing on the client
116
+ # side.
117
+ lines: Deque[str] = collections.deque(maxlen=tail)
118
+ async for line_str in _yield_log_file_with_payloads_skipped(f):
119
+ lines.append(line_str)
120
+ for line_str in lines:
121
+ yield line_str
96
122
 
97
- while True:
98
- # Sleep 0 to yield control to allow other coroutines to run,
99
- # while keeps the loop tight to make log stream responsive.
100
- await asyncio.sleep(0)
101
- line: Optional[bytes] = await f.readline()
102
- if not line:
103
- if request_id is not None:
104
- request_task = requests_lib.get_request(request_id)
105
- if request_task.status > requests_lib.RequestStatus.RUNNING:
106
- if (request_task.status ==
107
- requests_lib.RequestStatus.CANCELLED):
108
- yield (f'{request_task.name!r} request {request_id}'
109
- ' cancelled\n')
110
- break
111
- if not follow:
123
+ last_heartbeat_time = asyncio.get_event_loop().time()
124
+
125
+ # Buffer the lines in memory and flush them in chunks to improve log
126
+ # tailing throughput.
127
+ buffer: List[str] = []
128
+ buffer_bytes = 0
129
+ last_flush_time = asyncio.get_event_loop().time()
130
+
131
+ async def flush_buffer() -> AsyncGenerator[str, None]:
132
+ nonlocal buffer, buffer_bytes, last_flush_time
133
+ if buffer:
134
+ yield ''.join(buffer)
135
+ buffer.clear()
136
+ buffer_bytes = 0
137
+ last_flush_time = asyncio.get_event_loop().time()
138
+
139
+ while True:
140
+ # Sleep 0 to yield control to allow other coroutines to run,
141
+ # while keeps the loop tight to make log stream responsive.
142
+ await asyncio.sleep(0)
143
+ current_time = asyncio.get_event_loop().time()
144
+ # Flush the buffer when it is not empty and the buffer is full or the
145
+ # flush timeout is reached.
146
+ if buffer and (buffer_bytes >= _BUFFER_SIZE or
147
+ (current_time - last_flush_time) >= _BUFFER_TIMEOUT):
148
+ async for chunk in flush_buffer():
149
+ yield chunk
150
+
151
+ line: Optional[bytes] = await f.readline()
152
+ if not line:
153
+ if request_id is not None:
154
+ request_task = requests_lib.get_request(request_id)
155
+ if request_task.status > requests_lib.RequestStatus.RUNNING:
156
+ if (request_task.status ==
157
+ requests_lib.RequestStatus.CANCELLED):
158
+ buffer.append(
159
+ f'{request_task.name!r} request {request_id}'
160
+ ' cancelled\n')
112
161
  break
162
+ if not follow:
163
+ break
164
+
165
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
166
+ # Currently just used to keep the connection busy, refer to
167
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
168
+ # more details.
169
+ buffer.append(
170
+ message_utils.encode_payload(
171
+ rich_utils.Control.HEARTBEAT.encode('')))
172
+ last_heartbeat_time = current_time
173
+
174
+ # Sleep shortly to avoid storming the DB and CPU, this has
175
+ # little impact on the responsivness here since we are waiting
176
+ # for a new line to come in.
177
+ await asyncio.sleep(0.1)
178
+ continue
113
179
 
114
- current_time = asyncio.get_event_loop().time()
115
- if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
116
- # Currently just used to keep the connection busy, refer to
117
- # https://github.com/skypilot-org/skypilot/issues/5750 for
118
- # more details.
119
- yield message_utils.encode_payload(
120
- rich_utils.Control.HEARTBEAT.encode(''))
121
- last_heartbeat_time = current_time
122
-
123
- # Sleep shortly to avoid storming the DB and CPU, this has
124
- # little impact on the responsivness here since we are waiting
125
- # for a new line to come in.
126
- await asyncio.sleep(0.1)
180
+ # Refresh the heartbeat time, this is a trivial optimization for
181
+ # performance but it helps avoid unnecessary heartbeat strings
182
+ # being printed when the client runs in an old version.
183
+ last_heartbeat_time = asyncio.get_event_loop().time()
184
+ line_str = line.decode('utf-8')
185
+ if plain_logs:
186
+ is_payload, line_str = message_utils.decode_payload(
187
+ line_str, raise_for_mismatch=False)
188
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
189
+ # sending invisible characters might be okay.
190
+ if is_payload:
127
191
  continue
192
+ buffer.append(line_str)
193
+ buffer_bytes += len(line_str.encode('utf-8'))
128
194
 
129
- # Refresh the heartbeat time, this is a trivial optimization for
130
- # performance but it helps avoid unnecessary heartbeat strings
131
- # being printed when the client runs in an old version.
132
- last_heartbeat_time = asyncio.get_event_loop().time()
133
- line_str = line.decode('utf-8')
134
- if plain_logs:
135
- is_payload, line_str = message_utils.decode_payload(
136
- line_str, raise_for_mismatch=False)
137
- # TODO(aylei): implement heartbeat mechanism for plain logs,
138
- # sending invisible characters might be okay.
139
- if is_payload:
140
- continue
141
- yield line_str
195
+ # Flush remaining lines in the buffer.
196
+ async for chunk in flush_buffer():
197
+ yield chunk
142
198
 
143
199
 
144
200
  def stream_response(