skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -126,13 +126,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
126
126
 
127
127
  async def _authenticate(self, request: fastapi.Request, call_next,
128
128
  session: aiohttp.ClientSession):
129
- forwarded_headers = dict(request.headers)
129
+ forwarded_headers = {}
130
130
  auth_url = f'{self.proxy_base}/oauth2/auth'
131
131
  forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
132
- # Remove content-length and content-type headers and drop request body
133
- # to reduce the auth overhead.
134
- forwarded_headers.pop('content-length', None)
135
- forwarded_headers.pop('content-type', None)
132
+ forwarded_headers['Host'] = request.url.hostname
136
133
  logger.debug(f'authenticate request: {auth_url}, '
137
134
  f'headers: {forwarded_headers}')
138
135
 
sky/server/common.py CHANGED
@@ -17,7 +17,6 @@ import time
17
17
  import typing
18
18
  from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
19
  Tuple, TypeVar, Union)
20
- from urllib import parse
21
20
  import uuid
22
21
 
23
22
  import cachetools
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
342
341
  @annotations.lru_cache(scope='global')
343
342
  def get_dashboard_url(server_url: str,
344
343
  starting_page: Optional[str] = None) -> str:
345
- # The server_url may include username or password with the
346
- # format of https://username:password@example.com:8080/path
347
- # We need to remove the username and password and only
348
- # return `https://example.com:8080/path`
349
- parsed = parse.urlparse(server_url)
350
- # Reconstruct the URL without credentials but keeping the scheme
351
- dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
352
- if parsed.port:
353
- dashboard_url = f'{dashboard_url}:{parsed.port}'
354
- if parsed.path:
355
- dashboard_url = f'{dashboard_url}{parsed.path}'
356
- dashboard_url = dashboard_url.rstrip('/')
344
+ dashboard_url = server_url.rstrip('/')
357
345
  dashboard_url = f'{dashboard_url}/dashboard'
358
346
  if starting_page:
359
347
  dashboard_url = f'{dashboard_url}/{starting_page}'
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
490
478
  def handle_request_error(response: 'requests.Response') -> None:
491
479
  # Keep the original HTTPError if the response code >= 400
492
480
  response.raise_for_status()
481
+
493
482
  # Other status codes are not expected neither, e.g. we do not expect to
494
483
  # handle redirection here.
495
484
  if response.status_code != 200:
@@ -550,19 +539,27 @@ def _start_api_server(deploy: bool = False,
550
539
  'is not a local URL')
551
540
 
552
541
  # Check available memory before starting the server.
553
- avail_mem_size_gb: float = common_utils.get_mem_size_gb()
554
- # pylint: disable=import-outside-toplevel
555
- import sky.jobs.utils as job_utils
556
- max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
557
- if job_utils.is_consolidation_mode() else
558
- server_constants.MIN_AVAIL_MEM_GB)
559
- if avail_mem_size_gb <= max_memory:
560
- logger.warning(
561
- f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
562
- f'has {avail_mem_size_gb:.1f}GB memory available. '
563
- f'At least {max_memory}GB is recommended to support higher '
564
- 'load with better performance.'
565
- f'{colorama.Style.RESET_ALL}')
542
+ # Skip this warning if postgres is used, as:
543
+ # 1) that's almost certainly a remote API server;
544
+ # 2) the actual consolidation mode config is stashed in the database,
545
+ # and the value of `job_utils.is_consolidation_mode` will not be
546
+ # the actual value in the db, but only None as in this case, the
547
+ # whole YAML config is really just `db: <URI>`.
548
+ if skypilot_config.get_nested(('db',), None) is None:
549
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
550
+ # pylint: disable=import-outside-toplevel
551
+ import sky.jobs.utils as job_utils
552
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
553
+ if job_utils.is_consolidation_mode(
554
+ on_api_restart=True) else
555
+ server_constants.MIN_AVAIL_MEM_GB)
556
+ if avail_mem_size_gb <= max_memory:
557
+ logger.warning(
558
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
559
+ f'only has {avail_mem_size_gb:.1f}GB memory available. '
560
+ f'At least {max_memory}GB is recommended to support higher '
561
+ 'load with better performance.'
562
+ f'{colorama.Style.RESET_ALL}')
566
563
 
567
564
  args = [sys.executable, *API_SERVER_CMD.split()]
568
565
  if deploy:
@@ -914,12 +911,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
914
911
  client_command: Optional[str],
915
912
  using_remote_api_server: bool, user: 'models.User',
916
913
  request_id: str) -> None:
917
- """Reload modules, global variables, and usage message for a new request."""
914
+ """Reload modules, global variables, and usage message for a new request.
915
+
916
+ Must be called within the request's context.
917
+ """
918
918
  # This should be called first to make sure the logger is up-to-date.
919
919
  sky_logging.reload_logger()
920
920
 
921
921
  # Reload the skypilot config to make sure the latest config is used.
922
- skypilot_config.safe_reload_config()
922
+ # We don't need to grab the lock here because this function is only
923
+ # run once we are inside the request's context, so there shouldn't
924
+ # be any race conditions when reloading the config.
925
+ skypilot_config.reload_config()
923
926
 
924
927
  # Reset the client entrypoint and command for the usage message.
925
928
  common_utils.set_request_context(
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 20
13
+ API_VERSION = 22
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -64,3 +64,7 @@ DAEMON_RESTART_INTERVAL_SECONDS = 20
64
64
 
65
65
  # Cookie header for stream request id.
66
66
  STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
67
+
68
+ # Valid empty values for pickled fields (base64-encoded pickled None)
69
+ # base64.b64encode(pickle.dumps(None)).decode('utf-8')
70
+ EMPTY_PICKLED_VALUE = 'gAROLg=='
sky/server/daemons.py CHANGED
@@ -7,6 +7,7 @@ from typing import Callable
7
7
  from sky import sky_logging
8
8
  from sky import skypilot_config
9
9
  from sky.server import constants as server_constants
10
+ from sky.server.requests import request_names
10
11
  from sky.utils import annotations
11
12
  from sky.utils import common_utils
12
13
  from sky.utils import env_options
@@ -26,7 +27,7 @@ class InternalRequestDaemon:
26
27
  """Internal daemon that runs an event in the background."""
27
28
 
28
29
  id: str
29
- name: str
30
+ name: request_names.RequestName
30
31
  event_fn: Callable[[], None]
31
32
  default_log_level: str = 'INFO'
32
33
  should_skip: Callable[[], bool] = _default_should_skip
@@ -38,9 +39,11 @@ class InternalRequestDaemon:
38
39
  try:
39
40
  # Refresh config within the while loop.
40
41
  # Since this is a long running daemon,
41
- # reload_config_for_new_request()
42
+ # reload_for_new_request()
42
43
  # is not called in between the event runs.
43
- skypilot_config.safe_reload_config()
44
+ # We don't need to grab the lock here because each of the daemons
45
+ # run in their own process and thus have their own request context.
46
+ skypilot_config.reload_config()
44
47
  # Get the configured log level for the daemon inside the event loop
45
48
  # in case the log level changes after the API server is started.
46
49
  level_str = skypilot_config.get_nested(
@@ -193,26 +196,31 @@ INTERNAL_REQUEST_DAEMONS = [
193
196
  # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
194
197
  # set to updated status automatically, without showing users the hint of
195
198
  # cluster being stopped or down when `sky status -r` is called.
196
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
197
- name='status-refresh',
198
- event_fn=refresh_cluster_status_event,
199
- default_log_level='DEBUG'),
199
+ InternalRequestDaemon(
200
+ id='skypilot-status-refresh-daemon',
201
+ name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
202
+ event_fn=refresh_cluster_status_event,
203
+ default_log_level='DEBUG'),
200
204
  # Volume status refresh daemon to update the volume status periodically.
201
- InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
202
- name='volume-refresh',
203
- event_fn=refresh_volume_status_event),
205
+ InternalRequestDaemon(
206
+ id='skypilot-volume-status-refresh-daemon',
207
+ name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
208
+ event_fn=refresh_volume_status_event),
204
209
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
205
- name='managed-job-status-refresh',
210
+ name=request_names.RequestName.
211
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
206
212
  event_fn=managed_job_status_refresh_event,
207
213
  should_skip=should_skip_managed_job_status_refresh),
208
- InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
209
- name='sky-serve-status-refresh',
210
- event_fn=sky_serve_status_refresh_event,
211
- should_skip=should_skip_sky_serve_status_refresh),
212
- InternalRequestDaemon(id='pool-status-refresh-daemon',
213
- name='pool-status-refresh',
214
- event_fn=pool_status_refresh_event,
215
- should_skip=should_skip_pool_status_refresh),
214
+ InternalRequestDaemon(
215
+ id='sky-serve-status-refresh-daemon',
216
+ name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
217
+ event_fn=sky_serve_status_refresh_event,
218
+ should_skip=should_skip_sky_serve_status_refresh),
219
+ InternalRequestDaemon(
220
+ id='pool-status-refresh-daemon',
221
+ name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
222
+ event_fn=pool_status_refresh_event,
223
+ should_skip=should_skip_pool_status_refresh),
216
224
  ]
217
225
 
218
226
 
@@ -47,7 +47,9 @@ from sky.server import metrics as metrics_lib
47
47
  from sky.server.requests import payloads
48
48
  from sky.server.requests import preconditions
49
49
  from sky.server.requests import process
50
+ from sky.server.requests import request_names
50
51
  from sky.server.requests import requests as api_requests
52
+ from sky.server.requests import threads
51
53
  from sky.server.requests.queues import local_queue
52
54
  from sky.server.requests.queues import mp_queue
53
55
  from sky.skylet import constants
@@ -81,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
81
83
  # platforms, including macOS.
82
84
  multiprocessing.set_start_method('spawn', force=True)
83
85
 
86
+ # An upper limit of max threads for request execution per server process that
87
+ # unlikely to be reached to allow higher concurrency while still prevent the
88
+ # server process become overloaded.
89
+ _REQUEST_THREADS_LIMIT = 128
90
+
91
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
92
+ # A dedicated thread pool executor for synced requests execution in coroutine to
93
+ # avoid:
94
+ # 1. blocking the event loop;
95
+ # 2. exhausting the default thread pool executor of event loop;
96
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
97
+
98
+
99
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
100
+ """Lazy init and return the request thread executor for current process."""
101
+ global _REQUEST_THREAD_EXECUTOR
102
+ if _REQUEST_THREAD_EXECUTOR is not None:
103
+ return _REQUEST_THREAD_EXECUTOR
104
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
105
+ if _REQUEST_THREAD_EXECUTOR is None:
106
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
107
+ name='request_thread_executor',
108
+ max_workers=_REQUEST_THREADS_LIMIT)
109
+ return _REQUEST_THREAD_EXECUTOR
110
+
84
111
 
85
112
  class RequestQueue:
86
113
  """The queue for the requests, either redis or multiprocessing.
@@ -188,10 +215,11 @@ class RequestWorker:
188
215
  time.sleep(0.1)
189
216
  return
190
217
  request_id, ignore_return_value, _ = request_element
191
- request = api_requests.get_request(request_id)
218
+ request = api_requests.get_request(request_id, fields=['status'])
192
219
  assert request is not None, f'Request with ID {request_id} is None'
193
220
  if request.status == api_requests.RequestStatus.CANCELLED:
194
221
  return
222
+ del request
195
223
  logger.info(f'[{self}] Submitting request: {request_id}')
196
224
  # Start additional process to run the request, so that it can be
197
225
  # cancelled when requested by a user.
@@ -302,10 +330,7 @@ def override_request_env_and_config(
302
330
  # through the execution.
303
331
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
304
332
  name=request_body.env_vars[constants.USER_ENV_VAR])
305
- global_user_state.add_or_update_user(user)
306
- # Refetch the user to get the latest user info, including the created_at
307
- # field.
308
- user = global_user_state.get_user(user.id)
333
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
309
334
 
310
335
  # Force color to be enabled.
311
336
  os.environ['CLICOLOR_FORCE'] = '1'
@@ -349,32 +374,6 @@ def override_request_env_and_config(
349
374
  os.environ.update(original_env)
350
375
 
351
376
 
352
- def _get_current_output() -> Tuple[int, int]:
353
- """Get the current stdout and stderr file descriptors."""
354
- return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
355
-
356
-
357
- def _redirect_output(file: TextIO) -> None:
358
- """Redirect stdout and stderr to the log file."""
359
- # Get the file descriptor from the file object
360
- fd = file.fileno()
361
- # Copy this fd to stdout and stderr
362
- os.dup2(fd, sys.stdout.fileno())
363
- os.dup2(fd, sys.stderr.fileno())
364
-
365
-
366
- def _restore_output(original_stdout: Optional[int],
367
- original_stderr: Optional[int]) -> None:
368
- """Restore stdout and stderr to their original file descriptors."""
369
- if original_stdout is not None:
370
- os.dup2(original_stdout, sys.stdout.fileno())
371
- os.close(original_stdout)
372
-
373
- if original_stderr is not None:
374
- os.dup2(original_stderr, sys.stderr.fileno())
375
- os.close(original_stderr)
376
-
377
-
378
377
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
379
378
  raise KeyboardInterrupt
380
379
 
@@ -397,11 +396,43 @@ def _request_execution_wrapper(request_id: str,
397
396
  rss_begin = proc.memory_info().rss
398
397
  db_utils.set_max_connections(num_db_connections_per_worker)
399
398
  # Handle the SIGTERM signal to abort the request processing gracefully.
400
- signal.signal(signal.SIGTERM, _sigterm_handler)
399
+ # Only set up signal handlers in the main thread, as signal.signal() raises
400
+ # ValueError if called from a non-main thread (e.g., in tests).
401
+ if threading.current_thread() is threading.main_thread():
402
+ signal.signal(signal.SIGTERM, _sigterm_handler)
401
403
 
402
404
  logger.info(f'Running request {request_id} with pid {pid}')
403
405
 
404
406
  original_stdout = original_stderr = None
407
+
408
+ def _save_current_output() -> None:
409
+ """Save the current stdout and stderr file descriptors."""
410
+ nonlocal original_stdout, original_stderr
411
+ original_stdout = os.dup(sys.stdout.fileno())
412
+ original_stderr = os.dup(sys.stderr.fileno())
413
+
414
+ def _redirect_output(file: TextIO) -> None:
415
+ """Redirect stdout and stderr to the log file."""
416
+ # Get the file descriptor from the file object
417
+ fd = file.fileno()
418
+ # Copy this fd to stdout and stderr
419
+ os.dup2(fd, sys.stdout.fileno())
420
+ os.dup2(fd, sys.stderr.fileno())
421
+
422
+ def _restore_output() -> None:
423
+ """Restore stdout and stderr to their original file descriptors."""
424
+ nonlocal original_stdout, original_stderr
425
+ if original_stdout is not None:
426
+ os.dup2(original_stdout, sys.stdout.fileno())
427
+ os.close(original_stdout)
428
+ original_stdout = None
429
+
430
+ if original_stderr is not None:
431
+ os.dup2(original_stderr, sys.stderr.fileno())
432
+ os.close(original_stderr)
433
+ original_stderr = None
434
+
435
+ request_name = None
405
436
  try:
406
437
  # As soon as the request is updated with the executor PID, we can
407
438
  # receive SIGTERM from cancellation. So, we update the request inside
@@ -422,7 +453,7 @@ def _request_execution_wrapper(request_id: str,
422
453
  # Store copies of the original stdout and stderr file descriptors
423
454
  # We do this in two steps because we should make sure to restore the
424
455
  # original values even if we are cancelled or fail during the redirect.
425
- original_stdout, original_stderr = _get_current_output()
456
+ _save_current_output()
426
457
 
427
458
  # Append to the log file instead of overwriting it since there might be
428
459
  # logs from previous retries.
@@ -464,15 +495,14 @@ def _request_execution_wrapper(request_id: str,
464
495
  # clear the pid of the request.
465
496
  request_task.pid = None
466
497
  # Yield control to the scheduler for uniform handling of retries.
467
- _restore_output(original_stdout, original_stderr)
498
+ _restore_output()
468
499
  raise
469
500
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
470
501
  api_requests.set_request_failed(request_id, e)
471
502
  # Manually reset the original stdout and stderr file descriptors early
472
503
  # so that the "Request xxxx failed due to ..." log message will be
473
504
  # written to the original stdout and stderr file descriptors.
474
- _restore_output(original_stdout, original_stderr)
475
- original_stdout = original_stderr = None
505
+ _restore_output()
476
506
  logger.info(f'Request {request_id} failed due to '
477
507
  f'{common_utils.format_exception(e)}')
478
508
  return
@@ -482,11 +512,10 @@ def _request_execution_wrapper(request_id: str,
482
512
  # Manually reset the original stdout and stderr file descriptors early
483
513
  # so that the "Request xxxx failed due to ..." log message will be
484
514
  # written to the original stdout and stderr file descriptors.
485
- _restore_output(original_stdout, original_stderr)
486
- original_stdout = original_stderr = None
515
+ _restore_output()
487
516
  logger.info(f'Request {request_id} finished')
488
517
  finally:
489
- _restore_output(original_stdout, original_stderr)
518
+ _restore_output()
490
519
  try:
491
520
  # Capture the peak RSS before GC.
492
521
  peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
@@ -495,7 +524,8 @@ def _request_execution_wrapper(request_id: str,
495
524
  annotations.clear_request_level_cache()
496
525
  with metrics_utils.time_it(name='release_memory', group='internal'):
497
526
  common_utils.release_memory()
498
- _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
527
+ if request_name is not None:
528
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
499
529
  except Exception as e: # pylint: disable=broad-except
500
530
  logger.error(f'Failed to record memory metrics: '
501
531
  f'{common_utils.format_exception(e)}')
@@ -539,6 +569,21 @@ class CoroutineTask:
539
569
  pass
540
570
 
541
571
 
572
+ def check_request_thread_executor_available() -> None:
573
+ """Check if the request thread executor is available.
574
+
575
+ This is a best effort check to hint the client to retry other server
576
+ processes when there is no avaiable thread worker in current one. But
577
+ a request may pass this check and still cannot get worker on execution
578
+ time due to race condition. In this case, the client will see a failed
579
+ request instead of retry.
580
+
581
+ TODO(aylei): this can be refined with a refactor of our coroutine
582
+ execution flow.
583
+ """
584
+ get_request_thread_executor().check_available()
585
+
586
+
542
587
  def execute_request_in_coroutine(
543
588
  request: api_requests.Request) -> CoroutineTask:
544
589
  """Execute a request in current event loop.
@@ -553,6 +598,18 @@ def execute_request_in_coroutine(
553
598
  return CoroutineTask(task)
554
599
 
555
600
 
601
+ def _execute_with_config_override(func: Callable,
602
+ request_body: payloads.RequestBody,
603
+ request_id: str, request_name: str,
604
+ **kwargs) -> Any:
605
+ """Execute a function with env and config override inside a thread."""
606
+ # Override the environment and config within this thread's context,
607
+ # which gets copied when we call to_thread.
608
+ with override_request_env_and_config(request_body, request_id,
609
+ request_name):
610
+ return func(**kwargs)
611
+
612
+
556
613
  async def _execute_request_coroutine(request: api_requests.Request):
557
614
  """Execute a request in current event loop.
558
615
 
@@ -566,39 +623,43 @@ async def _execute_request_coroutine(request: api_requests.Request):
566
623
  logger.info(f'Executing request {request.request_id} in coroutine')
567
624
  func = request.entrypoint
568
625
  request_body = request.request_body
569
- with api_requests.update_request(request.request_id) as request_task:
570
- request_task.status = api_requests.RequestStatus.RUNNING
626
+ await api_requests.update_status_async(request.request_id,
627
+ api_requests.RequestStatus.RUNNING)
571
628
  # Redirect stdout and stderr to the request log path.
572
629
  original_output = ctx.redirect_log(request.log_path)
573
- # Override environment variables that backs env_options.Options
574
- # TODO(aylei): compared to process executor, running task in coroutine has
575
- # two issues to fix:
576
- # 1. skypilot config is not contextual
577
- # 2. envs that read directly from os.environ are not contextual
578
- ctx.override_envs(request_body.env_vars)
579
- fut: asyncio.Future = context_utils.to_thread(func,
580
- **request_body.to_kwargs())
630
+ try:
631
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
632
+ get_request_thread_executor(), _execute_with_config_override, func,
633
+ request_body, request.request_id, request.name,
634
+ **request_body.to_kwargs())
635
+ except Exception as e: # pylint: disable=broad-except
636
+ ctx.redirect_log(original_output)
637
+ await api_requests.set_request_failed_async(request.request_id, e)
638
+ logger.error(f'Failed to run request {request.request_id} due to '
639
+ f'{common_utils.format_exception(e)}')
640
+ return
581
641
 
582
642
  async def poll_task(request_id: str) -> bool:
583
- request = await api_requests.get_request_async(request_id)
584
- if request is None:
643
+ req_status = await api_requests.get_request_status_async(request_id)
644
+ if req_status is None:
585
645
  raise RuntimeError('Request not found')
586
646
 
587
- if request.status == api_requests.RequestStatus.CANCELLED:
647
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
588
648
  ctx.cancel()
589
649
  return True
590
650
 
591
651
  if fut.done():
592
652
  try:
593
653
  result = await fut
594
- api_requests.set_request_succeeded(request_id, result)
654
+ await api_requests.set_request_succeeded_async(
655
+ request_id, result)
595
656
  except asyncio.CancelledError:
596
657
  # The task is cancelled by ctx.cancel(), where the status
597
658
  # should already be set to CANCELLED.
598
659
  pass
599
660
  except Exception as e: # pylint: disable=broad-except
600
661
  ctx.redirect_log(original_output)
601
- api_requests.set_request_failed(request_id, e)
662
+ await api_requests.set_request_failed_async(request_id, e)
602
663
  logger.error(f'Request {request_id} failed due to '
603
664
  f'{common_utils.format_exception(e)}')
604
665
  return True
@@ -613,13 +674,13 @@ async def _execute_request_coroutine(request: api_requests.Request):
613
674
  except asyncio.CancelledError:
614
675
  # Current coroutine is cancelled due to client disconnect, set the
615
676
  # request status for consistency.
616
- api_requests.set_request_cancelled(request.request_id)
677
+ await api_requests.set_request_cancelled_async(request.request_id)
617
678
  pass
618
679
  # pylint: disable=broad-except
619
680
  except (Exception, KeyboardInterrupt, SystemExit) as e:
620
681
  # Handle any other error
621
682
  ctx.redirect_log(original_output)
622
- api_requests.set_request_failed(request.request_id, e)
683
+ await api_requests.set_request_failed_async(request.request_id, e)
623
684
  logger.error(f'Request {request.request_id} interrupted due to '
624
685
  f'unhandled exception: {common_utils.format_exception(e)}')
625
686
  raise
@@ -629,9 +690,9 @@ async def _execute_request_coroutine(request: api_requests.Request):
629
690
  ctx.cancel()
630
691
 
631
692
 
632
- def prepare_request(
693
+ async def prepare_request_async(
633
694
  request_id: str,
634
- request_name: str,
695
+ request_name: request_names.RequestName,
635
696
  request_body: payloads.RequestBody,
636
697
  func: Callable[P, Any],
637
698
  request_cluster_name: Optional[str] = None,
@@ -655,7 +716,7 @@ def prepare_request(
655
716
  user_id=user_id,
656
717
  cluster_name=request_cluster_name)
657
718
 
658
- if not api_requests.create_if_not_exists(request):
719
+ if not await api_requests.create_if_not_exists_async(request):
659
720
  raise exceptions.RequestAlreadyExistsError(
660
721
  f'Request {request_id} already exists.')
661
722
 
@@ -663,17 +724,18 @@ def prepare_request(
663
724
  return request
664
725
 
665
726
 
666
- def schedule_request(request_id: str,
667
- request_name: str,
668
- request_body: payloads.RequestBody,
669
- func: Callable[P, Any],
670
- request_cluster_name: Optional[str] = None,
671
- ignore_return_value: bool = False,
672
- schedule_type: api_requests.ScheduleType = (
673
- api_requests.ScheduleType.LONG),
674
- is_skypilot_system: bool = False,
675
- precondition: Optional[preconditions.Precondition] = None,
676
- retryable: bool = False) -> None:
727
+ async def schedule_request_async(request_id: str,
728
+ request_name: request_names.RequestName,
729
+ request_body: payloads.RequestBody,
730
+ func: Callable[P, Any],
731
+ request_cluster_name: Optional[str] = None,
732
+ ignore_return_value: bool = False,
733
+ schedule_type: api_requests.ScheduleType = (
734
+ api_requests.ScheduleType.LONG),
735
+ is_skypilot_system: bool = False,
736
+ precondition: Optional[
737
+ preconditions.Precondition] = None,
738
+ retryable: bool = False) -> None:
677
739
  """Enqueue a request to the request queue.
678
740
 
679
741
  Args:
@@ -694,9 +756,11 @@ def schedule_request(request_id: str,
694
756
  The precondition is waited asynchronously and does not block the
695
757
  caller.
696
758
  """
697
- request_task = prepare_request(request_id, request_name, request_body, func,
698
- request_cluster_name, schedule_type,
699
- is_skypilot_system)
759
+ request_task = await prepare_request_async(request_id, request_name,
760
+ request_body, func,
761
+ request_cluster_name,
762
+ schedule_type,
763
+ is_skypilot_system)
700
764
  schedule_prepared_request(request_task, ignore_return_value, precondition,
701
765
  retryable)
702
766
 
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
319
319
  # Only return fields that are needed for the
320
320
  # dashboard / CLI summary response
321
321
  summary_response: bool = False
322
+ # Include the cluster handle in the response
323
+ include_handle: bool = True
322
324
 
323
325
 
324
326
  class StartBody(RequestBody):
@@ -363,9 +365,10 @@ class CancelBody(RequestBody):
363
365
  return kwargs
364
366
 
365
367
 
366
- class ClusterNameBody(RequestBody):
368
+ class ProvisionLogsBody(RequestBody):
367
369
  """Cluster node."""
368
370
  cluster_name: str
371
+ worker: Optional[int] = None
369
372
 
370
373
 
371
374
  class ClusterJobBody(RequestBody):
@@ -541,6 +544,9 @@ class JobsQueueV2Body(RequestBody):
541
544
  page: Optional[int] = None
542
545
  limit: Optional[int] = None
543
546
  statuses: Optional[List[str]] = None
547
+ # The fields to return in the response.
548
+ # Refer to the fields in the `class ManagedJobRecord` in `response.py`
549
+ fields: Optional[List[str]] = None
544
550
 
545
551
 
546
552
  class JobsCancelBody(RequestBody):
@@ -573,6 +579,8 @@ class RequestStatusBody(pydantic.BaseModel):
573
579
  """The request body for the API request status endpoint."""
574
580
  request_ids: Optional[List[str]] = None
575
581
  all_status: bool = False
582
+ limit: Optional[int] = None
583
+ fields: Optional[List[str]] = None
576
584
 
577
585
 
578
586
  class ServeUpBody(RequestBody):