skypilot-nightly 1.0.0.dev20250609__py3-none-any.whl → 1.0.0.dev20250611__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +134 -5
  3. sky/authentication.py +1 -7
  4. sky/backends/cloud_vm_ray_backend.py +9 -20
  5. sky/benchmark/benchmark_state.py +39 -1
  6. sky/cli.py +3 -5
  7. sky/client/cli.py +3 -5
  8. sky/client/sdk.py +49 -4
  9. sky/clouds/kubernetes.py +15 -24
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
  18. sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/600.15a0009177e86b86.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
  21. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
  25. sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
  26. sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
  27. sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/938-ab185187a63f9cdb.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
  31. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/_app-7bbd9d39d6f9a98a.js +20 -0
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-208a9812ab4f61c9.js +1 -0
  47. sky/dashboard/out/_next/static/css/{8b1c8321d4c02372.css → 5d71bfc09f184bab.css} +1 -1
  48. sky/dashboard/out/_next/static/zJqasksBQ3HcqMpA2wTUZ/_buildManifest.js +1 -0
  49. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  50. sky/dashboard/out/clusters/[cluster].html +1 -1
  51. sky/dashboard/out/clusters.html +1 -1
  52. sky/dashboard/out/config.html +1 -1
  53. sky/dashboard/out/index.html +1 -1
  54. sky/dashboard/out/infra/[context].html +1 -1
  55. sky/dashboard/out/infra.html +1 -1
  56. sky/dashboard/out/jobs/[job].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/workspace/new.html +1 -1
  60. sky/dashboard/out/workspaces/[name].html +1 -1
  61. sky/dashboard/out/workspaces.html +1 -1
  62. sky/exceptions.py +18 -0
  63. sky/global_user_state.py +181 -74
  64. sky/jobs/client/sdk.py +29 -21
  65. sky/jobs/scheduler.py +4 -5
  66. sky/jobs/state.py +104 -11
  67. sky/jobs/utils.py +5 -5
  68. sky/provision/kubernetes/constants.py +9 -0
  69. sky/provision/kubernetes/utils.py +106 -7
  70. sky/serve/client/sdk.py +56 -45
  71. sky/server/common.py +1 -5
  72. sky/server/requests/executor.py +50 -20
  73. sky/server/requests/payloads.py +3 -0
  74. sky/server/requests/process.py +69 -29
  75. sky/server/server.py +1 -0
  76. sky/server/stream_utils.py +111 -55
  77. sky/skylet/constants.py +1 -2
  78. sky/skylet/job_lib.py +95 -40
  79. sky/skypilot_config.py +99 -25
  80. sky/users/permission.py +34 -17
  81. sky/utils/admin_policy_utils.py +41 -16
  82. sky/utils/context.py +21 -1
  83. sky/utils/controller_utils.py +16 -1
  84. sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
  85. sky/utils/schemas.py +11 -3
  86. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/METADATA +1 -1
  87. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/RECORD +92 -81
  88. sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
  89. sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
  91. sky/dashboard/out/_next/static/chunks/470-680c19413b8f808b.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/63-e2d7b1e75e67c713.js +0 -66
  93. sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
  94. sky/dashboard/out/_next/static/chunks/843-16c7194621b2b512.js +0 -11
  95. sky/dashboard/out/_next/static/chunks/856-affc52adf5403a3a.js +0 -1
  96. sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/973-aed916d5b02d2d63.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/pages/_app-5f16aba5794ee8e7.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-d31688d3e52736dd.js +0 -6
  100. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e7d8710a9b0491e5.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/pages/clusters-3c674e5d970e05cb.js +0 -1
  102. sky/dashboard/out/_next/static/chunks/pages/config-3aac7a015c6eede1.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-46d2e4ad6c487260.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/pages/infra-7013d816a2a0e76c.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-f7f0c9e156d328bc.js +0 -16
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-87e60396c376292f.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/users-9355a0f13d1db61d.js +0 -16
  108. sky/dashboard/out/_next/static/chunks/pages/workspace/new-9a749cca1813bd27.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-8eeb628e03902f1b.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces-8fbcc5ab4af316d0.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
  112. sky/dashboard/out/_next/static/xos0euNCptbGAM7_Q3Acl/_buildManifest.js +0 -1
  113. /sky/dashboard/out/_next/static/{xos0euNCptbGAM7_Q3Acl → zJqasksBQ3HcqMpA2wTUZ}/_ssgManifest.js +0 -0
  114. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/WHEEL +0 -0
  115. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/entry_points.txt +0 -0
  116. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/licenses/LICENSE +0 -0
  117. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
2
  import dataclasses
3
3
  import functools
4
+ import hashlib
4
5
  import json
5
6
  import math
6
7
  import os
@@ -1555,11 +1556,11 @@ def is_kubeconfig_exec_auth(
1555
1556
  == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
1556
1557
  ctx_name = context_obj['name']
1557
1558
  exec_msg = ('exec-based authentication is used for '
1558
- f'Kubernetes context {ctx_name!r}.'
1559
- ' This may cause issues with autodown or when running '
1560
- 'Managed Jobs or SkyServe controller on Kubernetes. '
1561
- 'To fix, configure SkyPilot to create a service account '
1562
- 'for running pods by setting the following in '
1559
+ f'Kubernetes context {ctx_name!r}. '
1560
+ 'Make sure that the corresponding cloud provider is '
1561
+ 'also enabled through `sky check` (e.g.: GCP for GKE). '
1562
+ 'Alternatively, configure SkyPilot to create a service '
1563
+ 'account for running pods by setting the following in '
1563
1564
  '~/.sky/config.yaml:\n'
1564
1565
  ' kubernetes:\n'
1565
1566
  ' remote_identity: SERVICE_ACCOUNT\n'
@@ -2877,8 +2878,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
2877
2878
  context = provider_config.get('context',
2878
2879
  get_current_kube_config_context_name())
2879
2880
  if context == kubernetes.in_cluster_context_name():
2880
- # If the context (also used as the region) is in-cluster, we need to
2881
- # we need to use in-cluster auth by setting the context to None.
2881
+ # If the context (also used as the region) is in-cluster, we need
2882
+ # to use in-cluster auth by setting the context to None.
2882
2883
  context = None
2883
2884
  return context
2884
2885
 
@@ -3135,3 +3136,101 @@ def get_kubeconfig_paths() -> List[str]:
3135
3136
  for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
3136
3137
  expanded.append(os.path.expanduser(path))
3137
3138
  return expanded
3139
+
3140
+
3141
+ def format_kubeconfig_exec_auth(config: Any,
3142
+ output_path: str,
3143
+ inject_wrapper: bool = True) -> bool:
3144
+ """Reformat the kubeconfig so that exec-based authentication can be used
3145
+ with SkyPilot. Will create a new kubeconfig file under <output_path>
3146
+ regardless of whether a change has been made.
3147
+
3148
+ kubectl internally strips all environment variables except for system
3149
+ defaults. If `inject_wrapper` is true, a wrapper executable is applied
3150
+ to inject the relevant PATH information before exec-auth is executed.
3151
+
3152
+ Contents of sky-kube-exec-wrapper:
3153
+
3154
+ #!/bin/bash
3155
+ export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
3156
+ exec "$@"
3157
+
3158
+ refer to `skylet/constants.py` for more information.
3159
+
3160
+ Args:
3161
+ config (dict): kubeconfig parsed by yaml.safe_load
3162
+ output_path (str): Path where the potentially modified kubeconfig file
3163
+ will be saved
3164
+ inject_wrapper (bool): Whether to inject the wrapper script
3165
+ Returns: whether config was updated, for logging purposes
3166
+ """
3167
+ updated = False
3168
+ for user in config.get('users', []):
3169
+ exec_info = user.get('user', {}).get('exec', {})
3170
+ current_command = exec_info.get('command', '')
3171
+
3172
+ if current_command:
3173
+ # Strip the path and keep only the executable name
3174
+ executable = os.path.basename(current_command)
3175
+ if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
3176
+ # we don't want this happening recursively.
3177
+ continue
3178
+
3179
+ if inject_wrapper:
3180
+ exec_info[
3181
+ 'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
3182
+ if exec_info.get('args') is None:
3183
+ exec_info['args'] = []
3184
+ exec_info['args'].insert(0, executable)
3185
+ updated = True
3186
+ elif executable != current_command:
3187
+ exec_info['command'] = executable
3188
+ updated = True
3189
+
3190
+ # Handle Nebius kubeconfigs: change --profile to 'sky'
3191
+ if executable == 'nebius':
3192
+ args = exec_info.get('args', [])
3193
+ if args and '--profile' in args:
3194
+ try:
3195
+ profile_index = args.index('--profile')
3196
+ if profile_index + 1 < len(args):
3197
+ old_profile = args[profile_index + 1]
3198
+ if old_profile != 'sky':
3199
+ args[profile_index + 1] = 'sky'
3200
+ updated = True
3201
+ except ValueError:
3202
+ pass
3203
+
3204
+ os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
3205
+ with open(output_path, 'w', encoding='utf-8') as file:
3206
+ yaml.safe_dump(config, file)
3207
+
3208
+ return updated
3209
+
3210
+
3211
+ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3212
+ """Reformat the kubeconfig file or retrieve it from cache if it has already
3213
+ been formatted before. Store it in the cache directory if necessary.
3214
+
3215
+ Having a cache for this is good if users spawn an extreme number of jobs
3216
+ concurrently.
3217
+
3218
+ Args:
3219
+ kubeconfig_path (str): kubeconfig path
3220
+ Returns: updated kubeconfig path
3221
+ """
3222
+ # TODO(kyuds): GC cache files
3223
+ with open(kubeconfig_path, 'r', encoding='utf-8') as file:
3224
+ config = yaml.safe_load(file)
3225
+ normalized = yaml.dump(config, sort_keys=True)
3226
+ hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
3227
+ path = os.path.expanduser(
3228
+ f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
3229
+ )
3230
+
3231
+ # If we have already converted the same kubeconfig before, just return.
3232
+ if os.path.isfile(path):
3233
+ return path
3234
+
3235
+ format_kubeconfig_exec_auth(config, path)
3236
+ return path
sky/serve/client/sdk.py CHANGED
@@ -10,6 +10,8 @@ from sky.client import common as client_common
10
10
  from sky.server import common as server_common
11
11
  from sky.server.requests import payloads
12
12
  from sky.usage import usage_lib
13
+ from sky.utils import admin_policy_utils
14
+ from sky.utils import context
13
15
  from sky.utils import dag_utils
14
16
 
15
17
  if typing.TYPE_CHECKING:
@@ -23,6 +25,7 @@ else:
23
25
  requests = adaptors_common.LazyImport('requests')
24
26
 
25
27
 
28
+ @context.contextual
26
29
  @usage_lib.entrypoint
27
30
  @server_common.check_server_healthy_or_start
28
31
  def up(
@@ -55,30 +58,36 @@ def up(
55
58
  from sky.client import sdk # pylint: disable=import-outside-toplevel
56
59
 
57
60
  dag = dag_utils.convert_entrypoint_to_dag(task)
58
- sdk.validate(dag)
59
- request_id = sdk.optimize(dag)
60
- sdk.stream_and_get(request_id)
61
- if _need_confirmation:
62
- prompt = f'Launching a new service {service_name!r}. Proceed?'
63
- if prompt is not None:
64
- click.confirm(prompt, default=True, abort=True, show_default=True)
65
-
66
- dag = client_common.upload_mounts_to_api_server(dag)
67
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
68
-
69
- body = payloads.ServeUpBody(
70
- task=dag_str,
71
- service_name=service_name,
72
- )
73
- response = requests.post(
74
- f'{server_common.get_server_url()}/serve/up',
75
- json=json.loads(body.model_dump_json()),
76
- timeout=(5, None),
77
- cookies=server_common.get_api_cookie_jar(),
78
- )
79
- return server_common.get_request_id(response)
61
+ with admin_policy_utils.apply_and_use_config_in_current_request(
62
+ dag, at_client_side=True) as dag:
63
+ sdk.validate(dag)
64
+ request_id = sdk.optimize(dag)
65
+ sdk.stream_and_get(request_id)
66
+ if _need_confirmation:
67
+ prompt = f'Launching a new service {service_name!r}. Proceed?'
68
+ if prompt is not None:
69
+ click.confirm(prompt,
70
+ default=True,
71
+ abort=True,
72
+ show_default=True)
73
+
74
+ dag = client_common.upload_mounts_to_api_server(dag)
75
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
76
+
77
+ body = payloads.ServeUpBody(
78
+ task=dag_str,
79
+ service_name=service_name,
80
+ )
81
+ response = requests.post(
82
+ f'{server_common.get_server_url()}/serve/up',
83
+ json=json.loads(body.model_dump_json()),
84
+ timeout=(5, None),
85
+ cookies=server_common.get_api_cookie_jar(),
86
+ )
87
+ return server_common.get_request_id(response)
80
88
 
81
89
 
90
+ @context.contextual
82
91
  @usage_lib.entrypoint
83
92
  @server_common.check_server_healthy_or_start
84
93
  def update(
@@ -112,30 +121,32 @@ def update(
112
121
  from sky.client import sdk # pylint: disable=import-outside-toplevel
113
122
 
114
123
  dag = dag_utils.convert_entrypoint_to_dag(task)
115
- sdk.validate(dag)
116
- request_id = sdk.optimize(dag)
117
- sdk.stream_and_get(request_id)
118
- if _need_confirmation:
119
- click.confirm(f'Updating service {service_name!r}. Proceed?',
120
- default=True,
121
- abort=True,
122
- show_default=True)
123
-
124
- dag = client_common.upload_mounts_to_api_server(dag)
125
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
126
- body = payloads.ServeUpdateBody(
127
- task=dag_str,
128
- service_name=service_name,
129
- mode=mode,
130
- )
124
+ with admin_policy_utils.apply_and_use_config_in_current_request(
125
+ dag, at_client_side=True) as dag:
126
+ sdk.validate(dag)
127
+ request_id = sdk.optimize(dag)
128
+ sdk.stream_and_get(request_id)
129
+ if _need_confirmation:
130
+ click.confirm(f'Updating service {service_name!r}. Proceed?',
131
+ default=True,
132
+ abort=True,
133
+ show_default=True)
134
+
135
+ dag = client_common.upload_mounts_to_api_server(dag)
136
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
137
+ body = payloads.ServeUpdateBody(
138
+ task=dag_str,
139
+ service_name=service_name,
140
+ mode=mode,
141
+ )
131
142
 
132
- response = requests.post(
133
- f'{server_common.get_server_url()}/serve/update',
134
- json=json.loads(body.model_dump_json()),
135
- timeout=(5, None),
136
- cookies=server_common.get_api_cookie_jar(),
137
- )
138
- return server_common.get_request_id(response)
143
+ response = requests.post(
144
+ f'{server_common.get_server_url()}/serve/update',
145
+ json=json.loads(body.model_dump_json()),
146
+ timeout=(5, None),
147
+ cookies=server_common.get_api_cookie_jar(),
148
+ )
149
+ return server_common.get_request_id(response)
139
150
 
140
151
 
141
152
  @usage_lib.entrypoint
sky/server/common.py CHANGED
@@ -420,11 +420,7 @@ def _start_api_server(deploy: bool = False,
420
420
  dashboard_msg += (
421
421
  'Dashboard may be stale when installed from source, '
422
422
  'to rebuild: npm --prefix sky/dashboard install '
423
- '&& npm --prefix sky/dashboard run build\n')
424
- dashboard_msg += (
425
- f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
426
- f'Dashboard: {get_dashboard_url(server_url)}')
427
- dashboard_msg += f'{colorama.Style.RESET_ALL}'
423
+ '&& npm --prefix sky/dashboard run build')
428
424
  logger.info(
429
425
  ux_utils.finishing_message(
430
426
  f'SkyPilot API server started. {dashboard_msg}'))
@@ -19,6 +19,7 @@ The number of the workers is determined by the system resources.
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
21
  import asyncio
22
+ import concurrent.futures
22
23
  import contextlib
23
24
  import multiprocessing
24
25
  import os
@@ -93,21 +94,21 @@ class RequestQueue:
93
94
  else:
94
95
  raise RuntimeError(f'Invalid queue backend: {backend}')
95
96
 
96
- def put(self, request: Tuple[str, bool]) -> None:
97
+ def put(self, request: Tuple[str, bool, bool]) -> None:
97
98
  """Put and request to the queue.
98
99
 
99
100
  Args:
100
- request: A tuple of request_id and ignore_return_value.
101
+ request: A tuple of request_id, ignore_return_value, and retryable.
101
102
  """
102
103
  self.queue.put(request) # type: ignore
103
104
 
104
- def get(self) -> Optional[Tuple[str, bool]]:
105
+ def get(self) -> Optional[Tuple[str, bool, bool]]:
105
106
  """Get a request from the queue.
106
107
 
107
108
  It is non-blocking if the queue is empty, and returns None.
108
109
 
109
110
  Returns:
110
- A tuple of request_id and ignore_return_value.
111
+ A tuple of request_id, ignore_return_value, and retryable.
111
112
  """
112
113
  try:
113
114
  return self.queue.get(block=False)
@@ -159,7 +160,7 @@ class RequestWorker:
159
160
  if request_element is None:
160
161
  time.sleep(0.1)
161
162
  return
162
- request_id, ignore_return_value = request_element
163
+ request_id, ignore_return_value, retryable = request_element
163
164
  request = api_requests.get_request(request_id)
164
165
  assert request is not None, f'Request with ID {request_id} is None'
165
166
  if request.status == api_requests.RequestStatus.CANCELLED:
@@ -171,8 +172,14 @@ class RequestWorker:
171
172
  # multiple requests can share the same process pid, which may cause
172
173
  # issues with SkyPilot core functions if they rely on the exit of
173
174
  # the process, such as subprocess_daemon.py.
174
- executor.submit_until_success(_request_execution_wrapper,
175
- request_id, ignore_return_value)
175
+ fut = executor.submit_until_success(_request_execution_wrapper,
176
+ request_id, ignore_return_value)
177
+ if retryable:
178
+ # If the task might fail and be retried, start a thread to
179
+ # monitor the future and process retry.
180
+ threading.Thread(target=self.handle_task_result,
181
+ args=(fut, request_element),
182
+ daemon=True).start()
176
183
 
177
184
  logger.info(f'[{self}] Submitted request: {request_id}')
178
185
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
@@ -182,6 +189,16 @@ class RequestWorker:
182
189
  f'{request_id if "request_id" in locals() else ""} '
183
190
  f'{common_utils.format_exception(e, use_bracket=True)}')
184
191
 
192
+ def handle_task_result(self, fut: concurrent.futures.Future,
193
+ request_element: Tuple[str, bool, bool]) -> None:
194
+ try:
195
+ fut.result()
196
+ except exceptions.ExecutionRetryableError as e:
197
+ time.sleep(e.retry_wait_seconds)
198
+ # Reschedule the request.
199
+ queue = _get_queue(self.schedule_type)
200
+ queue.put(request_element)
201
+
185
202
  def run(self) -> None:
186
203
  # Handle the SIGTERM signal to abort the executor process gracefully.
187
204
  proc_group = f'{self.schedule_type.value}'
@@ -316,7 +333,9 @@ def _request_execution_wrapper(request_id: str,
316
333
  func = request_task.entrypoint
317
334
  request_body = request_task.request_body
318
335
 
319
- with log_path.open('w', encoding='utf-8') as f:
336
+ # Append to the log file instead of overwriting it since there might be
337
+ # logs from previous retries.
338
+ with log_path.open('a', encoding='utf-8') as f:
320
339
  # Store copies of the original stdout and stderr file descriptors
321
340
  original_stdout, original_stderr = _redirect_output(f)
322
341
  # Redirect the stdout/stderr before overriding the environment and
@@ -340,6 +359,17 @@ def _request_execution_wrapper(request_id: str,
340
359
  subprocess_utils.kill_children_processes()
341
360
  _restore_output(original_stdout, original_stderr)
342
361
  return
362
+ except exceptions.ExecutionRetryableError as e:
363
+ logger.error(e)
364
+ logger.info(e.hint)
365
+ with api_requests.update_request(request_id) as request_task:
366
+ assert request_task is not None, request_id
367
+ # Retried request will undergo rescheduling and a new execution,
368
+ # clear the pid of the request.
369
+ request_task.pid = None
370
+ # Yield control to the scheduler for uniform handling of retries.
371
+ _restore_output(original_stdout, original_stderr)
372
+ raise
343
373
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
344
374
  api_requests.set_request_failed(request_id, e)
345
375
  _restore_output(original_stdout, original_stderr)
@@ -463,17 +493,17 @@ def prepare_request(
463
493
  return request
464
494
 
465
495
 
466
- def schedule_request(
467
- request_id: str,
468
- request_name: str,
469
- request_body: payloads.RequestBody,
470
- func: Callable[P, Any],
471
- request_cluster_name: Optional[str] = None,
472
- ignore_return_value: bool = False,
473
- schedule_type: api_requests.ScheduleType = (
474
- api_requests.ScheduleType.LONG),
475
- is_skypilot_system: bool = False,
476
- precondition: Optional[preconditions.Precondition] = None) -> None:
496
+ def schedule_request(request_id: str,
497
+ request_name: str,
498
+ request_body: payloads.RequestBody,
499
+ func: Callable[P, Any],
500
+ request_cluster_name: Optional[str] = None,
501
+ ignore_return_value: bool = False,
502
+ schedule_type: api_requests.ScheduleType = (
503
+ api_requests.ScheduleType.LONG),
504
+ is_skypilot_system: bool = False,
505
+ precondition: Optional[preconditions.Precondition] = None,
506
+ retryable: bool = False) -> None:
477
507
  """Enqueue a request to the request queue.
478
508
 
479
509
  Args:
@@ -498,7 +528,7 @@ def schedule_request(
498
528
  request_cluster_name, schedule_type, is_skypilot_system)
499
529
 
500
530
  def enqueue():
501
- input_tuple = (request_id, ignore_return_value)
531
+ input_tuple = (request_id, ignore_return_value, retryable)
502
532
  logger.info(f'Queuing request: {request_id}')
503
533
  _get_queue(schedule_type).put(input_tuple)
504
534
 
@@ -79,6 +79,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
79
79
  # server endpoint on the server side. This avoids the warning at
80
80
  # server-side.
81
81
  config.pop_nested(('api_server',), default_value=None)
82
+ # Remove the admin policy, as the policy has been applied on the client
83
+ # side.
84
+ config.pop_nested(('admin_policy',), default_value=None)
82
85
  return config
83
86
 
84
87
 
@@ -6,6 +6,7 @@ import threading
6
6
  import time
7
7
  from typing import Callable, Dict, Optional, Tuple
8
8
 
9
+ from sky import exceptions
9
10
  from sky.utils import atomic
10
11
  from sky.utils import subprocess_utils
11
12
 
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
67
68
 
68
69
 
69
70
  # Define the worker function outside of the class to avoid pickling self
70
- def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
71
- args, kwargs):
71
+ def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
72
+ """The worker function that is used to run the task.
73
+
74
+ Args:
75
+ fn: The function to run.
76
+ initializer: The initializer function to run before running the task.
77
+ initargs: The arguments to pass to the initializer function.
78
+ result_queue: The queue to put the result and exception into.
79
+ args: The arguments to pass to the function.
80
+ kwargs: The keyword arguments to pass to the function.
81
+ """
72
82
  try:
73
83
  if initializer is not None:
74
84
  initializer(*initargs)
75
- fn(*args, **kwargs)
85
+ result = fn(*args, **kwargs)
86
+ result_queue.put(result)
76
87
  except BaseException as e: # pylint: disable=broad-except
77
- return e
88
+ result_queue.put(e)
78
89
 
79
90
 
80
91
  class DisposableExecutor:
@@ -98,28 +109,52 @@ class DisposableExecutor:
98
109
  self._initializer: Optional[Callable] = initializer
99
110
  self._initargs: Tuple = initargs
100
111
 
101
- def _monitor_worker(self, process: multiprocessing.Process) -> None:
112
+ def _monitor_worker(self, process: multiprocessing.Process,
113
+ future: concurrent.futures.Future,
114
+ result_queue: multiprocessing.Queue) -> None:
102
115
  """Monitor the worker process and cleanup when it's done."""
103
- process.join()
104
- if process.pid:
105
- with self._lock:
106
- if process.pid in self.workers:
107
- del self.workers[process.pid]
108
-
109
- # Submit is not compatible with ProcessPoolExecutor because we does not
110
- # bother to return a Future. Can be improved if needed.
111
- def submit(self, fn, *args, **kwargs) -> bool:
112
- """Submit a task for execution."""
116
+ try:
117
+ process.join()
118
+ if not future.cancelled():
119
+ try:
120
+ # Get result from the queue if process completed
121
+ if not result_queue.empty():
122
+ result = result_queue.get(block=False)
123
+ if isinstance(result, BaseException):
124
+ future.set_exception(result)
125
+ else:
126
+ future.set_result(result)
127
+ else:
128
+ # Process ended but no result
129
+ future.set_result(None)
130
+ except (multiprocessing.TimeoutError, BrokenPipeError,
131
+ EOFError) as e:
132
+ future.set_exception(e)
133
+ finally:
134
+ if process.pid:
135
+ with self._lock:
136
+ if process.pid in self.workers:
137
+ del self.workers[process.pid]
138
+
139
+ def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
140
+ """Submit a task for execution and return a Future."""
141
+ future: concurrent.futures.Future = concurrent.futures.Future()
142
+
113
143
  if self._shutdown:
114
- return False
144
+ raise RuntimeError('Cannot submit task after executor is shutdown')
145
+
115
146
  with self._lock:
116
147
  if (self.max_workers is not None and
117
148
  len(self.workers) >= self.max_workers):
118
- return False
149
+ raise exceptions.ExecutionPoolFullError(
150
+ 'Maximum workers reached')
119
151
 
152
+ result_queue: multiprocessing.Queue = multiprocessing.Queue()
120
153
  process = multiprocessing.Process(target=_disposable_worker,
121
154
  args=(fn, self._initializer,
122
- self._initargs, args, kwargs))
155
+ self._initargs, result_queue,
156
+ args, kwargs))
157
+ process.daemon = True
123
158
  process.start()
124
159
 
125
160
  with self._lock:
@@ -128,13 +163,13 @@ class DisposableExecutor:
128
163
  raise RuntimeError('Failed to start process')
129
164
  self.workers[pid] = process
130
165
 
131
- # Start monitor thread to cleanup the worker process when it's done.
166
+ # Start monitor thread to cleanup the worker process when it's done
132
167
  monitor_thread = threading.Thread(target=self._monitor_worker,
133
- args=(process,),
168
+ args=(process, future, result_queue),
134
169
  daemon=True)
135
170
  monitor_thread.start()
136
171
 
137
- return True
172
+ return future
138
173
 
139
174
  def has_idle_workers(self) -> bool:
140
175
  """Check if there are any idle workers."""
@@ -173,12 +208,14 @@ class BurstableExecutor:
173
208
  self._burst_executor = DisposableExecutor(max_workers=burst_workers,
174
209
  **kwargs)
175
210
 
176
- def submit_until_success(self, fn, *args, **kwargs):
211
+ def submit_until_success(self, fn, *args,
212
+ **kwargs) -> concurrent.futures.Future:
177
213
  """Submit a task for execution until success.
178
214
 
179
215
  Prioritizes submitting to the guaranteed pool. If no idle workers
180
216
  are available in the guaranteed pool, it will submit to the burst
181
- pool.
217
+ pool. If the burst pool is full, it will retry the whole process until
218
+ the task is submitted successfully.
182
219
  TODO(aylei): this is coupled with executor.RequestWorker since we
183
220
  know the worker is dedicated to request scheduling and it either
184
221
  blocks on request polling or request submitting. So it is no harm
@@ -188,17 +225,20 @@ class BurstableExecutor:
188
225
 
189
226
  while True:
190
227
  if self._executor is not None and self._executor.has_idle_workers():
191
- self._executor.submit(fn, *args, **kwargs)
192
- break
228
+ logger.info('Submitting to the guaranteed pool')
229
+ return self._executor.submit(fn, *args, **kwargs)
193
230
  if (self._burst_executor is not None and
194
231
  self._burst_executor.has_idle_workers()):
195
- self._burst_executor.submit(fn, *args, **kwargs)
196
- break
232
+ try:
233
+ fut = self._burst_executor.submit(fn, *args, **kwargs)
234
+ return fut
235
+ except exceptions.ExecutionPoolFullError:
236
+ # The burst pool is full, try the next candidate.
237
+ pass
197
238
  if self._executor is not None:
198
239
  # No idle workers in either pool, still queue the request
199
240
  # to the guaranteed pool to keep behavior consistent.
200
- self._executor.submit(fn, *args, **kwargs)
201
- break
241
+ return self._executor.submit(fn, *args, **kwargs)
202
242
  logger.debug('No guaranteed pool set and the burst pool is full, '
203
243
  'retry later.')
204
244
  time.sleep(0.1)
sky/server/server.py CHANGED
@@ -693,6 +693,7 @@ async def launch(launch_body: payloads.LaunchBody,
693
693
  func=execution.launch,
694
694
  schedule_type=requests_lib.ScheduleType.LONG,
695
695
  request_cluster_name=launch_body.cluster_name,
696
+ retryable=launch_body.retry_until_up,
696
697
  )
697
698
 
698
699