skypilot-nightly 1.0.0.dev20250609__py3-none-any.whl → 1.0.0.dev20250611__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +134 -5
- sky/authentication.py +1 -7
- sky/backends/cloud_vm_ray_backend.py +9 -20
- sky/benchmark/benchmark_state.py +39 -1
- sky/cli.py +3 -5
- sky/client/cli.py +3 -5
- sky/client/sdk.py +49 -4
- sky/clouds/kubernetes.py +15 -24
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
- sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
- sky/dashboard/out/_next/static/chunks/600.15a0009177e86b86.js +16 -0
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
- sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
- sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-ab185187a63f9cdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
- sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-7bbd9d39d6f9a98a.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-208a9812ab4f61c9.js +1 -0
- sky/dashboard/out/_next/static/css/{8b1c8321d4c02372.css → 5d71bfc09f184bab.css} +1 -1
- sky/dashboard/out/_next/static/zJqasksBQ3HcqMpA2wTUZ/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +18 -0
- sky/global_user_state.py +181 -74
- sky/jobs/client/sdk.py +29 -21
- sky/jobs/scheduler.py +4 -5
- sky/jobs/state.py +104 -11
- sky/jobs/utils.py +5 -5
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/utils.py +106 -7
- sky/serve/client/sdk.py +56 -45
- sky/server/common.py +1 -5
- sky/server/requests/executor.py +50 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/process.py +69 -29
- sky/server/server.py +1 -0
- sky/server/stream_utils.py +111 -55
- sky/skylet/constants.py +1 -2
- sky/skylet/job_lib.py +95 -40
- sky/skypilot_config.py +99 -25
- sky/users/permission.py +34 -17
- sky/utils/admin_policy_utils.py +41 -16
- sky/utils/context.py +21 -1
- sky/utils/controller_utils.py +16 -1
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
- sky/utils/schemas.py +11 -3
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/RECORD +92 -81
- sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
- sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-680c19413b8f808b.js +0 -1
- sky/dashboard/out/_next/static/chunks/63-e2d7b1e75e67c713.js +0 -66
- sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-16c7194621b2b512.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-affc52adf5403a3a.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-aed916d5b02d2d63.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5f16aba5794ee8e7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-d31688d3e52736dd.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e7d8710a9b0491e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3c674e5d970e05cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-3aac7a015c6eede1.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-46d2e4ad6c487260.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-7013d816a2a0e76c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-f7f0c9e156d328bc.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-87e60396c376292f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-9355a0f13d1db61d.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-9a749cca1813bd27.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-8eeb628e03902f1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8fbcc5ab4af316d0.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
- sky/dashboard/out/_next/static/xos0euNCptbGAM7_Q3Acl/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{xos0euNCptbGAM7_Q3Acl → zJqasksBQ3HcqMpA2wTUZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250611.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
2
2
|
import dataclasses
|
3
3
|
import functools
|
4
|
+
import hashlib
|
4
5
|
import json
|
5
6
|
import math
|
6
7
|
import os
|
@@ -1555,11 +1556,11 @@ def is_kubeconfig_exec_auth(
|
|
1555
1556
|
== schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
|
1556
1557
|
ctx_name = context_obj['name']
|
1557
1558
|
exec_msg = ('exec-based authentication is used for '
|
1558
|
-
f'Kubernetes context {ctx_name!r}.'
|
1559
|
-
'
|
1560
|
-
'
|
1561
|
-
'
|
1562
|
-
'for running pods by setting the following in '
|
1559
|
+
f'Kubernetes context {ctx_name!r}. '
|
1560
|
+
'Make sure that the corresponding cloud provider is '
|
1561
|
+
'also enabled through `sky check` (e.g.: GCP for GKE). '
|
1562
|
+
'Alternatively, configure SkyPilot to create a service '
|
1563
|
+
'account for running pods by setting the following in '
|
1563
1564
|
'~/.sky/config.yaml:\n'
|
1564
1565
|
' kubernetes:\n'
|
1565
1566
|
' remote_identity: SERVICE_ACCOUNT\n'
|
@@ -2877,8 +2878,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
|
|
2877
2878
|
context = provider_config.get('context',
|
2878
2879
|
get_current_kube_config_context_name())
|
2879
2880
|
if context == kubernetes.in_cluster_context_name():
|
2880
|
-
# If the context (also used as the region) is in-cluster, we need
|
2881
|
-
#
|
2881
|
+
# If the context (also used as the region) is in-cluster, we need
|
2882
|
+
# to use in-cluster auth by setting the context to None.
|
2882
2883
|
context = None
|
2883
2884
|
return context
|
2884
2885
|
|
@@ -3135,3 +3136,101 @@ def get_kubeconfig_paths() -> List[str]:
|
|
3135
3136
|
for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
|
3136
3137
|
expanded.append(os.path.expanduser(path))
|
3137
3138
|
return expanded
|
3139
|
+
|
3140
|
+
|
3141
|
+
def format_kubeconfig_exec_auth(config: Any,
|
3142
|
+
output_path: str,
|
3143
|
+
inject_wrapper: bool = True) -> bool:
|
3144
|
+
"""Reformat the kubeconfig so that exec-based authentication can be used
|
3145
|
+
with SkyPilot. Will create a new kubeconfig file under <output_path>
|
3146
|
+
regardless of whether a change has been made.
|
3147
|
+
|
3148
|
+
kubectl internally strips all environment variables except for system
|
3149
|
+
defaults. If `inject_wrapper` is true, a wrapper executable is applied
|
3150
|
+
to inject the relevant PATH information before exec-auth is executed.
|
3151
|
+
|
3152
|
+
Contents of sky-kube-exec-wrapper:
|
3153
|
+
|
3154
|
+
#!/bin/bash
|
3155
|
+
export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
|
3156
|
+
exec "$@"
|
3157
|
+
|
3158
|
+
refer to `skylet/constants.py` for more information.
|
3159
|
+
|
3160
|
+
Args:
|
3161
|
+
config (dict): kubeconfig parsed by yaml.safe_load
|
3162
|
+
output_path (str): Path where the potentially modified kubeconfig file
|
3163
|
+
will be saved
|
3164
|
+
inject_wrapper (bool): Whether to inject the wrapper script
|
3165
|
+
Returns: whether config was updated, for logging purposes
|
3166
|
+
"""
|
3167
|
+
updated = False
|
3168
|
+
for user in config.get('users', []):
|
3169
|
+
exec_info = user.get('user', {}).get('exec', {})
|
3170
|
+
current_command = exec_info.get('command', '')
|
3171
|
+
|
3172
|
+
if current_command:
|
3173
|
+
# Strip the path and keep only the executable name
|
3174
|
+
executable = os.path.basename(current_command)
|
3175
|
+
if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
|
3176
|
+
# we don't want this happening recursively.
|
3177
|
+
continue
|
3178
|
+
|
3179
|
+
if inject_wrapper:
|
3180
|
+
exec_info[
|
3181
|
+
'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
|
3182
|
+
if exec_info.get('args') is None:
|
3183
|
+
exec_info['args'] = []
|
3184
|
+
exec_info['args'].insert(0, executable)
|
3185
|
+
updated = True
|
3186
|
+
elif executable != current_command:
|
3187
|
+
exec_info['command'] = executable
|
3188
|
+
updated = True
|
3189
|
+
|
3190
|
+
# Handle Nebius kubeconfigs: change --profile to 'sky'
|
3191
|
+
if executable == 'nebius':
|
3192
|
+
args = exec_info.get('args', [])
|
3193
|
+
if args and '--profile' in args:
|
3194
|
+
try:
|
3195
|
+
profile_index = args.index('--profile')
|
3196
|
+
if profile_index + 1 < len(args):
|
3197
|
+
old_profile = args[profile_index + 1]
|
3198
|
+
if old_profile != 'sky':
|
3199
|
+
args[profile_index + 1] = 'sky'
|
3200
|
+
updated = True
|
3201
|
+
except ValueError:
|
3202
|
+
pass
|
3203
|
+
|
3204
|
+
os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
|
3205
|
+
with open(output_path, 'w', encoding='utf-8') as file:
|
3206
|
+
yaml.safe_dump(config, file)
|
3207
|
+
|
3208
|
+
return updated
|
3209
|
+
|
3210
|
+
|
3211
|
+
def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
3212
|
+
"""Reformat the kubeconfig file or retrieve it from cache if it has already
|
3213
|
+
been formatted before. Store it in the cache directory if necessary.
|
3214
|
+
|
3215
|
+
Having a cache for this is good if users spawn an extreme number of jobs
|
3216
|
+
concurrently.
|
3217
|
+
|
3218
|
+
Args:
|
3219
|
+
kubeconfig_path (str): kubeconfig path
|
3220
|
+
Returns: updated kubeconfig path
|
3221
|
+
"""
|
3222
|
+
# TODO(kyuds): GC cache files
|
3223
|
+
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
3224
|
+
config = yaml.safe_load(file)
|
3225
|
+
normalized = yaml.dump(config, sort_keys=True)
|
3226
|
+
hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
|
3227
|
+
path = os.path.expanduser(
|
3228
|
+
f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
|
3229
|
+
)
|
3230
|
+
|
3231
|
+
# If we have already converted the same kubeconfig before, just return.
|
3232
|
+
if os.path.isfile(path):
|
3233
|
+
return path
|
3234
|
+
|
3235
|
+
format_kubeconfig_exec_auth(config, path)
|
3236
|
+
return path
|
sky/serve/client/sdk.py
CHANGED
@@ -10,6 +10,8 @@ from sky.client import common as client_common
|
|
10
10
|
from sky.server import common as server_common
|
11
11
|
from sky.server.requests import payloads
|
12
12
|
from sky.usage import usage_lib
|
13
|
+
from sky.utils import admin_policy_utils
|
14
|
+
from sky.utils import context
|
13
15
|
from sky.utils import dag_utils
|
14
16
|
|
15
17
|
if typing.TYPE_CHECKING:
|
@@ -23,6 +25,7 @@ else:
|
|
23
25
|
requests = adaptors_common.LazyImport('requests')
|
24
26
|
|
25
27
|
|
28
|
+
@context.contextual
|
26
29
|
@usage_lib.entrypoint
|
27
30
|
@server_common.check_server_healthy_or_start
|
28
31
|
def up(
|
@@ -55,30 +58,36 @@ def up(
|
|
55
58
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
56
59
|
|
57
60
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
61
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
62
|
+
dag, at_client_side=True) as dag:
|
63
|
+
sdk.validate(dag)
|
64
|
+
request_id = sdk.optimize(dag)
|
65
|
+
sdk.stream_and_get(request_id)
|
66
|
+
if _need_confirmation:
|
67
|
+
prompt = f'Launching a new service {service_name!r}. Proceed?'
|
68
|
+
if prompt is not None:
|
69
|
+
click.confirm(prompt,
|
70
|
+
default=True,
|
71
|
+
abort=True,
|
72
|
+
show_default=True)
|
73
|
+
|
74
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
75
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
76
|
+
|
77
|
+
body = payloads.ServeUpBody(
|
78
|
+
task=dag_str,
|
79
|
+
service_name=service_name,
|
80
|
+
)
|
81
|
+
response = requests.post(
|
82
|
+
f'{server_common.get_server_url()}/serve/up',
|
83
|
+
json=json.loads(body.model_dump_json()),
|
84
|
+
timeout=(5, None),
|
85
|
+
cookies=server_common.get_api_cookie_jar(),
|
86
|
+
)
|
87
|
+
return server_common.get_request_id(response)
|
80
88
|
|
81
89
|
|
90
|
+
@context.contextual
|
82
91
|
@usage_lib.entrypoint
|
83
92
|
@server_common.check_server_healthy_or_start
|
84
93
|
def update(
|
@@ -112,30 +121,32 @@ def update(
|
|
112
121
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
113
122
|
|
114
123
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
124
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
125
|
+
dag, at_client_side=True) as dag:
|
126
|
+
sdk.validate(dag)
|
127
|
+
request_id = sdk.optimize(dag)
|
128
|
+
sdk.stream_and_get(request_id)
|
129
|
+
if _need_confirmation:
|
130
|
+
click.confirm(f'Updating service {service_name!r}. Proceed?',
|
131
|
+
default=True,
|
132
|
+
abort=True,
|
133
|
+
show_default=True)
|
134
|
+
|
135
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
136
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
137
|
+
body = payloads.ServeUpdateBody(
|
138
|
+
task=dag_str,
|
139
|
+
service_name=service_name,
|
140
|
+
mode=mode,
|
141
|
+
)
|
131
142
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
143
|
+
response = requests.post(
|
144
|
+
f'{server_common.get_server_url()}/serve/update',
|
145
|
+
json=json.loads(body.model_dump_json()),
|
146
|
+
timeout=(5, None),
|
147
|
+
cookies=server_common.get_api_cookie_jar(),
|
148
|
+
)
|
149
|
+
return server_common.get_request_id(response)
|
139
150
|
|
140
151
|
|
141
152
|
@usage_lib.entrypoint
|
sky/server/common.py
CHANGED
@@ -420,11 +420,7 @@ def _start_api_server(deploy: bool = False,
|
|
420
420
|
dashboard_msg += (
|
421
421
|
'Dashboard may be stale when installed from source, '
|
422
422
|
'to rebuild: npm --prefix sky/dashboard install '
|
423
|
-
'&& npm --prefix sky/dashboard run build
|
424
|
-
dashboard_msg += (
|
425
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
426
|
-
f'Dashboard: {get_dashboard_url(server_url)}')
|
427
|
-
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
423
|
+
'&& npm --prefix sky/dashboard run build')
|
428
424
|
logger.info(
|
429
425
|
ux_utils.finishing_message(
|
430
426
|
f'SkyPilot API server started. {dashboard_msg}'))
|
sky/server/requests/executor.py
CHANGED
@@ -19,6 +19,7 @@ The number of the workers is determined by the system resources.
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
20
|
"""
|
21
21
|
import asyncio
|
22
|
+
import concurrent.futures
|
22
23
|
import contextlib
|
23
24
|
import multiprocessing
|
24
25
|
import os
|
@@ -93,21 +94,21 @@ class RequestQueue:
|
|
93
94
|
else:
|
94
95
|
raise RuntimeError(f'Invalid queue backend: {backend}')
|
95
96
|
|
96
|
-
def put(self, request: Tuple[str, bool]) -> None:
|
97
|
+
def put(self, request: Tuple[str, bool, bool]) -> None:
|
97
98
|
"""Put and request to the queue.
|
98
99
|
|
99
100
|
Args:
|
100
|
-
request: A tuple of request_id and
|
101
|
+
request: A tuple of request_id, ignore_return_value, and retryable.
|
101
102
|
"""
|
102
103
|
self.queue.put(request) # type: ignore
|
103
104
|
|
104
|
-
def get(self) -> Optional[Tuple[str, bool]]:
|
105
|
+
def get(self) -> Optional[Tuple[str, bool, bool]]:
|
105
106
|
"""Get a request from the queue.
|
106
107
|
|
107
108
|
It is non-blocking if the queue is empty, and returns None.
|
108
109
|
|
109
110
|
Returns:
|
110
|
-
A tuple of request_id and
|
111
|
+
A tuple of request_id, ignore_return_value, and retryable.
|
111
112
|
"""
|
112
113
|
try:
|
113
114
|
return self.queue.get(block=False)
|
@@ -159,7 +160,7 @@ class RequestWorker:
|
|
159
160
|
if request_element is None:
|
160
161
|
time.sleep(0.1)
|
161
162
|
return
|
162
|
-
request_id, ignore_return_value = request_element
|
163
|
+
request_id, ignore_return_value, retryable = request_element
|
163
164
|
request = api_requests.get_request(request_id)
|
164
165
|
assert request is not None, f'Request with ID {request_id} is None'
|
165
166
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
@@ -171,8 +172,14 @@ class RequestWorker:
|
|
171
172
|
# multiple requests can share the same process pid, which may cause
|
172
173
|
# issues with SkyPilot core functions if they rely on the exit of
|
173
174
|
# the process, such as subprocess_daemon.py.
|
174
|
-
executor.submit_until_success(_request_execution_wrapper,
|
175
|
-
|
175
|
+
fut = executor.submit_until_success(_request_execution_wrapper,
|
176
|
+
request_id, ignore_return_value)
|
177
|
+
if retryable:
|
178
|
+
# If the task might fail and be retried, start a thread to
|
179
|
+
# monitor the future and process retry.
|
180
|
+
threading.Thread(target=self.handle_task_result,
|
181
|
+
args=(fut, request_element),
|
182
|
+
daemon=True).start()
|
176
183
|
|
177
184
|
logger.info(f'[{self}] Submitted request: {request_id}')
|
178
185
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
@@ -182,6 +189,16 @@ class RequestWorker:
|
|
182
189
|
f'{request_id if "request_id" in locals() else ""} '
|
183
190
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
184
191
|
|
192
|
+
def handle_task_result(self, fut: concurrent.futures.Future,
|
193
|
+
request_element: Tuple[str, bool, bool]) -> None:
|
194
|
+
try:
|
195
|
+
fut.result()
|
196
|
+
except exceptions.ExecutionRetryableError as e:
|
197
|
+
time.sleep(e.retry_wait_seconds)
|
198
|
+
# Reschedule the request.
|
199
|
+
queue = _get_queue(self.schedule_type)
|
200
|
+
queue.put(request_element)
|
201
|
+
|
185
202
|
def run(self) -> None:
|
186
203
|
# Handle the SIGTERM signal to abort the executor process gracefully.
|
187
204
|
proc_group = f'{self.schedule_type.value}'
|
@@ -316,7 +333,9 @@ def _request_execution_wrapper(request_id: str,
|
|
316
333
|
func = request_task.entrypoint
|
317
334
|
request_body = request_task.request_body
|
318
335
|
|
319
|
-
|
336
|
+
# Append to the log file instead of overwriting it since there might be
|
337
|
+
# logs from previous retries.
|
338
|
+
with log_path.open('a', encoding='utf-8') as f:
|
320
339
|
# Store copies of the original stdout and stderr file descriptors
|
321
340
|
original_stdout, original_stderr = _redirect_output(f)
|
322
341
|
# Redirect the stdout/stderr before overriding the environment and
|
@@ -340,6 +359,17 @@ def _request_execution_wrapper(request_id: str,
|
|
340
359
|
subprocess_utils.kill_children_processes()
|
341
360
|
_restore_output(original_stdout, original_stderr)
|
342
361
|
return
|
362
|
+
except exceptions.ExecutionRetryableError as e:
|
363
|
+
logger.error(e)
|
364
|
+
logger.info(e.hint)
|
365
|
+
with api_requests.update_request(request_id) as request_task:
|
366
|
+
assert request_task is not None, request_id
|
367
|
+
# Retried request will undergo rescheduling and a new execution,
|
368
|
+
# clear the pid of the request.
|
369
|
+
request_task.pid = None
|
370
|
+
# Yield control to the scheduler for uniform handling of retries.
|
371
|
+
_restore_output(original_stdout, original_stderr)
|
372
|
+
raise
|
343
373
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
344
374
|
api_requests.set_request_failed(request_id, e)
|
345
375
|
_restore_output(original_stdout, original_stderr)
|
@@ -463,17 +493,17 @@ def prepare_request(
|
|
463
493
|
return request
|
464
494
|
|
465
495
|
|
466
|
-
def schedule_request(
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
496
|
+
def schedule_request(request_id: str,
|
497
|
+
request_name: str,
|
498
|
+
request_body: payloads.RequestBody,
|
499
|
+
func: Callable[P, Any],
|
500
|
+
request_cluster_name: Optional[str] = None,
|
501
|
+
ignore_return_value: bool = False,
|
502
|
+
schedule_type: api_requests.ScheduleType = (
|
503
|
+
api_requests.ScheduleType.LONG),
|
504
|
+
is_skypilot_system: bool = False,
|
505
|
+
precondition: Optional[preconditions.Precondition] = None,
|
506
|
+
retryable: bool = False) -> None:
|
477
507
|
"""Enqueue a request to the request queue.
|
478
508
|
|
479
509
|
Args:
|
@@ -498,7 +528,7 @@ def schedule_request(
|
|
498
528
|
request_cluster_name, schedule_type, is_skypilot_system)
|
499
529
|
|
500
530
|
def enqueue():
|
501
|
-
input_tuple = (request_id, ignore_return_value)
|
531
|
+
input_tuple = (request_id, ignore_return_value, retryable)
|
502
532
|
logger.info(f'Queuing request: {request_id}')
|
503
533
|
_get_queue(schedule_type).put(input_tuple)
|
504
534
|
|
sky/server/requests/payloads.py
CHANGED
@@ -79,6 +79,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
|
|
79
79
|
# server endpoint on the server side. This avoids the warning at
|
80
80
|
# server-side.
|
81
81
|
config.pop_nested(('api_server',), default_value=None)
|
82
|
+
# Remove the admin policy, as the policy has been applied on the client
|
83
|
+
# side.
|
84
|
+
config.pop_nested(('admin_policy',), default_value=None)
|
82
85
|
return config
|
83
86
|
|
84
87
|
|
sky/server/requests/process.py
CHANGED
@@ -6,6 +6,7 @@ import threading
|
|
6
6
|
import time
|
7
7
|
from typing import Callable, Dict, Optional, Tuple
|
8
8
|
|
9
|
+
from sky import exceptions
|
9
10
|
from sky.utils import atomic
|
10
11
|
from sky.utils import subprocess_utils
|
11
12
|
|
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
|
|
67
68
|
|
68
69
|
|
69
70
|
# Define the worker function outside of the class to avoid pickling self
|
70
|
-
def _disposable_worker(fn, initializer
|
71
|
-
|
71
|
+
def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
|
72
|
+
"""The worker function that is used to run the task.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
fn: The function to run.
|
76
|
+
initializer: The initializer function to run before running the task.
|
77
|
+
initargs: The arguments to pass to the initializer function.
|
78
|
+
result_queue: The queue to put the result and exception into.
|
79
|
+
args: The arguments to pass to the function.
|
80
|
+
kwargs: The keyword arguments to pass to the function.
|
81
|
+
"""
|
72
82
|
try:
|
73
83
|
if initializer is not None:
|
74
84
|
initializer(*initargs)
|
75
|
-
fn(*args, **kwargs)
|
85
|
+
result = fn(*args, **kwargs)
|
86
|
+
result_queue.put(result)
|
76
87
|
except BaseException as e: # pylint: disable=broad-except
|
77
|
-
|
88
|
+
result_queue.put(e)
|
78
89
|
|
79
90
|
|
80
91
|
class DisposableExecutor:
|
@@ -98,28 +109,52 @@ class DisposableExecutor:
|
|
98
109
|
self._initializer: Optional[Callable] = initializer
|
99
110
|
self._initargs: Tuple = initargs
|
100
111
|
|
101
|
-
def _monitor_worker(self, process: multiprocessing.Process
|
112
|
+
def _monitor_worker(self, process: multiprocessing.Process,
|
113
|
+
future: concurrent.futures.Future,
|
114
|
+
result_queue: multiprocessing.Queue) -> None:
|
102
115
|
"""Monitor the worker process and cleanup when it's done."""
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
116
|
+
try:
|
117
|
+
process.join()
|
118
|
+
if not future.cancelled():
|
119
|
+
try:
|
120
|
+
# Get result from the queue if process completed
|
121
|
+
if not result_queue.empty():
|
122
|
+
result = result_queue.get(block=False)
|
123
|
+
if isinstance(result, BaseException):
|
124
|
+
future.set_exception(result)
|
125
|
+
else:
|
126
|
+
future.set_result(result)
|
127
|
+
else:
|
128
|
+
# Process ended but no result
|
129
|
+
future.set_result(None)
|
130
|
+
except (multiprocessing.TimeoutError, BrokenPipeError,
|
131
|
+
EOFError) as e:
|
132
|
+
future.set_exception(e)
|
133
|
+
finally:
|
134
|
+
if process.pid:
|
135
|
+
with self._lock:
|
136
|
+
if process.pid in self.workers:
|
137
|
+
del self.workers[process.pid]
|
138
|
+
|
139
|
+
def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
|
140
|
+
"""Submit a task for execution and return a Future."""
|
141
|
+
future: concurrent.futures.Future = concurrent.futures.Future()
|
142
|
+
|
113
143
|
if self._shutdown:
|
114
|
-
|
144
|
+
raise RuntimeError('Cannot submit task after executor is shutdown')
|
145
|
+
|
115
146
|
with self._lock:
|
116
147
|
if (self.max_workers is not None and
|
117
148
|
len(self.workers) >= self.max_workers):
|
118
|
-
|
149
|
+
raise exceptions.ExecutionPoolFullError(
|
150
|
+
'Maximum workers reached')
|
119
151
|
|
152
|
+
result_queue: multiprocessing.Queue = multiprocessing.Queue()
|
120
153
|
process = multiprocessing.Process(target=_disposable_worker,
|
121
154
|
args=(fn, self._initializer,
|
122
|
-
self._initargs,
|
155
|
+
self._initargs, result_queue,
|
156
|
+
args, kwargs))
|
157
|
+
process.daemon = True
|
123
158
|
process.start()
|
124
159
|
|
125
160
|
with self._lock:
|
@@ -128,13 +163,13 @@ class DisposableExecutor:
|
|
128
163
|
raise RuntimeError('Failed to start process')
|
129
164
|
self.workers[pid] = process
|
130
165
|
|
131
|
-
# Start monitor thread to cleanup the worker process when it's done
|
166
|
+
# Start monitor thread to cleanup the worker process when it's done
|
132
167
|
monitor_thread = threading.Thread(target=self._monitor_worker,
|
133
|
-
args=(process,),
|
168
|
+
args=(process, future, result_queue),
|
134
169
|
daemon=True)
|
135
170
|
monitor_thread.start()
|
136
171
|
|
137
|
-
return
|
172
|
+
return future
|
138
173
|
|
139
174
|
def has_idle_workers(self) -> bool:
|
140
175
|
"""Check if there are any idle workers."""
|
@@ -173,12 +208,14 @@ class BurstableExecutor:
|
|
173
208
|
self._burst_executor = DisposableExecutor(max_workers=burst_workers,
|
174
209
|
**kwargs)
|
175
210
|
|
176
|
-
def submit_until_success(self, fn, *args,
|
211
|
+
def submit_until_success(self, fn, *args,
|
212
|
+
**kwargs) -> concurrent.futures.Future:
|
177
213
|
"""Submit a task for execution until success.
|
178
214
|
|
179
215
|
Prioritizes submitting to the guaranteed pool. If no idle workers
|
180
216
|
are available in the guaranteed pool, it will submit to the burst
|
181
|
-
pool.
|
217
|
+
pool. If the burst pool is full, it will retry the whole process until
|
218
|
+
the task is submitted successfully.
|
182
219
|
TODO(aylei): this is coupled with executor.RequestWorker since we
|
183
220
|
know the worker is dedicated to request scheduling and it either
|
184
221
|
blocks on request polling or request submitting. So it is no harm
|
@@ -188,17 +225,20 @@ class BurstableExecutor:
|
|
188
225
|
|
189
226
|
while True:
|
190
227
|
if self._executor is not None and self._executor.has_idle_workers():
|
191
|
-
|
192
|
-
|
228
|
+
logger.info('Submitting to the guaranteed pool')
|
229
|
+
return self._executor.submit(fn, *args, **kwargs)
|
193
230
|
if (self._burst_executor is not None and
|
194
231
|
self._burst_executor.has_idle_workers()):
|
195
|
-
|
196
|
-
|
232
|
+
try:
|
233
|
+
fut = self._burst_executor.submit(fn, *args, **kwargs)
|
234
|
+
return fut
|
235
|
+
except exceptions.ExecutionPoolFullError:
|
236
|
+
# The burst pool is full, try the next candidate.
|
237
|
+
pass
|
197
238
|
if self._executor is not None:
|
198
239
|
# No idle workers in either pool, still queue the request
|
199
240
|
# to the guaranteed pool to keep behavior consistent.
|
200
|
-
self._executor.submit(fn, *args, **kwargs)
|
201
|
-
break
|
241
|
+
return self._executor.submit(fn, *args, **kwargs)
|
202
242
|
logger.debug('No guaranteed pool set and the burst pool is full, '
|
203
243
|
'retry later.')
|
204
244
|
time.sleep(0.1)
|
sky/server/server.py
CHANGED