skypilot-nightly 1.0.0.dev20250609__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +3 -0
  3. sky/authentication.py +1 -7
  4. sky/backends/cloud_vm_ray_backend.py +9 -20
  5. sky/cli.py +2 -4
  6. sky/client/cli.py +2 -4
  7. sky/client/sdk.py +49 -4
  8. sky/clouds/kubernetes.py +15 -24
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
  18. sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
  21. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
  25. sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
  26. sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
  27. sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
  31. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
  47. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  48. sky/dashboard/out/clusters/[cluster].html +1 -1
  49. sky/dashboard/out/clusters.html +1 -1
  50. sky/dashboard/out/config.html +1 -1
  51. sky/dashboard/out/index.html +1 -1
  52. sky/dashboard/out/infra/[context].html +1 -1
  53. sky/dashboard/out/infra.html +1 -1
  54. sky/dashboard/out/jobs/[job].html +1 -1
  55. sky/dashboard/out/jobs.html +1 -1
  56. sky/dashboard/out/users.html +1 -1
  57. sky/dashboard/out/workspace/new.html +1 -1
  58. sky/dashboard/out/workspaces/[name].html +1 -1
  59. sky/dashboard/out/workspaces.html +1 -1
  60. sky/exceptions.py +18 -0
  61. sky/global_user_state.py +181 -74
  62. sky/jobs/client/sdk.py +29 -21
  63. sky/provision/kubernetes/constants.py +9 -0
  64. sky/provision/kubernetes/utils.py +106 -7
  65. sky/serve/client/sdk.py +56 -45
  66. sky/server/common.py +1 -5
  67. sky/server/requests/executor.py +50 -20
  68. sky/server/requests/payloads.py +3 -0
  69. sky/server/requests/process.py +69 -29
  70. sky/server/server.py +1 -0
  71. sky/server/stream_utils.py +111 -55
  72. sky/skylet/constants.py +1 -2
  73. sky/skypilot_config.py +99 -25
  74. sky/users/permission.py +1 -1
  75. sky/utils/admin_policy_utils.py +9 -3
  76. sky/utils/context.py +21 -1
  77. sky/utils/controller_utils.py +16 -1
  78. sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
  79. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
  80. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +85 -74
  81. sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
  82. sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
  84. sky/dashboard/out/_next/static/chunks/470-680c19413b8f808b.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/63-e2d7b1e75e67c713.js +0 -66
  86. sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/843-16c7194621b2b512.js +0 -11
  88. sky/dashboard/out/_next/static/chunks/856-affc52adf5403a3a.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/973-aed916d5b02d2d63.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/pages/_app-5f16aba5794ee8e7.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-d31688d3e52736dd.js +0 -6
  93. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e7d8710a9b0491e5.js +0 -6
  94. sky/dashboard/out/_next/static/chunks/pages/clusters-3c674e5d970e05cb.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/pages/config-3aac7a015c6eede1.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-46d2e4ad6c487260.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/pages/infra-7013d816a2a0e76c.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-f7f0c9e156d328bc.js +0 -16
  99. sky/dashboard/out/_next/static/chunks/pages/jobs-87e60396c376292f.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/pages/users-9355a0f13d1db61d.js +0 -16
  101. sky/dashboard/out/_next/static/chunks/pages/workspace/new-9a749cca1813bd27.js +0 -1
  102. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-8eeb628e03902f1b.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/pages/workspaces-8fbcc5ab4af316d0.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
  105. sky/dashboard/out/_next/static/xos0euNCptbGAM7_Q3Acl/_buildManifest.js +0 -1
  106. /sky/dashboard/out/_next/static/{xos0euNCptbGAM7_Q3Acl → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
  107. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
  108. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
  109. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
  110. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py CHANGED
@@ -14,7 +14,9 @@ from sky.server import common as server_common
14
14
  from sky.server.requests import payloads
15
15
  from sky.skylet import constants
16
16
  from sky.usage import usage_lib
17
+ from sky.utils import admin_policy_utils
17
18
  from sky.utils import common_utils
19
+ from sky.utils import context
18
20
  from sky.utils import dag_utils
19
21
 
20
22
  if typing.TYPE_CHECKING:
@@ -29,6 +31,7 @@ else:
29
31
  logger = sky_logging.init_logger(__name__)
30
32
 
31
33
 
34
+ @context.contextual
32
35
  @usage_lib.entrypoint
33
36
  @server_common.check_server_healthy_or_start
34
37
  def launch(
@@ -65,27 +68,32 @@ def launch(
65
68
  """
66
69
 
67
70
  dag = dag_utils.convert_entrypoint_to_dag(task)
68
- sdk.validate(dag)
69
- if _need_confirmation:
70
- request_id = sdk.optimize(dag)
71
- sdk.stream_and_get(request_id)
72
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
73
- if prompt is not None:
74
- click.confirm(prompt, default=True, abort=True, show_default=True)
75
-
76
- dag = client_common.upload_mounts_to_api_server(dag)
77
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
78
- body = payloads.JobsLaunchBody(
79
- task=dag_str,
80
- name=name,
81
- )
82
- response = requests.post(
83
- f'{server_common.get_server_url()}/jobs/launch',
84
- json=json.loads(body.model_dump_json()),
85
- timeout=(5, None),
86
- cookies=server_common.get_api_cookie_jar(),
87
- )
88
- return server_common.get_request_id(response)
71
+ with admin_policy_utils.apply_and_use_config_in_current_request(
72
+ dag, at_client_side=True) as dag:
73
+ sdk.validate(dag)
74
+ if _need_confirmation:
75
+ request_id = sdk.optimize(dag)
76
+ sdk.stream_and_get(request_id)
77
+ prompt = f'Launching a managed job {dag.name!r}. Proceed?'
78
+ if prompt is not None:
79
+ click.confirm(prompt,
80
+ default=True,
81
+ abort=True,
82
+ show_default=True)
83
+
84
+ dag = client_common.upload_mounts_to_api_server(dag)
85
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
86
+ body = payloads.JobsLaunchBody(
87
+ task=dag_str,
88
+ name=name,
89
+ )
90
+ response = requests.post(
91
+ f'{server_common.get_server_url()}/jobs/launch',
92
+ json=json.loads(body.model_dump_json()),
93
+ timeout=(5, None),
94
+ cookies=server_common.get_api_cookie_jar(),
95
+ )
96
+ return server_common.get_request_id(response)
89
97
 
90
98
 
91
99
  @usage_lib.entrypoint
@@ -6,3 +6,12 @@ NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
6
6
  '(e.g., skypilot.co/accelerator) are setup correctly. ')
7
7
 
8
8
  KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'
9
+
10
+ # Name of kubernetes exec auth wrapper script
11
+ SKY_K8S_EXEC_AUTH_WRAPPER = 'sky-kube-exec-wrapper'
12
+
13
+ # PATH envvar for kubectl exec auth execve
14
+ SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:$PATH' # pylint: disable=line-too-long
15
+
16
+ # cache directory for kubeconfig with modified exec auth
17
+ SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
@@ -1,6 +1,7 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
2
  import dataclasses
3
3
  import functools
4
+ import hashlib
4
5
  import json
5
6
  import math
6
7
  import os
@@ -1555,11 +1556,11 @@ def is_kubeconfig_exec_auth(
1555
1556
  == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
1556
1557
  ctx_name = context_obj['name']
1557
1558
  exec_msg = ('exec-based authentication is used for '
1558
- f'Kubernetes context {ctx_name!r}.'
1559
- ' This may cause issues with autodown or when running '
1560
- 'Managed Jobs or SkyServe controller on Kubernetes. '
1561
- 'To fix, configure SkyPilot to create a service account '
1562
- 'for running pods by setting the following in '
1559
+ f'Kubernetes context {ctx_name!r}. '
1560
+ 'Make sure that the corresponding cloud provider is '
1561
+ 'also enabled through `sky check` (e.g.: GCP for GKE). '
1562
+ 'Alternatively, configure SkyPilot to create a service '
1563
+ 'account for running pods by setting the following in '
1563
1564
  '~/.sky/config.yaml:\n'
1564
1565
  ' kubernetes:\n'
1565
1566
  ' remote_identity: SERVICE_ACCOUNT\n'
@@ -2877,8 +2878,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
2877
2878
  context = provider_config.get('context',
2878
2879
  get_current_kube_config_context_name())
2879
2880
  if context == kubernetes.in_cluster_context_name():
2880
- # If the context (also used as the region) is in-cluster, we need to
2881
- # we need to use in-cluster auth by setting the context to None.
2881
+ # If the context (also used as the region) is in-cluster, we need
2882
+ # to use in-cluster auth by setting the context to None.
2882
2883
  context = None
2883
2884
  return context
2884
2885
 
@@ -3135,3 +3136,101 @@ def get_kubeconfig_paths() -> List[str]:
3135
3136
  for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
3136
3137
  expanded.append(os.path.expanduser(path))
3137
3138
  return expanded
3139
+
3140
+
3141
+ def format_kubeconfig_exec_auth(config: Any,
3142
+ output_path: str,
3143
+ inject_wrapper: bool = True) -> bool:
3144
+ """Reformat the kubeconfig so that exec-based authentication can be used
3145
+ with SkyPilot. Will create a new kubeconfig file under <output_path>
3146
+ regardless of whether a change has been made.
3147
+
3148
+ kubectl internally strips all environment variables except for system
3149
+ defaults. If `inject_wrapper` is true, a wrapper executable is applied
3150
+ to inject the relevant PATH information before exec-auth is executed.
3151
+
3152
+ Contents of sky-kube-exec-wrapper:
3153
+
3154
+ #!/bin/bash
3155
+ export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
3156
+ exec "$@"
3157
+
3158
+ refer to `skylet/constants.py` for more information.
3159
+
3160
+ Args:
3161
+ config (dict): kubeconfig parsed by yaml.safe_load
3162
+ output_path (str): Path where the potentially modified kubeconfig file
3163
+ will be saved
3164
+ inject_wrapper (bool): Whether to inject the wrapper script
3165
+ Returns: whether config was updated, for logging purposes
3166
+ """
3167
+ updated = False
3168
+ for user in config.get('users', []):
3169
+ exec_info = user.get('user', {}).get('exec', {})
3170
+ current_command = exec_info.get('command', '')
3171
+
3172
+ if current_command:
3173
+ # Strip the path and keep only the executable name
3174
+ executable = os.path.basename(current_command)
3175
+ if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
3176
+ # we don't want this happening recursively.
3177
+ continue
3178
+
3179
+ if inject_wrapper:
3180
+ exec_info[
3181
+ 'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
3182
+ if exec_info.get('args') is None:
3183
+ exec_info['args'] = []
3184
+ exec_info['args'].insert(0, executable)
3185
+ updated = True
3186
+ elif executable != current_command:
3187
+ exec_info['command'] = executable
3188
+ updated = True
3189
+
3190
+ # Handle Nebius kubeconfigs: change --profile to 'sky'
3191
+ if executable == 'nebius':
3192
+ args = exec_info.get('args', [])
3193
+ if args and '--profile' in args:
3194
+ try:
3195
+ profile_index = args.index('--profile')
3196
+ if profile_index + 1 < len(args):
3197
+ old_profile = args[profile_index + 1]
3198
+ if old_profile != 'sky':
3199
+ args[profile_index + 1] = 'sky'
3200
+ updated = True
3201
+ except ValueError:
3202
+ pass
3203
+
3204
+ os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
3205
+ with open(output_path, 'w', encoding='utf-8') as file:
3206
+ yaml.safe_dump(config, file)
3207
+
3208
+ return updated
3209
+
3210
+
3211
+ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3212
+ """Reformat the kubeconfig file or retrieve it from cache if it has already
3213
+ been formatted before. Store it in the cache directory if necessary.
3214
+
3215
+ Having a cache for this is good if users spawn an extreme number of jobs
3216
+ concurrently.
3217
+
3218
+ Args:
3219
+ kubeconfig_path (str): kubeconfig path
3220
+ Returns: updated kubeconfig path
3221
+ """
3222
+ # TODO(kyuds): GC cache files
3223
+ with open(kubeconfig_path, 'r', encoding='utf-8') as file:
3224
+ config = yaml.safe_load(file)
3225
+ normalized = yaml.dump(config, sort_keys=True)
3226
+ hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
3227
+ path = os.path.expanduser(
3228
+ f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
3229
+ )
3230
+
3231
+ # If we have already converted the same kubeconfig before, just return.
3232
+ if os.path.isfile(path):
3233
+ return path
3234
+
3235
+ format_kubeconfig_exec_auth(config, path)
3236
+ return path
sky/serve/client/sdk.py CHANGED
@@ -10,6 +10,8 @@ from sky.client import common as client_common
10
10
  from sky.server import common as server_common
11
11
  from sky.server.requests import payloads
12
12
  from sky.usage import usage_lib
13
+ from sky.utils import admin_policy_utils
14
+ from sky.utils import context
13
15
  from sky.utils import dag_utils
14
16
 
15
17
  if typing.TYPE_CHECKING:
@@ -23,6 +25,7 @@ else:
23
25
  requests = adaptors_common.LazyImport('requests')
24
26
 
25
27
 
28
+ @context.contextual
26
29
  @usage_lib.entrypoint
27
30
  @server_common.check_server_healthy_or_start
28
31
  def up(
@@ -55,30 +58,36 @@ def up(
55
58
  from sky.client import sdk # pylint: disable=import-outside-toplevel
56
59
 
57
60
  dag = dag_utils.convert_entrypoint_to_dag(task)
58
- sdk.validate(dag)
59
- request_id = sdk.optimize(dag)
60
- sdk.stream_and_get(request_id)
61
- if _need_confirmation:
62
- prompt = f'Launching a new service {service_name!r}. Proceed?'
63
- if prompt is not None:
64
- click.confirm(prompt, default=True, abort=True, show_default=True)
65
-
66
- dag = client_common.upload_mounts_to_api_server(dag)
67
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
68
-
69
- body = payloads.ServeUpBody(
70
- task=dag_str,
71
- service_name=service_name,
72
- )
73
- response = requests.post(
74
- f'{server_common.get_server_url()}/serve/up',
75
- json=json.loads(body.model_dump_json()),
76
- timeout=(5, None),
77
- cookies=server_common.get_api_cookie_jar(),
78
- )
79
- return server_common.get_request_id(response)
61
+ with admin_policy_utils.apply_and_use_config_in_current_request(
62
+ dag, at_client_side=True) as dag:
63
+ sdk.validate(dag)
64
+ request_id = sdk.optimize(dag)
65
+ sdk.stream_and_get(request_id)
66
+ if _need_confirmation:
67
+ prompt = f'Launching a new service {service_name!r}. Proceed?'
68
+ if prompt is not None:
69
+ click.confirm(prompt,
70
+ default=True,
71
+ abort=True,
72
+ show_default=True)
73
+
74
+ dag = client_common.upload_mounts_to_api_server(dag)
75
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
76
+
77
+ body = payloads.ServeUpBody(
78
+ task=dag_str,
79
+ service_name=service_name,
80
+ )
81
+ response = requests.post(
82
+ f'{server_common.get_server_url()}/serve/up',
83
+ json=json.loads(body.model_dump_json()),
84
+ timeout=(5, None),
85
+ cookies=server_common.get_api_cookie_jar(),
86
+ )
87
+ return server_common.get_request_id(response)
80
88
 
81
89
 
90
+ @context.contextual
82
91
  @usage_lib.entrypoint
83
92
  @server_common.check_server_healthy_or_start
84
93
  def update(
@@ -112,30 +121,32 @@ def update(
112
121
  from sky.client import sdk # pylint: disable=import-outside-toplevel
113
122
 
114
123
  dag = dag_utils.convert_entrypoint_to_dag(task)
115
- sdk.validate(dag)
116
- request_id = sdk.optimize(dag)
117
- sdk.stream_and_get(request_id)
118
- if _need_confirmation:
119
- click.confirm(f'Updating service {service_name!r}. Proceed?',
120
- default=True,
121
- abort=True,
122
- show_default=True)
123
-
124
- dag = client_common.upload_mounts_to_api_server(dag)
125
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
126
- body = payloads.ServeUpdateBody(
127
- task=dag_str,
128
- service_name=service_name,
129
- mode=mode,
130
- )
124
+ with admin_policy_utils.apply_and_use_config_in_current_request(
125
+ dag, at_client_side=True) as dag:
126
+ sdk.validate(dag)
127
+ request_id = sdk.optimize(dag)
128
+ sdk.stream_and_get(request_id)
129
+ if _need_confirmation:
130
+ click.confirm(f'Updating service {service_name!r}. Proceed?',
131
+ default=True,
132
+ abort=True,
133
+ show_default=True)
134
+
135
+ dag = client_common.upload_mounts_to_api_server(dag)
136
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
137
+ body = payloads.ServeUpdateBody(
138
+ task=dag_str,
139
+ service_name=service_name,
140
+ mode=mode,
141
+ )
131
142
 
132
- response = requests.post(
133
- f'{server_common.get_server_url()}/serve/update',
134
- json=json.loads(body.model_dump_json()),
135
- timeout=(5, None),
136
- cookies=server_common.get_api_cookie_jar(),
137
- )
138
- return server_common.get_request_id(response)
143
+ response = requests.post(
144
+ f'{server_common.get_server_url()}/serve/update',
145
+ json=json.loads(body.model_dump_json()),
146
+ timeout=(5, None),
147
+ cookies=server_common.get_api_cookie_jar(),
148
+ )
149
+ return server_common.get_request_id(response)
139
150
 
140
151
 
141
152
  @usage_lib.entrypoint
sky/server/common.py CHANGED
@@ -420,11 +420,7 @@ def _start_api_server(deploy: bool = False,
420
420
  dashboard_msg += (
421
421
  'Dashboard may be stale when installed from source, '
422
422
  'to rebuild: npm --prefix sky/dashboard install '
423
- '&& npm --prefix sky/dashboard run build\n')
424
- dashboard_msg += (
425
- f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
426
- f'Dashboard: {get_dashboard_url(server_url)}')
427
- dashboard_msg += f'{colorama.Style.RESET_ALL}'
423
+ '&& npm --prefix sky/dashboard run build')
428
424
  logger.info(
429
425
  ux_utils.finishing_message(
430
426
  f'SkyPilot API server started. {dashboard_msg}'))
@@ -19,6 +19,7 @@ The number of the workers is determined by the system resources.
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
21
  import asyncio
22
+ import concurrent.futures
22
23
  import contextlib
23
24
  import multiprocessing
24
25
  import os
@@ -93,21 +94,21 @@ class RequestQueue:
93
94
  else:
94
95
  raise RuntimeError(f'Invalid queue backend: {backend}')
95
96
 
96
- def put(self, request: Tuple[str, bool]) -> None:
97
+ def put(self, request: Tuple[str, bool, bool]) -> None:
97
98
  """Put and request to the queue.
98
99
 
99
100
  Args:
100
- request: A tuple of request_id and ignore_return_value.
101
+ request: A tuple of request_id, ignore_return_value, and retryable.
101
102
  """
102
103
  self.queue.put(request) # type: ignore
103
104
 
104
- def get(self) -> Optional[Tuple[str, bool]]:
105
+ def get(self) -> Optional[Tuple[str, bool, bool]]:
105
106
  """Get a request from the queue.
106
107
 
107
108
  It is non-blocking if the queue is empty, and returns None.
108
109
 
109
110
  Returns:
110
- A tuple of request_id and ignore_return_value.
111
+ A tuple of request_id, ignore_return_value, and retryable.
111
112
  """
112
113
  try:
113
114
  return self.queue.get(block=False)
@@ -159,7 +160,7 @@ class RequestWorker:
159
160
  if request_element is None:
160
161
  time.sleep(0.1)
161
162
  return
162
- request_id, ignore_return_value = request_element
163
+ request_id, ignore_return_value, retryable = request_element
163
164
  request = api_requests.get_request(request_id)
164
165
  assert request is not None, f'Request with ID {request_id} is None'
165
166
  if request.status == api_requests.RequestStatus.CANCELLED:
@@ -171,8 +172,14 @@ class RequestWorker:
171
172
  # multiple requests can share the same process pid, which may cause
172
173
  # issues with SkyPilot core functions if they rely on the exit of
173
174
  # the process, such as subprocess_daemon.py.
174
- executor.submit_until_success(_request_execution_wrapper,
175
- request_id, ignore_return_value)
175
+ fut = executor.submit_until_success(_request_execution_wrapper,
176
+ request_id, ignore_return_value)
177
+ if retryable:
178
+ # If the task might fail and be retried, start a thread to
179
+ # monitor the future and process retry.
180
+ threading.Thread(target=self.handle_task_result,
181
+ args=(fut, request_element),
182
+ daemon=True).start()
176
183
 
177
184
  logger.info(f'[{self}] Submitted request: {request_id}')
178
185
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
@@ -182,6 +189,16 @@ class RequestWorker:
182
189
  f'{request_id if "request_id" in locals() else ""} '
183
190
  f'{common_utils.format_exception(e, use_bracket=True)}')
184
191
 
192
+ def handle_task_result(self, fut: concurrent.futures.Future,
193
+ request_element: Tuple[str, bool, bool]) -> None:
194
+ try:
195
+ fut.result()
196
+ except exceptions.ExecutionRetryableError as e:
197
+ time.sleep(e.retry_wait_seconds)
198
+ # Reschedule the request.
199
+ queue = _get_queue(self.schedule_type)
200
+ queue.put(request_element)
201
+
185
202
  def run(self) -> None:
186
203
  # Handle the SIGTERM signal to abort the executor process gracefully.
187
204
  proc_group = f'{self.schedule_type.value}'
@@ -316,7 +333,9 @@ def _request_execution_wrapper(request_id: str,
316
333
  func = request_task.entrypoint
317
334
  request_body = request_task.request_body
318
335
 
319
- with log_path.open('w', encoding='utf-8') as f:
336
+ # Append to the log file instead of overwriting it since there might be
337
+ # logs from previous retries.
338
+ with log_path.open('a', encoding='utf-8') as f:
320
339
  # Store copies of the original stdout and stderr file descriptors
321
340
  original_stdout, original_stderr = _redirect_output(f)
322
341
  # Redirect the stdout/stderr before overriding the environment and
@@ -340,6 +359,17 @@ def _request_execution_wrapper(request_id: str,
340
359
  subprocess_utils.kill_children_processes()
341
360
  _restore_output(original_stdout, original_stderr)
342
361
  return
362
+ except exceptions.ExecutionRetryableError as e:
363
+ logger.error(e)
364
+ logger.info(e.hint)
365
+ with api_requests.update_request(request_id) as request_task:
366
+ assert request_task is not None, request_id
367
+ # Retried request will undergo rescheduling and a new execution,
368
+ # clear the pid of the request.
369
+ request_task.pid = None
370
+ # Yield control to the scheduler for uniform handling of retries.
371
+ _restore_output(original_stdout, original_stderr)
372
+ raise
343
373
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
344
374
  api_requests.set_request_failed(request_id, e)
345
375
  _restore_output(original_stdout, original_stderr)
@@ -463,17 +493,17 @@ def prepare_request(
463
493
  return request
464
494
 
465
495
 
466
- def schedule_request(
467
- request_id: str,
468
- request_name: str,
469
- request_body: payloads.RequestBody,
470
- func: Callable[P, Any],
471
- request_cluster_name: Optional[str] = None,
472
- ignore_return_value: bool = False,
473
- schedule_type: api_requests.ScheduleType = (
474
- api_requests.ScheduleType.LONG),
475
- is_skypilot_system: bool = False,
476
- precondition: Optional[preconditions.Precondition] = None) -> None:
496
+ def schedule_request(request_id: str,
497
+ request_name: str,
498
+ request_body: payloads.RequestBody,
499
+ func: Callable[P, Any],
500
+ request_cluster_name: Optional[str] = None,
501
+ ignore_return_value: bool = False,
502
+ schedule_type: api_requests.ScheduleType = (
503
+ api_requests.ScheduleType.LONG),
504
+ is_skypilot_system: bool = False,
505
+ precondition: Optional[preconditions.Precondition] = None,
506
+ retryable: bool = False) -> None:
477
507
  """Enqueue a request to the request queue.
478
508
 
479
509
  Args:
@@ -498,7 +528,7 @@ def schedule_request(
498
528
  request_cluster_name, schedule_type, is_skypilot_system)
499
529
 
500
530
  def enqueue():
501
- input_tuple = (request_id, ignore_return_value)
531
+ input_tuple = (request_id, ignore_return_value, retryable)
502
532
  logger.info(f'Queuing request: {request_id}')
503
533
  _get_queue(schedule_type).put(input_tuple)
504
534
 
@@ -79,6 +79,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
79
79
  # server endpoint on the server side. This avoids the warning at
80
80
  # server-side.
81
81
  config.pop_nested(('api_server',), default_value=None)
82
+ # Remove the admin policy, as the policy has been applied on the client
83
+ # side.
84
+ config.pop_nested(('admin_policy',), default_value=None)
82
85
  return config
83
86
 
84
87