skypilot-nightly 1.0.0.dev20250607__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +3 -0
- sky/authentication.py +1 -7
- sky/backends/backend_utils.py +18 -2
- sky/backends/cloud_vm_ray_backend.py +9 -20
- sky/check.py +4 -3
- sky/cli.py +6 -9
- sky/client/cli.py +6 -9
- sky/client/sdk.py +49 -4
- sky/clouds/kubernetes.py +15 -24
- sky/core.py +3 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
- sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
- sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
- sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
- sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
- sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
- sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
- sky/dashboard/out/_next/static/css/8b1c8321d4c02372.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +23 -0
- sky/global_user_state.py +192 -80
- sky/jobs/client/sdk.py +29 -21
- sky/jobs/server/core.py +9 -1
- sky/jobs/server/server.py +0 -95
- sky/jobs/utils.py +2 -1
- sky/models.py +18 -0
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/utils.py +106 -7
- sky/serve/client/sdk.py +56 -45
- sky/serve/server/core.py +1 -1
- sky/server/common.py +5 -7
- sky/server/constants.py +0 -2
- sky/server/requests/executor.py +60 -22
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/process.py +69 -29
- sky/server/requests/requests.py +4 -3
- sky/server/server.py +23 -5
- sky/server/stream_utils.py +111 -55
- sky/skylet/constants.py +4 -2
- sky/skylet/job_lib.py +2 -1
- sky/skypilot_config.py +108 -25
- sky/users/model.conf +1 -1
- sky/users/permission.py +149 -32
- sky/users/rbac.py +26 -0
- sky/users/server.py +14 -13
- sky/utils/admin_policy_utils.py +9 -3
- sky/utils/common.py +6 -1
- sky/utils/common_utils.py +21 -3
- sky/utils/context.py +21 -1
- sky/utils/controller_utils.py +16 -1
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
- sky/utils/schemas.py +9 -0
- sky/workspaces/core.py +100 -8
- sky/workspaces/server.py +15 -2
- sky/workspaces/utils.py +56 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +106 -94
- sky/dashboard/out/_next/static/1qG0HTmVilJPxQdBk0fX5/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
- sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-ad1e0db3afcbd9c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/614-635a84e87800f99e.js +0 -66
- sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-c296541442d4af88.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-3a32da4b84176f6d.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-6d78a0814682d771.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-cb81dc4d27f4d009.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-18aed9b56247d074.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b919a73aecdfa78f.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-4f6b9dd9abcb33ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-fe375a56342cf609.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-3a18d0eeb5119fe4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-a1a6abeeb58c1051.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1354e28c81eeb686.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-23bfc8bf373423db.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-5800045bd04e69c2.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-e1f9c0c3ff7ac4bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-686590e0ee4b2412.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-76b07aa5da91b0df.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
- sky/dashboard/out/_next/static/css/667d941a2888ce6e.css +0 -3
- /sky/dashboard/out/_next/static/{1qG0HTmVilJPxQdBk0fX5 → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250607.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py
CHANGED
@@ -14,7 +14,9 @@ from sky.server import common as server_common
|
|
14
14
|
from sky.server.requests import payloads
|
15
15
|
from sky.skylet import constants
|
16
16
|
from sky.usage import usage_lib
|
17
|
+
from sky.utils import admin_policy_utils
|
17
18
|
from sky.utils import common_utils
|
19
|
+
from sky.utils import context
|
18
20
|
from sky.utils import dag_utils
|
19
21
|
|
20
22
|
if typing.TYPE_CHECKING:
|
@@ -29,6 +31,7 @@ else:
|
|
29
31
|
logger = sky_logging.init_logger(__name__)
|
30
32
|
|
31
33
|
|
34
|
+
@context.contextual
|
32
35
|
@usage_lib.entrypoint
|
33
36
|
@server_common.check_server_healthy_or_start
|
34
37
|
def launch(
|
@@ -65,27 +68,32 @@ def launch(
|
|
65
68
|
"""
|
66
69
|
|
67
70
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
71
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
72
|
+
dag, at_client_side=True) as dag:
|
73
|
+
sdk.validate(dag)
|
74
|
+
if _need_confirmation:
|
75
|
+
request_id = sdk.optimize(dag)
|
76
|
+
sdk.stream_and_get(request_id)
|
77
|
+
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
78
|
+
if prompt is not None:
|
79
|
+
click.confirm(prompt,
|
80
|
+
default=True,
|
81
|
+
abort=True,
|
82
|
+
show_default=True)
|
83
|
+
|
84
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
85
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
86
|
+
body = payloads.JobsLaunchBody(
|
87
|
+
task=dag_str,
|
88
|
+
name=name,
|
89
|
+
)
|
90
|
+
response = requests.post(
|
91
|
+
f'{server_common.get_server_url()}/jobs/launch',
|
92
|
+
json=json.loads(body.model_dump_json()),
|
93
|
+
timeout=(5, None),
|
94
|
+
cookies=server_common.get_api_cookie_jar(),
|
95
|
+
)
|
96
|
+
return server_common.get_request_id(response)
|
89
97
|
|
90
98
|
|
91
99
|
@usage_lib.entrypoint
|
sky/jobs/server/core.py
CHANGED
@@ -37,6 +37,7 @@ from sky.utils import status_lib
|
|
37
37
|
from sky.utils import subprocess_utils
|
38
38
|
from sky.utils import timeline
|
39
39
|
from sky.utils import ux_utils
|
40
|
+
from sky.workspaces import core as workspaces_core
|
40
41
|
|
41
42
|
if typing.TYPE_CHECKING:
|
42
43
|
import sky
|
@@ -244,7 +245,7 @@ def launch(
|
|
244
245
|
|
245
246
|
# Launch with the api server's user hash, so that sky status does not
|
246
247
|
# show the owner of the controller as whatever user launched it first.
|
247
|
-
with common.
|
248
|
+
with common.with_server_user():
|
248
249
|
# Always launch the controller in the default workspace.
|
249
250
|
with skypilot_config.local_active_workspace_ctx(
|
250
251
|
skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
|
@@ -455,6 +456,13 @@ def queue(refresh: bool,
|
|
455
456
|
|
456
457
|
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
457
458
|
|
459
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
460
|
+
jobs = list(
|
461
|
+
filter(
|
462
|
+
lambda job: job.get('workspace', skylet_constants.
|
463
|
+
SKYPILOT_DEFAULT_WORKSPACE) in
|
464
|
+
accessible_workspaces, jobs))
|
465
|
+
|
458
466
|
if skip_finished:
|
459
467
|
# Filter out the finished jobs. If a multi-task job is partially
|
460
468
|
# finished, we will include all its tasks.
|
sky/jobs/server/server.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1
1
|
"""REST API for managed jobs."""
|
2
|
-
import os
|
3
2
|
|
4
3
|
import fastapi
|
5
|
-
import httpx
|
6
4
|
|
7
5
|
from sky import sky_logging
|
8
6
|
from sky.jobs.server import core
|
9
|
-
from sky.jobs.server import dashboard_utils
|
10
7
|
from sky.server import common as server_common
|
11
8
|
from sky.server import stream_utils
|
12
9
|
from sky.server.requests import executor
|
@@ -14,7 +11,6 @@ from sky.server.requests import payloads
|
|
14
11
|
from sky.server.requests import requests as api_requests
|
15
12
|
from sky.skylet import constants
|
16
13
|
from sky.utils import common
|
17
|
-
from sky.utils import common_utils
|
18
14
|
|
19
15
|
logger = sky_logging.init_logger(__name__)
|
20
16
|
|
@@ -110,94 +106,3 @@ async def download_logs(
|
|
110
106
|
if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
111
107
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
112
108
|
)
|
113
|
-
|
114
|
-
|
115
|
-
@router.get('/dashboard')
|
116
|
-
async def dashboard(request: fastapi.Request,
|
117
|
-
user_hash: str) -> fastapi.Response:
|
118
|
-
# TODO(cooperc): Support showing only jobs for a specific user.
|
119
|
-
|
120
|
-
# FIX(zhwu/cooperc/eric): Fix log downloading (assumes global
|
121
|
-
# /download_log/xx route)
|
122
|
-
|
123
|
-
# Note: before #4717, each user had their own controller, and thus their own
|
124
|
-
# dashboard. Now, all users share the same controller, so this isn't really
|
125
|
-
# necessary. TODO(cooperc): clean up.
|
126
|
-
|
127
|
-
# TODO: Put this in an executor to avoid blocking the main server thread.
|
128
|
-
# It can take a long time if it needs to check the controller status.
|
129
|
-
|
130
|
-
# Find the port for the dashboard of the user
|
131
|
-
os.environ[constants.USER_ID_ENV_VAR] = user_hash
|
132
|
-
server_common.reload_for_new_request(client_entrypoint=None,
|
133
|
-
client_command=None,
|
134
|
-
using_remote_api_server=False)
|
135
|
-
logger.info(f'Starting dashboard for user hash: {user_hash}')
|
136
|
-
|
137
|
-
with dashboard_utils.get_dashboard_lock_for_user(user_hash):
|
138
|
-
max_retries = 3
|
139
|
-
for attempt in range(max_retries):
|
140
|
-
port, pid = dashboard_utils.get_dashboard_session(user_hash)
|
141
|
-
if port == 0 or attempt > 0:
|
142
|
-
# Let the client know that we are waiting for starting the
|
143
|
-
# dashboard.
|
144
|
-
try:
|
145
|
-
port, pid = core.start_dashboard_forwarding()
|
146
|
-
except Exception as e: # pylint: disable=broad-except
|
147
|
-
# We catch all exceptions to gracefully handle unknown
|
148
|
-
# errors and raise an HTTPException to the client.
|
149
|
-
msg = (
|
150
|
-
'Dashboard failed to start: '
|
151
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
152
|
-
logger.error(msg)
|
153
|
-
raise fastapi.HTTPException(status_code=503, detail=msg)
|
154
|
-
dashboard_utils.add_dashboard_session(user_hash, port, pid)
|
155
|
-
|
156
|
-
# Assuming the dashboard is forwarded to localhost on the API server
|
157
|
-
dashboard_url = f'http://localhost:{port}'
|
158
|
-
try:
|
159
|
-
# Ping the dashboard to check if it's still running
|
160
|
-
async with httpx.AsyncClient() as client:
|
161
|
-
response = await client.request('GET',
|
162
|
-
dashboard_url,
|
163
|
-
timeout=5)
|
164
|
-
if response.is_success:
|
165
|
-
break # Connection successful, proceed with the request
|
166
|
-
# Raise an HTTPException here which will be caught by the
|
167
|
-
# following except block to retry with new connection
|
168
|
-
response.raise_for_status()
|
169
|
-
except Exception as e: # pylint: disable=broad-except
|
170
|
-
# We catch all exceptions to gracefully handle unknown
|
171
|
-
# errors and retry or raise an HTTPException to the client.
|
172
|
-
# Assume an exception indicates that the dashboard connection
|
173
|
-
# is stale - remove it so that a new one is created.
|
174
|
-
dashboard_utils.remove_dashboard_session(user_hash)
|
175
|
-
msg = (
|
176
|
-
f'Dashboard connection attempt {attempt + 1} failed with '
|
177
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
178
|
-
logger.info(msg)
|
179
|
-
if attempt == max_retries - 1:
|
180
|
-
raise fastapi.HTTPException(status_code=503, detail=msg)
|
181
|
-
|
182
|
-
# Create a client session to forward the request
|
183
|
-
try:
|
184
|
-
async with httpx.AsyncClient() as client:
|
185
|
-
# Make the request and get the response
|
186
|
-
response = await client.request(
|
187
|
-
method='GET',
|
188
|
-
url=f'{dashboard_url}',
|
189
|
-
headers=request.headers.raw,
|
190
|
-
)
|
191
|
-
|
192
|
-
# Create a new response with the content already read
|
193
|
-
content = await response.aread()
|
194
|
-
return fastapi.Response(
|
195
|
-
content=content,
|
196
|
-
status_code=response.status_code,
|
197
|
-
headers=dict(response.headers),
|
198
|
-
media_type=response.headers.get('content-type'))
|
199
|
-
except Exception as e:
|
200
|
-
msg = (f'Failed to forward request to dashboard: '
|
201
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
202
|
-
logger.error(msg)
|
203
|
-
raise fastapi.HTTPException(status_code=502, detail=msg)
|
sky/jobs/utils.py
CHANGED
@@ -1025,7 +1025,8 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
1025
1025
|
if 'user_hash' in job and job['user_hash'] is not None:
|
1026
1026
|
# Skip jobs that do not have user_hash info.
|
1027
1027
|
# TODO(cooperc): Remove check before 0.12.0.
|
1028
|
-
|
1028
|
+
user = global_user_state.get_user(job['user_hash'])
|
1029
|
+
job['user_name'] = user.name if user is not None else None
|
1029
1030
|
return jobs
|
1030
1031
|
|
1031
1032
|
|
sky/models.py
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
import collections
|
4
4
|
import dataclasses
|
5
|
+
import getpass
|
6
|
+
import os
|
5
7
|
from typing import Any, Dict, Optional
|
6
8
|
|
9
|
+
from sky.skylet import constants
|
10
|
+
from sky.utils import common_utils
|
11
|
+
|
7
12
|
|
8
13
|
@dataclasses.dataclass
|
9
14
|
class User:
|
@@ -16,6 +21,19 @@ class User:
|
|
16
21
|
def to_dict(self) -> Dict[str, Any]:
|
17
22
|
return {'id': self.id, 'name': self.name}
|
18
23
|
|
24
|
+
def to_env_vars(self) -> Dict[str, Any]:
|
25
|
+
return {
|
26
|
+
constants.USER_ID_ENV_VAR: self.id,
|
27
|
+
constants.USER_ENV_VAR: self.name,
|
28
|
+
}
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def get_current_user(cls) -> 'User':
|
32
|
+
"""Returns the current user."""
|
33
|
+
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
34
|
+
user_hash = common_utils.get_user_hash()
|
35
|
+
return User(id=user_hash, name=user_name)
|
36
|
+
|
19
37
|
|
20
38
|
RealtimeGpuAvailability = collections.namedtuple(
|
21
39
|
'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
|
@@ -6,3 +6,12 @@ NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
|
|
6
6
|
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
7
7
|
|
8
8
|
KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'
|
9
|
+
|
10
|
+
# Name of kubernetes exec auth wrapper script
|
11
|
+
SKY_K8S_EXEC_AUTH_WRAPPER = 'sky-kube-exec-wrapper'
|
12
|
+
|
13
|
+
# PATH envvar for kubectl exec auth execve
|
14
|
+
SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:$PATH' # pylint: disable=line-too-long
|
15
|
+
|
16
|
+
# cache directory for kubeconfig with modified exec auth
|
17
|
+
SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
2
2
|
import dataclasses
|
3
3
|
import functools
|
4
|
+
import hashlib
|
4
5
|
import json
|
5
6
|
import math
|
6
7
|
import os
|
@@ -1555,11 +1556,11 @@ def is_kubeconfig_exec_auth(
|
|
1555
1556
|
== schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
|
1556
1557
|
ctx_name = context_obj['name']
|
1557
1558
|
exec_msg = ('exec-based authentication is used for '
|
1558
|
-
f'Kubernetes context {ctx_name!r}.'
|
1559
|
-
'
|
1560
|
-
'
|
1561
|
-
'
|
1562
|
-
'for running pods by setting the following in '
|
1559
|
+
f'Kubernetes context {ctx_name!r}. '
|
1560
|
+
'Make sure that the corresponding cloud provider is '
|
1561
|
+
'also enabled through `sky check` (e.g.: GCP for GKE). '
|
1562
|
+
'Alternatively, configure SkyPilot to create a service '
|
1563
|
+
'account for running pods by setting the following in '
|
1563
1564
|
'~/.sky/config.yaml:\n'
|
1564
1565
|
' kubernetes:\n'
|
1565
1566
|
' remote_identity: SERVICE_ACCOUNT\n'
|
@@ -2877,8 +2878,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
|
|
2877
2878
|
context = provider_config.get('context',
|
2878
2879
|
get_current_kube_config_context_name())
|
2879
2880
|
if context == kubernetes.in_cluster_context_name():
|
2880
|
-
# If the context (also used as the region) is in-cluster, we need
|
2881
|
-
#
|
2881
|
+
# If the context (also used as the region) is in-cluster, we need
|
2882
|
+
# to use in-cluster auth by setting the context to None.
|
2882
2883
|
context = None
|
2883
2884
|
return context
|
2884
2885
|
|
@@ -3135,3 +3136,101 @@ def get_kubeconfig_paths() -> List[str]:
|
|
3135
3136
|
for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
|
3136
3137
|
expanded.append(os.path.expanduser(path))
|
3137
3138
|
return expanded
|
3139
|
+
|
3140
|
+
|
3141
|
+
def format_kubeconfig_exec_auth(config: Any,
|
3142
|
+
output_path: str,
|
3143
|
+
inject_wrapper: bool = True) -> bool:
|
3144
|
+
"""Reformat the kubeconfig so that exec-based authentication can be used
|
3145
|
+
with SkyPilot. Will create a new kubeconfig file under <output_path>
|
3146
|
+
regardless of whether a change has been made.
|
3147
|
+
|
3148
|
+
kubectl internally strips all environment variables except for system
|
3149
|
+
defaults. If `inject_wrapper` is true, a wrapper executable is applied
|
3150
|
+
to inject the relevant PATH information before exec-auth is executed.
|
3151
|
+
|
3152
|
+
Contents of sky-kube-exec-wrapper:
|
3153
|
+
|
3154
|
+
#!/bin/bash
|
3155
|
+
export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
|
3156
|
+
exec "$@"
|
3157
|
+
|
3158
|
+
refer to `skylet/constants.py` for more information.
|
3159
|
+
|
3160
|
+
Args:
|
3161
|
+
config (dict): kubeconfig parsed by yaml.safe_load
|
3162
|
+
output_path (str): Path where the potentially modified kubeconfig file
|
3163
|
+
will be saved
|
3164
|
+
inject_wrapper (bool): Whether to inject the wrapper script
|
3165
|
+
Returns: whether config was updated, for logging purposes
|
3166
|
+
"""
|
3167
|
+
updated = False
|
3168
|
+
for user in config.get('users', []):
|
3169
|
+
exec_info = user.get('user', {}).get('exec', {})
|
3170
|
+
current_command = exec_info.get('command', '')
|
3171
|
+
|
3172
|
+
if current_command:
|
3173
|
+
# Strip the path and keep only the executable name
|
3174
|
+
executable = os.path.basename(current_command)
|
3175
|
+
if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
|
3176
|
+
# we don't want this happening recursively.
|
3177
|
+
continue
|
3178
|
+
|
3179
|
+
if inject_wrapper:
|
3180
|
+
exec_info[
|
3181
|
+
'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
|
3182
|
+
if exec_info.get('args') is None:
|
3183
|
+
exec_info['args'] = []
|
3184
|
+
exec_info['args'].insert(0, executable)
|
3185
|
+
updated = True
|
3186
|
+
elif executable != current_command:
|
3187
|
+
exec_info['command'] = executable
|
3188
|
+
updated = True
|
3189
|
+
|
3190
|
+
# Handle Nebius kubeconfigs: change --profile to 'sky'
|
3191
|
+
if executable == 'nebius':
|
3192
|
+
args = exec_info.get('args', [])
|
3193
|
+
if args and '--profile' in args:
|
3194
|
+
try:
|
3195
|
+
profile_index = args.index('--profile')
|
3196
|
+
if profile_index + 1 < len(args):
|
3197
|
+
old_profile = args[profile_index + 1]
|
3198
|
+
if old_profile != 'sky':
|
3199
|
+
args[profile_index + 1] = 'sky'
|
3200
|
+
updated = True
|
3201
|
+
except ValueError:
|
3202
|
+
pass
|
3203
|
+
|
3204
|
+
os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
|
3205
|
+
with open(output_path, 'w', encoding='utf-8') as file:
|
3206
|
+
yaml.safe_dump(config, file)
|
3207
|
+
|
3208
|
+
return updated
|
3209
|
+
|
3210
|
+
|
3211
|
+
def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
3212
|
+
"""Reformat the kubeconfig file or retrieve it from cache if it has already
|
3213
|
+
been formatted before. Store it in the cache directory if necessary.
|
3214
|
+
|
3215
|
+
Having a cache for this is good if users spawn an extreme number of jobs
|
3216
|
+
concurrently.
|
3217
|
+
|
3218
|
+
Args:
|
3219
|
+
kubeconfig_path (str): kubeconfig path
|
3220
|
+
Returns: updated kubeconfig path
|
3221
|
+
"""
|
3222
|
+
# TODO(kyuds): GC cache files
|
3223
|
+
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
3224
|
+
config = yaml.safe_load(file)
|
3225
|
+
normalized = yaml.dump(config, sort_keys=True)
|
3226
|
+
hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
|
3227
|
+
path = os.path.expanduser(
|
3228
|
+
f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
|
3229
|
+
)
|
3230
|
+
|
3231
|
+
# If we have already converted the same kubeconfig before, just return.
|
3232
|
+
if os.path.isfile(path):
|
3233
|
+
return path
|
3234
|
+
|
3235
|
+
format_kubeconfig_exec_auth(config, path)
|
3236
|
+
return path
|
sky/serve/client/sdk.py
CHANGED
@@ -10,6 +10,8 @@ from sky.client import common as client_common
|
|
10
10
|
from sky.server import common as server_common
|
11
11
|
from sky.server.requests import payloads
|
12
12
|
from sky.usage import usage_lib
|
13
|
+
from sky.utils import admin_policy_utils
|
14
|
+
from sky.utils import context
|
13
15
|
from sky.utils import dag_utils
|
14
16
|
|
15
17
|
if typing.TYPE_CHECKING:
|
@@ -23,6 +25,7 @@ else:
|
|
23
25
|
requests = adaptors_common.LazyImport('requests')
|
24
26
|
|
25
27
|
|
28
|
+
@context.contextual
|
26
29
|
@usage_lib.entrypoint
|
27
30
|
@server_common.check_server_healthy_or_start
|
28
31
|
def up(
|
@@ -55,30 +58,36 @@ def up(
|
|
55
58
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
56
59
|
|
57
60
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
61
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
62
|
+
dag, at_client_side=True) as dag:
|
63
|
+
sdk.validate(dag)
|
64
|
+
request_id = sdk.optimize(dag)
|
65
|
+
sdk.stream_and_get(request_id)
|
66
|
+
if _need_confirmation:
|
67
|
+
prompt = f'Launching a new service {service_name!r}. Proceed?'
|
68
|
+
if prompt is not None:
|
69
|
+
click.confirm(prompt,
|
70
|
+
default=True,
|
71
|
+
abort=True,
|
72
|
+
show_default=True)
|
73
|
+
|
74
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
75
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
76
|
+
|
77
|
+
body = payloads.ServeUpBody(
|
78
|
+
task=dag_str,
|
79
|
+
service_name=service_name,
|
80
|
+
)
|
81
|
+
response = requests.post(
|
82
|
+
f'{server_common.get_server_url()}/serve/up',
|
83
|
+
json=json.loads(body.model_dump_json()),
|
84
|
+
timeout=(5, None),
|
85
|
+
cookies=server_common.get_api_cookie_jar(),
|
86
|
+
)
|
87
|
+
return server_common.get_request_id(response)
|
80
88
|
|
81
89
|
|
90
|
+
@context.contextual
|
82
91
|
@usage_lib.entrypoint
|
83
92
|
@server_common.check_server_healthy_or_start
|
84
93
|
def update(
|
@@ -112,30 +121,32 @@ def update(
|
|
112
121
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
113
122
|
|
114
123
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
124
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
125
|
+
dag, at_client_side=True) as dag:
|
126
|
+
sdk.validate(dag)
|
127
|
+
request_id = sdk.optimize(dag)
|
128
|
+
sdk.stream_and_get(request_id)
|
129
|
+
if _need_confirmation:
|
130
|
+
click.confirm(f'Updating service {service_name!r}. Proceed?',
|
131
|
+
default=True,
|
132
|
+
abort=True,
|
133
|
+
show_default=True)
|
134
|
+
|
135
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
136
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
137
|
+
body = payloads.ServeUpdateBody(
|
138
|
+
task=dag_str,
|
139
|
+
service_name=service_name,
|
140
|
+
mode=mode,
|
141
|
+
)
|
131
142
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
143
|
+
response = requests.post(
|
144
|
+
f'{server_common.get_server_url()}/serve/update',
|
145
|
+
json=json.loads(body.model_dump_json()),
|
146
|
+
timeout=(5, None),
|
147
|
+
cookies=server_common.get_api_cookie_jar(),
|
148
|
+
)
|
149
|
+
return server_common.get_request_id(response)
|
139
150
|
|
140
151
|
|
141
152
|
@usage_lib.entrypoint
|
sky/serve/server/core.py
CHANGED
@@ -221,7 +221,7 @@ def up(
|
|
221
221
|
# for the first time; otherwise it is a name conflict.
|
222
222
|
# Since the controller may be shared among multiple users, launch the
|
223
223
|
# controller with the API server's user hash.
|
224
|
-
with common.
|
224
|
+
with common.with_server_user():
|
225
225
|
with skypilot_config.local_active_workspace_ctx(
|
226
226
|
constants.SKYPILOT_DEFAULT_WORKSPACE):
|
227
227
|
controller_job_id, controller_handle = execution.launch(
|
sky/server/common.py
CHANGED
@@ -39,6 +39,7 @@ if typing.TYPE_CHECKING:
|
|
39
39
|
import requests
|
40
40
|
|
41
41
|
from sky import dag as dag_lib
|
42
|
+
from sky import models
|
42
43
|
else:
|
43
44
|
pydantic = adaptors_common.LazyImport('pydantic')
|
44
45
|
requests = adaptors_common.LazyImport('requests')
|
@@ -419,11 +420,7 @@ def _start_api_server(deploy: bool = False,
|
|
419
420
|
dashboard_msg += (
|
420
421
|
'Dashboard may be stale when installed from source, '
|
421
422
|
'to rebuild: npm --prefix sky/dashboard install '
|
422
|
-
'&& npm --prefix sky/dashboard run build
|
423
|
-
dashboard_msg += (
|
424
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
425
|
-
f'Dashboard: {get_dashboard_url(server_url)}')
|
426
|
-
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
423
|
+
'&& npm --prefix sky/dashboard run build')
|
427
424
|
logger.info(
|
428
425
|
ux_utils.finishing_message(
|
429
426
|
f'SkyPilot API server started. {dashboard_msg}'))
|
@@ -710,7 +707,7 @@ def request_body_to_params(body: 'pydantic.BaseModel') -> Dict[str, Any]:
|
|
710
707
|
|
711
708
|
def reload_for_new_request(client_entrypoint: Optional[str],
|
712
709
|
client_command: Optional[str],
|
713
|
-
using_remote_api_server: bool):
|
710
|
+
using_remote_api_server: bool, user: 'models.User'):
|
714
711
|
"""Reload modules, global variables, and usage message for a new request."""
|
715
712
|
# This should be called first to make sure the logger is up-to-date.
|
716
713
|
sky_logging.reload_logger()
|
@@ -719,10 +716,11 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
719
716
|
skypilot_config.safe_reload_config()
|
720
717
|
|
721
718
|
# Reset the client entrypoint and command for the usage message.
|
722
|
-
common_utils.
|
719
|
+
common_utils.set_request_context(
|
723
720
|
client_entrypoint=client_entrypoint,
|
724
721
|
client_command=client_command,
|
725
722
|
using_remote_api_server=using_remote_api_server,
|
723
|
+
user=user,
|
726
724
|
)
|
727
725
|
|
728
726
|
# Clear cache should be called before reload_logger and usage reset,
|
sky/server/constants.py
CHANGED
@@ -11,8 +11,6 @@ API_VERSION = '9'
|
|
11
11
|
|
12
12
|
# Prefix for API request names.
|
13
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
14
|
-
# The user ID of the SkyPilot system.
|
15
|
-
SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
|
16
14
|
# The memory (GB) that SkyPilot tries to not use to prevent OOM.
|
17
15
|
MIN_AVAIL_MEM_GB = 2
|
18
16
|
# Default encoder/decoder handler name.
|