skypilot-nightly 1.0.0.dev20250702__py3-none-any.whl → 1.0.0.dev20250704__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +24 -24
- sky/catalog/data_fetchers/fetch_cudo.py +37 -37
- sky/client/sdk.py +4 -6
- sky/clouds/aws.py +1 -1
- sky/clouds/cudo.py +1 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/jobs/client/sdk.py +3 -1
- sky/jobs/server/core.py +16 -11
- sky/metrics/__init__.py +0 -0
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +71 -70
- sky/server/common.py +59 -94
- sky/server/constants.py +25 -4
- sky/server/requests/payloads.py +55 -10
- sky/server/requests/requests.py +6 -28
- sky/server/rest.py +15 -4
- sky/server/server.py +51 -7
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +0 -1
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/RECORD +43 -41
- /sky/dashboard/out/_next/static/{N5IdFnjR1RaPGBAVYeTIr → 6TieQqyqsJiaJC33q0FfI}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{N5IdFnjR1RaPGBAVYeTIr → 6TieQqyqsJiaJC33q0FfI}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/top_level.txt +0 -0
sky/provision/cudo/cudo_utils.py
CHANGED
@@ -1,22 +1,28 @@
|
|
1
1
|
"""Cudo catalog helper."""
|
2
2
|
|
3
3
|
cudo_gpu_model = {
|
4
|
-
'
|
5
|
-
'
|
6
|
-
'
|
7
|
-
'
|
8
|
-
'
|
4
|
+
'H100 NVL': 'H100',
|
5
|
+
'H100 SXM': 'H100-SXM',
|
6
|
+
'L40S (compute mode)': 'L40S',
|
7
|
+
'L40S (graphics mode)': 'L40S',
|
8
|
+
'A40 (compute mode)': 'A40',
|
9
|
+
'A40 (graphics mode)': 'A40',
|
9
10
|
'RTX A5000': 'RTXA5000',
|
10
11
|
'RTX A6000': 'RTXA6000',
|
12
|
+
'A100 80GB PCIe': 'A100',
|
13
|
+
'A800 PCIe': 'A800',
|
14
|
+
'V100': 'V100',
|
11
15
|
}
|
12
16
|
|
13
17
|
cudo_gpu_mem = {
|
14
|
-
'
|
18
|
+
'H100': 94,
|
19
|
+
'H100-SXM': 80,
|
20
|
+
'L40S': 48,
|
15
21
|
'A40': 48,
|
16
|
-
'RTXA4000': 16,
|
17
|
-
'RTXA4500': 20,
|
18
22
|
'RTXA5000': 24,
|
19
23
|
'RTXA6000': 48,
|
24
|
+
'A100': 80,
|
25
|
+
'A800': 80,
|
20
26
|
'V100': 16,
|
21
27
|
}
|
22
28
|
|
@@ -28,12 +28,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
|
|
28
28
|
size_gib=disk_size),
|
29
29
|
metadata=tags)
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
except cudo.cudo.rest.ApiException as e:
|
36
|
-
raise e
|
31
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
32
|
+
vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
|
33
|
+
|
34
|
+
return vm.to_dict()['id']
|
37
35
|
|
38
36
|
|
39
37
|
def remove(instance_id: str):
|
@@ -54,11 +52,8 @@ def remove(instance_id: str):
|
|
54
52
|
state = 'unknown'
|
55
53
|
project_id = cudo.cudo.cudo_api.project_id_throwable()
|
56
54
|
while retry_count < max_retries:
|
57
|
-
|
58
|
-
|
59
|
-
state = vm.to_dict()['vm']['short_state']
|
60
|
-
except cudo.cudo.rest.ApiException as e:
|
61
|
-
raise e
|
55
|
+
vm = api.get_vm(project_id, instance_id)
|
56
|
+
state = vm.to_dict()['vm']['short_state']
|
62
57
|
|
63
58
|
if state in terminate_ok:
|
64
59
|
break
|
@@ -69,76 +64,82 @@ def remove(instance_id: str):
|
|
69
64
|
'Timeout error, could not terminate due to VM state: {}'.format(
|
70
65
|
state))
|
71
66
|
|
72
|
-
|
73
|
-
api.terminate_vm(project_id, instance_id)
|
74
|
-
except cudo.cudo.rest.ApiException as e:
|
75
|
-
raise e
|
67
|
+
api.terminate_vm(project_id, instance_id)
|
76
68
|
|
77
69
|
|
78
70
|
def set_tags(instance_id: str, tags: Dict):
|
79
71
|
"""Sets the tags for the given instance."""
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
merge=True)) # TODO (skypilot team) merge or overwrite?
|
87
|
-
except cudo.cudo.rest.ApiException as e:
|
88
|
-
raise e
|
72
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
73
|
+
api.update_vm_metadata(
|
74
|
+
cudo.cudo.cudo_api.project_id(), instance_id,
|
75
|
+
cudo.cudo.UpdateVMMetadataBody(
|
76
|
+
metadata=tags,
|
77
|
+
merge=True)) # TODO (skypilot team) merge or overwrite?
|
89
78
|
|
90
79
|
|
91
80
|
def get_instance(vm_id):
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
return vm_dict
|
97
|
-
except cudo.cudo.rest.ApiException as e:
|
98
|
-
raise e
|
81
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
82
|
+
vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
|
83
|
+
vm_dict = vm.to_dict()
|
84
|
+
return vm_dict
|
99
85
|
|
100
86
|
|
101
87
|
def list_instances():
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
return instances
|
122
|
-
except cudo.cudo.rest.ApiException as e:
|
123
|
-
raise e
|
88
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
89
|
+
vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
|
90
|
+
instances = {}
|
91
|
+
for vm in vms.to_dict()['vms']:
|
92
|
+
ex_ip = vm['external_ip_address']
|
93
|
+
in_ip = vm['internal_ip_address']
|
94
|
+
if not in_ip:
|
95
|
+
in_ip = ex_ip
|
96
|
+
instance = {
|
97
|
+
# active_state, init_state, lcm_state, short_state
|
98
|
+
'status': vm['short_state'],
|
99
|
+
'tags': vm['metadata'],
|
100
|
+
'name': vm['id'],
|
101
|
+
'ip': ex_ip,
|
102
|
+
'external_ip': ex_ip,
|
103
|
+
'internal_ip': in_ip
|
104
|
+
}
|
105
|
+
instances[vm['id']] = instance
|
106
|
+
return instances
|
124
107
|
|
125
108
|
|
126
109
|
def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
|
127
110
|
cpus):
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
111
|
+
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
|
112
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
113
|
+
types = api.list_vm_machine_types2()
|
114
|
+
types_dict = types.to_dict()
|
115
|
+
machine_types = types_dict['machine_types']
|
116
|
+
|
117
|
+
# Filter machine types based on requirements
|
118
|
+
matching_types = []
|
119
|
+
for machine_type in machine_types:
|
120
|
+
# Check if this machine type matches our requirements
|
121
|
+
if (machine_type['data_center_id'] == data_center_id and
|
122
|
+
machine_type['gpu_model'] == gpu_model and
|
123
|
+
machine_type['min_vcpu'] <= cpus <= machine_type.get(
|
124
|
+
'max_vcpu_free', float('inf')) and
|
125
|
+
machine_type['min_memory_gib'] <= mem <= machine_type.get(
|
126
|
+
'max_memory_gib_free', float('inf'))):
|
127
|
+
|
128
|
+
# Calculate available VMs based on resource constraints
|
129
|
+
max_vms_by_vcpu = machine_type[
|
130
|
+
'total_vcpu_free'] // cpus if cpus > 0 else float('inf')
|
131
|
+
max_vms_by_memory = machine_type[
|
132
|
+
'total_memory_gib_free'] // mem if mem > 0 else float('inf')
|
133
|
+
max_vms_by_gpu = machine_type[
|
134
|
+
'total_gpu_free'] // gpu_count if gpu_count > 0 else float(
|
135
|
+
'inf')
|
136
|
+
|
137
|
+
available_vms = min(max_vms_by_vcpu, max_vms_by_memory,
|
138
|
+
max_vms_by_gpu)
|
139
|
+
matching_types.append(available_vms)
|
140
|
+
|
141
|
+
total_count = sum(matching_types)
|
142
|
+
if total_count < to_start_count:
|
143
|
+
raise Exception(
|
144
|
+
'Too many VMs requested, try another gpu type or region')
|
145
|
+
return total_count
|
sky/server/common.py
CHANGED
@@ -22,7 +22,6 @@ import uuid
|
|
22
22
|
import colorama
|
23
23
|
import filelock
|
24
24
|
|
25
|
-
import sky
|
26
25
|
from sky import exceptions
|
27
26
|
from sky import sky_logging
|
28
27
|
from sky import skypilot_config
|
@@ -31,6 +30,7 @@ from sky.client import service_account_auth
|
|
31
30
|
from sky.data import data_utils
|
32
31
|
from sky.server import constants as server_constants
|
33
32
|
from sky.server import rest
|
33
|
+
from sky.server import versions
|
34
34
|
from sky.skylet import constants
|
35
35
|
from sky.usage import usage_lib
|
36
36
|
from sky.utils import annotations
|
@@ -66,34 +66,11 @@ RETRY_COUNT_ON_TIMEOUT = 3
|
|
66
66
|
# (e.g. in high contention env) and we will exit eagerly if server exit.
|
67
67
|
WAIT_APISERVER_START_TIMEOUT_SEC = 60
|
68
68
|
|
69
|
-
_VERSION_INFO = (
|
70
|
-
f'{colorama.Style.RESET_ALL}'
|
71
|
-
f'{colorama.Style.DIM}'
|
72
|
-
'client version: v{client_version} (API version: v{client_api_version})\n'
|
73
|
-
'server version: v{server_version} (API version: v{server_api_version})'
|
74
|
-
f'{colorama.Style.RESET_ALL}')
|
75
69
|
_LOCAL_API_SERVER_RESTART_HINT = (
|
76
|
-
f'{colorama.Fore.YELLOW}
|
70
|
+
f'{colorama.Fore.YELLOW}The local SkyPilot API server is not compatible '
|
71
|
+
'with the client. Please restart the API server with:\n'
|
77
72
|
f'{colorama.Style.BRIGHT}sky api stop; sky api start'
|
78
73
|
f'{colorama.Style.RESET_ALL}')
|
79
|
-
_LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
|
80
|
-
f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
|
81
|
-
'{version_info}\n'
|
82
|
-
f'{_LOCAL_API_SERVER_RESTART_HINT}'
|
83
|
-
f'{colorama.Style.RESET_ALL}')
|
84
|
-
_CLIENT_TOO_OLD_WARNING = (
|
85
|
-
f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
|
86
|
-
'{version_info}\n'
|
87
|
-
f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
|
88
|
-
'{command}'
|
89
|
-
f'{colorama.Style.RESET_ALL}')
|
90
|
-
_REMOTE_SERVER_TOO_OLD_WARNING = (
|
91
|
-
f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
|
92
|
-
'{version_info}\n'
|
93
|
-
f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
|
94
|
-
'remote API server or downgrade your local client with:\n'
|
95
|
-
'{command}\n'
|
96
|
-
f'{colorama.Style.RESET_ALL}')
|
97
74
|
_SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
|
98
75
|
f'{colorama.Fore.YELLOW}SkyPilot API server version does not match the '
|
99
76
|
'installation on disk:\n'
|
@@ -105,10 +82,6 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
|
|
105
82
|
f'{colorama.Fore.YELLOW}This can happen if you upgraded SkyPilot without '
|
106
83
|
'restarting the API server.'
|
107
84
|
f'{colorama.Style.RESET_ALL}')
|
108
|
-
# Parse local API version eargly to catch version format errors.
|
109
|
-
_LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
|
110
|
-
# SkyPilot dev version.
|
111
|
-
_DEV_VERSION = '1.0.0-dev0'
|
112
85
|
|
113
86
|
RequestId = str
|
114
87
|
ApiVersion = Optional[str]
|
@@ -134,6 +107,7 @@ class ApiServerInfo:
|
|
134
107
|
commit: Optional[str] = None
|
135
108
|
user: Optional[Dict[str, Any]] = None
|
136
109
|
basic_auth_enabled: bool = False
|
110
|
+
error: Optional[str] = None
|
137
111
|
|
138
112
|
|
139
113
|
def get_api_cookie_jar_path() -> pathlib.Path:
|
@@ -165,14 +139,25 @@ def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
165
139
|
if not cookie_path.parent.exists():
|
166
140
|
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
167
141
|
|
168
|
-
|
142
|
+
# Writing directly to the cookie jar path can race with other processes that
|
143
|
+
# are reading the cookie jar, making it look malformed. Instead, write to a
|
144
|
+
# temporary file and then move it to the final location.
|
145
|
+
# Avoid hardcoding the tmp file path, since it could cause a race with other
|
146
|
+
# processes that are also writing to the tmp file.
|
147
|
+
with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
|
148
|
+
delete=False) as tmp_file:
|
149
|
+
tmp_cookie_path = tmp_file.name
|
150
|
+
file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
|
169
151
|
if cookie_path.exists():
|
170
|
-
file_cookie_jar.load()
|
152
|
+
file_cookie_jar.load(str(cookie_path))
|
171
153
|
|
172
154
|
for cookie in cookie_jar:
|
173
155
|
file_cookie_jar.set_cookie(cookie)
|
174
156
|
file_cookie_jar.save()
|
175
157
|
|
158
|
+
# Move the temporary file to the final location.
|
159
|
+
os.replace(tmp_cookie_path, cookie_path)
|
160
|
+
|
176
161
|
|
177
162
|
def get_cookies_from_response(
|
178
163
|
response: 'requests.Response') -> requests.cookies.RequestsCookieJar:
|
@@ -271,6 +256,23 @@ def is_api_server_local():
|
|
271
256
|
return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
|
272
257
|
|
273
258
|
|
259
|
+
def _handle_non_200_server_status(
|
260
|
+
response: 'requests.Response') -> ApiServerInfo:
|
261
|
+
if response.status_code == 401:
|
262
|
+
return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
|
263
|
+
if response.status_code == 400:
|
264
|
+
# Check if a version mismatch error is returned.
|
265
|
+
try:
|
266
|
+
body = response.json()
|
267
|
+
if (body.get('error',
|
268
|
+
'') == ApiServerStatus.VERSION_MISMATCH.value):
|
269
|
+
return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
|
270
|
+
error=body.get('message', ''))
|
271
|
+
except json.JSONDecodeError:
|
272
|
+
pass
|
273
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
274
|
+
|
275
|
+
|
274
276
|
def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
275
277
|
"""Retrieve the status of the API server.
|
276
278
|
|
@@ -304,10 +306,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
304
306
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
305
307
|
|
306
308
|
logger.debug(f'Health check status: {response.status_code}')
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
309
|
+
|
310
|
+
if response.status_code != 200:
|
311
|
+
return _handle_non_200_server_status(response)
|
312
|
+
|
311
313
|
# The response is 200, so we can parse the response.
|
312
314
|
try:
|
313
315
|
result = response.json()
|
@@ -329,8 +331,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
329
331
|
f'version info. {server_url} may '
|
330
332
|
f'not be running SkyPilot API server.')
|
331
333
|
server_info.status = ApiServerStatus.UNHEALTHY
|
332
|
-
|
333
|
-
|
334
|
+
version_info = versions.check_compatibility_at_client(
|
335
|
+
response.headers)
|
336
|
+
if version_info is None:
|
337
|
+
# Backward compatibility for server prior to v0.11.0 which
|
338
|
+
# does not check compatibility at server side.
|
339
|
+
# TODO(aylei): remove this after v0.13.0 is released.
|
340
|
+
return ApiServerInfo(
|
341
|
+
status=ApiServerStatus.VERSION_MISMATCH,
|
342
|
+
error=versions.SERVER_TOO_OLD_ERROR.format(
|
343
|
+
remote_version=version,
|
344
|
+
local_version=versions.get_local_readable_version(),
|
345
|
+
min_version=server_constants.MIN_COMPATIBLE_VERSION,
|
346
|
+
command=versions.install_version_command(
|
347
|
+
version, commit)))
|
348
|
+
if version_info.error is not None:
|
349
|
+
return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
|
350
|
+
error=version_info.error)
|
351
|
+
|
334
352
|
cookies = get_cookies_from_response(response)
|
335
353
|
set_api_cookie_jar(cookies, create_if_not_exists=False)
|
336
354
|
return server_info
|
@@ -479,7 +497,7 @@ def _start_api_server(deploy: bool = False,
|
|
479
497
|
server_url = get_server_url(host)
|
480
498
|
dashboard_msg = ''
|
481
499
|
api_server_info = get_api_server_status(server_url)
|
482
|
-
if api_server_info.version ==
|
500
|
+
if api_server_info.version == versions.DEV_VERSION:
|
483
501
|
dashboard_msg += (
|
484
502
|
f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
|
485
503
|
f'{colorama.Fore.YELLOW}')
|
@@ -543,33 +561,11 @@ def check_server_healthy(
|
|
543
561
|
api_server_info = get_api_server_status(endpoint)
|
544
562
|
api_server_status = api_server_info.status
|
545
563
|
if api_server_status == ApiServerStatus.VERSION_MISMATCH:
|
546
|
-
|
547
|
-
assert sv is not None, 'Server API version is None'
|
548
|
-
try:
|
549
|
-
server_is_older = int(sv) < _LOCAL_API_VERSION
|
550
|
-
except ValueError:
|
551
|
-
# Raised when the server version using an unknown scheme.
|
552
|
-
# Version compatibility checking is expected to handle all legacy
|
553
|
-
# cases so we safely assume the server is newer when the version
|
554
|
-
# scheme is unknown.
|
555
|
-
logger.debug('API server version using unknown scheme: %s', sv)
|
556
|
-
server_is_older = False
|
557
|
-
version_info = _get_version_info_hint(api_server_info)
|
564
|
+
msg = api_server_info.error
|
558
565
|
if is_api_server_local():
|
559
566
|
# For local server, just hint user to restart the server to get
|
560
567
|
# a consistent version.
|
561
|
-
msg =
|
562
|
-
version_info=version_info)
|
563
|
-
else:
|
564
|
-
assert api_server_info.version is not None, 'Server version is None'
|
565
|
-
if server_is_older:
|
566
|
-
msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
|
567
|
-
version_info=version_info,
|
568
|
-
command=_install_server_version_command(api_server_info))
|
569
|
-
else:
|
570
|
-
msg = _CLIENT_TOO_OLD_WARNING.format(
|
571
|
-
version_info=version_info,
|
572
|
-
command=_install_server_version_command(api_server_info))
|
568
|
+
msg = _LOCAL_API_SERVER_RESTART_HINT
|
573
569
|
with ux_utils.print_exception_no_traceback():
|
574
570
|
raise exceptions.APIVersionMismatchError(msg)
|
575
571
|
elif api_server_status == ApiServerStatus.UNHEALTHY:
|
@@ -603,37 +599,6 @@ def check_server_healthy(
|
|
603
599
|
return api_server_status, api_server_info
|
604
600
|
|
605
601
|
|
606
|
-
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
607
|
-
assert server_info.version is not None, 'Server version is None'
|
608
|
-
# version_on_disk may be None if the server is older
|
609
|
-
assert server_info.commit is not None, 'Server commit is None'
|
610
|
-
sv = server_info.version
|
611
|
-
cv = sky.__version__
|
612
|
-
if server_info.version == _DEV_VERSION:
|
613
|
-
sv = f'{sv} with commit {server_info.commit}'
|
614
|
-
if cv == _DEV_VERSION:
|
615
|
-
cv = f'{cv} with commit {sky.__commit__}'
|
616
|
-
return _VERSION_INFO.format(client_version=cv,
|
617
|
-
server_version=sv,
|
618
|
-
client_api_version=server_constants.API_VERSION,
|
619
|
-
server_api_version=server_info.api_version)
|
620
|
-
|
621
|
-
|
622
|
-
def _install_server_version_command(server_info: ApiServerInfo) -> str:
|
623
|
-
assert server_info.version is not None, 'Server version is None'
|
624
|
-
assert server_info.commit is not None, 'Server commit is None'
|
625
|
-
if server_info.version == _DEV_VERSION:
|
626
|
-
# Dev build without valid version.
|
627
|
-
return ('pip install git+https://github.com/skypilot-org/skypilot@'
|
628
|
-
f'{server_info.commit}')
|
629
|
-
elif 'dev' in server_info.version:
|
630
|
-
# Nightly version.
|
631
|
-
return f'pip install -U "skypilot-nightly=={server_info.version}"'
|
632
|
-
else:
|
633
|
-
# Stable version.
|
634
|
-
return f'pip install -U "skypilot=={server_info.version}"'
|
635
|
-
|
636
|
-
|
637
602
|
# Keep in sync with sky/setup_files/setup.py find_version()
|
638
603
|
def get_skypilot_version_on_disk() -> str:
|
639
604
|
"""Get the version of the SkyPilot code on disk."""
|
sky/server/constants.py
CHANGED
@@ -4,10 +4,31 @@ import os
|
|
4
4
|
|
5
5
|
from sky.skylet import constants
|
6
6
|
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# the
|
10
|
-
|
7
|
+
# pylint: disable=line-too-long
|
8
|
+
# The SkyPilot API version that the code currently use.
|
9
|
+
# Bump this version when the API is changed and special compatibility handling
|
10
|
+
# based on version info is needed.
|
11
|
+
# For more details and code guidelines, refer to:
|
12
|
+
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
13
|
+
API_VERSION = 11
|
14
|
+
|
15
|
+
# The minimum peer API version that the code should still work with.
|
16
|
+
# Notes (dev):
|
17
|
+
# - This value is maintained by the CI pipeline, DO NOT EDIT this manually.
|
18
|
+
# - Compatibility code for versions lower than this can be safely removed.
|
19
|
+
# Refer to API_VERSION for more details.
|
20
|
+
MIN_COMPATIBLE_API_VERSION = 11
|
21
|
+
|
22
|
+
# The semantic version of the minimum compatible API version.
|
23
|
+
# Refer to MIN_COMPATIBLE_API_VERSION for more details.
|
24
|
+
# Note (dev): DO NOT EDIT this constant manually.
|
25
|
+
MIN_COMPATIBLE_VERSION = '0.10.0'
|
26
|
+
|
27
|
+
# The HTTP header name for the API version of the sender.
|
28
|
+
API_VERSION_HEADER = 'X-SkyPilot-API-Version'
|
29
|
+
|
30
|
+
# The HTTP header name for the SkyPilot version of the sender.
|
31
|
+
VERSION_HEADER = 'X-SkyPilot-Version'
|
11
32
|
|
12
33
|
# Prefix for API request names.
|
13
34
|
REQUEST_NAME_PREFIX = 'sky.'
|
sky/server/requests/payloads.py
CHANGED
@@ -1,9 +1,27 @@
|
|
1
1
|
"""Payloads for the Sky API requests.
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
All the payloads that will be used between the client and server communication
|
4
|
+
must be defined here to make sure it get covered by our API compatbility tests.
|
5
|
+
|
6
|
+
Compatibility note:
|
7
|
+
- Adding a new body for new API is compatible as long as the SDK method using
|
8
|
+
the new API is properly decorated with `versions.minimal_api_version`.
|
9
|
+
- Adding a new field with default value to an existing body is compatible at
|
10
|
+
API level, but the business logic must handle the case where the field is
|
11
|
+
not proccessed by an old version of remote client/server. This can usually
|
12
|
+
be done by checking `versions.get_remote_api_version()`.
|
13
|
+
- Other changes are not compatible at API level, so must be handled specially.
|
14
|
+
A common pattern is to keep both the old and new version of the body and
|
15
|
+
checking `versions.get_remote_api_version()` to decide which body to use. For
|
16
|
+
example, say we refactor the `LaunchBody`, the original `LaunchBody` must be
|
17
|
+
kept in the codebase and the new body should be added via `LaunchBodyV2`.
|
18
|
+
Then if the remote runs in an old version, the local code should still send
|
19
|
+
`LaunchBody` to keep the backward compatibility. `LaunchBody` can be removed
|
20
|
+
later when constants.MIN_COMPATIBLE_API_VERSION is updated to a version that
|
21
|
+
supports `LaunchBodyV2`
|
22
|
+
|
23
|
+
Also refer to sky.server.constants.MIN_COMPATIBLE_API_VERSION and the
|
24
|
+
sky.server.versions module for more details.
|
7
25
|
"""
|
8
26
|
import os
|
9
27
|
import typing
|
@@ -94,7 +112,18 @@ def get_override_skypilot_config_path_from_client() -> Optional[str]:
|
|
94
112
|
return skypilot_config.loaded_config_path_serialized()
|
95
113
|
|
96
114
|
|
97
|
-
class
|
115
|
+
class BasePayload(pydantic.BaseModel):
|
116
|
+
"""The base payload for the SkyPilot API."""
|
117
|
+
# Ignore extra fields in the request body, which is useful for backward
|
118
|
+
# compatibility. The difference with `allow` is that `ignore` will not
|
119
|
+
# include the unknown fields when dump the model, i.e., we can add new
|
120
|
+
# fields to the request body without breaking the existing old API server
|
121
|
+
# where the handler function does not accept the new field in function
|
122
|
+
# signature.
|
123
|
+
model_config = pydantic.ConfigDict(extra='ignore')
|
124
|
+
|
125
|
+
|
126
|
+
class RequestBody(BasePayload):
|
98
127
|
"""The request body for the SkyPilot API."""
|
99
128
|
env_vars: Dict[str, str] = {}
|
100
129
|
entrypoint: str = ''
|
@@ -103,11 +132,6 @@ class RequestBody(pydantic.BaseModel):
|
|
103
132
|
override_skypilot_config: Optional[Dict[str, Any]] = {}
|
104
133
|
override_skypilot_config_path: Optional[str] = None
|
105
134
|
|
106
|
-
# Allow extra fields in the request body, which is useful for backward
|
107
|
-
# compatibility, i.e., we can add new fields to the request body without
|
108
|
-
# breaking the existing old API server.
|
109
|
-
model_config = pydantic.ConfigDict(extra='allow')
|
110
|
-
|
111
135
|
def __init__(self, **data):
|
112
136
|
data['env_vars'] = data.get('env_vars', request_body_env_vars())
|
113
137
|
usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
|
@@ -665,3 +689,24 @@ class GetConfigBody(RequestBody):
|
|
665
689
|
class CostReportBody(RequestBody):
|
666
690
|
"""The request body for the cost report endpoint."""
|
667
691
|
days: Optional[int] = 30
|
692
|
+
|
693
|
+
|
694
|
+
class RequestPayload(BasePayload):
|
695
|
+
"""The payload for the requests."""
|
696
|
+
|
697
|
+
request_id: str
|
698
|
+
name: str
|
699
|
+
entrypoint: str
|
700
|
+
request_body: str
|
701
|
+
status: str
|
702
|
+
created_at: float
|
703
|
+
user_id: str
|
704
|
+
return_value: str
|
705
|
+
error: str
|
706
|
+
pid: Optional[int]
|
707
|
+
schedule_type: str
|
708
|
+
user_name: Optional[str] = None
|
709
|
+
# Resources the request operates on.
|
710
|
+
cluster_name: Optional[str] = None
|
711
|
+
status_msg: Optional[str] = None
|
712
|
+
should_retry: bool = False
|
sky/server/requests/requests.py
CHANGED
@@ -98,28 +98,6 @@ class ScheduleType(enum.Enum):
|
|
98
98
|
SHORT = 'short'
|
99
99
|
|
100
100
|
|
101
|
-
@dataclasses.dataclass
|
102
|
-
class RequestPayload:
|
103
|
-
"""The payload for the requests."""
|
104
|
-
|
105
|
-
request_id: str
|
106
|
-
name: str
|
107
|
-
entrypoint: str
|
108
|
-
request_body: str
|
109
|
-
status: str
|
110
|
-
created_at: float
|
111
|
-
user_id: str
|
112
|
-
return_value: str
|
113
|
-
error: str
|
114
|
-
pid: Optional[int]
|
115
|
-
schedule_type: str
|
116
|
-
user_name: Optional[str] = None
|
117
|
-
# Resources the request operates on.
|
118
|
-
cluster_name: Optional[str] = None
|
119
|
-
status_msg: Optional[str] = None
|
120
|
-
should_retry: bool = False
|
121
|
-
|
122
|
-
|
123
101
|
@dataclasses.dataclass
|
124
102
|
class Request:
|
125
103
|
"""A SkyPilot API request."""
|
@@ -185,7 +163,7 @@ class Request:
|
|
185
163
|
@classmethod
|
186
164
|
def from_row(cls, row: Tuple[Any, ...]) -> 'Request':
|
187
165
|
content = dict(zip(REQUEST_COLUMNS, row))
|
188
|
-
return cls.decode(RequestPayload(**content))
|
166
|
+
return cls.decode(payloads.RequestPayload(**content))
|
189
167
|
|
190
168
|
def to_row(self) -> Tuple[Any, ...]:
|
191
169
|
payload = self.encode()
|
@@ -194,7 +172,7 @@ class Request:
|
|
194
172
|
row.append(getattr(payload, k))
|
195
173
|
return tuple(row)
|
196
174
|
|
197
|
-
def readable_encode(self) -> RequestPayload:
|
175
|
+
def readable_encode(self) -> payloads.RequestPayload:
|
198
176
|
"""Serialize the SkyPilot API request for display purposes.
|
199
177
|
|
200
178
|
This function should be called on the server side to serialize the
|
@@ -212,7 +190,7 @@ class Request:
|
|
212
190
|
payloads.RequestBody), (self.name, self.request_body)
|
213
191
|
user = global_user_state.get_user(self.user_id)
|
214
192
|
user_name = user.name if user is not None else None
|
215
|
-
return RequestPayload(
|
193
|
+
return payloads.RequestPayload(
|
216
194
|
request_id=self.request_id,
|
217
195
|
name=self.name,
|
218
196
|
entrypoint=self.entrypoint.__name__,
|
@@ -230,12 +208,12 @@ class Request:
|
|
230
208
|
should_retry=self.should_retry,
|
231
209
|
)
|
232
210
|
|
233
|
-
def encode(self) -> RequestPayload:
|
211
|
+
def encode(self) -> payloads.RequestPayload:
|
234
212
|
"""Serialize the SkyPilot API request."""
|
235
213
|
assert isinstance(self.request_body,
|
236
214
|
payloads.RequestBody), (self.name, self.request_body)
|
237
215
|
try:
|
238
|
-
return RequestPayload(
|
216
|
+
return payloads.RequestPayload(
|
239
217
|
request_id=self.request_id,
|
240
218
|
name=self.name,
|
241
219
|
entrypoint=encoders.pickle_and_encode(self.entrypoint),
|
@@ -264,7 +242,7 @@ class Request:
|
|
264
242
|
raise
|
265
243
|
|
266
244
|
@classmethod
|
267
|
-
def decode(cls, payload: RequestPayload) -> 'Request':
|
245
|
+
def decode(cls, payload: payloads.RequestPayload) -> 'Request':
|
268
246
|
"""Deserialize the SkyPilot API request."""
|
269
247
|
try:
|
270
248
|
return cls(
|