skypilot-nightly 1.0.0.dev20250702__py3-none-any.whl → 1.0.0.dev20250704__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +24 -24
  3. sky/catalog/data_fetchers/fetch_cudo.py +37 -37
  4. sky/client/sdk.py +4 -6
  5. sky/clouds/aws.py +1 -1
  6. sky/clouds/cudo.py +1 -1
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs.html +1 -1
  17. sky/dashboard/out/users.html +1 -1
  18. sky/dashboard/out/volumes.html +1 -1
  19. sky/dashboard/out/workspace/new.html +1 -1
  20. sky/dashboard/out/workspaces/[name].html +1 -1
  21. sky/dashboard/out/workspaces.html +1 -1
  22. sky/exceptions.py +5 -0
  23. sky/jobs/client/sdk.py +3 -1
  24. sky/jobs/server/core.py +16 -11
  25. sky/metrics/__init__.py +0 -0
  26. sky/provision/cudo/cudo_utils.py +14 -8
  27. sky/provision/cudo/cudo_wrapper.py +71 -70
  28. sky/server/common.py +59 -94
  29. sky/server/constants.py +25 -4
  30. sky/server/requests/payloads.py +55 -10
  31. sky/server/requests/requests.py +6 -28
  32. sky/server/rest.py +15 -4
  33. sky/server/server.py +51 -7
  34. sky/server/versions.py +270 -0
  35. sky/setup_files/MANIFEST.in +0 -1
  36. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/METADATA +1 -1
  37. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/RECORD +43 -41
  38. /sky/dashboard/out/_next/static/{N5IdFnjR1RaPGBAVYeTIr → 6TieQqyqsJiaJC33q0FfI}/_buildManifest.js +0 -0
  39. /sky/dashboard/out/_next/static/{N5IdFnjR1RaPGBAVYeTIr → 6TieQqyqsJiaJC33q0FfI}/_ssgManifest.js +0 -0
  40. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250702.dist-info → skypilot_nightly-1.0.0.dev20250704.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,28 @@
1
1
  """Cudo catalog helper."""
2
2
 
3
3
  cudo_gpu_model = {
4
- 'NVIDIA V100': 'V100',
5
- 'NVIDIA A40': 'A40',
6
- 'RTX 3080': 'RTX3080',
7
- 'RTX A4000': 'RTXA4000',
8
- 'RTX A4500': 'RTXA4500',
4
+ 'H100 NVL': 'H100',
5
+ 'H100 SXM': 'H100-SXM',
6
+ 'L40S (compute mode)': 'L40S',
7
+ 'L40S (graphics mode)': 'L40S',
8
+ 'A40 (compute mode)': 'A40',
9
+ 'A40 (graphics mode)': 'A40',
9
10
  'RTX A5000': 'RTXA5000',
10
11
  'RTX A6000': 'RTXA6000',
12
+ 'A100 80GB PCIe': 'A100',
13
+ 'A800 PCIe': 'A800',
14
+ 'V100': 'V100',
11
15
  }
12
16
 
13
17
  cudo_gpu_mem = {
14
- 'RTX3080': 12,
18
+ 'H100': 94,
19
+ 'H100-SXM': 80,
20
+ 'L40S': 48,
15
21
  'A40': 48,
16
- 'RTXA4000': 16,
17
- 'RTXA4500': 20,
18
22
  'RTXA5000': 24,
19
23
  'RTXA6000': 48,
24
+ 'A100': 80,
25
+ 'A800': 80,
20
26
  'V100': 16,
21
27
  }
22
28
 
@@ -28,12 +28,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
28
28
  size_gib=disk_size),
29
29
  metadata=tags)
30
30
 
31
- try:
32
- api = cudo.cudo.cudo_api.virtual_machines()
33
- vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
34
- return vm.to_dict()['id']
35
- except cudo.cudo.rest.ApiException as e:
36
- raise e
31
+ api = cudo.cudo.cudo_api.virtual_machines()
32
+ vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
33
+
34
+ return vm.to_dict()['id']
37
35
 
38
36
 
39
37
  def remove(instance_id: str):
@@ -54,11 +52,8 @@ def remove(instance_id: str):
54
52
  state = 'unknown'
55
53
  project_id = cudo.cudo.cudo_api.project_id_throwable()
56
54
  while retry_count < max_retries:
57
- try:
58
- vm = api.get_vm(project_id, instance_id)
59
- state = vm.to_dict()['vm']['short_state']
60
- except cudo.cudo.rest.ApiException as e:
61
- raise e
55
+ vm = api.get_vm(project_id, instance_id)
56
+ state = vm.to_dict()['vm']['short_state']
62
57
 
63
58
  if state in terminate_ok:
64
59
  break
@@ -69,76 +64,82 @@ def remove(instance_id: str):
69
64
  'Timeout error, could not terminate due to VM state: {}'.format(
70
65
  state))
71
66
 
72
- try:
73
- api.terminate_vm(project_id, instance_id)
74
- except cudo.cudo.rest.ApiException as e:
75
- raise e
67
+ api.terminate_vm(project_id, instance_id)
76
68
 
77
69
 
78
70
  def set_tags(instance_id: str, tags: Dict):
79
71
  """Sets the tags for the given instance."""
80
- try:
81
- api = cudo.cudo.cudo_api.virtual_machines()
82
- api.update_vm_metadata(
83
- cudo.cudo.cudo_api.project_id(), instance_id,
84
- cudo.cudo.UpdateVMMetadataBody(
85
- metadata=tags,
86
- merge=True)) # TODO (skypilot team) merge or overwrite?
87
- except cudo.cudo.rest.ApiException as e:
88
- raise e
72
+ api = cudo.cudo.cudo_api.virtual_machines()
73
+ api.update_vm_metadata(
74
+ cudo.cudo.cudo_api.project_id(), instance_id,
75
+ cudo.cudo.UpdateVMMetadataBody(
76
+ metadata=tags,
77
+ merge=True)) # TODO (skypilot team) merge or overwrite?
89
78
 
90
79
 
91
80
  def get_instance(vm_id):
92
- try:
93
- api = cudo.cudo.cudo_api.virtual_machines()
94
- vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
95
- vm_dict = vm.to_dict()
96
- return vm_dict
97
- except cudo.cudo.rest.ApiException as e:
98
- raise e
81
+ api = cudo.cudo.cudo_api.virtual_machines()
82
+ vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
83
+ vm_dict = vm.to_dict()
84
+ return vm_dict
99
85
 
100
86
 
101
87
  def list_instances():
102
- try:
103
- api = cudo.cudo.cudo_api.virtual_machines()
104
- vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
105
- instances = {}
106
- for vm in vms.to_dict()['vms']:
107
- ex_ip = vm['external_ip_address']
108
- in_ip = vm['internal_ip_address']
109
- if not in_ip:
110
- in_ip = ex_ip
111
- instance = {
112
- # active_state, init_state, lcm_state, short_state
113
- 'status': vm['short_state'],
114
- 'tags': vm['metadata'],
115
- 'name': vm['id'],
116
- 'ip': ex_ip,
117
- 'external_ip': ex_ip,
118
- 'internal_ip': in_ip
119
- }
120
- instances[vm['id']] = instance
121
- return instances
122
- except cudo.cudo.rest.ApiException as e:
123
- raise e
88
+ api = cudo.cudo.cudo_api.virtual_machines()
89
+ vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
90
+ instances = {}
91
+ for vm in vms.to_dict()['vms']:
92
+ ex_ip = vm['external_ip_address']
93
+ in_ip = vm['internal_ip_address']
94
+ if not in_ip:
95
+ in_ip = ex_ip
96
+ instance = {
97
+ # active_state, init_state, lcm_state, short_state
98
+ 'status': vm['short_state'],
99
+ 'tags': vm['metadata'],
100
+ 'name': vm['id'],
101
+ 'ip': ex_ip,
102
+ 'external_ip': ex_ip,
103
+ 'internal_ip': in_ip
104
+ }
105
+ instances[vm['id']] = instance
106
+ return instances
124
107
 
125
108
 
126
109
  def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
127
110
  cpus):
128
- try:
129
- gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
130
- api = cudo.cudo.cudo_api.virtual_machines()
131
- types = api.list_vm_machine_types(mem,
132
- cpus,
133
- gpu=gpu_count,
134
- gpu_model=gpu_model,
135
- data_center_id=data_center_id)
136
- types_dict = types.to_dict()
137
- hc = types_dict['host_configs']
138
- total_count = sum(item['count_vm_available'] for item in hc)
139
- if total_count < to_start_count:
140
- raise Exception(
141
- 'Too many VMs requested, try another gpu type or region')
142
- return total_count
143
- except cudo.cudo.rest.ApiException as e:
144
- raise e
111
+ gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
112
+ api = cudo.cudo.cudo_api.virtual_machines()
113
+ types = api.list_vm_machine_types2()
114
+ types_dict = types.to_dict()
115
+ machine_types = types_dict['machine_types']
116
+
117
+ # Filter machine types based on requirements
118
+ matching_types = []
119
+ for machine_type in machine_types:
120
+ # Check if this machine type matches our requirements
121
+ if (machine_type['data_center_id'] == data_center_id and
122
+ machine_type['gpu_model'] == gpu_model and
123
+ machine_type['min_vcpu'] <= cpus <= machine_type.get(
124
+ 'max_vcpu_free', float('inf')) and
125
+ machine_type['min_memory_gib'] <= mem <= machine_type.get(
126
+ 'max_memory_gib_free', float('inf'))):
127
+
128
+ # Calculate available VMs based on resource constraints
129
+ max_vms_by_vcpu = machine_type[
130
+ 'total_vcpu_free'] // cpus if cpus > 0 else float('inf')
131
+ max_vms_by_memory = machine_type[
132
+ 'total_memory_gib_free'] // mem if mem > 0 else float('inf')
133
+ max_vms_by_gpu = machine_type[
134
+ 'total_gpu_free'] // gpu_count if gpu_count > 0 else float(
135
+ 'inf')
136
+
137
+ available_vms = min(max_vms_by_vcpu, max_vms_by_memory,
138
+ max_vms_by_gpu)
139
+ matching_types.append(available_vms)
140
+
141
+ total_count = sum(matching_types)
142
+ if total_count < to_start_count:
143
+ raise Exception(
144
+ 'Too many VMs requested, try another gpu type or region')
145
+ return total_count
sky/server/common.py CHANGED
@@ -22,7 +22,6 @@ import uuid
22
22
  import colorama
23
23
  import filelock
24
24
 
25
- import sky
26
25
  from sky import exceptions
27
26
  from sky import sky_logging
28
27
  from sky import skypilot_config
@@ -31,6 +30,7 @@ from sky.client import service_account_auth
31
30
  from sky.data import data_utils
32
31
  from sky.server import constants as server_constants
33
32
  from sky.server import rest
33
+ from sky.server import versions
34
34
  from sky.skylet import constants
35
35
  from sky.usage import usage_lib
36
36
  from sky.utils import annotations
@@ -66,34 +66,11 @@ RETRY_COUNT_ON_TIMEOUT = 3
66
66
  # (e.g. in high contention env) and we will exit eagerly if server exit.
67
67
  WAIT_APISERVER_START_TIMEOUT_SEC = 60
68
68
 
69
- _VERSION_INFO = (
70
- f'{colorama.Style.RESET_ALL}'
71
- f'{colorama.Style.DIM}'
72
- 'client version: v{client_version} (API version: v{client_api_version})\n'
73
- 'server version: v{server_version} (API version: v{server_api_version})'
74
- f'{colorama.Style.RESET_ALL}')
75
69
  _LOCAL_API_SERVER_RESTART_HINT = (
76
- f'{colorama.Fore.YELLOW}Please restart the SkyPilot API server with:\n'
70
+ f'{colorama.Fore.YELLOW}The local SkyPilot API server is not compatible '
71
+ 'with the client. Please restart the API server with:\n'
77
72
  f'{colorama.Style.BRIGHT}sky api stop; sky api start'
78
73
  f'{colorama.Style.RESET_ALL}')
79
- _LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
80
- f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
81
- '{version_info}\n'
82
- f'{_LOCAL_API_SERVER_RESTART_HINT}'
83
- f'{colorama.Style.RESET_ALL}')
84
- _CLIENT_TOO_OLD_WARNING = (
85
- f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
86
- '{version_info}\n'
87
- f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
88
- '{command}'
89
- f'{colorama.Style.RESET_ALL}')
90
- _REMOTE_SERVER_TOO_OLD_WARNING = (
91
- f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
92
- '{version_info}\n'
93
- f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
94
- 'remote API server or downgrade your local client with:\n'
95
- '{command}\n'
96
- f'{colorama.Style.RESET_ALL}')
97
74
  _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
98
75
  f'{colorama.Fore.YELLOW}SkyPilot API server version does not match the '
99
76
  'installation on disk:\n'
@@ -105,10 +82,6 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
105
82
  f'{colorama.Fore.YELLOW}This can happen if you upgraded SkyPilot without '
106
83
  'restarting the API server.'
107
84
  f'{colorama.Style.RESET_ALL}')
108
- # Parse local API version eargly to catch version format errors.
109
- _LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
110
- # SkyPilot dev version.
111
- _DEV_VERSION = '1.0.0-dev0'
112
85
 
113
86
  RequestId = str
114
87
  ApiVersion = Optional[str]
@@ -134,6 +107,7 @@ class ApiServerInfo:
134
107
  commit: Optional[str] = None
135
108
  user: Optional[Dict[str, Any]] = None
136
109
  basic_auth_enabled: bool = False
110
+ error: Optional[str] = None
137
111
 
138
112
 
139
113
  def get_api_cookie_jar_path() -> pathlib.Path:
@@ -165,14 +139,25 @@ def set_api_cookie_jar(cookie_jar: CookieJar,
165
139
  if not cookie_path.parent.exists():
166
140
  cookie_path.parent.mkdir(parents=True, exist_ok=True)
167
141
 
168
- file_cookie_jar = MozillaCookieJar(cookie_path)
142
+ # Writing directly to the cookie jar path can race with other processes that
143
+ # are reading the cookie jar, making it look malformed. Instead, write to a
144
+ # temporary file and then move it to the final location.
145
+ # Avoid hardcoding the tmp file path, since it could cause a race with other
146
+ # processes that are also writing to the tmp file.
147
+ with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
148
+ delete=False) as tmp_file:
149
+ tmp_cookie_path = tmp_file.name
150
+ file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
169
151
  if cookie_path.exists():
170
- file_cookie_jar.load()
152
+ file_cookie_jar.load(str(cookie_path))
171
153
 
172
154
  for cookie in cookie_jar:
173
155
  file_cookie_jar.set_cookie(cookie)
174
156
  file_cookie_jar.save()
175
157
 
158
+ # Move the temporary file to the final location.
159
+ os.replace(tmp_cookie_path, cookie_path)
160
+
176
161
 
177
162
  def get_cookies_from_response(
178
163
  response: 'requests.Response') -> requests.cookies.RequestsCookieJar:
@@ -271,6 +256,23 @@ def is_api_server_local():
271
256
  return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
272
257
 
273
258
 
259
+ def _handle_non_200_server_status(
260
+ response: 'requests.Response') -> ApiServerInfo:
261
+ if response.status_code == 401:
262
+ return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
263
+ if response.status_code == 400:
264
+ # Check if a version mismatch error is returned.
265
+ try:
266
+ body = response.json()
267
+ if (body.get('error',
268
+ '') == ApiServerStatus.VERSION_MISMATCH.value):
269
+ return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
270
+ error=body.get('message', ''))
271
+ except json.JSONDecodeError:
272
+ pass
273
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
274
+
275
+
274
276
  def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
275
277
  """Retrieve the status of the API server.
276
278
 
@@ -304,10 +306,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
304
306
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
305
307
 
306
308
  logger.debug(f'Health check status: {response.status_code}')
307
- if response.status_code == 401:
308
- return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
309
- elif response.status_code != 200:
310
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
309
+
310
+ if response.status_code != 200:
311
+ return _handle_non_200_server_status(response)
312
+
311
313
  # The response is 200, so we can parse the response.
312
314
  try:
313
315
  result = response.json()
@@ -329,8 +331,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
329
331
  f'version info. {server_url} may '
330
332
  f'not be running SkyPilot API server.')
331
333
  server_info.status = ApiServerStatus.UNHEALTHY
332
- elif api_version != server_constants.API_VERSION:
333
- server_info.status = ApiServerStatus.VERSION_MISMATCH
334
+ version_info = versions.check_compatibility_at_client(
335
+ response.headers)
336
+ if version_info is None:
337
+ # Backward compatibility for server prior to v0.11.0 which
338
+ # does not check compatibility at server side.
339
+ # TODO(aylei): remove this after v0.13.0 is released.
340
+ return ApiServerInfo(
341
+ status=ApiServerStatus.VERSION_MISMATCH,
342
+ error=versions.SERVER_TOO_OLD_ERROR.format(
343
+ remote_version=version,
344
+ local_version=versions.get_local_readable_version(),
345
+ min_version=server_constants.MIN_COMPATIBLE_VERSION,
346
+ command=versions.install_version_command(
347
+ version, commit)))
348
+ if version_info.error is not None:
349
+ return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
350
+ error=version_info.error)
351
+
334
352
  cookies = get_cookies_from_response(response)
335
353
  set_api_cookie_jar(cookies, create_if_not_exists=False)
336
354
  return server_info
@@ -479,7 +497,7 @@ def _start_api_server(deploy: bool = False,
479
497
  server_url = get_server_url(host)
480
498
  dashboard_msg = ''
481
499
  api_server_info = get_api_server_status(server_url)
482
- if api_server_info.version == _DEV_VERSION:
500
+ if api_server_info.version == versions.DEV_VERSION:
483
501
  dashboard_msg += (
484
502
  f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
485
503
  f'{colorama.Fore.YELLOW}')
@@ -543,33 +561,11 @@ def check_server_healthy(
543
561
  api_server_info = get_api_server_status(endpoint)
544
562
  api_server_status = api_server_info.status
545
563
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
546
- sv = api_server_info.api_version
547
- assert sv is not None, 'Server API version is None'
548
- try:
549
- server_is_older = int(sv) < _LOCAL_API_VERSION
550
- except ValueError:
551
- # Raised when the server version using an unknown scheme.
552
- # Version compatibility checking is expected to handle all legacy
553
- # cases so we safely assume the server is newer when the version
554
- # scheme is unknown.
555
- logger.debug('API server version using unknown scheme: %s', sv)
556
- server_is_older = False
557
- version_info = _get_version_info_hint(api_server_info)
564
+ msg = api_server_info.error
558
565
  if is_api_server_local():
559
566
  # For local server, just hint user to restart the server to get
560
567
  # a consistent version.
561
- msg = _LOCAL_SERVER_VERSION_MISMATCH_WARNING.format(
562
- version_info=version_info)
563
- else:
564
- assert api_server_info.version is not None, 'Server version is None'
565
- if server_is_older:
566
- msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
567
- version_info=version_info,
568
- command=_install_server_version_command(api_server_info))
569
- else:
570
- msg = _CLIENT_TOO_OLD_WARNING.format(
571
- version_info=version_info,
572
- command=_install_server_version_command(api_server_info))
568
+ msg = _LOCAL_API_SERVER_RESTART_HINT
573
569
  with ux_utils.print_exception_no_traceback():
574
570
  raise exceptions.APIVersionMismatchError(msg)
575
571
  elif api_server_status == ApiServerStatus.UNHEALTHY:
@@ -603,37 +599,6 @@ def check_server_healthy(
603
599
  return api_server_status, api_server_info
604
600
 
605
601
 
606
- def _get_version_info_hint(server_info: ApiServerInfo) -> str:
607
- assert server_info.version is not None, 'Server version is None'
608
- # version_on_disk may be None if the server is older
609
- assert server_info.commit is not None, 'Server commit is None'
610
- sv = server_info.version
611
- cv = sky.__version__
612
- if server_info.version == _DEV_VERSION:
613
- sv = f'{sv} with commit {server_info.commit}'
614
- if cv == _DEV_VERSION:
615
- cv = f'{cv} with commit {sky.__commit__}'
616
- return _VERSION_INFO.format(client_version=cv,
617
- server_version=sv,
618
- client_api_version=server_constants.API_VERSION,
619
- server_api_version=server_info.api_version)
620
-
621
-
622
- def _install_server_version_command(server_info: ApiServerInfo) -> str:
623
- assert server_info.version is not None, 'Server version is None'
624
- assert server_info.commit is not None, 'Server commit is None'
625
- if server_info.version == _DEV_VERSION:
626
- # Dev build without valid version.
627
- return ('pip install git+https://github.com/skypilot-org/skypilot@'
628
- f'{server_info.commit}')
629
- elif 'dev' in server_info.version:
630
- # Nightly version.
631
- return f'pip install -U "skypilot-nightly=={server_info.version}"'
632
- else:
633
- # Stable version.
634
- return f'pip install -U "skypilot=={server_info.version}"'
635
-
636
-
637
602
  # Keep in sync with sky/setup_files/setup.py find_version()
638
603
  def get_skypilot_version_on_disk() -> str:
639
604
  """Get the version of the SkyPilot code on disk."""
sky/server/constants.py CHANGED
@@ -4,10 +4,31 @@ import os
4
4
 
5
5
  from sky.skylet import constants
6
6
 
7
- # API server version, whenever there is a change in API server that requires a
8
- # restart of the local API server or error out when the client does not match
9
- # the server version.
10
- API_VERSION = '10'
7
+ # pylint: disable=line-too-long
8
+ # The SkyPilot API version that the code currently use.
9
+ # Bump this version when the API is changed and special compatibility handling
10
+ # based on version info is needed.
11
+ # For more details and code guidelines, refer to:
12
+ # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
+ API_VERSION = 11
14
+
15
+ # The minimum peer API version that the code should still work with.
16
+ # Notes (dev):
17
+ # - This value is maintained by the CI pipeline, DO NOT EDIT this manually.
18
+ # - Compatibility code for versions lower than this can be safely removed.
19
+ # Refer to API_VERSION for more details.
20
+ MIN_COMPATIBLE_API_VERSION = 11
21
+
22
+ # The semantic version of the minimum compatible API version.
23
+ # Refer to MIN_COMPATIBLE_API_VERSION for more details.
24
+ # Note (dev): DO NOT EDIT this constant manually.
25
+ MIN_COMPATIBLE_VERSION = '0.10.0'
26
+
27
+ # The HTTP header name for the API version of the sender.
28
+ API_VERSION_HEADER = 'X-SkyPilot-API-Version'
29
+
30
+ # The HTTP header name for the SkyPilot version of the sender.
31
+ VERSION_HEADER = 'X-SkyPilot-Version'
11
32
 
12
33
  # Prefix for API request names.
13
34
  REQUEST_NAME_PREFIX = 'sky.'
@@ -1,9 +1,27 @@
1
1
  """Payloads for the Sky API requests.
2
2
 
3
- TODO(zhwu): We can consider a better way to handle the default values of the
4
- kwargs for the payloads, otherwise, we have to keep the default values the sync
5
- with the backend functions. The benefit of having the default values in the
6
- payloads is that a user can find the default values in the Restful API docs.
3
+ All the payloads that will be used between the client and server communication
4
+ must be defined here to make sure it get covered by our API compatbility tests.
5
+
6
+ Compatibility note:
7
+ - Adding a new body for new API is compatible as long as the SDK method using
8
+ the new API is properly decorated with `versions.minimal_api_version`.
9
+ - Adding a new field with default value to an existing body is compatible at
10
+ API level, but the business logic must handle the case where the field is
11
+ not proccessed by an old version of remote client/server. This can usually
12
+ be done by checking `versions.get_remote_api_version()`.
13
+ - Other changes are not compatible at API level, so must be handled specially.
14
+ A common pattern is to keep both the old and new version of the body and
15
+ checking `versions.get_remote_api_version()` to decide which body to use. For
16
+ example, say we refactor the `LaunchBody`, the original `LaunchBody` must be
17
+ kept in the codebase and the new body should be added via `LaunchBodyV2`.
18
+ Then if the remote runs in an old version, the local code should still send
19
+ `LaunchBody` to keep the backward compatibility. `LaunchBody` can be removed
20
+ later when constants.MIN_COMPATIBLE_API_VERSION is updated to a version that
21
+ supports `LaunchBodyV2`
22
+
23
+ Also refer to sky.server.constants.MIN_COMPATIBLE_API_VERSION and the
24
+ sky.server.versions module for more details.
7
25
  """
8
26
  import os
9
27
  import typing
@@ -94,7 +112,18 @@ def get_override_skypilot_config_path_from_client() -> Optional[str]:
94
112
  return skypilot_config.loaded_config_path_serialized()
95
113
 
96
114
 
97
- class RequestBody(pydantic.BaseModel):
115
+ class BasePayload(pydantic.BaseModel):
116
+ """The base payload for the SkyPilot API."""
117
+ # Ignore extra fields in the request body, which is useful for backward
118
+ # compatibility. The difference with `allow` is that `ignore` will not
119
+ # include the unknown fields when dump the model, i.e., we can add new
120
+ # fields to the request body without breaking the existing old API server
121
+ # where the handler function does not accept the new field in function
122
+ # signature.
123
+ model_config = pydantic.ConfigDict(extra='ignore')
124
+
125
+
126
+ class RequestBody(BasePayload):
98
127
  """The request body for the SkyPilot API."""
99
128
  env_vars: Dict[str, str] = {}
100
129
  entrypoint: str = ''
@@ -103,11 +132,6 @@ class RequestBody(pydantic.BaseModel):
103
132
  override_skypilot_config: Optional[Dict[str, Any]] = {}
104
133
  override_skypilot_config_path: Optional[str] = None
105
134
 
106
- # Allow extra fields in the request body, which is useful for backward
107
- # compatibility, i.e., we can add new fields to the request body without
108
- # breaking the existing old API server.
109
- model_config = pydantic.ConfigDict(extra='allow')
110
-
111
135
  def __init__(self, **data):
112
136
  data['env_vars'] = data.get('env_vars', request_body_env_vars())
113
137
  usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
@@ -665,3 +689,24 @@ class GetConfigBody(RequestBody):
665
689
  class CostReportBody(RequestBody):
666
690
  """The request body for the cost report endpoint."""
667
691
  days: Optional[int] = 30
692
+
693
+
694
+ class RequestPayload(BasePayload):
695
+ """The payload for the requests."""
696
+
697
+ request_id: str
698
+ name: str
699
+ entrypoint: str
700
+ request_body: str
701
+ status: str
702
+ created_at: float
703
+ user_id: str
704
+ return_value: str
705
+ error: str
706
+ pid: Optional[int]
707
+ schedule_type: str
708
+ user_name: Optional[str] = None
709
+ # Resources the request operates on.
710
+ cluster_name: Optional[str] = None
711
+ status_msg: Optional[str] = None
712
+ should_retry: bool = False
@@ -98,28 +98,6 @@ class ScheduleType(enum.Enum):
98
98
  SHORT = 'short'
99
99
 
100
100
 
101
- @dataclasses.dataclass
102
- class RequestPayload:
103
- """The payload for the requests."""
104
-
105
- request_id: str
106
- name: str
107
- entrypoint: str
108
- request_body: str
109
- status: str
110
- created_at: float
111
- user_id: str
112
- return_value: str
113
- error: str
114
- pid: Optional[int]
115
- schedule_type: str
116
- user_name: Optional[str] = None
117
- # Resources the request operates on.
118
- cluster_name: Optional[str] = None
119
- status_msg: Optional[str] = None
120
- should_retry: bool = False
121
-
122
-
123
101
  @dataclasses.dataclass
124
102
  class Request:
125
103
  """A SkyPilot API request."""
@@ -185,7 +163,7 @@ class Request:
185
163
  @classmethod
186
164
  def from_row(cls, row: Tuple[Any, ...]) -> 'Request':
187
165
  content = dict(zip(REQUEST_COLUMNS, row))
188
- return cls.decode(RequestPayload(**content))
166
+ return cls.decode(payloads.RequestPayload(**content))
189
167
 
190
168
  def to_row(self) -> Tuple[Any, ...]:
191
169
  payload = self.encode()
@@ -194,7 +172,7 @@ class Request:
194
172
  row.append(getattr(payload, k))
195
173
  return tuple(row)
196
174
 
197
- def readable_encode(self) -> RequestPayload:
175
+ def readable_encode(self) -> payloads.RequestPayload:
198
176
  """Serialize the SkyPilot API request for display purposes.
199
177
 
200
178
  This function should be called on the server side to serialize the
@@ -212,7 +190,7 @@ class Request:
212
190
  payloads.RequestBody), (self.name, self.request_body)
213
191
  user = global_user_state.get_user(self.user_id)
214
192
  user_name = user.name if user is not None else None
215
- return RequestPayload(
193
+ return payloads.RequestPayload(
216
194
  request_id=self.request_id,
217
195
  name=self.name,
218
196
  entrypoint=self.entrypoint.__name__,
@@ -230,12 +208,12 @@ class Request:
230
208
  should_retry=self.should_retry,
231
209
  )
232
210
 
233
- def encode(self) -> RequestPayload:
211
+ def encode(self) -> payloads.RequestPayload:
234
212
  """Serialize the SkyPilot API request."""
235
213
  assert isinstance(self.request_body,
236
214
  payloads.RequestBody), (self.name, self.request_body)
237
215
  try:
238
- return RequestPayload(
216
+ return payloads.RequestPayload(
239
217
  request_id=self.request_id,
240
218
  name=self.name,
241
219
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
@@ -264,7 +242,7 @@ class Request:
264
242
  raise
265
243
 
266
244
  @classmethod
267
- def decode(cls, payload: RequestPayload) -> 'Request':
245
+ def decode(cls, payload: payloads.RequestPayload) -> 'Request':
268
246
  """Deserialize the SkyPilot API request."""
269
247
  try:
270
248
  return cls(