skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (70) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +37 -6
  4. sky/backends/cloud_vm_ray_backend.py +41 -6
  5. sky/client/cli/command.py +22 -2
  6. sky/core.py +5 -0
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-7fd0cf9dbecff10f.js} +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/config.html +1 -1
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/infra/[context].html +1 -1
  19. sky/dashboard/out/infra.html +1 -1
  20. sky/dashboard/out/jobs/[job].html +1 -1
  21. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  22. sky/dashboard/out/jobs.html +1 -1
  23. sky/dashboard/out/users.html +1 -1
  24. sky/dashboard/out/volumes.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/execution.py +15 -0
  29. sky/global_user_state.py +102 -0
  30. sky/jobs/recovery_strategy.py +3 -0
  31. sky/jobs/server/core.py +4 -0
  32. sky/jobs/utils.py +9 -2
  33. sky/provision/__init__.py +3 -2
  34. sky/provision/aws/instance.py +5 -4
  35. sky/provision/azure/instance.py +5 -4
  36. sky/provision/cudo/instance.py +5 -4
  37. sky/provision/do/instance.py +5 -4
  38. sky/provision/fluidstack/instance.py +5 -4
  39. sky/provision/gcp/instance.py +5 -4
  40. sky/provision/hyperbolic/instance.py +5 -4
  41. sky/provision/kubernetes/instance.py +36 -6
  42. sky/provision/lambda_cloud/instance.py +5 -4
  43. sky/provision/nebius/instance.py +5 -4
  44. sky/provision/oci/instance.py +5 -4
  45. sky/provision/paperspace/instance.py +5 -4
  46. sky/provision/provisioner.py +6 -0
  47. sky/provision/runpod/instance.py +5 -4
  48. sky/provision/scp/instance.py +5 -5
  49. sky/provision/vast/instance.py +5 -5
  50. sky/provision/vsphere/instance.py +5 -4
  51. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  52. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  53. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  54. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  55. sky/serve/serve_utils.py +37 -3
  56. sky/skypilot_config.py +4 -4
  57. sky/users/permission.py +1 -1
  58. sky/utils/cli_utils/status_utils.py +9 -0
  59. sky/utils/db/db_utils.py +22 -1
  60. sky/utils/db/migration_utils.py +1 -1
  61. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +1 -1
  62. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +67 -66
  63. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  64. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  65. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  66. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
  67. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
  68. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
  69. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
  70. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import copy
4
4
  from multiprocessing import pool
5
5
  import re
6
6
  import time
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Type
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import gcp
@@ -61,7 +61,7 @@ def query_instances(
61
61
  cluster_name_on_cloud: str,
62
62
  provider_config: Optional[Dict[str, Any]] = None,
63
63
  non_terminated_only: bool = True,
64
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
64
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
65
65
  """See sky/provision/__init__.py"""
66
66
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
67
  zone = provider_config['availability_zone']
@@ -84,7 +84,8 @@ def query_instances(
84
84
  )
85
85
 
86
86
  raw_statuses = {}
87
- statuses = {}
87
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
88
+ Optional[str]]] = {}
88
89
  for inst_id, instance in instances.items():
89
90
  raw_status = instance[handler.STATUS_FIELD]
90
91
  raw_statuses[inst_id] = raw_status
@@ -98,7 +99,7 @@ def query_instances(
98
99
  status = None
99
100
  if non_terminated_only and status is None:
100
101
  continue
101
- statuses[inst_id] = status
102
+ statuses[inst_id] = (status, None)
102
103
 
103
104
  # GCP does not clean up preempted TPU VMs. We remove it ourselves.
104
105
  if handler == instance_utils.GCPTPUVMInstance:
@@ -1,6 +1,6 @@
1
1
  """Hyperbolic instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -307,7 +307,7 @@ def query_instances(
307
307
  cluster_name_on_cloud: str,
308
308
  provider_config: Optional[dict] = None,
309
309
  non_terminated_only: bool = True,
310
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
310
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
311
311
  """Returns the status of the specified instances for Hyperbolic."""
312
312
  del provider_config # unused
313
313
  # Fetch all instances for this cluster
@@ -319,7 +319,8 @@ def query_instances(
319
319
  # No instances found: return empty dict to indicate fully deleted
320
320
  return {}
321
321
 
322
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
322
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
323
+ Optional[str]]] = {}
323
324
  for instance_id, instance in instances.items():
324
325
  try:
325
326
  raw_status = instance.get('status', 'unknown').lower()
@@ -328,7 +329,7 @@ def query_instances(
328
329
  status = hyperbolic_status.to_cluster_status()
329
330
  if non_terminated_only and status is None:
330
331
  continue
331
- statuses[instance_id] = status
332
+ statuses[instance_id] = (status, None)
332
333
  except utils.HyperbolicError as e:
333
334
  logger.warning(
334
335
  f'Failed to parse status for instance {instance_id}: {e}')
@@ -2,7 +2,7 @@
2
2
  import copy
3
3
  import json
4
4
  import time
5
- from typing import Any, Callable, Dict, List, Optional, Union
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import exceptions
8
8
  from sky import sky_logging
@@ -1248,15 +1248,37 @@ def get_cluster_info(
1248
1248
  provider_config=provider_config)
1249
1249
 
1250
1250
 
1251
+ def _get_pod_termination_reason(pod: Any) -> str:
1252
+ reasons = []
1253
+ if pod.status.container_statuses:
1254
+ for container_status in pod.status.container_statuses:
1255
+ terminated = container_status.state.terminated
1256
+ if terminated:
1257
+ exit_code = terminated.exit_code
1258
+ reason = terminated.reason
1259
+ if exit_code == 0:
1260
+ # skip exit 0 (non-failed) just for sanity
1261
+ continue
1262
+ if reason is None:
1263
+ # just in-case reason is None, have default for debugging
1264
+ reason = f'exit({exit_code})'
1265
+ reasons.append(reason)
1266
+ # TODO (kyuds): later, if needed, query `last_state` too.
1267
+
1268
+ # Normally we will have a single container per pod for skypilot
1269
+ # but doing this just in-case there are multiple containers.
1270
+ return ' | '.join(reasons)
1271
+
1272
+
1251
1273
  def query_instances(
1252
1274
  cluster_name_on_cloud: str,
1253
1275
  provider_config: Optional[Dict[str, Any]] = None,
1254
1276
  non_terminated_only: bool = True
1255
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
1277
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1256
1278
  status_map = {
1257
1279
  'Pending': status_lib.ClusterStatus.INIT,
1258
1280
  'Running': status_lib.ClusterStatus.UP,
1259
- 'Failed': None,
1281
+ 'Failed': status_lib.ClusterStatus.INIT,
1260
1282
  'Unknown': None,
1261
1283
  'Succeeded': None,
1262
1284
  'Terminating': None,
@@ -1298,12 +1320,20 @@ def query_instances(
1298
1320
  f'status: {common_utils.format_exception(e)}')
1299
1321
 
1300
1322
  # Check if the pods are running or pending
1301
- cluster_status = {}
1323
+ cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1324
+ Optional[str]]] = {}
1302
1325
  for pod in pods:
1303
- pod_status = status_map[pod.status.phase]
1326
+ phase = pod.status.phase
1327
+ pod_status = status_map[phase]
1304
1328
  if non_terminated_only and pod_status is None:
1305
1329
  continue
1306
- cluster_status[pod.metadata.name] = pod_status
1330
+ reason = None
1331
+ if phase == 'Failed':
1332
+ reason = _get_pod_termination_reason(pod)
1333
+ logger.debug(f'Pod Status Reason(s): {reason}')
1334
+ pod_name = pod.metadata.name
1335
+ reason = f'{pod_name}: {reason}' if reason is not None else None
1336
+ cluster_status[pod_name] = (pod_status, reason)
1307
1337
  return cluster_status
1308
1338
 
1309
1339
 
@@ -1,7 +1,7 @@
1
1
  """Lambda Cloud instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -229,7 +229,7 @@ def query_instances(
229
229
  cluster_name_on_cloud: str,
230
230
  provider_config: Optional[Dict[str, Any]] = None,
231
231
  non_terminated_only: bool = True,
232
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
232
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
233
  """See sky/provision/__init__.py"""
234
234
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
235
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -240,12 +240,13 @@ def query_instances(
240
240
  'unhealthy': status_lib.ClusterStatus.INIT,
241
241
  'terminating': None,
242
242
  }
243
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
243
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
244
+ Optional[str]]] = {}
244
245
  for instance_id, instance in instances.items():
245
246
  status = status_map.get(instance['status'])
246
247
  if non_terminated_only and status is None:
247
248
  continue
248
- statuses[instance_id] = status
249
+ statuses[instance_id] = (status, None)
249
250
  return statuses
250
251
 
251
252
 
@@ -1,6 +1,6 @@
1
1
  """Nebius instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -250,7 +250,7 @@ def query_instances(
250
250
  cluster_name_on_cloud: str,
251
251
  provider_config: Optional[Dict[str, Any]] = None,
252
252
  non_terminated_only: bool = True,
253
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
253
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
254
254
  """See sky/provision/__init__.py"""
255
255
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
256
256
  instances = _filter_instances(provider_config['region'],
@@ -263,12 +263,13 @@ def query_instances(
263
263
  'STOPPING': status_lib.ClusterStatus.STOPPED,
264
264
  'DELETING': status_lib.ClusterStatus.STOPPED,
265
265
  }
266
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
266
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
267
+ Optional[str]]] = {}
267
268
  for inst_id, inst in instances.items():
268
269
  status = status_map[inst['status']]
269
270
  if non_terminated_only and status is None:
270
271
  continue
271
- statuses[inst_id] = status
272
+ statuses[inst_id] = (status, None)
272
273
  return statuses
273
274
 
274
275
 
@@ -10,7 +10,7 @@ import copy
10
10
  from datetime import datetime
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional
13
+ from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  from sky import exceptions
16
16
  from sky import sky_logging
@@ -35,7 +35,7 @@ def query_instances(
35
35
  cluster_name_on_cloud: str,
36
36
  provider_config: Optional[Dict[str, Any]] = None,
37
37
  non_terminated_only: bool = True,
38
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
38
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
39
39
  """Query instances.
40
40
 
41
41
  Returns a dictionary of instance IDs and status.
@@ -47,7 +47,8 @@ def query_instances(
47
47
  region = provider_config['region']
48
48
 
49
49
  status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
50
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
51
+ Optional[str]]] = {}
51
52
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
53
 
53
54
  instances = _get_filtered_nodes(region, filters)
@@ -56,7 +57,7 @@ def query_instances(
56
57
  sky_status = status_map[vm_status]
57
58
  if non_terminated_only and sky_status is None:
58
59
  continue
59
- statuses[node['inst_id']] = sky_status
60
+ statuses[node['inst_id']] = (sky_status, None)
60
61
 
61
62
  return statuses
62
63
 
@@ -1,7 +1,7 @@
1
1
  """Paperspace instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -280,7 +280,7 @@ def query_instances(
280
280
  cluster_name_on_cloud: str,
281
281
  provider_config: Optional[Dict[str, Any]] = None,
282
282
  non_terminated_only: bool = True,
283
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
283
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
284
  """See sky/provision/__init__.py"""
285
285
  del non_terminated_only
286
286
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
@@ -297,10 +297,11 @@ def query_instances(
297
297
  'ready': status_lib.ClusterStatus.UP,
298
298
  'off': status_lib.ClusterStatus.STOPPED,
299
299
  }
300
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
300
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
301
+ Optional[str]]] = {}
301
302
  for inst_id, inst in instances.items():
302
303
  status = status_map[inst['state']]
303
- statuses[inst_id] = status
304
+ statuses[inst_id] = (status, None)
304
305
  return statuses
305
306
 
306
307
 
@@ -100,6 +100,12 @@ def _bulk_provision(
100
100
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
101
101
  f'seconds.')
102
102
 
103
+ # Add cluster event for provisioning completion.
104
+ global_user_state.add_cluster_event(
105
+ str(cluster_name), status_lib.ClusterStatus.INIT,
106
+ f'Instances launched on {cloud.display_name()} in {region}',
107
+ global_user_state.ClusterEventType.STATUS_CHANGE)
108
+
103
109
  return provision_record
104
110
 
105
111
 
@@ -1,6 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -204,7 +204,7 @@ def query_instances(
204
204
  cluster_name_on_cloud: str,
205
205
  provider_config: Optional[Dict[str, Any]] = None,
206
206
  non_terminated_only: bool = True,
207
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
207
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
208
  """See sky/provision/__init__.py"""
209
209
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
210
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -215,12 +215,13 @@ def query_instances(
215
215
  'PAUSED': status_lib.ClusterStatus.INIT,
216
216
  'RUNNING': status_lib.ClusterStatus.UP,
217
217
  }
218
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
218
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
219
+ Optional[str]]] = {}
219
220
  for inst_id, inst in instances.items():
220
221
  status = status_map[inst['status']]
221
222
  if non_terminated_only and status is None:
222
223
  continue
223
- statuses[inst_id] = status
224
+ statuses[inst_id] = (status, None)
224
225
  return statuses
225
226
 
226
227
 
@@ -4,7 +4,7 @@ import logging
4
4
  import random
5
5
  import string
6
6
  import time
7
- from typing import Any, Dict, List, Optional
7
+ from typing import Any, Dict, List, Optional, Tuple
8
8
 
9
9
  from sky.clouds.utils import scp_utils
10
10
  from sky.provision import common
@@ -430,8 +430,7 @@ def query_instances(
430
430
  cluster_name_on_cloud: str,
431
431
  provider_config: Optional[Dict[str, Any]] = None,
432
432
  non_terminated_only: bool = True,
433
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
434
-
433
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
434
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
436
435
  instances = _filter_instances(cluster_name_on_cloud, None)
437
436
 
@@ -447,12 +446,13 @@ def query_instances(
447
446
  'TERMINATED': None,
448
447
  }
449
448
 
450
- statuses = {}
449
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
450
+ Optional[str]]] = {}
451
451
  for instance in instances:
452
452
  status = status_map[instance['virtualServerState']]
453
453
  if non_terminated_only and status is None:
454
454
  continue
455
- statuses[instance['virtualServerId']] = status
455
+ statuses[instance['virtualServerId']] = (status, None)
456
456
  return statuses
457
457
 
458
458
 
@@ -1,6 +1,6 @@
1
1
  """Vast instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -219,9 +219,8 @@ def query_instances(
219
219
  cluster_name_on_cloud: str,
220
220
  provider_config: Optional[Dict[str, Any]] = None,
221
221
  non_terminated_only: bool = True,
222
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
222
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
223
223
  """See sky/provision/__init__.py"""
224
-
225
224
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
226
225
  instances = _filter_instances(cluster_name_on_cloud, None)
227
226
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -231,12 +230,13 @@ def query_instances(
231
230
  'STOPPED': status_lib.ClusterStatus.STOPPED,
232
231
  'RUNNING': status_lib.ClusterStatus.UP,
233
232
  }
234
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
233
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
234
+ Optional[str]]] = {}
235
235
  for inst_id, inst in instances.items():
236
236
  status = status_map[inst['status']]
237
237
  if non_terminated_only and status is None:
238
238
  continue
239
- statuses[inst_id] = status
239
+ statuses[inst_id] = (status, None)
240
240
  return statuses
241
241
 
242
242
 
@@ -1,7 +1,7 @@
1
1
  """Vsphere instance provisioning."""
2
2
  import json
3
3
  import typing
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
@@ -396,7 +396,7 @@ def query_instances(
396
396
  cluster_name_on_cloud: str,
397
397
  provider_config: Optional[Dict[str, Any]] = None,
398
398
  non_terminated_only: bool = True,
399
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
399
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
400
400
  """See sky/provision/__init__.py"""
401
401
  logger.info('New provision of Vsphere: query_instances().')
402
402
  assert provider_config is not None, cluster_name_on_cloud
@@ -413,12 +413,13 @@ def query_instances(
413
413
  'suspended': None,
414
414
  }
415
415
 
416
- status = {}
416
+ status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
417
+ Optional[str]]] = {}
417
418
  for inst in instances:
418
419
  stat = status_map[inst.runtime.powerState]
419
420
  if non_terminated_only and stat is None:
420
421
  continue
421
- status[inst.summary.config.instanceUuid] = stat
422
+ status[inst.summary.config.instanceUuid] = (stat, None)
422
423
  vc_object.disconnect()
423
424
  return status
424
425
 
@@ -22,7 +22,7 @@ depends_on = None
22
22
  def upgrade():
23
23
  with op.get_context().autocommit_block():
24
24
  # Create any missing tables with current schema first
25
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
25
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
26
 
27
27
  # Add all missing columns to clusters table
28
28
  # This allows each column addition to fail independently without rolling
@@ -0,0 +1,32 @@
1
+ """Columns for whether the cluster is managed.
2
+
3
+ Revision ID: 005
4
+ Revises: 004
5
+ Create Date: 2025-08-08
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+
13
+ from sky.global_user_state import Base
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.π
17
+ revision: str = '005'
18
+ down_revision: Union[str, Sequence[str], None] = '004'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add new table for cluster events."""
25
+ with op.get_context().autocommit_block():
26
+ # Add new table for cluster events.
27
+ db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
28
+ 'cluster_events')
29
+
30
+
31
+ def downgrade():
32
+ pass
@@ -26,7 +26,7 @@ def upgrade():
26
26
  """Create initial schema and add all backwards compatibility columns"""
27
27
  with op.get_context().autocommit_block():
28
28
  # Create all tables with their current schema
29
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
29
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
30
30
 
31
31
  # Add backwards compatibility columns using helper function that matches
32
32
  # original add_column_to_table_sqlalchemy behavior exactly
@@ -26,7 +26,7 @@ def upgrade():
26
26
  """Create initial schema and add all backwards compatibility columns"""
27
27
  with op.get_context().autocommit_block():
28
28
  # Create all tables with their current schema
29
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
29
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
30
30
 
31
31
  # Add backwards compatibility columns using helper function that matches
32
32
  # original add_column_to_table_sqlalchemy behavior exactly
sky/serve/serve_utils.py CHANGED
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
37
37
  from sky.utils import annotations
38
38
  from sky.utils import command_runner
39
39
  from sky.utils import common_utils
40
+ from sky.utils import controller_utils
40
41
  from sky.utils import log_utils
41
42
  from sky.utils import message_utils
42
43
  from sky.utils import resources_utils
@@ -259,14 +260,47 @@ def get_service_filelock_path(pool: str) -> str:
259
260
  return str(path)
260
261
 
261
262
 
263
+ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
264
+ pool: bool) -> None:
265
+ """Validate the consolidation mode config."""
266
+ # Check whether the consolidation mode config is changed.
267
+ controller = controller_utils.get_controller_for_pool(pool).value
268
+ if current_is_consolidation_mode:
269
+ controller_cn = controller.cluster_name
270
+ if global_user_state.get_cluster_from_name(controller_cn) is not None:
271
+ with ux_utils.print_exception_no_traceback():
272
+ raise exceptions.InconsistentConsolidationModeError(
273
+ f'{colorama.Fore.RED}Consolidation mode for '
274
+ f'{controller.controller_type} is enabled, but the '
275
+ f'controller cluster {controller_cn} is still running. '
276
+ 'Please terminate the controller cluster first.'
277
+ f'{colorama.Style.RESET_ALL}')
278
+ else:
279
+ noun = 'pool' if pool else 'service'
280
+ all_services = [
281
+ svc for svc in serve_state.get_services() if svc['pool'] == pool
282
+ ]
283
+ if all_services:
284
+ with ux_utils.print_exception_no_traceback():
285
+ raise exceptions.InconsistentConsolidationModeError(
286
+ f'{colorama.Fore.RED}Consolidation mode for '
287
+ f'{controller.controller_type} is disabled, but there are '
288
+ f'still {len(all_services)} {noun}s running. Please '
289
+ f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
290
+
291
+
262
292
  @annotations.lru_cache(scope='request', maxsize=1)
263
293
  def is_consolidation_mode(pool: bool = False) -> bool:
264
294
  # Use jobs config for pool consolidation mode.
265
- controller_type = 'jobs' if pool else 'serve'
295
+ controller = controller_utils.get_controller_for_pool(pool).value
266
296
  consolidation_mode = skypilot_config.get_nested(
267
- (controller_type, 'controller', 'consolidation_mode'),
297
+ (controller.controller_type, 'controller', 'consolidation_mode'),
268
298
  default_value=False)
269
- # _check_consolidation_mode_consistency(consolidation_mode, pool)
299
+ # We should only do this check on API server, as the controller will not
300
+ # have related config and will always seemingly disabled for consolidation
301
+ # mode. Check #6611 for more details.
302
+ if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
303
+ _validate_consolidation_mode_config(consolidation_mode, pool)
270
304
  return consolidation_mode
271
305
 
272
306
 
sky/skypilot_config.py CHANGED
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
575
575
  with _DB_USE_LOCK:
576
576
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
577
577
  poolclass=NullPool)
578
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
579
- sqlalchemy_engine)
578
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
579
+ sqlalchemy_engine)
580
580
 
581
581
  def _get_config_yaml_from_db(
582
582
  key: str) -> Optional[config_utils.Config]:
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
867
867
  with _DB_USE_LOCK:
868
868
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
869
869
  poolclass=NullPool)
870
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
871
- sqlalchemy_engine)
870
+ db_utils.add_all_tables_to_db_sqlalchemy(
871
+ Base.metadata, sqlalchemy_engine)
872
872
 
873
873
  def _set_config_yaml_to_db(key: str,
874
874
  config: config_utils.Config):
sky/users/permission.py CHANGED
@@ -44,7 +44,7 @@ class PermissionService:
44
44
  if _enforcer_instance is None:
45
45
  _enforcer_instance = self
46
46
  engine = global_user_state.initialize_and_get_db()
47
- db_utils.add_tables_to_db_sqlalchemy(
47
+ db_utils.add_all_tables_to_db_sqlalchemy(
48
48
  sqlalchemy_adapter.Base.metadata, engine)
49
49
  adapter = sqlalchemy_adapter.Adapter(engine)
50
50
  model_path = os.path.join(os.path.dirname(__file__),
@@ -81,6 +81,7 @@ def show_status_table(cluster_records: List[_ClusterRecord],
81
81
  _get_command,
82
82
  truncate=not show_all,
83
83
  show_by_default=False),
84
+ StatusColumn('LAST_EVENT', _get_last_event, show_by_default=False),
84
85
  ]
85
86
 
86
87
  columns = []
@@ -314,6 +315,14 @@ def _get_head_ip(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
314
315
  return handle.head_ip
315
316
 
316
317
 
318
+ def _get_last_event(cluster_record: _ClusterRecord,
319
+ truncate: bool = True) -> str:
320
+ del truncate
321
+ if cluster_record.get('last_event', None) is None:
322
+ return 'No recorded events.'
323
+ return cluster_record['last_event']
324
+
325
+
317
326
  def _is_pending_autostop(cluster_record: _ClusterRecord) -> bool:
318
327
  # autostop < 0 means nothing scheduled.
319
328
  return cluster_record['autostop'] >= 0 and _get_status(
sky/utils/db/db_utils.py CHANGED
@@ -87,7 +87,7 @@ def add_column_to_table(
87
87
  conn.commit()
88
88
 
89
89
 
90
- def add_tables_to_db_sqlalchemy(
90
+ def add_all_tables_to_db_sqlalchemy(
91
91
  metadata: sqlalchemy.MetaData,
92
92
  engine: sqlalchemy.Engine,
93
93
  ):
@@ -103,6 +103,27 @@ def add_tables_to_db_sqlalchemy(
103
103
  raise
104
104
 
105
105
 
106
+ def add_table_to_db_sqlalchemy(
107
+ metadata: sqlalchemy.MetaData,
108
+ engine: sqlalchemy.Engine,
109
+ table_name: str,
110
+ ):
111
+ """Add a specific table to the database."""
112
+ try:
113
+ table = metadata.tables[table_name]
114
+ except KeyError as e:
115
+ raise e
116
+
117
+ try:
118
+ table.create(bind=engine, checkfirst=True)
119
+ except (sqlalchemy_exc.OperationalError,
120
+ sqlalchemy_exc.ProgrammingError) as e:
121
+ if 'already exists' in str(e):
122
+ pass
123
+ else:
124
+ raise
125
+
126
+
106
127
  def add_column_to_table_sqlalchemy(
107
128
  session: 'Session',
108
129
  table_name: str,
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '004'
22
+ GLOBAL_USER_STATE_VERSION = '005'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'