skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (91) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +57 -7
  4. sky/backends/cloud_vm_ray_backend.py +50 -8
  5. sky/client/cli/command.py +60 -26
  6. sky/client/sdk.py +132 -65
  7. sky/client/sdk_async.py +1 -1
  8. sky/core.py +10 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +21 -4
  34. sky/global_user_state.py +110 -1
  35. sky/jobs/client/sdk.py +27 -20
  36. sky/jobs/controller.py +2 -1
  37. sky/jobs/recovery_strategy.py +3 -0
  38. sky/jobs/server/core.py +4 -0
  39. sky/jobs/utils.py +9 -2
  40. sky/provision/__init__.py +3 -2
  41. sky/provision/aws/instance.py +5 -4
  42. sky/provision/azure/instance.py +5 -4
  43. sky/provision/cudo/instance.py +5 -4
  44. sky/provision/do/instance.py +5 -4
  45. sky/provision/fluidstack/instance.py +5 -4
  46. sky/provision/gcp/instance.py +5 -4
  47. sky/provision/hyperbolic/instance.py +5 -4
  48. sky/provision/kubernetes/instance.py +36 -6
  49. sky/provision/lambda_cloud/instance.py +5 -4
  50. sky/provision/nebius/instance.py +5 -4
  51. sky/provision/oci/instance.py +5 -4
  52. sky/provision/paperspace/instance.py +5 -4
  53. sky/provision/provisioner.py +6 -0
  54. sky/provision/runpod/instance.py +5 -4
  55. sky/provision/scp/instance.py +5 -5
  56. sky/provision/vast/instance.py +5 -5
  57. sky/provision/vsphere/instance.py +5 -4
  58. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  59. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  60. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  61. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  62. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  63. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  64. sky/serve/client/impl.py +11 -8
  65. sky/serve/client/sdk.py +7 -7
  66. sky/serve/serve_state.py +437 -340
  67. sky/serve/serve_utils.py +37 -3
  68. sky/serve/server/impl.py +2 -2
  69. sky/server/common.py +12 -8
  70. sky/server/constants.py +1 -1
  71. sky/setup_files/alembic.ini +4 -0
  72. sky/skypilot_config.py +4 -4
  73. sky/users/permission.py +1 -1
  74. sky/utils/cli_utils/status_utils.py +10 -1
  75. sky/utils/db/db_utils.py +53 -1
  76. sky/utils/db/migration_utils.py +5 -1
  77. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  78. sky/utils/resource_checker.py +162 -21
  79. sky/volumes/client/sdk.py +4 -4
  80. sky/workspaces/core.py +210 -6
  81. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
  82. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
  83. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
  85. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
  87. /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
  88. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
  89. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
  90. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -141,7 +141,7 @@ def _validate_consolidation_mode_config(
141
141
  if global_user_state.get_cluster_from_name(controller_cn) is not None:
142
142
  with ux_utils.print_exception_no_traceback():
143
143
  raise exceptions.InconsistentConsolidationModeError(
144
- f'{colorama.Fore.RED}Consolidation mode is '
144
+ f'{colorama.Fore.RED}Consolidation mode for jobs is '
145
145
  f'enabled, but the controller cluster '
146
146
  f'{controller_cn} is still running. Please '
147
147
  'terminate the controller cluster first.'
@@ -179,7 +179,11 @@ def _validate_consolidation_mode_config(
179
179
  def is_consolidation_mode() -> bool:
180
180
  consolidation_mode = skypilot_config.get_nested(
181
181
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
182
- _validate_consolidation_mode_config(consolidation_mode)
182
+ # We should only do this check on API server, as the controller will not
183
+ # have related config and will always seemingly disabled for consolidation
184
+ # mode. Check #6611 for more details.
185
+ if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
186
+ _validate_consolidation_mode_config(consolidation_mode)
183
187
  return consolidation_mode
184
188
 
185
189
 
@@ -333,6 +337,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
333
337
  if handle is not None:
334
338
  try:
335
339
  if pool is None:
340
+ global_user_state.add_cluster_event(
341
+ cluster_name, None, 'Cluster was cleaned up.',
342
+ global_user_state.ClusterEventType.STATUS_CHANGE)
336
343
  terminate_cluster(cluster_name)
337
344
  except Exception as e: # pylint: disable=broad-except
338
345
  error_msg = (
sky/provision/__init__.py CHANGED
@@ -76,10 +76,11 @@ def query_instances(
76
76
  cluster_name_on_cloud: str,
77
77
  provider_config: Optional[Dict[str, Any]] = None,
78
78
  non_terminated_only: bool = True,
79
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
79
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
80
80
  """Query instances.
81
81
 
82
- Returns a dictionary of instance IDs and status.
82
+ Returns a dictionary of instance IDs and a tuple of (status, reason for
83
+ being in status if any).
83
84
 
84
85
  A None status means the instance is marked as "terminated"
85
86
  or "terminating".
@@ -10,7 +10,7 @@ from multiprocessing import pool
10
10
  import re
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
14
14
 
15
15
  from sky import sky_logging
16
16
  from sky.adaptors import aws
@@ -588,7 +588,7 @@ def query_instances(
588
588
  cluster_name_on_cloud: str,
589
589
  provider_config: Optional[Dict[str, Any]] = None,
590
590
  non_terminated_only: bool = True,
591
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
591
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
592
592
  """See sky/provision/__init__.py"""
593
593
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
594
594
  region = provider_config['region']
@@ -608,12 +608,13 @@ def query_instances(
608
608
  'shutting-down': None,
609
609
  'terminated': None,
610
610
  }
611
- statuses = {}
611
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
612
+ Optional[str]]] = {}
612
613
  for inst in instances:
613
614
  status = status_map[inst.state['Name']]
614
615
  if non_terminated_only and status is None:
615
616
  continue
616
- statuses[inst.id] = status
617
+ statuses[inst.id] = (status, None)
617
618
  return statuses
618
619
 
619
620
 
@@ -955,7 +955,7 @@ def query_instances(
955
955
  cluster_name_on_cloud: str,
956
956
  provider_config: Optional[Dict[str, Any]] = None,
957
957
  non_terminated_only: bool = True,
958
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
958
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
959
959
  """See sky/provision/__init__.py"""
960
960
  assert provider_config is not None, cluster_name_on_cloud
961
961
 
@@ -964,7 +964,8 @@ def query_instances(
964
964
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
965
965
  compute_client = azure.get_client('compute', subscription_id)
966
966
  nodes = _filter_instances(compute_client, resource_group, filters)
967
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
967
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
968
+ Optional[str]]] = {}
968
969
 
969
970
  def _fetch_and_map_status(node, resource_group: str) -> None:
970
971
  compute_client = azure.get_client('compute', subscription_id)
@@ -972,8 +973,8 @@ def query_instances(
972
973
 
973
974
  if status is None and non_terminated_only:
974
975
  return
975
- statuses[node.name] = (None if status is None else
976
- status.to_cluster_status())
976
+ statuses[node.name] = ((None if status is None else
977
+ status.to_cluster_status()), None)
977
978
 
978
979
  with pool.ThreadPool() as p:
979
980
  p.starmap(_fetch_and_map_status,
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -194,7 +194,7 @@ def query_instances(
194
194
  cluster_name_on_cloud: str,
195
195
  provider_config: Optional[Dict[str, Any]] = None,
196
196
  non_terminated_only: bool = True,
197
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
197
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
198
  """See sky/provision/__init__.py"""
199
199
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
200
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -210,12 +210,13 @@ def query_instances(
210
210
  'done': status_lib.ClusterStatus.STOPPED,
211
211
  'poff': status_lib.ClusterStatus.STOPPED,
212
212
  }
213
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
213
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
214
+ Optional[str]]] = {}
214
215
  for inst_id, inst in instances.items():
215
216
  status = status_map[inst['status']]
216
217
  if non_terminated_only and status is None:
217
218
  continue
218
- statuses[inst_id] = status
219
+ statuses[inst_id] = (status, None)
219
220
  return statuses
220
221
 
221
222
 
@@ -1,7 +1,7 @@
1
1
  """DigitalOcean instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
  import uuid
6
6
 
7
7
  from sky import sky_logging
@@ -245,7 +245,7 @@ def query_instances(
245
245
  cluster_name_on_cloud: str,
246
246
  provider_config: Optional[Dict[str, Any]] = None,
247
247
  non_terminated_only: bool = True,
248
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
248
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
249
  """See sky/provision/__init__.py"""
250
250
  # terminated instances are not retrieved by the
251
251
  # API making `non_terminated_only` argument moot.
@@ -260,10 +260,11 @@ def query_instances(
260
260
  'active': status_lib.ClusterStatus.UP,
261
261
  'off': status_lib.ClusterStatus.STOPPED,
262
262
  }
263
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
263
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
264
+ Optional[str]]] = {}
264
265
  for instance_meta in instances.values():
265
266
  status = status_map[instance_meta['status']]
266
- statuses[instance_meta['name']] = status
267
+ statuses[instance_meta['name']] = (status, None)
267
268
  return statuses
268
269
 
269
270
 
@@ -1,7 +1,7 @@
1
1
  """FluidStack instance provisioning."""
2
2
  import os
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import authentication as auth
7
7
  from sky import exceptions
@@ -290,7 +290,7 @@ def query_instances(
290
290
  cluster_name_on_cloud: str,
291
291
  provider_config: Optional[Dict[str, Any]] = None,
292
292
  non_terminated_only: bool = True,
293
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
293
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
294
294
  """See sky/provision/__init__.py"""
295
295
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
296
296
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -302,7 +302,8 @@ def query_instances(
302
302
  'failed': status_lib.ClusterStatus.INIT,
303
303
  'terminated': None,
304
304
  }
305
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
305
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
306
+ Optional[str]]] = {}
306
307
  for inst_id, inst in instances.items():
307
308
  if inst['status'] not in status_map:
308
309
  with ux_utils.print_exception_no_traceback():
@@ -311,7 +312,7 @@ def query_instances(
311
312
  status = status_map.get(inst['status'], None)
312
313
  if non_terminated_only and status is None:
313
314
  continue
314
- statuses[inst_id] = status
315
+ statuses[inst_id] = (status, None)
315
316
  return statuses
316
317
 
317
318
 
@@ -4,7 +4,7 @@ import copy
4
4
  from multiprocessing import pool
5
5
  import re
6
6
  import time
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Type
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import gcp
@@ -61,7 +61,7 @@ def query_instances(
61
61
  cluster_name_on_cloud: str,
62
62
  provider_config: Optional[Dict[str, Any]] = None,
63
63
  non_terminated_only: bool = True,
64
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
64
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
65
65
  """See sky/provision/__init__.py"""
66
66
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
67
  zone = provider_config['availability_zone']
@@ -84,7 +84,8 @@ def query_instances(
84
84
  )
85
85
 
86
86
  raw_statuses = {}
87
- statuses = {}
87
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
88
+ Optional[str]]] = {}
88
89
  for inst_id, instance in instances.items():
89
90
  raw_status = instance[handler.STATUS_FIELD]
90
91
  raw_statuses[inst_id] = raw_status
@@ -98,7 +99,7 @@ def query_instances(
98
99
  status = None
99
100
  if non_terminated_only and status is None:
100
101
  continue
101
- statuses[inst_id] = status
102
+ statuses[inst_id] = (status, None)
102
103
 
103
104
  # GCP does not clean up preempted TPU VMs. We remove it ourselves.
104
105
  if handler == instance_utils.GCPTPUVMInstance:
@@ -1,6 +1,6 @@
1
1
  """Hyperbolic instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -307,7 +307,7 @@ def query_instances(
307
307
  cluster_name_on_cloud: str,
308
308
  provider_config: Optional[dict] = None,
309
309
  non_terminated_only: bool = True,
310
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
310
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
311
311
  """Returns the status of the specified instances for Hyperbolic."""
312
312
  del provider_config # unused
313
313
  # Fetch all instances for this cluster
@@ -319,7 +319,8 @@ def query_instances(
319
319
  # No instances found: return empty dict to indicate fully deleted
320
320
  return {}
321
321
 
322
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
322
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
323
+ Optional[str]]] = {}
323
324
  for instance_id, instance in instances.items():
324
325
  try:
325
326
  raw_status = instance.get('status', 'unknown').lower()
@@ -328,7 +329,7 @@ def query_instances(
328
329
  status = hyperbolic_status.to_cluster_status()
329
330
  if non_terminated_only and status is None:
330
331
  continue
331
- statuses[instance_id] = status
332
+ statuses[instance_id] = (status, None)
332
333
  except utils.HyperbolicError as e:
333
334
  logger.warning(
334
335
  f'Failed to parse status for instance {instance_id}: {e}')
@@ -2,7 +2,7 @@
2
2
  import copy
3
3
  import json
4
4
  import time
5
- from typing import Any, Callable, Dict, List, Optional, Union
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import exceptions
8
8
  from sky import sky_logging
@@ -1248,15 +1248,37 @@ def get_cluster_info(
1248
1248
  provider_config=provider_config)
1249
1249
 
1250
1250
 
1251
+ def _get_pod_termination_reason(pod: Any) -> str:
1252
+ reasons = []
1253
+ if pod.status.container_statuses:
1254
+ for container_status in pod.status.container_statuses:
1255
+ terminated = container_status.state.terminated
1256
+ if terminated:
1257
+ exit_code = terminated.exit_code
1258
+ reason = terminated.reason
1259
+ if exit_code == 0:
1260
+ # skip exit 0 (non-failed) just for sanity
1261
+ continue
1262
+ if reason is None:
1263
+ # just in-case reason is None, have default for debugging
1264
+ reason = f'exit({exit_code})'
1265
+ reasons.append(reason)
1266
+ # TODO (kyuds): later, if needed, query `last_state` too.
1267
+
1268
+ # Normally we will have a single container per pod for skypilot
1269
+ # but doing this just in-case there are multiple containers.
1270
+ return ' | '.join(reasons)
1271
+
1272
+
1251
1273
  def query_instances(
1252
1274
  cluster_name_on_cloud: str,
1253
1275
  provider_config: Optional[Dict[str, Any]] = None,
1254
1276
  non_terminated_only: bool = True
1255
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
1277
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1256
1278
  status_map = {
1257
1279
  'Pending': status_lib.ClusterStatus.INIT,
1258
1280
  'Running': status_lib.ClusterStatus.UP,
1259
- 'Failed': None,
1281
+ 'Failed': status_lib.ClusterStatus.INIT,
1260
1282
  'Unknown': None,
1261
1283
  'Succeeded': None,
1262
1284
  'Terminating': None,
@@ -1298,12 +1320,20 @@ def query_instances(
1298
1320
  f'status: {common_utils.format_exception(e)}')
1299
1321
 
1300
1322
  # Check if the pods are running or pending
1301
- cluster_status = {}
1323
+ cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1324
+ Optional[str]]] = {}
1302
1325
  for pod in pods:
1303
- pod_status = status_map[pod.status.phase]
1326
+ phase = pod.status.phase
1327
+ pod_status = status_map[phase]
1304
1328
  if non_terminated_only and pod_status is None:
1305
1329
  continue
1306
- cluster_status[pod.metadata.name] = pod_status
1330
+ reason = None
1331
+ if phase == 'Failed':
1332
+ reason = _get_pod_termination_reason(pod)
1333
+ logger.debug(f'Pod Status Reason(s): {reason}')
1334
+ pod_name = pod.metadata.name
1335
+ reason = f'{pod_name}: {reason}' if reason is not None else None
1336
+ cluster_status[pod_name] = (pod_status, reason)
1307
1337
  return cluster_status
1308
1338
 
1309
1339
 
@@ -1,7 +1,7 @@
1
1
  """Lambda Cloud instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -229,7 +229,7 @@ def query_instances(
229
229
  cluster_name_on_cloud: str,
230
230
  provider_config: Optional[Dict[str, Any]] = None,
231
231
  non_terminated_only: bool = True,
232
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
232
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
233
  """See sky/provision/__init__.py"""
234
234
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
235
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -240,12 +240,13 @@ def query_instances(
240
240
  'unhealthy': status_lib.ClusterStatus.INIT,
241
241
  'terminating': None,
242
242
  }
243
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
243
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
244
+ Optional[str]]] = {}
244
245
  for instance_id, instance in instances.items():
245
246
  status = status_map.get(instance['status'])
246
247
  if non_terminated_only and status is None:
247
248
  continue
248
- statuses[instance_id] = status
249
+ statuses[instance_id] = (status, None)
249
250
  return statuses
250
251
 
251
252
 
@@ -1,6 +1,6 @@
1
1
  """Nebius instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -250,7 +250,7 @@ def query_instances(
250
250
  cluster_name_on_cloud: str,
251
251
  provider_config: Optional[Dict[str, Any]] = None,
252
252
  non_terminated_only: bool = True,
253
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
253
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
254
254
  """See sky/provision/__init__.py"""
255
255
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
256
256
  instances = _filter_instances(provider_config['region'],
@@ -263,12 +263,13 @@ def query_instances(
263
263
  'STOPPING': status_lib.ClusterStatus.STOPPED,
264
264
  'DELETING': status_lib.ClusterStatus.STOPPED,
265
265
  }
266
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
266
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
267
+ Optional[str]]] = {}
267
268
  for inst_id, inst in instances.items():
268
269
  status = status_map[inst['status']]
269
270
  if non_terminated_only and status is None:
270
271
  continue
271
- statuses[inst_id] = status
272
+ statuses[inst_id] = (status, None)
272
273
  return statuses
273
274
 
274
275
 
@@ -10,7 +10,7 @@ import copy
10
10
  from datetime import datetime
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional
13
+ from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  from sky import exceptions
16
16
  from sky import sky_logging
@@ -35,7 +35,7 @@ def query_instances(
35
35
  cluster_name_on_cloud: str,
36
36
  provider_config: Optional[Dict[str, Any]] = None,
37
37
  non_terminated_only: bool = True,
38
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
38
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
39
39
  """Query instances.
40
40
 
41
41
  Returns a dictionary of instance IDs and status.
@@ -47,7 +47,8 @@ def query_instances(
47
47
  region = provider_config['region']
48
48
 
49
49
  status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
50
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
51
+ Optional[str]]] = {}
51
52
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
53
 
53
54
  instances = _get_filtered_nodes(region, filters)
@@ -56,7 +57,7 @@ def query_instances(
56
57
  sky_status = status_map[vm_status]
57
58
  if non_terminated_only and sky_status is None:
58
59
  continue
59
- statuses[node['inst_id']] = sky_status
60
+ statuses[node['inst_id']] = (sky_status, None)
60
61
 
61
62
  return statuses
62
63
 
@@ -1,7 +1,7 @@
1
1
  """Paperspace instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -280,7 +280,7 @@ def query_instances(
280
280
  cluster_name_on_cloud: str,
281
281
  provider_config: Optional[Dict[str, Any]] = None,
282
282
  non_terminated_only: bool = True,
283
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
283
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
284
  """See sky/provision/__init__.py"""
285
285
  del non_terminated_only
286
286
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
@@ -297,10 +297,11 @@ def query_instances(
297
297
  'ready': status_lib.ClusterStatus.UP,
298
298
  'off': status_lib.ClusterStatus.STOPPED,
299
299
  }
300
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
300
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
301
+ Optional[str]]] = {}
301
302
  for inst_id, inst in instances.items():
302
303
  status = status_map[inst['state']]
303
- statuses[inst_id] = status
304
+ statuses[inst_id] = (status, None)
304
305
  return statuses
305
306
 
306
307
 
@@ -100,6 +100,12 @@ def _bulk_provision(
100
100
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
101
101
  f'seconds.')
102
102
 
103
+ # Add cluster event for provisioning completion.
104
+ global_user_state.add_cluster_event(
105
+ str(cluster_name), status_lib.ClusterStatus.INIT,
106
+ f'Instances launched on {cloud.display_name()} in {region}',
107
+ global_user_state.ClusterEventType.STATUS_CHANGE)
108
+
103
109
  return provision_record
104
110
 
105
111
 
@@ -1,6 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -204,7 +204,7 @@ def query_instances(
204
204
  cluster_name_on_cloud: str,
205
205
  provider_config: Optional[Dict[str, Any]] = None,
206
206
  non_terminated_only: bool = True,
207
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
207
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
208
  """See sky/provision/__init__.py"""
209
209
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
210
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -215,12 +215,13 @@ def query_instances(
215
215
  'PAUSED': status_lib.ClusterStatus.INIT,
216
216
  'RUNNING': status_lib.ClusterStatus.UP,
217
217
  }
218
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
218
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
219
+ Optional[str]]] = {}
219
220
  for inst_id, inst in instances.items():
220
221
  status = status_map[inst['status']]
221
222
  if non_terminated_only and status is None:
222
223
  continue
223
- statuses[inst_id] = status
224
+ statuses[inst_id] = (status, None)
224
225
  return statuses
225
226
 
226
227
 
@@ -4,7 +4,7 @@ import logging
4
4
  import random
5
5
  import string
6
6
  import time
7
- from typing import Any, Dict, List, Optional
7
+ from typing import Any, Dict, List, Optional, Tuple
8
8
 
9
9
  from sky.clouds.utils import scp_utils
10
10
  from sky.provision import common
@@ -430,8 +430,7 @@ def query_instances(
430
430
  cluster_name_on_cloud: str,
431
431
  provider_config: Optional[Dict[str, Any]] = None,
432
432
  non_terminated_only: bool = True,
433
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
434
-
433
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
434
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
436
435
  instances = _filter_instances(cluster_name_on_cloud, None)
437
436
 
@@ -447,12 +446,13 @@ def query_instances(
447
446
  'TERMINATED': None,
448
447
  }
449
448
 
450
- statuses = {}
449
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
450
+ Optional[str]]] = {}
451
451
  for instance in instances:
452
452
  status = status_map[instance['virtualServerState']]
453
453
  if non_terminated_only and status is None:
454
454
  continue
455
- statuses[instance['virtualServerId']] = status
455
+ statuses[instance['virtualServerId']] = (status, None)
456
456
  return statuses
457
457
 
458
458
 
@@ -1,6 +1,6 @@
1
1
  """Vast instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -219,9 +219,8 @@ def query_instances(
219
219
  cluster_name_on_cloud: str,
220
220
  provider_config: Optional[Dict[str, Any]] = None,
221
221
  non_terminated_only: bool = True,
222
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
222
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
223
223
  """See sky/provision/__init__.py"""
224
-
225
224
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
226
225
  instances = _filter_instances(cluster_name_on_cloud, None)
227
226
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -231,12 +230,13 @@ def query_instances(
231
230
  'STOPPED': status_lib.ClusterStatus.STOPPED,
232
231
  'RUNNING': status_lib.ClusterStatus.UP,
233
232
  }
234
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
233
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
234
+ Optional[str]]] = {}
235
235
  for inst_id, inst in instances.items():
236
236
  status = status_map[inst['status']]
237
237
  if non_terminated_only and status is None:
238
238
  continue
239
- statuses[inst_id] = status
239
+ statuses[inst_id] = (status, None)
240
240
  return statuses
241
241
 
242
242
 
@@ -1,7 +1,7 @@
1
1
  """Vsphere instance provisioning."""
2
2
  import json
3
3
  import typing
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
@@ -396,7 +396,7 @@ def query_instances(
396
396
  cluster_name_on_cloud: str,
397
397
  provider_config: Optional[Dict[str, Any]] = None,
398
398
  non_terminated_only: bool = True,
399
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
399
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
400
400
  """See sky/provision/__init__.py"""
401
401
  logger.info('New provision of Vsphere: query_instances().')
402
402
  assert provider_config is not None, cluster_name_on_cloud
@@ -413,12 +413,13 @@ def query_instances(
413
413
  'suspended': None,
414
414
  }
415
415
 
416
- status = {}
416
+ status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
417
+ Optional[str]]] = {}
417
418
  for inst in instances:
418
419
  stat = status_map[inst.runtime.powerState]
419
420
  if non_terminated_only and stat is None:
420
421
  continue
421
- status[inst.summary.config.instanceUuid] = stat
422
+ status[inst.summary.config.instanceUuid] = (stat, None)
422
423
  vc_object.disconnect()
423
424
  return status
424
425
 
@@ -22,7 +22,7 @@ depends_on = None
22
22
  def upgrade():
23
23
  with op.get_context().autocommit_block():
24
24
  # Create any missing tables with current schema first
25
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
25
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
26
 
27
27
  # Add all missing columns to clusters table
28
28
  # This allows each column addition to fail independently without rolling