skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +102 -8
  4. sky/backends/cloud_vm_ray_backend.py +197 -31
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +60 -77
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +19 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +14 -0
  14. sky/core.py +5 -0
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  18. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/data/storage.py +11 -1
  41. sky/exceptions.py +5 -0
  42. sky/execution.py +15 -0
  43. sky/global_user_state.py +160 -2
  44. sky/jobs/constants.py +1 -1
  45. sky/jobs/controller.py +0 -1
  46. sky/jobs/recovery_strategy.py +6 -3
  47. sky/jobs/scheduler.py +23 -68
  48. sky/jobs/server/core.py +22 -12
  49. sky/jobs/state.py +6 -2
  50. sky/jobs/utils.py +17 -2
  51. sky/provision/__init__.py +4 -2
  52. sky/provision/aws/config.py +9 -0
  53. sky/provision/aws/instance.py +41 -17
  54. sky/provision/azure/instance.py +7 -4
  55. sky/provision/cudo/cudo_wrapper.py +1 -1
  56. sky/provision/cudo/instance.py +7 -4
  57. sky/provision/do/instance.py +7 -4
  58. sky/provision/fluidstack/instance.py +7 -4
  59. sky/provision/gcp/instance.py +7 -4
  60. sky/provision/hyperbolic/instance.py +7 -5
  61. sky/provision/kubernetes/instance.py +169 -6
  62. sky/provision/lambda_cloud/instance.py +7 -4
  63. sky/provision/nebius/instance.py +7 -4
  64. sky/provision/oci/instance.py +7 -4
  65. sky/provision/paperspace/instance.py +7 -5
  66. sky/provision/paperspace/utils.py +1 -1
  67. sky/provision/provisioner.py +6 -0
  68. sky/provision/runpod/instance.py +7 -4
  69. sky/provision/runpod/utils.py +1 -1
  70. sky/provision/scp/instance.py +7 -5
  71. sky/provision/vast/instance.py +7 -5
  72. sky/provision/vsphere/instance.py +7 -4
  73. sky/resources.py +1 -2
  74. sky/schemas/__init__.py +0 -0
  75. sky/schemas/api/__init__.py +0 -0
  76. sky/schemas/api/responses.py +70 -0
  77. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  78. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  79. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  80. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  81. sky/schemas/generated/__init__.py +0 -0
  82. sky/schemas/generated/autostopv1_pb2.py +36 -0
  83. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  84. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  85. sky/serve/constants.py +3 -7
  86. sky/serve/replica_managers.py +15 -16
  87. sky/serve/serve_state.py +10 -0
  88. sky/serve/serve_utils.py +58 -23
  89. sky/serve/server/impl.py +15 -19
  90. sky/serve/service.py +31 -16
  91. sky/server/server.py +20 -14
  92. sky/setup_files/dependencies.py +11 -10
  93. sky/skylet/autostop_lib.py +38 -5
  94. sky/skylet/constants.py +3 -1
  95. sky/skylet/services.py +44 -0
  96. sky/skylet/skylet.py +49 -4
  97. sky/skypilot_config.py +4 -4
  98. sky/task.py +19 -16
  99. sky/templates/aws-ray.yml.j2 +2 -2
  100. sky/templates/jobs-controller.yaml.j2 +6 -0
  101. sky/users/permission.py +1 -1
  102. sky/utils/cli_utils/status_utils.py +9 -0
  103. sky/utils/command_runner.py +1 -1
  104. sky/utils/config_utils.py +29 -5
  105. sky/utils/controller_utils.py +73 -0
  106. sky/utils/db/db_utils.py +39 -1
  107. sky/utils/db/migration_utils.py +1 -1
  108. sky/utils/schemas.py +3 -0
  109. sky/volumes/server/core.py +2 -2
  110. sky/volumes/server/server.py +2 -2
  111. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  112. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
  113. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  116. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import copy
4
4
  from multiprocessing import pool
5
5
  import re
6
6
  import time
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Type
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import gcp
@@ -58,11 +58,13 @@ def _filter_instances(
58
58
  # for terminated instances, if they have already been fully deleted.
59
59
  @common_utils.retry
60
60
  def query_instances(
61
+ cluster_name: str,
61
62
  cluster_name_on_cloud: str,
62
63
  provider_config: Optional[Dict[str, Any]] = None,
63
64
  non_terminated_only: bool = True,
64
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
65
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
65
66
  """See sky/provision/__init__.py"""
67
+ del cluster_name # unused
66
68
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
69
  zone = provider_config['availability_zone']
68
70
  project_id = provider_config['project_id']
@@ -84,7 +86,8 @@ def query_instances(
84
86
  )
85
87
 
86
88
  raw_statuses = {}
87
- statuses = {}
89
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
90
+ Optional[str]]] = {}
88
91
  for inst_id, instance in instances.items():
89
92
  raw_status = instance[handler.STATUS_FIELD]
90
93
  raw_statuses[inst_id] = raw_status
@@ -98,7 +101,7 @@ def query_instances(
98
101
  status = None
99
102
  if non_terminated_only and status is None:
100
103
  continue
101
- statuses[inst_id] = status
104
+ statuses[inst_id] = (status, None)
102
105
 
103
106
  # GCP does not clean up preempted TPU VMs. We remove it ourselves.
104
107
  if handler == instance_utils.GCPTPUVMInstance:
@@ -1,6 +1,6 @@
1
1
  """Hyperbolic instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -304,12 +304,13 @@ def get_cluster_info(
304
304
 
305
305
 
306
306
  def query_instances(
307
+ cluster_name: str,
307
308
  cluster_name_on_cloud: str,
308
309
  provider_config: Optional[dict] = None,
309
310
  non_terminated_only: bool = True,
310
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
311
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
311
312
  """Returns the status of the specified instances for Hyperbolic."""
312
- del provider_config # unused
313
+ del cluster_name, provider_config # unused
313
314
  # Fetch all instances for this cluster
314
315
  instances = utils.list_instances(
315
316
  metadata={'skypilot': {
@@ -319,7 +320,8 @@ def query_instances(
319
320
  # No instances found: return empty dict to indicate fully deleted
320
321
  return {}
321
322
 
322
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
323
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
324
+ Optional[str]]] = {}
323
325
  for instance_id, instance in instances.items():
324
326
  try:
325
327
  raw_status = instance.get('status', 'unknown').lower()
@@ -328,7 +330,7 @@ def query_instances(
328
330
  status = hyperbolic_status.to_cluster_status()
329
331
  if non_terminated_only and status is None:
330
332
  continue
331
- statuses[instance_id] = status
333
+ statuses[instance_id] = (status, None)
332
334
  except utils.HyperbolicError as e:
333
335
  logger.warning(
334
336
  f'Failed to parse status for instance {instance_id}: {e}')
@@ -1,10 +1,12 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
3
  import json
4
+ import re
4
5
  import time
5
- from typing import Any, Callable, Dict, List, Optional, Union
6
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
7
 
7
8
  from sky import exceptions
9
+ from sky import global_user_state
8
10
  from sky import sky_logging
9
11
  from sky import skypilot_config
10
12
  from sky.adaptors import kubernetes
@@ -24,6 +26,7 @@ from sky.utils import status_lib
24
26
  from sky.utils import subprocess_utils
25
27
  from sky.utils import timeline
26
28
  from sky.utils import ux_utils
29
+ from sky.utils.db import db_utils
27
30
 
28
31
  POLL_INTERVAL = 2
29
32
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
@@ -1248,15 +1251,146 @@ def get_cluster_info(
1248
1251
  provider_config=provider_config)
1249
1252
 
1250
1253
 
1254
+ def _get_pod_termination_reason(pod: Any) -> str:
1255
+ reasons = []
1256
+ if pod.status.container_statuses:
1257
+ for container_status in pod.status.container_statuses:
1258
+ terminated = container_status.state.terminated
1259
+ if terminated:
1260
+ exit_code = terminated.exit_code
1261
+ reason = terminated.reason
1262
+ if exit_code == 0:
1263
+ # skip exit 0 (non-failed) just for sanity
1264
+ continue
1265
+ if reason is None:
1266
+ # just in-case reason is None, have default for debugging
1267
+ reason = f'exit({exit_code})'
1268
+ reasons.append(reason)
1269
+ # TODO (kyuds): later, if needed, query `last_state` too.
1270
+
1271
+ # Normally we will have a single container per pod for skypilot
1272
+ # but doing this just in-case there are multiple containers.
1273
+ return ' | '.join(reasons)
1274
+
1275
+
1276
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1277
+ cluster_name: str, pod_name: str) -> Optional[str]:
1278
+ logger.debug(f'Analyzing events for pod {pod_name}')
1279
+ pod_field_selector = (
1280
+ f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1281
+ pod_events = kubernetes.core_api(context).list_namespaced_event(
1282
+ namespace,
1283
+ field_selector=pod_field_selector,
1284
+ _request_timeout=kubernetes.API_TIMEOUT).items
1285
+ pod_events = sorted(
1286
+ pod_events,
1287
+ key=lambda event: event.metadata.creation_timestamp,
1288
+ # latest event appears first
1289
+ reverse=True)
1290
+ last_scheduled_node = None
1291
+ insert_new_pod_event = True
1292
+ new_event_inserted = False
1293
+ for event in pod_events:
1294
+ if event.reason == 'Scheduled':
1295
+ pattern = r'Successfully assigned (\S+) to (\S+)'
1296
+ match = re.search(pattern, event.message)
1297
+ if match:
1298
+ scheduled_node = match.group(2)
1299
+ last_scheduled_node = scheduled_node
1300
+ if insert_new_pod_event:
1301
+ # Try inserting the latest events first. If the event is a
1302
+ # duplicate, it means the event (and any previous events) have
1303
+ # already been inserted - so do not insert further events.
1304
+ try:
1305
+ global_user_state.add_cluster_event(
1306
+ cluster_name,
1307
+ None, f'[kubernetes pod {pod_name}] '
1308
+ f'{event.reason} {event.message}',
1309
+ global_user_state.ClusterEventType.DEBUG,
1310
+ transitioned_at=int(
1311
+ event.metadata.creation_timestamp.timestamp()),
1312
+ expose_duplicate_error=True)
1313
+ except db_utils.UniqueConstraintViolationError:
1314
+ insert_new_pod_event = False
1315
+ else:
1316
+ new_event_inserted = True
1317
+
1318
+ if last_scheduled_node is not None:
1319
+ node_field_selector = ('involvedObject.kind=Node,'
1320
+ f'involvedObject.name={last_scheduled_node}')
1321
+ node_events = kubernetes.core_api(context).list_namespaced_event(
1322
+ namespace,
1323
+ field_selector=node_field_selector,
1324
+ _request_timeout=kubernetes.API_TIMEOUT).items
1325
+ node_events = sorted(
1326
+ node_events,
1327
+ key=lambda event: event.metadata.creation_timestamp,
1328
+ # latest event appears first
1329
+ reverse=True)
1330
+ insert_new_node_event = True
1331
+ for event in node_events:
1332
+ if insert_new_node_event:
1333
+ # Try inserting the latest events first. If the event is a
1334
+ # duplicate, it means the event (and any previous events) have
1335
+ # already been inserted - so do not insert further events.
1336
+ try:
1337
+ global_user_state.add_cluster_event(
1338
+ cluster_name,
1339
+ None, f'[kubernetes node {last_scheduled_node}] '
1340
+ f'{event.reason} {event.message}',
1341
+ global_user_state.ClusterEventType.DEBUG,
1342
+ transitioned_at=int(
1343
+ event.metadata.creation_timestamp.timestamp()),
1344
+ expose_duplicate_error=True)
1345
+ except db_utils.UniqueConstraintViolationError:
1346
+ insert_new_node_event = False
1347
+ else:
1348
+ new_event_inserted = True
1349
+
1350
+ if not new_event_inserted:
1351
+ # If new event is not inserted, there is no useful information to
1352
+ # return. Return None.
1353
+ return None
1354
+
1355
+ # Analyze the events for failure
1356
+ failure_reason = None
1357
+ failure_decisiveness = 0
1358
+
1359
+ def _record_failure_reason(reason: str, decisiveness: int):
1360
+ nonlocal failure_reason, failure_decisiveness
1361
+ if decisiveness > failure_decisiveness:
1362
+ failure_reason = reason
1363
+ failure_decisiveness = decisiveness
1364
+
1365
+ cluster_events = global_user_state.get_cluster_events(
1366
+ cluster_name, None, global_user_state.ClusterEventType.DEBUG)
1367
+ for event in cluster_events:
1368
+ if event.startswith('[kubernetes pod'):
1369
+ event = event.split(']')[1].strip()
1370
+ elif event.startswith('[kubernetes node'):
1371
+ event = event.split(']')[1].strip()
1372
+
1373
+ if event.startswith('NodeNotReady '):
1374
+ _record_failure_reason(event[len('NodeNotReady '):], 1)
1375
+ elif event.startswith('TaintManagerEviction '):
1376
+ # usually the event message for TaintManagerEviction is not useful
1377
+ # so we record a more generic message.
1378
+ _record_failure_reason('pod was evicted by taint manager', 2)
1379
+ elif event.startswith('DeletingNode '):
1380
+ _record_failure_reason(event[len('DeletingNode '):], 3)
1381
+ return failure_reason
1382
+
1383
+
1251
1384
  def query_instances(
1385
+ cluster_name: str,
1252
1386
  cluster_name_on_cloud: str,
1253
1387
  provider_config: Optional[Dict[str, Any]] = None,
1254
1388
  non_terminated_only: bool = True
1255
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
1389
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1256
1390
  status_map = {
1257
1391
  'Pending': status_lib.ClusterStatus.INIT,
1258
1392
  'Running': status_lib.ClusterStatus.UP,
1259
- 'Failed': None,
1393
+ 'Failed': status_lib.ClusterStatus.INIT,
1260
1394
  'Unknown': None,
1261
1395
  'Succeeded': None,
1262
1396
  'Terminating': None,
@@ -1298,12 +1432,41 @@ def query_instances(
1298
1432
  f'status: {common_utils.format_exception(e)}')
1299
1433
 
1300
1434
  # Check if the pods are running or pending
1301
- cluster_status = {}
1435
+ cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1436
+ Optional[str]]] = {}
1302
1437
  for pod in pods:
1303
- pod_status = status_map[pod.status.phase]
1438
+ phase = pod.status.phase
1439
+ pod_status = status_map[phase]
1304
1440
  if non_terminated_only and pod_status is None:
1305
1441
  continue
1306
- cluster_status[pod.metadata.name] = pod_status
1442
+ reason = None
1443
+ if phase == 'Failed':
1444
+ reason = _get_pod_termination_reason(pod)
1445
+ logger.debug(f'Pod Status Reason(s): {reason}')
1446
+ pod_name = pod.metadata.name
1447
+ reason = f'{pod_name}: {reason}' if reason is not None else None
1448
+ cluster_status[pod_name] = (pod_status, reason)
1449
+
1450
+ # Find the list of pod names that should be there
1451
+ # from k8s services. Filter duplicates as -ssh service
1452
+ # creates a duplicate entry.
1453
+ target_pod_names = list(
1454
+ set([
1455
+ service['spec']['selector']['component']
1456
+ for service in provider_config.get('services', [])
1457
+ ]))
1458
+
1459
+ for target_pod_name in target_pod_names:
1460
+ if target_pod_name not in cluster_status:
1461
+ # If the pod is not in the cluster_status, it means it's not
1462
+ # running.
1463
+ # Analyze what happened to the pod based on events.
1464
+ reason = _get_pod_missing_reason(context, namespace, cluster_name,
1465
+ target_pod_name)
1466
+ reason = (f'{target_pod_name}: {reason}'
1467
+ if reason is not None else None)
1468
+ cluster_status[target_pod_name] = (None, reason)
1469
+
1307
1470
  return cluster_status
1308
1471
 
1309
1472
 
@@ -1,7 +1,7 @@
1
1
  """Lambda Cloud instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -226,11 +226,13 @@ def get_cluster_info(
226
226
 
227
227
 
228
228
  def query_instances(
229
+ cluster_name: str,
229
230
  cluster_name_on_cloud: str,
230
231
  provider_config: Optional[Dict[str, Any]] = None,
231
232
  non_terminated_only: bool = True,
232
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
233
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
234
  """See sky/provision/__init__.py"""
235
+ del cluster_name # unused
234
236
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
237
  instances = _filter_instances(cluster_name_on_cloud, None)
236
238
 
@@ -240,12 +242,13 @@ def query_instances(
240
242
  'unhealthy': status_lib.ClusterStatus.INIT,
241
243
  'terminating': None,
242
244
  }
243
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
245
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
246
+ Optional[str]]] = {}
244
247
  for instance_id, instance in instances.items():
245
248
  status = status_map.get(instance['status'])
246
249
  if non_terminated_only and status is None:
247
250
  continue
248
- statuses[instance_id] = status
251
+ statuses[instance_id] = (status, None)
249
252
  return statuses
250
253
 
251
254
 
@@ -1,6 +1,6 @@
1
1
  """Nebius instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -247,11 +247,13 @@ def get_cluster_info(
247
247
 
248
248
 
249
249
  def query_instances(
250
+ cluster_name: str,
250
251
  cluster_name_on_cloud: str,
251
252
  provider_config: Optional[Dict[str, Any]] = None,
252
253
  non_terminated_only: bool = True,
253
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
254
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
254
255
  """See sky/provision/__init__.py"""
256
+ del cluster_name # unused
255
257
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
256
258
  instances = _filter_instances(provider_config['region'],
257
259
  cluster_name_on_cloud, None)
@@ -263,12 +265,13 @@ def query_instances(
263
265
  'STOPPING': status_lib.ClusterStatus.STOPPED,
264
266
  'DELETING': status_lib.ClusterStatus.STOPPED,
265
267
  }
266
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
268
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
269
+ Optional[str]]] = {}
267
270
  for inst_id, inst in instances.items():
268
271
  status = status_map[inst['status']]
269
272
  if non_terminated_only and status is None:
270
273
  continue
271
- statuses[inst_id] = status
274
+ statuses[inst_id] = (status, None)
272
275
  return statuses
273
276
 
274
277
 
@@ -10,7 +10,7 @@ import copy
10
10
  from datetime import datetime
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional
13
+ from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  from sky import exceptions
16
16
  from sky import sky_logging
@@ -32,10 +32,11 @@ logger = sky_logging.init_logger(__name__)
32
32
  @query_utils.debug_enabled(logger)
33
33
  @common_utils.retry
34
34
  def query_instances(
35
+ cluster_name: str,
35
36
  cluster_name_on_cloud: str,
36
37
  provider_config: Optional[Dict[str, Any]] = None,
37
38
  non_terminated_only: bool = True,
38
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
39
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
39
40
  """Query instances.
40
41
 
41
42
  Returns a dictionary of instance IDs and status.
@@ -43,11 +44,13 @@ def query_instances(
43
44
  A None status means the instance is marked as "terminated"
44
45
  or "terminating".
45
46
  """
47
+ del cluster_name # unusedå
46
48
  assert provider_config is not None, cluster_name_on_cloud
47
49
  region = provider_config['region']
48
50
 
49
51
  status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
52
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
53
+ Optional[str]]] = {}
51
54
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
55
 
53
56
  instances = _get_filtered_nodes(region, filters)
@@ -56,7 +59,7 @@ def query_instances(
56
59
  sky_status = status_map[vm_status]
57
60
  if non_terminated_only and sky_status is None:
58
61
  continue
59
- statuses[node['inst_id']] = sky_status
62
+ statuses[node['inst_id']] = (sky_status, None)
60
63
 
61
64
  return statuses
62
65
 
@@ -1,7 +1,7 @@
1
1
  """Paperspace instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -277,12 +277,13 @@ def get_cluster_info(
277
277
 
278
278
 
279
279
  def query_instances(
280
+ cluster_name: str,
280
281
  cluster_name_on_cloud: str,
281
282
  provider_config: Optional[Dict[str, Any]] = None,
282
283
  non_terminated_only: bool = True,
283
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
284
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
285
  """See sky/provision/__init__.py"""
285
- del non_terminated_only
286
+ del cluster_name, non_terminated_only #unused
286
287
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
288
  instances = _filter_instances(cluster_name_on_cloud, None)
288
289
 
@@ -297,10 +298,11 @@ def query_instances(
297
298
  'ready': status_lib.ClusterStatus.UP,
298
299
  'off': status_lib.ClusterStatus.STOPPED,
299
300
  }
300
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
301
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
302
+ Optional[str]]] = {}
301
303
  for inst_id, inst in instances.items():
302
304
  status = status_map[inst['state']]
303
- statuses[inst_id] = status
305
+ statuses[inst_id] = (status, None)
304
306
  return statuses
305
307
 
306
308
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import common as adaptors_common
11
- import sky.provision.paperspace.constants as constants
11
+ from sky.provision.paperspace import constants
12
12
  from sky.utils import common_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -100,6 +100,12 @@ def _bulk_provision(
100
100
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
101
101
  f'seconds.')
102
102
 
103
+ # Add cluster event for provisioning completion.
104
+ global_user_state.add_cluster_event(
105
+ str(cluster_name), status_lib.ClusterStatus.INIT,
106
+ f'Instances launched on {cloud.display_name()} in {region}',
107
+ global_user_state.ClusterEventType.STATUS_CHANGE)
108
+
103
109
  return provision_record
104
110
 
105
111
 
@@ -1,6 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -201,11 +201,13 @@ def get_cluster_info(
201
201
 
202
202
 
203
203
  def query_instances(
204
+ cluster_name: str,
204
205
  cluster_name_on_cloud: str,
205
206
  provider_config: Optional[Dict[str, Any]] = None,
206
207
  non_terminated_only: bool = True,
207
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
208
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
209
  """See sky/provision/__init__.py"""
210
+ del cluster_name # unused
209
211
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
212
  instances = _filter_instances(cluster_name_on_cloud, None)
211
213
 
@@ -215,12 +217,13 @@ def query_instances(
215
217
  'PAUSED': status_lib.ClusterStatus.INIT,
216
218
  'RUNNING': status_lib.ClusterStatus.UP,
217
219
  }
218
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
220
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
221
+ Optional[str]]] = {}
219
222
  for inst_id, inst in instances.items():
220
223
  status = status_map[inst['status']]
221
224
  if non_terminated_only and status is None:
222
225
  continue
223
- statuses[inst_id] = status
226
+ statuses[inst_id] = (status, None)
224
227
  return statuses
225
228
 
226
229
 
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
9
  from sky.provision import docker_utils
10
- import sky.provision.runpod.api.commands as runpod_commands
10
+ from sky.provision.runpod.api import commands as runpod_commands
11
11
  from sky.skylet import constants
12
12
  from sky.utils import common_utils
13
13
 
@@ -4,7 +4,7 @@ import logging
4
4
  import random
5
5
  import string
6
6
  import time
7
- from typing import Any, Dict, List, Optional
7
+ from typing import Any, Dict, List, Optional, Tuple
8
8
 
9
9
  from sky.clouds.utils import scp_utils
10
10
  from sky.provision import common
@@ -427,11 +427,12 @@ def terminate_instances(
427
427
 
428
428
 
429
429
  def query_instances(
430
+ cluster_name: str,
430
431
  cluster_name_on_cloud: str,
431
432
  provider_config: Optional[Dict[str, Any]] = None,
432
433
  non_terminated_only: bool = True,
433
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
434
-
434
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
+ del cluster_name # unused
435
436
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
436
437
  instances = _filter_instances(cluster_name_on_cloud, None)
437
438
 
@@ -447,12 +448,13 @@ def query_instances(
447
448
  'TERMINATED': None,
448
449
  }
449
450
 
450
- statuses = {}
451
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
452
+ Optional[str]]] = {}
451
453
  for instance in instances:
452
454
  status = status_map[instance['virtualServerState']]
453
455
  if non_terminated_only and status is None:
454
456
  continue
455
- statuses[instance['virtualServerId']] = status
457
+ statuses[instance['virtualServerId']] = (status, None)
456
458
  return statuses
457
459
 
458
460
 
@@ -1,6 +1,6 @@
1
1
  """Vast instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -216,12 +216,13 @@ def open_ports(
216
216
 
217
217
 
218
218
  def query_instances(
219
+ cluster_name: str,
219
220
  cluster_name_on_cloud: str,
220
221
  provider_config: Optional[Dict[str, Any]] = None,
221
222
  non_terminated_only: bool = True,
222
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
223
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
223
224
  """See sky/provision/__init__.py"""
224
-
225
+ del cluster_name # unused
225
226
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
226
227
  instances = _filter_instances(cluster_name_on_cloud, None)
227
228
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -231,12 +232,13 @@ def query_instances(
231
232
  'STOPPED': status_lib.ClusterStatus.STOPPED,
232
233
  'RUNNING': status_lib.ClusterStatus.UP,
233
234
  }
234
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
235
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
236
+ Optional[str]]] = {}
235
237
  for inst_id, inst in instances.items():
236
238
  status = status_map[inst['status']]
237
239
  if non_terminated_only and status is None:
238
240
  continue
239
- statuses[inst_id] = status
241
+ statuses[inst_id] = (status, None)
240
242
  return statuses
241
243
 
242
244
 
@@ -1,7 +1,7 @@
1
1
  """Vsphere instance provisioning."""
2
2
  import json
3
3
  import typing
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
393
393
 
394
394
 
395
395
  def query_instances(
396
+ cluster_name: str,
396
397
  cluster_name_on_cloud: str,
397
398
  provider_config: Optional[Dict[str, Any]] = None,
398
399
  non_terminated_only: bool = True,
399
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
400
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
400
401
  """See sky/provision/__init__.py"""
402
+ del cluster_name # unused
401
403
  logger.info('New provision of Vsphere: query_instances().')
402
404
  assert provider_config is not None, cluster_name_on_cloud
403
405
  region = provider_config['region']
@@ -413,12 +415,13 @@ def query_instances(
413
415
  'suspended': None,
414
416
  }
415
417
 
416
- status = {}
418
+ status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
419
+ Optional[str]]] = {}
417
420
  for inst in instances:
418
421
  stat = status_map[inst.runtime.powerState]
419
422
  if non_terminated_only and stat is None:
420
423
  continue
421
- status[inst.summary.config.instanceUuid] = stat
424
+ status[inst.summary.config.instanceUuid] = (stat, None)
422
425
  vc_object.disconnect()
423
426
  return status
424
427
 
sky/resources.py CHANGED
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
8
8
 
9
9
  import colorama
10
10
 
11
- import sky
12
11
  from sky import catalog
13
12
  from sky import check as sky_check
14
13
  from sky import clouds
@@ -288,7 +287,7 @@ class Resources:
288
287
  if infra is not None:
289
288
  infra_info = infra_utils.InfraInfo.from_str(infra)
290
289
  # Infra takes precedence over individually specified parameters
291
- cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
290
+ cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
292
291
  region = infra_info.region
293
292
  zone = infra_info.zone
294
293