skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (102) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend_utils.py +69 -6
  3. sky/backends/cloud_vm_ray_backend.py +156 -25
  4. sky/catalog/cudo_catalog.py +1 -1
  5. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  6. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  7. sky/client/cli/command.py +40 -77
  8. sky/client/common.py +1 -1
  9. sky/client/sdk.py +19 -19
  10. sky/client/sdk_async.py +5 -4
  11. sky/clouds/aws.py +52 -1
  12. sky/clouds/kubernetes.py +14 -0
  13. sky/dag.py +1 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  16. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/storage.py +11 -1
  36. sky/exceptions.py +5 -0
  37. sky/global_user_state.py +63 -7
  38. sky/jobs/constants.py +1 -1
  39. sky/jobs/controller.py +0 -1
  40. sky/jobs/recovery_strategy.py +3 -3
  41. sky/jobs/scheduler.py +23 -68
  42. sky/jobs/server/core.py +18 -12
  43. sky/jobs/state.py +6 -2
  44. sky/jobs/utils.py +8 -0
  45. sky/provision/__init__.py +1 -0
  46. sky/provision/aws/config.py +9 -0
  47. sky/provision/aws/instance.py +36 -13
  48. sky/provision/azure/instance.py +2 -0
  49. sky/provision/cudo/cudo_wrapper.py +1 -1
  50. sky/provision/cudo/instance.py +2 -0
  51. sky/provision/do/instance.py +2 -0
  52. sky/provision/fluidstack/instance.py +2 -0
  53. sky/provision/gcp/instance.py +2 -0
  54. sky/provision/hyperbolic/instance.py +2 -1
  55. sky/provision/kubernetes/instance.py +133 -0
  56. sky/provision/lambda_cloud/instance.py +2 -0
  57. sky/provision/nebius/instance.py +2 -0
  58. sky/provision/oci/instance.py +2 -0
  59. sky/provision/paperspace/instance.py +2 -1
  60. sky/provision/paperspace/utils.py +1 -1
  61. sky/provision/runpod/instance.py +2 -0
  62. sky/provision/runpod/utils.py +1 -1
  63. sky/provision/scp/instance.py +2 -0
  64. sky/provision/vast/instance.py +2 -0
  65. sky/provision/vsphere/instance.py +2 -0
  66. sky/resources.py +1 -2
  67. sky/schemas/__init__.py +0 -0
  68. sky/schemas/api/__init__.py +0 -0
  69. sky/schemas/api/responses.py +70 -0
  70. sky/schemas/generated/__init__.py +0 -0
  71. sky/schemas/generated/autostopv1_pb2.py +36 -0
  72. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  73. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  74. sky/serve/constants.py +3 -7
  75. sky/serve/replica_managers.py +15 -16
  76. sky/serve/serve_state.py +10 -0
  77. sky/serve/serve_utils.py +21 -20
  78. sky/serve/server/impl.py +15 -19
  79. sky/serve/service.py +31 -16
  80. sky/server/server.py +20 -14
  81. sky/setup_files/dependencies.py +11 -10
  82. sky/skylet/autostop_lib.py +38 -5
  83. sky/skylet/constants.py +3 -1
  84. sky/skylet/services.py +44 -0
  85. sky/skylet/skylet.py +49 -4
  86. sky/task.py +19 -16
  87. sky/templates/aws-ray.yml.j2 +2 -2
  88. sky/templates/jobs-controller.yaml.j2 +6 -0
  89. sky/utils/command_runner.py +1 -1
  90. sky/utils/config_utils.py +29 -5
  91. sky/utils/controller_utils.py +73 -0
  92. sky/utils/db/db_utils.py +17 -0
  93. sky/utils/schemas.py +3 -0
  94. sky/volumes/server/core.py +2 -2
  95. sky/volumes/server/server.py +2 -2
  96. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  97. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
  98. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  99. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  100. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  101. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  102. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
527
527
  to_start_count,
528
528
  associate_public_ip_address=(
529
529
  not config.provider_config['use_internal_ips']))
530
+
530
531
  created_instances.extend(created_remaining_instances)
531
532
  created_instances.sort(key=lambda x: x.id)
532
533
 
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
585
586
  # stop() and terminate() for example already implicitly assume non-terminated.
586
587
  @common_utils.retry
587
588
  def query_instances(
589
+ cluster_name: str,
588
590
  cluster_name_on_cloud: str,
589
591
  provider_config: Optional[Dict[str, Any]] = None,
590
592
  non_terminated_only: bool = True,
591
593
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
592
594
  """See sky/provision/__init__.py"""
595
+ del cluster_name # unused
593
596
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
594
597
  region = provider_config['region']
595
598
  ec2 = _default_ec2_resource(region)
@@ -682,19 +685,39 @@ def terminate_instances(
682
685
  filters,
683
686
  included_instances=None,
684
687
  excluded_instances=None)
685
- instances_list = list(instances)
686
- instances.terminate()
687
- if (sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME or
688
- not managed_by_skypilot):
689
- # Using default AWS SG or user specified security group. We don't need
690
- # to wait for the termination of the instances, as we do not need to
691
- # delete the SG.
692
- return
693
- # If ports are specified, we need to delete the newly created Security
694
- # Group. Here we wait for all instances to be terminated, since the
695
- # Security Group dependent on them.
696
- for instance in instances_list:
697
- instance.wait_until_terminated()
688
+ default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
689
+ if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
690
+ # Case 1: The default SG is used, we don't need to ensure instance are
691
+ # terminated.
692
+ instances.terminate()
693
+ elif not managed_by_skypilot:
694
+ # Case 2: We are not managing the non-default sg. We don't need to
695
+ # ensure instances are terminated.
696
+ instances.terminate()
697
+ elif (managed_by_skypilot and default_sg is not None):
698
+ # Case 3: We are managing the non-default sg. The default SG exists
699
+ # so we can move the instances to the default SG and terminate them
700
+ # without blocking.
701
+
702
+ # Make this multithreaded: modify all instances' SGs in parallel.
703
+ def modify_instance_sg(instance):
704
+ instance.modify_attribute(Groups=[default_sg.id])
705
+ logger.debug(f'Instance {instance.id} modified to use default SG:'
706
+ f'{default_sg.id} for quick deletion.')
707
+
708
+ with pool.ThreadPool() as thread_pool:
709
+ thread_pool.map(modify_instance_sg, instances)
710
+ thread_pool.close()
711
+ thread_pool.join()
712
+
713
+ instances.terminate()
714
+ else:
715
+ # Case 4: We are managing the non-default sg. The default SG does not
716
+ # exist. We must block on instance termination.
717
+ instances.terminate()
718
+ for instance in instances:
719
+ instance.wait_until_terminated()
720
+
698
721
  # TODO(suquark): Currently, the implementation of GCP and Azure will
699
722
  # wait util the cluster is fully terminated, while other clouds just
700
723
  # trigger the termination process (via http call) and then return.
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
952
952
 
953
953
  @common_utils.retry
954
954
  def query_instances(
955
+ cluster_name: str,
955
956
  cluster_name_on_cloud: str,
956
957
  provider_config: Optional[Dict[str, Any]] = None,
957
958
  non_terminated_only: bool = True,
958
959
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
959
960
  """See sky/provision/__init__.py"""
961
+ del cluster_name # unused
960
962
  assert provider_config is not None, cluster_name_on_cloud
961
963
 
962
964
  subscription_id = provider_config['subscription_id']
@@ -4,7 +4,7 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
- import sky.provision.cudo.cudo_utils as utils
7
+ from sky.provision.cudo import cudo_utils as utils
8
8
 
9
9
  logger = sky_logging.init_logger(__name__)
10
10
 
@@ -191,11 +191,13 @@ def get_cluster_info(
191
191
 
192
192
 
193
193
  def query_instances(
194
+ cluster_name: str,
194
195
  cluster_name_on_cloud: str,
195
196
  provider_config: Optional[Dict[str, Any]] = None,
196
197
  non_terminated_only: bool = True,
197
198
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
199
  """See sky/provision/__init__.py"""
200
+ del cluster_name # unused
199
201
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
202
  instances = _filter_instances(cluster_name_on_cloud, None)
201
203
 
@@ -242,11 +242,13 @@ def get_cluster_info(
242
242
 
243
243
 
244
244
  def query_instances(
245
+ cluster_name: str,
245
246
  cluster_name_on_cloud: str,
246
247
  provider_config: Optional[Dict[str, Any]] = None,
247
248
  non_terminated_only: bool = True,
248
249
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
250
  """See sky/provision/__init__.py"""
251
+ del cluster_name # unused
250
252
  # terminated instances are not retrieved by the
251
253
  # API making `non_terminated_only` argument moot.
252
254
  del non_terminated_only
@@ -287,11 +287,13 @@ def get_cluster_info(
287
287
 
288
288
 
289
289
  def query_instances(
290
+ cluster_name: str,
290
291
  cluster_name_on_cloud: str,
291
292
  provider_config: Optional[Dict[str, Any]] = None,
292
293
  non_terminated_only: bool = True,
293
294
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
294
295
  """See sky/provision/__init__.py"""
296
+ del cluster_name # unused
295
297
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
296
298
  instances = _filter_instances(cluster_name_on_cloud, None)
297
299
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -58,11 +58,13 @@ def _filter_instances(
58
58
  # for terminated instances, if they have already been fully deleted.
59
59
  @common_utils.retry
60
60
  def query_instances(
61
+ cluster_name: str,
61
62
  cluster_name_on_cloud: str,
62
63
  provider_config: Optional[Dict[str, Any]] = None,
63
64
  non_terminated_only: bool = True,
64
65
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
65
66
  """See sky/provision/__init__.py"""
67
+ del cluster_name # unused
66
68
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
69
  zone = provider_config['availability_zone']
68
70
  project_id = provider_config['project_id']
@@ -304,12 +304,13 @@ def get_cluster_info(
304
304
 
305
305
 
306
306
  def query_instances(
307
+ cluster_name: str,
307
308
  cluster_name_on_cloud: str,
308
309
  provider_config: Optional[dict] = None,
309
310
  non_terminated_only: bool = True,
310
311
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
311
312
  """Returns the status of the specified instances for Hyperbolic."""
312
- del provider_config # unused
313
+ del cluster_name, provider_config # unused
313
314
  # Fetch all instances for this cluster
314
315
  instances = utils.list_instances(
315
316
  metadata={'skypilot': {
@@ -1,10 +1,12 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
3
  import json
4
+ import re
4
5
  import time
5
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
7
 
7
8
  from sky import exceptions
9
+ from sky import global_user_state
8
10
  from sky import sky_logging
9
11
  from sky import skypilot_config
10
12
  from sky.adaptors import kubernetes
@@ -24,6 +26,7 @@ from sky.utils import status_lib
24
26
  from sky.utils import subprocess_utils
25
27
  from sky.utils import timeline
26
28
  from sky.utils import ux_utils
29
+ from sky.utils.db import db_utils
27
30
 
28
31
  POLL_INTERVAL = 2
29
32
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
@@ -1270,7 +1273,116 @@ def _get_pod_termination_reason(pod: Any) -> str:
1270
1273
  return ' | '.join(reasons)
1271
1274
 
1272
1275
 
1276
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1277
+ cluster_name: str, pod_name: str) -> Optional[str]:
1278
+ logger.debug(f'Analyzing events for pod {pod_name}')
1279
+ pod_field_selector = (
1280
+ f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1281
+ pod_events = kubernetes.core_api(context).list_namespaced_event(
1282
+ namespace,
1283
+ field_selector=pod_field_selector,
1284
+ _request_timeout=kubernetes.API_TIMEOUT).items
1285
+ pod_events = sorted(
1286
+ pod_events,
1287
+ key=lambda event: event.metadata.creation_timestamp,
1288
+ # latest event appears first
1289
+ reverse=True)
1290
+ last_scheduled_node = None
1291
+ insert_new_pod_event = True
1292
+ new_event_inserted = False
1293
+ for event in pod_events:
1294
+ if event.reason == 'Scheduled':
1295
+ pattern = r'Successfully assigned (\S+) to (\S+)'
1296
+ match = re.search(pattern, event.message)
1297
+ if match:
1298
+ scheduled_node = match.group(2)
1299
+ last_scheduled_node = scheduled_node
1300
+ if insert_new_pod_event:
1301
+ # Try inserting the latest events first. If the event is a
1302
+ # duplicate, it means the event (and any previous events) have
1303
+ # already been inserted - so do not insert further events.
1304
+ try:
1305
+ global_user_state.add_cluster_event(
1306
+ cluster_name,
1307
+ None, f'[kubernetes pod {pod_name}] '
1308
+ f'{event.reason} {event.message}',
1309
+ global_user_state.ClusterEventType.DEBUG,
1310
+ transitioned_at=int(
1311
+ event.metadata.creation_timestamp.timestamp()),
1312
+ expose_duplicate_error=True)
1313
+ except db_utils.UniqueConstraintViolationError:
1314
+ insert_new_pod_event = False
1315
+ else:
1316
+ new_event_inserted = True
1317
+
1318
+ if last_scheduled_node is not None:
1319
+ node_field_selector = ('involvedObject.kind=Node,'
1320
+ f'involvedObject.name={last_scheduled_node}')
1321
+ node_events = kubernetes.core_api(context).list_namespaced_event(
1322
+ namespace,
1323
+ field_selector=node_field_selector,
1324
+ _request_timeout=kubernetes.API_TIMEOUT).items
1325
+ node_events = sorted(
1326
+ node_events,
1327
+ key=lambda event: event.metadata.creation_timestamp,
1328
+ # latest event appears first
1329
+ reverse=True)
1330
+ insert_new_node_event = True
1331
+ for event in node_events:
1332
+ if insert_new_node_event:
1333
+ # Try inserting the latest events first. If the event is a
1334
+ # duplicate, it means the event (and any previous events) have
1335
+ # already been inserted - so do not insert further events.
1336
+ try:
1337
+ global_user_state.add_cluster_event(
1338
+ cluster_name,
1339
+ None, f'[kubernetes node {last_scheduled_node}] '
1340
+ f'{event.reason} {event.message}',
1341
+ global_user_state.ClusterEventType.DEBUG,
1342
+ transitioned_at=int(
1343
+ event.metadata.creation_timestamp.timestamp()),
1344
+ expose_duplicate_error=True)
1345
+ except db_utils.UniqueConstraintViolationError:
1346
+ insert_new_node_event = False
1347
+ else:
1348
+ new_event_inserted = True
1349
+
1350
+ if not new_event_inserted:
1351
+ # If new event is not inserted, there is no useful information to
1352
+ # return. Return None.
1353
+ return None
1354
+
1355
+ # Analyze the events for failure
1356
+ failure_reason = None
1357
+ failure_decisiveness = 0
1358
+
1359
+ def _record_failure_reason(reason: str, decisiveness: int):
1360
+ nonlocal failure_reason, failure_decisiveness
1361
+ if decisiveness > failure_decisiveness:
1362
+ failure_reason = reason
1363
+ failure_decisiveness = decisiveness
1364
+
1365
+ cluster_events = global_user_state.get_cluster_events(
1366
+ cluster_name, None, global_user_state.ClusterEventType.DEBUG)
1367
+ for event in cluster_events:
1368
+ if event.startswith('[kubernetes pod'):
1369
+ event = event.split(']')[1].strip()
1370
+ elif event.startswith('[kubernetes node'):
1371
+ event = event.split(']')[1].strip()
1372
+
1373
+ if event.startswith('NodeNotReady '):
1374
+ _record_failure_reason(event[len('NodeNotReady '):], 1)
1375
+ elif event.startswith('TaintManagerEviction '):
1376
+ # usually the event message for TaintManagerEviction is not useful
1377
+ # so we record a more generic message.
1378
+ _record_failure_reason('pod was evicted by taint manager', 2)
1379
+ elif event.startswith('DeletingNode '):
1380
+ _record_failure_reason(event[len('DeletingNode '):], 3)
1381
+ return failure_reason
1382
+
1383
+
1273
1384
  def query_instances(
1385
+ cluster_name: str,
1274
1386
  cluster_name_on_cloud: str,
1275
1387
  provider_config: Optional[Dict[str, Any]] = None,
1276
1388
  non_terminated_only: bool = True
@@ -1334,6 +1446,27 @@ def query_instances(
1334
1446
  pod_name = pod.metadata.name
1335
1447
  reason = f'{pod_name}: {reason}' if reason is not None else None
1336
1448
  cluster_status[pod_name] = (pod_status, reason)
1449
+
1450
+ # Find the list of pod names that should be there
1451
+ # from k8s services. Filter duplicates as -ssh service
1452
+ # creates a duplicate entry.
1453
+ target_pod_names = list(
1454
+ set([
1455
+ service['spec']['selector']['component']
1456
+ for service in provider_config.get('services', [])
1457
+ ]))
1458
+
1459
+ for target_pod_name in target_pod_names:
1460
+ if target_pod_name not in cluster_status:
1461
+ # If the pod is not in the cluster_status, it means it's not
1462
+ # running.
1463
+ # Analyze what happened to the pod based on events.
1464
+ reason = _get_pod_missing_reason(context, namespace, cluster_name,
1465
+ target_pod_name)
1466
+ reason = (f'{target_pod_name}: {reason}'
1467
+ if reason is not None else None)
1468
+ cluster_status[target_pod_name] = (None, reason)
1469
+
1337
1470
  return cluster_status
1338
1471
 
1339
1472
 
@@ -226,11 +226,13 @@ def get_cluster_info(
226
226
 
227
227
 
228
228
  def query_instances(
229
+ cluster_name: str,
229
230
  cluster_name_on_cloud: str,
230
231
  provider_config: Optional[Dict[str, Any]] = None,
231
232
  non_terminated_only: bool = True,
232
233
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
234
  """See sky/provision/__init__.py"""
235
+ del cluster_name # unused
234
236
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
237
  instances = _filter_instances(cluster_name_on_cloud, None)
236
238
 
@@ -247,11 +247,13 @@ def get_cluster_info(
247
247
 
248
248
 
249
249
  def query_instances(
250
+ cluster_name: str,
250
251
  cluster_name_on_cloud: str,
251
252
  provider_config: Optional[Dict[str, Any]] = None,
252
253
  non_terminated_only: bool = True,
253
254
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
254
255
  """See sky/provision/__init__.py"""
256
+ del cluster_name # unused
255
257
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
256
258
  instances = _filter_instances(provider_config['region'],
257
259
  cluster_name_on_cloud, None)
@@ -32,6 +32,7 @@ logger = sky_logging.init_logger(__name__)
32
32
  @query_utils.debug_enabled(logger)
33
33
  @common_utils.retry
34
34
  def query_instances(
35
+ cluster_name: str,
35
36
  cluster_name_on_cloud: str,
36
37
  provider_config: Optional[Dict[str, Any]] = None,
37
38
  non_terminated_only: bool = True,
@@ -43,6 +44,7 @@ def query_instances(
43
44
  A None status means the instance is marked as "terminated"
44
45
  or "terminating".
45
46
  """
47
+ del cluster_name # unusedå
46
48
  assert provider_config is not None, cluster_name_on_cloud
47
49
  region = provider_config['region']
48
50
 
@@ -277,12 +277,13 @@ def get_cluster_info(
277
277
 
278
278
 
279
279
  def query_instances(
280
+ cluster_name: str,
280
281
  cluster_name_on_cloud: str,
281
282
  provider_config: Optional[Dict[str, Any]] = None,
282
283
  non_terminated_only: bool = True,
283
284
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
285
  """See sky/provision/__init__.py"""
285
- del non_terminated_only
286
+ del cluster_name, non_terminated_only #unused
286
287
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
288
  instances = _filter_instances(cluster_name_on_cloud, None)
288
289
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import common as adaptors_common
11
- import sky.provision.paperspace.constants as constants
11
+ from sky.provision.paperspace import constants
12
12
  from sky.utils import common_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -201,11 +201,13 @@ def get_cluster_info(
201
201
 
202
202
 
203
203
  def query_instances(
204
+ cluster_name: str,
204
205
  cluster_name_on_cloud: str,
205
206
  provider_config: Optional[Dict[str, Any]] = None,
206
207
  non_terminated_only: bool = True,
207
208
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
209
  """See sky/provision/__init__.py"""
210
+ del cluster_name # unused
209
211
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
212
  instances = _filter_instances(cluster_name_on_cloud, None)
211
213
 
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
9
  from sky.provision import docker_utils
10
- import sky.provision.runpod.api.commands as runpod_commands
10
+ from sky.provision.runpod.api import commands as runpod_commands
11
11
  from sky.skylet import constants
12
12
  from sky.utils import common_utils
13
13
 
@@ -427,10 +427,12 @@ def terminate_instances(
427
427
 
428
428
 
429
429
  def query_instances(
430
+ cluster_name: str,
430
431
  cluster_name_on_cloud: str,
431
432
  provider_config: Optional[Dict[str, Any]] = None,
432
433
  non_terminated_only: bool = True,
433
434
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
+ del cluster_name # unused
434
436
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
435
437
  instances = _filter_instances(cluster_name_on_cloud, None)
436
438
 
@@ -216,11 +216,13 @@ def open_ports(
216
216
 
217
217
 
218
218
  def query_instances(
219
+ cluster_name: str,
219
220
  cluster_name_on_cloud: str,
220
221
  provider_config: Optional[Dict[str, Any]] = None,
221
222
  non_terminated_only: bool = True,
222
223
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
223
224
  """See sky/provision/__init__.py"""
225
+ del cluster_name # unused
224
226
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
225
227
  instances = _filter_instances(cluster_name_on_cloud, None)
226
228
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
393
393
 
394
394
 
395
395
  def query_instances(
396
+ cluster_name: str,
396
397
  cluster_name_on_cloud: str,
397
398
  provider_config: Optional[Dict[str, Any]] = None,
398
399
  non_terminated_only: bool = True,
399
400
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
400
401
  """See sky/provision/__init__.py"""
402
+ del cluster_name # unused
401
403
  logger.info('New provision of Vsphere: query_instances().')
402
404
  assert provider_config is not None, cluster_name_on_cloud
403
405
  region = provider_config['region']
sky/resources.py CHANGED
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
8
8
 
9
9
  import colorama
10
10
 
11
- import sky
12
11
  from sky import catalog
13
12
  from sky import check as sky_check
14
13
  from sky import clouds
@@ -288,7 +287,7 @@ class Resources:
288
287
  if infra is not None:
289
288
  infra_info = infra_utils.InfraInfo.from_str(infra)
290
289
  # Infra takes precedence over individually specified parameters
291
- cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
290
+ cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
292
291
  region = infra_info.region
293
292
  zone = infra_info.zone
294
293
 
File without changes
File without changes
@@ -0,0 +1,70 @@
1
+ """Responses for the API server."""
2
+
3
+ from typing import Optional
4
+
5
+ import pydantic
6
+
7
+ from sky import models
8
+ from sky.server import common
9
+
10
+
11
+ class ResponseBaseModel(pydantic.BaseModel):
12
+ """A pydantic model that acts like a dict.
13
+
14
+ Supports the following syntax:
15
+ class SampleResponse(DictLikePayload):
16
+ field: str
17
+
18
+ response = SampleResponse(field='value')
19
+ print(response['field']) # prints 'value'
20
+ response['field'] = 'value2'
21
+ print(response['field']) # prints 'value2'
22
+ print('field' in response) # prints True
23
+
24
+ This model exists for backwards compatibility with the
25
+ old SDK that used to return a dict.
26
+
27
+ The backward compatibility may be removed
28
+ in the future.
29
+ """
30
+ # Ignore extra fields in the request body, which is useful for backward
31
+ # compatibility. The difference with `allow` is that `ignore` will not
32
+ # include the unknown fields when dump the model, i.e., we can add new
33
+ # fields to the request body without breaking the existing old API server
34
+ # where the handler function does not accept the new field in function
35
+ # signature.
36
+ model_config = pydantic.ConfigDict(extra='ignore')
37
+
38
+ # backward compatibility with dict
39
+ # TODO(syang): remove this in v0.13.0
40
+ def __getitem__(self, key):
41
+ try:
42
+ return getattr(self, key)
43
+ except AttributeError as e:
44
+ raise KeyError(key) from e
45
+
46
+ def __setitem__(self, key, value):
47
+ setattr(self, key, value)
48
+
49
+ def __contains__(self, key):
50
+ return hasattr(self, key)
51
+
52
+ def keys(self):
53
+ return self.model_dump().keys()
54
+
55
+ def values(self):
56
+ return self.model_dump().values()
57
+
58
+ def items(self):
59
+ return self.model_dump().items()
60
+
61
+
62
+ class APIHealthResponse(ResponseBaseModel):
63
+ """Response for the API health endpoint."""
64
+ status: common.ApiServerStatus
65
+ api_version: str = ''
66
+ version: str = ''
67
+ version_on_disk: str = ''
68
+ commit: str = ''
69
+ basic_auth_enabled: bool = False
70
+ user: Optional[models.User] = None
File without changes
@@ -0,0 +1,36 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: sky/schemas/generated/autostopv1.proto
4
+ # Protobuf Python Version: 5.26.1
5
+ """Generated protocol buffer code."""
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ from google.protobuf.internal import builder as _builder
10
+ # @@protoc_insertion_point(imports)
11
+
12
+ _sym_db = _symbol_database.Default()
13
+
14
+
15
+
16
+
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
18
+
19
+ _globals = globals()
20
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
21
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
22
+ if not _descriptor._USE_C_DESCRIPTORS:
23
+ DESCRIPTOR._loaded_options = None
24
+ _globals['_AUTOSTOPWAITFOR']._serialized_start=278
25
+ _globals['_AUTOSTOPWAITFOR']._serialized_end=422
26
+ _globals['_SETAUTOSTOPREQUEST']._serialized_start=55
27
+ _globals['_SETAUTOSTOPREQUEST']._serialized_end=176
28
+ _globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
29
+ _globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
30
+ _globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
31
+ _globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
32
+ _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
33
+ _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
34
+ _globals['_AUTOSTOPSERVICE']._serialized_start=425
35
+ _globals['_AUTOSTOPSERVICE']._serialized_end=615
36
+ # @@protoc_insertion_point(module_scope)
@@ -0,0 +1,43 @@
1
+ from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
2
+ from google.protobuf import descriptor as _descriptor
3
+ from google.protobuf import message as _message
4
+ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
5
+
6
+ DESCRIPTOR: _descriptor.FileDescriptor
7
+
8
+ class AutostopWaitFor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
9
+ __slots__ = ()
10
+ AUTOSTOP_WAIT_FOR_UNSPECIFIED: _ClassVar[AutostopWaitFor]
11
+ AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: _ClassVar[AutostopWaitFor]
12
+ AUTOSTOP_WAIT_FOR_JOBS: _ClassVar[AutostopWaitFor]
13
+ AUTOSTOP_WAIT_FOR_NONE: _ClassVar[AutostopWaitFor]
14
+ AUTOSTOP_WAIT_FOR_UNSPECIFIED: AutostopWaitFor
15
+ AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: AutostopWaitFor
16
+ AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
17
+ AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
18
+
19
+ class SetAutostopRequest(_message.Message):
20
+ __slots__ = ("idle_minutes", "backend", "wait_for", "down")
21
+ IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
22
+ BACKEND_FIELD_NUMBER: _ClassVar[int]
23
+ WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
24
+ DOWN_FIELD_NUMBER: _ClassVar[int]
25
+ idle_minutes: int
26
+ backend: str
27
+ wait_for: AutostopWaitFor
28
+ down: bool
29
+ def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
30
+
31
+ class SetAutostopResponse(_message.Message):
32
+ __slots__ = ()
33
+ def __init__(self) -> None: ...
34
+
35
+ class IsAutostoppingRequest(_message.Message):
36
+ __slots__ = ()
37
+ def __init__(self) -> None: ...
38
+
39
+ class IsAutostoppingResponse(_message.Message):
40
+ __slots__ = ("is_autostopping",)
41
+ IS_AUTOSTOPPING_FIELD_NUMBER: _ClassVar[int]
42
+ is_autostopping: bool
43
+ def __init__(self, is_autostopping: bool = ...) -> None: ...