skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (136) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend.py +5 -3
  4. sky/backends/backend_utils.py +22 -7
  5. sky/backends/cloud_vm_ray_backend.py +50 -18
  6. sky/backends/local_docker_backend.py +8 -3
  7. sky/client/cli/command.py +25 -10
  8. sky/client/sdk.py +51 -1
  9. sky/clouds/kubernetes.py +2 -6
  10. sky/clouds/nebius.py +3 -1
  11. sky/core.py +9 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  19. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  20. sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  23. sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  27. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
  28. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  29. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  32. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
  47. sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
  48. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  49. sky/dashboard/out/clusters/[cluster].html +1 -1
  50. sky/dashboard/out/clusters.html +1 -1
  51. sky/dashboard/out/config.html +1 -1
  52. sky/dashboard/out/index.html +1 -1
  53. sky/dashboard/out/infra/[context].html +1 -1
  54. sky/dashboard/out/infra.html +1 -1
  55. sky/dashboard/out/jobs/[job].html +1 -1
  56. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/volumes.html +1 -1
  60. sky/dashboard/out/workspace/new.html +1 -1
  61. sky/dashboard/out/workspaces/[name].html +1 -1
  62. sky/dashboard/out/workspaces.html +1 -1
  63. sky/data/storage_utils.py +29 -9
  64. sky/execution.py +13 -10
  65. sky/global_user_state.py +131 -2
  66. sky/jobs/constants.py +1 -1
  67. sky/jobs/recovery_strategy.py +0 -3
  68. sky/jobs/scheduler.py +14 -21
  69. sky/jobs/server/core.py +64 -10
  70. sky/jobs/server/utils.py +1 -1
  71. sky/jobs/state.py +1 -3
  72. sky/jobs/utils.py +159 -11
  73. sky/provision/aws/config.py +19 -3
  74. sky/provision/aws/instance.py +2 -1
  75. sky/provision/kubernetes/instance.py +2 -1
  76. sky/provision/nebius/utils.py +101 -86
  77. sky/provision/provisioner.py +13 -8
  78. sky/resources.py +5 -5
  79. sky/schemas/api/responses.py +50 -1
  80. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  81. sky/serve/replica_managers.py +123 -101
  82. sky/serve/serve_state.py +32 -0
  83. sky/serve/serve_utils.py +37 -16
  84. sky/serve/service.py +51 -17
  85. sky/server/common.py +2 -3
  86. sky/server/constants.py +1 -1
  87. sky/server/requests/payloads.py +6 -0
  88. sky/server/requests/serializers/decoders.py +20 -5
  89. sky/server/requests/serializers/encoders.py +21 -8
  90. sky/server/server.py +57 -11
  91. sky/templates/kubernetes-ray.yml.j2 +1 -0
  92. sky/utils/cli_utils/status_utils.py +2 -1
  93. sky/utils/common_utils.py +20 -0
  94. sky/utils/controller_utils.py +17 -4
  95. sky/utils/db/migration_utils.py +1 -1
  96. sky/utils/log_utils.py +14 -5
  97. sky/utils/resources_utils.py +25 -1
  98. sky/utils/schemas.py +3 -0
  99. sky/utils/ux_utils.py +36 -5
  100. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
  102. sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  104. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  105. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  106. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  109. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  110. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  111. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  114. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  117. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  119. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
  121. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
  131. /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
  132. /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
  133. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -85,6 +85,12 @@ _JOB_CANCELLED_MESSAGE = (
85
85
  _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
86
86
 
87
87
 
88
+ class ManagedJobQueueResultType(enum.Enum):
89
+ """The type of the managed job queue result."""
90
+ DICT = 'DICT'
91
+ LIST = 'LIST'
92
+
93
+
88
94
  class UserSignal(enum.Enum):
89
95
  """The signal to be sent to the user."""
90
96
  CANCEL = 'CANCEL'
@@ -337,9 +343,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
337
343
  if handle is not None:
338
344
  try:
339
345
  if pool is None:
340
- global_user_state.add_cluster_event(
341
- cluster_name, None, 'Cluster was cleaned up.',
342
- global_user_state.ClusterEventType.STATUS_CHANGE)
343
346
  terminate_cluster(cluster_name)
344
347
  except Exception as e: # pylint: disable=broad-except
345
348
  error_msg = (
@@ -1120,7 +1123,17 @@ def stream_logs(job_id: Optional[int],
1120
1123
  return stream_logs_by_id(job_id, follow, tail)
1121
1124
 
1122
1125
 
1123
- def dump_managed_job_queue() -> str:
1126
+ def dump_managed_job_queue(
1127
+ skip_finished: bool = False,
1128
+ accessible_workspaces: Optional[List[str]] = None,
1129
+ job_ids: Optional[List[int]] = None,
1130
+ workspace_match: Optional[str] = None,
1131
+ name_match: Optional[str] = None,
1132
+ pool_match: Optional[str] = None,
1133
+ page: Optional[int] = None,
1134
+ limit: Optional[int] = None,
1135
+ user_hashes: Optional[List[Optional[str]]] = None,
1136
+ ) -> str:
1124
1137
  # Make sure to get all jobs - some logic below (e.g. high priority job
1125
1138
  # detection) requires a full view of the jobs table.
1126
1139
  jobs = managed_job_state.get_managed_jobs()
@@ -1147,6 +1160,31 @@ def dump_managed_job_queue() -> str:
1147
1160
  if priority is not None and priority > highest_blocking_priority:
1148
1161
  highest_blocking_priority = priority
1149
1162
 
1163
+ if user_hashes:
1164
+ jobs = [
1165
+ job for job in jobs if job.get('user_hash', None) in user_hashes
1166
+ ]
1167
+ if accessible_workspaces:
1168
+ jobs = [
1169
+ job for job in jobs
1170
+ if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
1171
+ accessible_workspaces
1172
+ ]
1173
+ if skip_finished:
1174
+ # Filter out the finished jobs. If a multi-task job is partially
1175
+ # finished, we will include all its tasks.
1176
+ non_finished_tasks = list(
1177
+ filter(
1178
+ lambda job: not managed_job_state.ManagedJobStatus(job[
1179
+ 'status']).is_terminal(), jobs))
1180
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
1181
+ jobs = list(
1182
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
1183
+ if job_ids:
1184
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
1185
+
1186
+ jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
1187
+ page, limit)
1150
1188
  for job in jobs:
1151
1189
  end_at = job['end_at']
1152
1190
  if end_at is None:
@@ -1220,12 +1258,96 @@ def dump_managed_job_queue() -> str:
1220
1258
  else:
1221
1259
  job['details'] = None
1222
1260
 
1223
- return message_utils.encode_payload(jobs)
1261
+ return message_utils.encode_payload({'jobs': jobs, 'total': total})
1262
+
1263
+
1264
+ def filter_jobs(
1265
+ jobs: List[Dict[str, Any]],
1266
+ workspace_match: Optional[str],
1267
+ name_match: Optional[str],
1268
+ pool_match: Optional[str],
1269
+ page: Optional[int],
1270
+ limit: Optional[int],
1271
+ user_match: Optional[str] = None,
1272
+ enable_user_match: bool = False,
1273
+ ) -> Tuple[List[Dict[str, Any]], int]:
1274
+ """Filter jobs based on the given criteria.
1275
+
1276
+ Args:
1277
+ jobs: List of jobs to filter.
1278
+ workspace_match: Workspace name to filter.
1279
+ name_match: Job name to filter.
1280
+ pool_match: Pool name to filter.
1281
+ page: Page to filter.
1282
+ limit: Limit to filter.
1283
+ user_match: User name to filter.
1284
+ enable_user_match: Whether to enable user match.
1285
+
1286
+ Returns:
1287
+ List of filtered jobs and total number of jobs.
1288
+ """
1289
+
1290
+ # TODO(hailong): refactor the whole function including the
1291
+ # `dump_managed_job_queue()` to use DB filtering.
1292
+
1293
+ def _pattern_matches(job: Dict[str, Any], key: str,
1294
+ pattern: Optional[str]) -> bool:
1295
+ if pattern is None:
1296
+ return True
1297
+ if key not in job:
1298
+ return False
1299
+ value = job[key]
1300
+ if not value:
1301
+ return False
1302
+ return pattern in str(value)
1303
+
1304
+ def _handle_page_and_limit(
1305
+ result: List[Dict[str, Any]],
1306
+ page: Optional[int],
1307
+ limit: Optional[int],
1308
+ ) -> List[Dict[str, Any]]:
1309
+ if page is None and limit is None:
1310
+ return result
1311
+ assert page is not None and limit is not None, (page, limit)
1312
+ # page starts from 1
1313
+ start = (page - 1) * limit
1314
+ end = min(start + limit, len(result))
1315
+ return result[start:end]
1316
+
1317
+ result = []
1318
+ checks = [
1319
+ ('workspace', workspace_match),
1320
+ ('job_name', name_match),
1321
+ ('pool', pool_match),
1322
+ ]
1323
+ if enable_user_match:
1324
+ checks.append(('user_name', user_match))
1325
+
1326
+ for job in jobs:
1327
+ if not all(
1328
+ _pattern_matches(job, key, pattern) for key, pattern in checks):
1329
+ continue
1330
+ result.append(job)
1331
+
1332
+ total = len(result)
1333
+
1334
+ return _handle_page_and_limit(result, page, limit), total
1224
1335
 
1225
1336
 
1226
- def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
1337
+ def load_managed_job_queue(
1338
+ payload: str
1339
+ ) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
1227
1340
  """Load job queue from json string."""
1228
- jobs = message_utils.decode_payload(payload)
1341
+ result = message_utils.decode_payload(payload)
1342
+ result_type = ManagedJobQueueResultType.DICT
1343
+ if isinstance(result, dict):
1344
+ jobs = result['jobs']
1345
+ total = result['total']
1346
+ else:
1347
+ jobs = result
1348
+ total = len(jobs)
1349
+ result_type = ManagedJobQueueResultType.LIST
1350
+
1229
1351
  for job in jobs:
1230
1352
  job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1231
1353
  if 'user_hash' in job and job['user_hash'] is not None:
@@ -1233,7 +1355,7 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
1233
1355
  # TODO(cooperc): Remove check before 0.12.0.
1234
1356
  user = global_user_state.get_user(job['user_hash'])
1235
1357
  job['user_name'] = user.name if user is not None else None
1236
- return jobs
1358
+ return jobs, total, result_type
1237
1359
 
1238
1360
 
1239
1361
  def _get_job_status_from_tasks(
@@ -1580,9 +1702,35 @@ class ManagedJobCodeGen:
1580
1702
  """)
1581
1703
 
1582
1704
  @classmethod
1583
- def get_job_table(cls) -> str:
1584
- code = textwrap.dedent("""\
1585
- job_table = utils.dump_managed_job_queue()
1705
+ def get_job_table(
1706
+ cls,
1707
+ skip_finished: bool = False,
1708
+ accessible_workspaces: Optional[List[str]] = None,
1709
+ job_ids: Optional[List[int]] = None,
1710
+ workspace_match: Optional[str] = None,
1711
+ name_match: Optional[str] = None,
1712
+ pool_match: Optional[str] = None,
1713
+ page: Optional[int] = None,
1714
+ limit: Optional[int] = None,
1715
+ user_hashes: Optional[List[Optional[str]]] = None,
1716
+ ) -> str:
1717
+ code = textwrap.dedent(f"""\
1718
+ if managed_job_version < 9:
1719
+ # For backward compatibility, since filtering is not supported
1720
+ # before #6652.
1721
+ # TODO(hailong): Remove compatibility before 0.12.0
1722
+ job_table = utils.dump_managed_job_queue()
1723
+ else:
1724
+ job_table = utils.dump_managed_job_queue(
1725
+ skip_finished={skip_finished},
1726
+ accessible_workspaces={accessible_workspaces!r},
1727
+ job_ids={job_ids!r},
1728
+ workspace_match={workspace_match!r},
1729
+ name_match={name_match!r},
1730
+ pool_match={pool_match!r},
1731
+ page={page!r},
1732
+ limit={limit!r},
1733
+ user_hashes={user_hashes!r})
1586
1734
  print(job_table, flush=True)
1587
1735
  """)
1588
1736
  return cls._build(code)
@@ -105,13 +105,29 @@ def bootstrap_instances(
105
105
  expected_sg_name,
106
106
  extended_ip_rules)
107
107
  if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
- # Ensure the default security group is created. This is needed
108
+ logger.debug('Attempting to create the default security group.')
109
+ # Attempt to create the default security group. This is needed
109
110
  # to enable us to use the default security group to quickly
110
111
  # delete the cluster. If the default security group is not created,
111
112
  # we will need to block on instance termination to delete the
112
113
  # security group.
113
- _configure_security_group(ec2, vpc_id,
114
- aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
114
+ try:
115
+ _configure_security_group(ec2, vpc_id,
116
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
117
+ [])
118
+ logger.debug('Default security group created.')
119
+ except exceptions.NoClusterLaunchedError as e:
120
+ if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
121
+ e):
122
+ # User does not have permission to create the default
123
+ # security group.
124
+ logger.debug('User does not have permission to create '
125
+ 'the default security group. '
126
+ f'{e}')
127
+ pass
128
+ else:
129
+ raise e
130
+
115
131
  end_time = time.time()
116
132
  elapsed = end_time - start_time
117
133
  logger.info(
@@ -713,7 +713,8 @@ def terminate_instances(
713
713
  instances.terminate()
714
714
  else:
715
715
  # Case 4: We are managing the non-default sg. The default SG does not
716
- # exist. We must block on instance termination.
716
+ # exist. We must block on instance termination so that we can
717
+ # delete the security group.
717
718
  instances.terminate()
718
719
  for instance in instances:
719
720
  instance.wait_until_terminated()
@@ -1465,7 +1465,8 @@ def query_instances(
1465
1465
  target_pod_name)
1466
1466
  reason = (f'{target_pod_name}: {reason}'
1467
1467
  if reason is not None else None)
1468
- cluster_status[target_pod_name] = (None, reason)
1468
+ if not non_terminated_only:
1469
+ cluster_status[target_pod_name] = (None, reason)
1469
1470
 
1470
1471
  return cluster_status
1471
1472
 
@@ -36,8 +36,10 @@ def retry(func):
36
36
 
37
37
  def get_project_by_region(region: str) -> str:
38
38
  service = nebius.iam().ProjectServiceClient(nebius.sdk())
39
- projects = service.list(nebius.iam().ListProjectsRequest(
40
- parent_id=nebius.get_tenant_id())).wait()
39
+ projects = nebius.sync_call(
40
+ service.list(
41
+ nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
42
+ timeout=nebius.READ_TIMEOUT))
41
43
 
42
44
  # Check is there project if in config
43
45
  project_id = skypilot_config.get_effective_region_config(
@@ -56,19 +58,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
56
58
  """
57
59
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
58
60
  try:
59
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
60
- parent_id=project_id,
61
- name=name,
62
- )).wait()
63
- cluster_id = cluster.metadata.id
64
- except nebius.request_error():
65
- cluster = service.create(nebius.compute().CreateGpuClusterRequest(
66
- metadata=nebius.nebius_common().ResourceMetadata(
61
+ cluster = nebius.sync_call(
62
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
67
63
  parent_id=project_id,
68
64
  name=name,
69
- ),
70
- spec=nebius.compute().GpuClusterSpec(
71
- infiniband_fabric=fabric))).wait()
65
+ )))
66
+ cluster_id = cluster.metadata.id
67
+ except nebius.request_error():
68
+ cluster = nebius.sync_call(
69
+ service.create(nebius.compute().CreateGpuClusterRequest(
70
+ metadata=nebius.nebius_common().ResourceMetadata(
71
+ parent_id=project_id,
72
+ name=name,
73
+ ),
74
+ spec=nebius.compute().GpuClusterSpec(
75
+ infiniband_fabric=fabric))))
72
76
  cluster_id = cluster.resource_id
73
77
  return cluster_id
74
78
 
@@ -78,14 +82,16 @@ def delete_cluster(name: str, region: str) -> None:
78
82
  project_id = get_project_by_region(region)
79
83
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
80
84
  try:
81
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
82
- parent_id=project_id,
83
- name=name,
84
- )).wait()
85
+ cluster = nebius.sync_call(
86
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
87
+ parent_id=project_id,
88
+ name=name,
89
+ )))
85
90
  cluster_id = cluster.metadata.id
86
91
  logger.debug(f'Found GPU Cluster : {cluster_id}.')
87
- service.delete(
88
- nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
92
+ nebius.sync_call(
93
+ service.delete(
94
+ nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
89
95
  logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
90
96
  except nebius.request_error():
91
97
  logger.debug('GPU Cluster does not exist.')
@@ -94,8 +100,10 @@ def delete_cluster(name: str, region: str) -> None:
94
100
  def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
95
101
  """Lists instances associated with API key."""
96
102
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
97
- result = service.list(
98
- nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
103
+ result = nebius.sync_call(
104
+ service.list(
105
+ nebius.compute().ListInstancesRequest(parent_id=project_id),
106
+ timeout=nebius.READ_TIMEOUT))
99
107
 
100
108
  instances = result
101
109
 
@@ -116,12 +124,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
116
124
 
117
125
  def stop(instance_id: str) -> None:
118
126
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
119
- service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
127
+ nebius.sync_call(
128
+ service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
120
129
  retry_count = 0
121
130
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
122
131
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
123
- instance = service.get(nebius.compute().GetInstanceRequest(
124
- id=instance_id,)).wait()
132
+ instance = nebius.sync_call(
133
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
125
134
  if instance.status.state.name == 'STOPPED':
126
135
  break
127
136
  time.sleep(POLL_INTERVAL)
@@ -138,12 +147,13 @@ def stop(instance_id: str) -> None:
138
147
 
139
148
  def start(instance_id: str) -> None:
140
149
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
141
- service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
150
+ nebius.sync_call(
151
+ service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
142
152
  retry_count = 0
143
153
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
144
154
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
145
- instance = service.get(nebius.compute().GetInstanceRequest(
146
- id=instance_id,)).wait()
155
+ instance = nebius.sync_call(
156
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
147
157
  if instance.status.state.name == 'RUNNING':
148
158
  break
149
159
  time.sleep(POLL_INTERVAL)
@@ -212,24 +222,26 @@ def launch(cluster_name_on_cloud: str,
212
222
  project_id, fabric)
213
223
 
214
224
  service = nebius.compute().DiskServiceClient(nebius.sdk())
215
- disk = service.create(nebius.compute().CreateDiskRequest(
216
- metadata=nebius.nebius_common().ResourceMetadata(
217
- parent_id=project_id,
218
- name=disk_name,
219
- ),
220
- spec=nebius.compute().DiskSpec(
221
- source_image_family=nebius.compute().SourceImageFamily(
222
- image_family=image_family),
223
- size_gibibytes=disk_size,
224
- type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
225
- ))).wait()
225
+ disk = nebius.sync_call(
226
+ service.create(nebius.compute().CreateDiskRequest(
227
+ metadata=nebius.nebius_common().ResourceMetadata(
228
+ parent_id=project_id,
229
+ name=disk_name,
230
+ ),
231
+ spec=nebius.compute().DiskSpec(
232
+ source_image_family=nebius.compute().SourceImageFamily(
233
+ image_family=image_family),
234
+ size_gibibytes=disk_size,
235
+ type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
236
+ ))))
226
237
  disk_id = disk.resource_id
227
238
  retry_count = 0
228
239
  while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
229
- disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
230
- parent_id=project_id,
231
- name=disk_name,
232
- )).wait()
240
+ disk = nebius.sync_call(
241
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
242
+ parent_id=project_id,
243
+ name=disk_name,
244
+ )))
233
245
  if disk.status.state.name == 'READY':
234
246
  break
235
247
  logger.debug(f'Waiting for disk {disk_name} to be ready.')
@@ -254,50 +266,53 @@ def launch(cluster_name_on_cloud: str,
254
266
  id=fs['filesystem_id'])))
255
267
 
256
268
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
257
- sub_net = service.list(nebius.vpc().ListSubnetsRequest(
258
- parent_id=project_id,)).wait()
269
+ sub_net = nebius.sync_call(
270
+ service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
259
271
 
260
272
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
261
- service.create(nebius.compute().CreateInstanceRequest(
262
- metadata=nebius.nebius_common().ResourceMetadata(
263
- parent_id=project_id,
264
- name=instance_name,
265
- ),
266
- spec=nebius.compute().InstanceSpec(
267
- gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
268
- if cluster_id is not None else None,
269
- boot_disk=nebius.compute().AttachedDiskSpec(
270
- attach_mode=nebius.compute(
271
- ).AttachedDiskSpec.AttachMode.READ_WRITE,
272
- existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
273
- cloud_init_user_data=user_data,
274
- resources=nebius.compute().ResourcesSpec(platform=platform,
275
- preset=preset),
276
- filesystems=filesystems_spec if filesystems_spec else None,
277
- network_interfaces=[
278
- nebius.compute().NetworkInterfaceSpec(
279
- subnet_id=sub_net.items[0].metadata.id,
280
- ip_address=nebius.compute().IPAddress(),
281
- name='network-interface-0',
282
- public_ip_address=nebius.compute().PublicIPAddress()
283
- if associate_public_ip_address else None,
284
- )
285
- ],
286
- recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
287
- if use_spot else None,
288
- preemptible=nebius.compute().PreemptibleSpec(
289
- priority=1,
290
- on_preemption=nebius.compute(
291
- ).PreemptibleSpec.PreemptionPolicy.STOP) if use_spot else None,
292
- ))).wait()
273
+ logger.debug(f'Creating instance {instance_name} in project {project_id}.')
274
+ nebius.sync_call(
275
+ service.create(nebius.compute().CreateInstanceRequest(
276
+ metadata=nebius.nebius_common().ResourceMetadata(
277
+ parent_id=project_id,
278
+ name=instance_name,
279
+ ),
280
+ spec=nebius.compute().InstanceSpec(
281
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
282
+ id=cluster_id,) if cluster_id is not None else None,
283
+ boot_disk=nebius.compute().AttachedDiskSpec(
284
+ attach_mode=nebius.compute(
285
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
286
+ existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
287
+ cloud_init_user_data=user_data,
288
+ resources=nebius.compute().ResourcesSpec(platform=platform,
289
+ preset=preset),
290
+ filesystems=filesystems_spec if filesystems_spec else None,
291
+ network_interfaces=[
292
+ nebius.compute().NetworkInterfaceSpec(
293
+ subnet_id=sub_net.items[0].metadata.id,
294
+ ip_address=nebius.compute().IPAddress(),
295
+ name='network-interface-0',
296
+ public_ip_address=nebius.compute().PublicIPAddress()
297
+ if associate_public_ip_address else None,
298
+ )
299
+ ],
300
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
301
+ if use_spot else None,
302
+ preemptible=nebius.compute().PreemptibleSpec(
303
+ priority=1,
304
+ on_preemption=nebius.compute().PreemptibleSpec.
305
+ PreemptionPolicy.STOP) if use_spot else None,
306
+ ))))
293
307
  instance_id = ''
294
308
  retry_count = 0
295
309
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
296
310
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
297
- instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
298
- parent_id=project_id,
299
- name=instance_name,
300
- )).wait()
311
+ instance = nebius.sync_call(
312
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
313
+ parent_id=project_id,
314
+ name=instance_name,
315
+ )))
301
316
  if instance.status.state.name == 'STARTING':
302
317
  instance_id = instance.metadata.id
303
318
  break
@@ -317,19 +332,19 @@ def launch(cluster_name_on_cloud: str,
317
332
  def remove(instance_id: str) -> None:
318
333
  """Terminates the given instance."""
319
334
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
320
- result = service.get(
321
- nebius.compute().GetInstanceRequest(id=instance_id)).wait()
335
+ result = nebius.sync_call(
336
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
322
337
  disk_id = result.spec.boot_disk.existing_disk.id
323
- service.delete(
324
- nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
338
+ nebius.sync_call(
339
+ service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
325
340
  retry_count = 0
326
341
  # The instance begins deleting and attempts to delete the disk.
327
342
  # Must wait until the disk is unlocked and becomes deletable.
328
343
  while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
329
344
  try:
330
345
  service = nebius.compute().DiskServiceClient(nebius.sdk())
331
- service.delete(
332
- nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
346
+ nebius.sync_call(
347
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
333
348
  break
334
349
  except nebius.request_error():
335
350
  logger.debug('Waiting for disk deletion.')
@@ -76,7 +76,8 @@ def _bulk_provision(
76
76
  logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
77
77
  rich_utils.force_update_status(
78
78
  ux_utils.spinner_message('Launching - Checking instance status',
79
- str(provision_logging.config.log_path)))
79
+ str(provision_logging.config.log_path),
80
+ cluster_name=str(cluster_name)))
80
81
  # AWS would take a very short time (<<1s) updating the state of the
81
82
  # instance.
82
83
  time.sleep(1)
@@ -462,9 +463,9 @@ def _post_provision_setup(
462
463
  docker_config = config_from_yaml.get('docker', {})
463
464
 
464
465
  with rich_utils.safe_status(
465
- ux_utils.spinner_message(
466
- 'Launching - Waiting for SSH access',
467
- provision_logging.config.log_path)) as status:
466
+ ux_utils.spinner_message('Launching - Waiting for SSH access',
467
+ provision_logging.config.log_path,
468
+ cluster_name=str(cluster_name))) as status:
468
469
  # If on Kubernetes, skip SSH check since the pods are guaranteed to be
469
470
  # ready by the provisioner, and we use kubectl instead of SSH to run the
470
471
  # commands and rsync on the pods. SSH will still be ready after a while
@@ -493,7 +494,8 @@ def _post_provision_setup(
493
494
  status.update(
494
495
  ux_utils.spinner_message(
495
496
  'Launching - Initializing docker container',
496
- provision_logging.config.log_path))
497
+ provision_logging.config.log_path,
498
+ cluster_name=str(cluster_name)))
497
499
  docker_user = instance_setup.initialize_docker(
498
500
  cluster_name.name_on_cloud,
499
501
  docker_config=docker_config,
@@ -541,7 +543,8 @@ def _post_provision_setup(
541
543
 
542
544
  runtime_preparation_str = (ux_utils.spinner_message(
543
545
  'Preparing SkyPilot runtime ({step}/3 - {step_name})',
544
- provision_logging.config.log_path))
546
+ provision_logging.config.log_path,
547
+ cluster_name=str(cluster_name)))
545
548
  status.update(
546
549
  runtime_preparation_str.format(step=1, step_name='initializing'))
547
550
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -679,7 +682,8 @@ def _post_provision_setup(
679
682
  if logging_agent:
680
683
  status.update(
681
684
  ux_utils.spinner_message('Setting up logging agent',
682
- provision_logging.config.log_path))
685
+ provision_logging.config.log_path,
686
+ cluster_name=str(cluster_name)))
683
687
  instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
684
688
  cluster_info,
685
689
  ssh_credentials)
@@ -689,7 +693,8 @@ def _post_provision_setup(
689
693
 
690
694
  logger.info(
691
695
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
692
- provision_logging.config.log_path))
696
+ provision_logging.config.log_path,
697
+ cluster_name=str(cluster_name)))
693
698
  return cluster_info
694
699
 
695
700
 
sky/resources.py CHANGED
@@ -37,7 +37,7 @@ if typing.TYPE_CHECKING:
37
37
 
38
38
  logger = sky_logging.init_logger(__name__)
39
39
 
40
- _DEFAULT_DISK_SIZE_GB = 256
40
+ DEFAULT_DISK_SIZE_GB = 256
41
41
 
42
42
  RESOURCE_CONFIG_ALIASES = {
43
43
  'gpus': 'accelerators',
@@ -319,7 +319,7 @@ class Resources:
319
319
  self._disk_size = int(
320
320
  resources_utils.parse_memory_resource(disk_size, 'disk_size'))
321
321
  else:
322
- self._disk_size = _DEFAULT_DISK_SIZE_GB
322
+ self._disk_size = DEFAULT_DISK_SIZE_GB
323
323
 
324
324
  self._image_id: Optional[Dict[Optional[str], str]] = None
325
325
  if isinstance(image_id, str):
@@ -482,7 +482,7 @@ class Resources:
482
482
  network_tier = f', network_tier={self.network_tier.value}'
483
483
 
484
484
  disk_size = ''
485
- if self.disk_size != _DEFAULT_DISK_SIZE_GB:
485
+ if self.disk_size != DEFAULT_DISK_SIZE_GB:
486
486
  disk_size = f', disk_size={self.disk_size}'
487
487
 
488
488
  ports = ''
@@ -1766,7 +1766,7 @@ class Resources:
1766
1766
  self._accelerators is None,
1767
1767
  self._accelerator_args is None,
1768
1768
  not self._use_spot_specified,
1769
- self._disk_size == _DEFAULT_DISK_SIZE_GB,
1769
+ self._disk_size == DEFAULT_DISK_SIZE_GB,
1770
1770
  self._disk_tier is None,
1771
1771
  self._network_tier is None,
1772
1772
  self._image_id is None,
@@ -2255,7 +2255,7 @@ class Resources:
2255
2255
  accelerator_args = state.pop('accelerator_args', None)
2256
2256
  state['_accelerator_args'] = accelerator_args
2257
2257
 
2258
- disk_size = state.pop('disk_size', _DEFAULT_DISK_SIZE_GB)
2258
+ disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
2259
2259
  state['_disk_size'] = disk_size
2260
2260
 
2261
2261
  if version < 2: