skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -85,6 +85,12 @@ _JOB_CANCELLED_MESSAGE = (
85
85
  _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
86
86
 
87
87
 
88
+ class ManagedJobQueueResultType(enum.Enum):
89
+ """The type of the managed job queue result."""
90
+ DICT = 'DICT'
91
+ LIST = 'LIST'
92
+
93
+
88
94
  class UserSignal(enum.Enum):
89
95
  """The signal to be sent to the user."""
90
96
  CANCEL = 'CANCEL'
@@ -1120,7 +1126,17 @@ def stream_logs(job_id: Optional[int],
1120
1126
  return stream_logs_by_id(job_id, follow, tail)
1121
1127
 
1122
1128
 
1123
- def dump_managed_job_queue() -> str:
1129
+ def dump_managed_job_queue(
1130
+ skip_finished: bool = False,
1131
+ accessible_workspaces: Optional[List[str]] = None,
1132
+ job_ids: Optional[List[int]] = None,
1133
+ workspace_match: Optional[str] = None,
1134
+ name_match: Optional[str] = None,
1135
+ pool_match: Optional[str] = None,
1136
+ page: Optional[int] = None,
1137
+ limit: Optional[int] = None,
1138
+ user_hashes: Optional[List[Optional[str]]] = None,
1139
+ ) -> str:
1124
1140
  # Make sure to get all jobs - some logic below (e.g. high priority job
1125
1141
  # detection) requires a full view of the jobs table.
1126
1142
  jobs = managed_job_state.get_managed_jobs()
@@ -1147,6 +1163,31 @@ def dump_managed_job_queue() -> str:
1147
1163
  if priority is not None and priority > highest_blocking_priority:
1148
1164
  highest_blocking_priority = priority
1149
1165
 
1166
+ if user_hashes:
1167
+ jobs = [
1168
+ job for job in jobs if job.get('user_hash', None) in user_hashes
1169
+ ]
1170
+ if accessible_workspaces:
1171
+ jobs = [
1172
+ job for job in jobs
1173
+ if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
1174
+ accessible_workspaces
1175
+ ]
1176
+ if skip_finished:
1177
+ # Filter out the finished jobs. If a multi-task job is partially
1178
+ # finished, we will include all its tasks.
1179
+ non_finished_tasks = list(
1180
+ filter(
1181
+ lambda job: not managed_job_state.ManagedJobStatus(job[
1182
+ 'status']).is_terminal(), jobs))
1183
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
1184
+ jobs = list(
1185
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
1186
+ if job_ids:
1187
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
1188
+
1189
+ jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
1190
+ page, limit)
1150
1191
  for job in jobs:
1151
1192
  end_at = job['end_at']
1152
1193
  if end_at is None:
@@ -1220,12 +1261,96 @@ def dump_managed_job_queue() -> str:
1220
1261
  else:
1221
1262
  job['details'] = None
1222
1263
 
1223
- return message_utils.encode_payload(jobs)
1264
+ return message_utils.encode_payload({'jobs': jobs, 'total': total})
1265
+
1266
+
1267
+ def filter_jobs(
1268
+ jobs: List[Dict[str, Any]],
1269
+ workspace_match: Optional[str],
1270
+ name_match: Optional[str],
1271
+ pool_match: Optional[str],
1272
+ page: Optional[int],
1273
+ limit: Optional[int],
1274
+ user_match: Optional[str] = None,
1275
+ enable_user_match: bool = False,
1276
+ ) -> Tuple[List[Dict[str, Any]], int]:
1277
+ """Filter jobs based on the given criteria.
1278
+
1279
+ Args:
1280
+ jobs: List of jobs to filter.
1281
+ workspace_match: Workspace name to filter.
1282
+ name_match: Job name to filter.
1283
+ pool_match: Pool name to filter.
1284
+ page: Page to filter.
1285
+ limit: Limit to filter.
1286
+ user_match: User name to filter.
1287
+ enable_user_match: Whether to enable user match.
1288
+
1289
+ Returns:
1290
+ List of filtered jobs and total number of jobs.
1291
+ """
1292
+
1293
+ # TODO(hailong): refactor the whole function including the
1294
+ # `dump_managed_job_queue()` to use DB filtering.
1295
+
1296
+ def _pattern_matches(job: Dict[str, Any], key: str,
1297
+ pattern: Optional[str]) -> bool:
1298
+ if pattern is None:
1299
+ return True
1300
+ if key not in job:
1301
+ return False
1302
+ value = job[key]
1303
+ if not value:
1304
+ return False
1305
+ return pattern in str(value)
1306
+
1307
+ def _handle_page_and_limit(
1308
+ result: List[Dict[str, Any]],
1309
+ page: Optional[int],
1310
+ limit: Optional[int],
1311
+ ) -> List[Dict[str, Any]]:
1312
+ if page is None and limit is None:
1313
+ return result
1314
+ assert page is not None and limit is not None, (page, limit)
1315
+ # page starts from 1
1316
+ start = (page - 1) * limit
1317
+ end = min(start + limit, len(result))
1318
+ return result[start:end]
1319
+
1320
+ result = []
1321
+ checks = [
1322
+ ('workspace', workspace_match),
1323
+ ('job_name', name_match),
1324
+ ('pool', pool_match),
1325
+ ]
1326
+ if enable_user_match:
1327
+ checks.append(('user_name', user_match))
1328
+
1329
+ for job in jobs:
1330
+ if not all(
1331
+ _pattern_matches(job, key, pattern) for key, pattern in checks):
1332
+ continue
1333
+ result.append(job)
1334
+
1335
+ total = len(result)
1336
+
1337
+ return _handle_page_and_limit(result, page, limit), total
1224
1338
 
1225
1339
 
1226
- def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
1340
+ def load_managed_job_queue(
1341
+ payload: str
1342
+ ) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
1227
1343
  """Load job queue from json string."""
1228
- jobs = message_utils.decode_payload(payload)
1344
+ result = message_utils.decode_payload(payload)
1345
+ result_type = ManagedJobQueueResultType.DICT
1346
+ if isinstance(result, dict):
1347
+ jobs = result['jobs']
1348
+ total = result['total']
1349
+ else:
1350
+ jobs = result
1351
+ total = len(jobs)
1352
+ result_type = ManagedJobQueueResultType.LIST
1353
+
1229
1354
  for job in jobs:
1230
1355
  job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1231
1356
  if 'user_hash' in job and job['user_hash'] is not None:
@@ -1233,7 +1358,7 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
1233
1358
  # TODO(cooperc): Remove check before 0.12.0.
1234
1359
  user = global_user_state.get_user(job['user_hash'])
1235
1360
  job['user_name'] = user.name if user is not None else None
1236
- return jobs
1361
+ return jobs, total, result_type
1237
1362
 
1238
1363
 
1239
1364
  def _get_job_status_from_tasks(
@@ -1580,9 +1705,35 @@ class ManagedJobCodeGen:
1580
1705
  """)
1581
1706
 
1582
1707
  @classmethod
1583
- def get_job_table(cls) -> str:
1584
- code = textwrap.dedent("""\
1585
- job_table = utils.dump_managed_job_queue()
1708
+ def get_job_table(
1709
+ cls,
1710
+ skip_finished: bool = False,
1711
+ accessible_workspaces: Optional[List[str]] = None,
1712
+ job_ids: Optional[List[int]] = None,
1713
+ workspace_match: Optional[str] = None,
1714
+ name_match: Optional[str] = None,
1715
+ pool_match: Optional[str] = None,
1716
+ page: Optional[int] = None,
1717
+ limit: Optional[int] = None,
1718
+ user_hashes: Optional[List[Optional[str]]] = None,
1719
+ ) -> str:
1720
+ code = textwrap.dedent(f"""\
1721
+ if managed_job_version < 9:
1722
+ # For backward compatibility, since filtering is not supported
1723
+ # before #6652.
1724
+ # TODO(hailong): Remove compatibility before 0.12.0
1725
+ job_table = utils.dump_managed_job_queue()
1726
+ else:
1727
+ job_table = utils.dump_managed_job_queue(
1728
+ skip_finished={skip_finished},
1729
+ accessible_workspaces={accessible_workspaces!r},
1730
+ job_ids={job_ids!r},
1731
+ workspace_match={workspace_match!r},
1732
+ name_match={name_match!r},
1733
+ pool_match={pool_match!r},
1734
+ page={page!r},
1735
+ limit={limit!r},
1736
+ user_hashes={user_hashes!r})
1586
1737
  print(job_table, flush=True)
1587
1738
  """)
1588
1739
  return cls._build(code)
@@ -1690,6 +1841,7 @@ class ManagedJobCodeGen:
1690
1841
  def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1691
1842
  workspace: str, entrypoint: str) -> str:
1692
1843
  dag_name = managed_job_dag.name
1844
+ pool = managed_job_dag.pool
1693
1845
  # Add the managed job to queue table.
1694
1846
  code = textwrap.dedent(f"""\
1695
1847
  set_job_info_kwargs = {{'workspace': {workspace!r}}}
@@ -1697,6 +1849,13 @@ class ManagedJobCodeGen:
1697
1849
  set_job_info_kwargs = {{}}
1698
1850
  if managed_job_version >= 5:
1699
1851
  set_job_info_kwargs['entrypoint'] = {entrypoint!r}
1852
+ if managed_job_version >= 8:
1853
+ from sky.serve import serve_state
1854
+ pool_hash = None
1855
+ if {pool!r} != None:
1856
+ pool_hash = serve_state.get_service_hash({pool!r})
1857
+ set_job_info_kwargs['pool'] = {pool!r}
1858
+ set_job_info_kwargs['pool_hash'] = pool_hash
1700
1859
  managed_job_state.set_job_info(
1701
1860
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
1702
1861
  """)
sky/provision/__init__.py CHANGED
@@ -73,6 +73,7 @@ def _route_to_cloud_impl(func):
73
73
  @_route_to_cloud_impl
74
74
  def query_instances(
75
75
  provider_name: str,
76
+ cluster_name: str,
76
77
  cluster_name_on_cloud: str,
77
78
  provider_config: Optional[Dict[str, Any]] = None,
78
79
  non_terminated_only: bool = True,
@@ -19,6 +19,7 @@ import colorama
19
19
  from sky import exceptions
20
20
  from sky import sky_logging
21
21
  from sky.adaptors import aws
22
+ from sky.clouds import aws as aws_cloud
22
23
  from sky.provision import common
23
24
  from sky.provision.aws import utils
24
25
  from sky.utils import annotations
@@ -103,6 +104,30 @@ def bootstrap_instances(
103
104
  security_group_ids = _configure_security_group(ec2, vpc_id,
104
105
  expected_sg_name,
105
106
  extended_ip_rules)
107
+ if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
+ logger.debug('Attempting to create the default security group.')
109
+ # Attempt to create the default security group. This is needed
110
+ # to enable us to use the default security group to quickly
111
+ # delete the cluster. If the default security group is not created,
112
+ # we will need to block on instance termination to delete the
113
+ # security group.
114
+ try:
115
+ _configure_security_group(ec2, vpc_id,
116
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
117
+ [])
118
+ logger.debug('Default security group created.')
119
+ except exceptions.NoClusterLaunchedError as e:
120
+ if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
121
+ e):
122
+ # User does not have permission to create the default
123
+ # security group.
124
+ logger.debug('User does not have permission to create '
125
+ 'the default security group. '
126
+ f'{e}')
127
+ pass
128
+ else:
129
+ raise e
130
+
106
131
  end_time = time.time()
107
132
  elapsed = end_time - start_time
108
133
  logger.info(
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
527
527
  to_start_count,
528
528
  associate_public_ip_address=(
529
529
  not config.provider_config['use_internal_ips']))
530
+
530
531
  created_instances.extend(created_remaining_instances)
531
532
  created_instances.sort(key=lambda x: x.id)
532
533
 
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
585
586
  # stop() and terminate() for example already implicitly assume non-terminated.
586
587
  @common_utils.retry
587
588
  def query_instances(
589
+ cluster_name: str,
588
590
  cluster_name_on_cloud: str,
589
591
  provider_config: Optional[Dict[str, Any]] = None,
590
592
  non_terminated_only: bool = True,
591
593
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
592
594
  """See sky/provision/__init__.py"""
595
+ del cluster_name # unused
593
596
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
594
597
  region = provider_config['region']
595
598
  ec2 = _default_ec2_resource(region)
@@ -682,19 +685,40 @@ def terminate_instances(
682
685
  filters,
683
686
  included_instances=None,
684
687
  excluded_instances=None)
685
- instances_list = list(instances)
686
- instances.terminate()
687
- if (sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME or
688
- not managed_by_skypilot):
689
- # Using default AWS SG or user specified security group. We don't need
690
- # to wait for the termination of the instances, as we do not need to
691
- # delete the SG.
692
- return
693
- # If ports are specified, we need to delete the newly created Security
694
- # Group. Here we wait for all instances to be terminated, since the
695
- # Security Group dependent on them.
696
- for instance in instances_list:
697
- instance.wait_until_terminated()
688
+ default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
689
+ if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
690
+ # Case 1: The default SG is used, we don't need to ensure instance are
691
+ # terminated.
692
+ instances.terminate()
693
+ elif not managed_by_skypilot:
694
+ # Case 2: We are not managing the non-default sg. We don't need to
695
+ # ensure instances are terminated.
696
+ instances.terminate()
697
+ elif (managed_by_skypilot and default_sg is not None):
698
+ # Case 3: We are managing the non-default sg. The default SG exists
699
+ # so we can move the instances to the default SG and terminate them
700
+ # without blocking.
701
+
702
+ # Make this multithreaded: modify all instances' SGs in parallel.
703
+ def modify_instance_sg(instance):
704
+ instance.modify_attribute(Groups=[default_sg.id])
705
+ logger.debug(f'Instance {instance.id} modified to use default SG:'
706
+ f'{default_sg.id} for quick deletion.')
707
+
708
+ with pool.ThreadPool() as thread_pool:
709
+ thread_pool.map(modify_instance_sg, instances)
710
+ thread_pool.close()
711
+ thread_pool.join()
712
+
713
+ instances.terminate()
714
+ else:
715
+ # Case 4: We are managing the non-default sg. The default SG does not
716
+ # exist. We must block on instance termination so that we can
717
+ # delete the security group.
718
+ instances.terminate()
719
+ for instance in instances:
720
+ instance.wait_until_terminated()
721
+
698
722
  # TODO(suquark): Currently, the implementation of GCP and Azure will
699
723
  # wait util the cluster is fully terminated, while other clouds just
700
724
  # trigger the termination process (via http call) and then return.
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
952
952
 
953
953
  @common_utils.retry
954
954
  def query_instances(
955
+ cluster_name: str,
955
956
  cluster_name_on_cloud: str,
956
957
  provider_config: Optional[Dict[str, Any]] = None,
957
958
  non_terminated_only: bool = True,
958
959
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
959
960
  """See sky/provision/__init__.py"""
961
+ del cluster_name # unused
960
962
  assert provider_config is not None, cluster_name_on_cloud
961
963
 
962
964
  subscription_id = provider_config['subscription_id']
@@ -4,7 +4,7 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
- import sky.provision.cudo.cudo_utils as utils
7
+ from sky.provision.cudo import cudo_utils as utils
8
8
 
9
9
  logger = sky_logging.init_logger(__name__)
10
10
 
@@ -191,11 +191,13 @@ def get_cluster_info(
191
191
 
192
192
 
193
193
  def query_instances(
194
+ cluster_name: str,
194
195
  cluster_name_on_cloud: str,
195
196
  provider_config: Optional[Dict[str, Any]] = None,
196
197
  non_terminated_only: bool = True,
197
198
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
199
  """See sky/provision/__init__.py"""
200
+ del cluster_name # unused
199
201
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
202
  instances = _filter_instances(cluster_name_on_cloud, None)
201
203
 
@@ -242,11 +242,13 @@ def get_cluster_info(
242
242
 
243
243
 
244
244
  def query_instances(
245
+ cluster_name: str,
245
246
  cluster_name_on_cloud: str,
246
247
  provider_config: Optional[Dict[str, Any]] = None,
247
248
  non_terminated_only: bool = True,
248
249
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
250
  """See sky/provision/__init__.py"""
251
+ del cluster_name # unused
250
252
  # terminated instances are not retrieved by the
251
253
  # API making `non_terminated_only` argument moot.
252
254
  del non_terminated_only
@@ -287,11 +287,13 @@ def get_cluster_info(
287
287
 
288
288
 
289
289
  def query_instances(
290
+ cluster_name: str,
290
291
  cluster_name_on_cloud: str,
291
292
  provider_config: Optional[Dict[str, Any]] = None,
292
293
  non_terminated_only: bool = True,
293
294
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
294
295
  """See sky/provision/__init__.py"""
296
+ del cluster_name # unused
295
297
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
296
298
  instances = _filter_instances(cluster_name_on_cloud, None)
297
299
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -58,11 +58,13 @@ def _filter_instances(
58
58
  # for terminated instances, if they have already been fully deleted.
59
59
  @common_utils.retry
60
60
  def query_instances(
61
+ cluster_name: str,
61
62
  cluster_name_on_cloud: str,
62
63
  provider_config: Optional[Dict[str, Any]] = None,
63
64
  non_terminated_only: bool = True,
64
65
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
65
66
  """See sky/provision/__init__.py"""
67
+ del cluster_name # unused
66
68
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
69
  zone = provider_config['availability_zone']
68
70
  project_id = provider_config['project_id']
@@ -304,12 +304,13 @@ def get_cluster_info(
304
304
 
305
305
 
306
306
  def query_instances(
307
+ cluster_name: str,
307
308
  cluster_name_on_cloud: str,
308
309
  provider_config: Optional[dict] = None,
309
310
  non_terminated_only: bool = True,
310
311
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
311
312
  """Returns the status of the specified instances for Hyperbolic."""
312
- del provider_config # unused
313
+ del cluster_name, provider_config # unused
313
314
  # Fetch all instances for this cluster
314
315
  instances = utils.list_instances(
315
316
  metadata={'skypilot': {
@@ -1,10 +1,12 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
3
  import json
4
+ import re
4
5
  import time
5
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
7
 
7
8
  from sky import exceptions
9
+ from sky import global_user_state
8
10
  from sky import sky_logging
9
11
  from sky import skypilot_config
10
12
  from sky.adaptors import kubernetes
@@ -24,6 +26,7 @@ from sky.utils import status_lib
24
26
  from sky.utils import subprocess_utils
25
27
  from sky.utils import timeline
26
28
  from sky.utils import ux_utils
29
+ from sky.utils.db import db_utils
27
30
 
28
31
  POLL_INTERVAL = 2
29
32
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
@@ -1270,7 +1273,116 @@ def _get_pod_termination_reason(pod: Any) -> str:
1270
1273
  return ' | '.join(reasons)
1271
1274
 
1272
1275
 
1276
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1277
+ cluster_name: str, pod_name: str) -> Optional[str]:
1278
+ logger.debug(f'Analyzing events for pod {pod_name}')
1279
+ pod_field_selector = (
1280
+ f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1281
+ pod_events = kubernetes.core_api(context).list_namespaced_event(
1282
+ namespace,
1283
+ field_selector=pod_field_selector,
1284
+ _request_timeout=kubernetes.API_TIMEOUT).items
1285
+ pod_events = sorted(
1286
+ pod_events,
1287
+ key=lambda event: event.metadata.creation_timestamp,
1288
+ # latest event appears first
1289
+ reverse=True)
1290
+ last_scheduled_node = None
1291
+ insert_new_pod_event = True
1292
+ new_event_inserted = False
1293
+ for event in pod_events:
1294
+ if event.reason == 'Scheduled':
1295
+ pattern = r'Successfully assigned (\S+) to (\S+)'
1296
+ match = re.search(pattern, event.message)
1297
+ if match:
1298
+ scheduled_node = match.group(2)
1299
+ last_scheduled_node = scheduled_node
1300
+ if insert_new_pod_event:
1301
+ # Try inserting the latest events first. If the event is a
1302
+ # duplicate, it means the event (and any previous events) have
1303
+ # already been inserted - so do not insert further events.
1304
+ try:
1305
+ global_user_state.add_cluster_event(
1306
+ cluster_name,
1307
+ None, f'[kubernetes pod {pod_name}] '
1308
+ f'{event.reason} {event.message}',
1309
+ global_user_state.ClusterEventType.DEBUG,
1310
+ transitioned_at=int(
1311
+ event.metadata.creation_timestamp.timestamp()),
1312
+ expose_duplicate_error=True)
1313
+ except db_utils.UniqueConstraintViolationError:
1314
+ insert_new_pod_event = False
1315
+ else:
1316
+ new_event_inserted = True
1317
+
1318
+ if last_scheduled_node is not None:
1319
+ node_field_selector = ('involvedObject.kind=Node,'
1320
+ f'involvedObject.name={last_scheduled_node}')
1321
+ node_events = kubernetes.core_api(context).list_namespaced_event(
1322
+ namespace,
1323
+ field_selector=node_field_selector,
1324
+ _request_timeout=kubernetes.API_TIMEOUT).items
1325
+ node_events = sorted(
1326
+ node_events,
1327
+ key=lambda event: event.metadata.creation_timestamp,
1328
+ # latest event appears first
1329
+ reverse=True)
1330
+ insert_new_node_event = True
1331
+ for event in node_events:
1332
+ if insert_new_node_event:
1333
+ # Try inserting the latest events first. If the event is a
1334
+ # duplicate, it means the event (and any previous events) have
1335
+ # already been inserted - so do not insert further events.
1336
+ try:
1337
+ global_user_state.add_cluster_event(
1338
+ cluster_name,
1339
+ None, f'[kubernetes node {last_scheduled_node}] '
1340
+ f'{event.reason} {event.message}',
1341
+ global_user_state.ClusterEventType.DEBUG,
1342
+ transitioned_at=int(
1343
+ event.metadata.creation_timestamp.timestamp()),
1344
+ expose_duplicate_error=True)
1345
+ except db_utils.UniqueConstraintViolationError:
1346
+ insert_new_node_event = False
1347
+ else:
1348
+ new_event_inserted = True
1349
+
1350
+ if not new_event_inserted:
1351
+ # If new event is not inserted, there is no useful information to
1352
+ # return. Return None.
1353
+ return None
1354
+
1355
+ # Analyze the events for failure
1356
+ failure_reason = None
1357
+ failure_decisiveness = 0
1358
+
1359
+ def _record_failure_reason(reason: str, decisiveness: int):
1360
+ nonlocal failure_reason, failure_decisiveness
1361
+ if decisiveness > failure_decisiveness:
1362
+ failure_reason = reason
1363
+ failure_decisiveness = decisiveness
1364
+
1365
+ cluster_events = global_user_state.get_cluster_events(
1366
+ cluster_name, None, global_user_state.ClusterEventType.DEBUG)
1367
+ for event in cluster_events:
1368
+ if event.startswith('[kubernetes pod'):
1369
+ event = event.split(']')[1].strip()
1370
+ elif event.startswith('[kubernetes node'):
1371
+ event = event.split(']')[1].strip()
1372
+
1373
+ if event.startswith('NodeNotReady '):
1374
+ _record_failure_reason(event[len('NodeNotReady '):], 1)
1375
+ elif event.startswith('TaintManagerEviction '):
1376
+ # usually the event message for TaintManagerEviction is not useful
1377
+ # so we record a more generic message.
1378
+ _record_failure_reason('pod was evicted by taint manager', 2)
1379
+ elif event.startswith('DeletingNode '):
1380
+ _record_failure_reason(event[len('DeletingNode '):], 3)
1381
+ return failure_reason
1382
+
1383
+
1273
1384
  def query_instances(
1385
+ cluster_name: str,
1274
1386
  cluster_name_on_cloud: str,
1275
1387
  provider_config: Optional[Dict[str, Any]] = None,
1276
1388
  non_terminated_only: bool = True
@@ -1334,6 +1446,27 @@ def query_instances(
1334
1446
  pod_name = pod.metadata.name
1335
1447
  reason = f'{pod_name}: {reason}' if reason is not None else None
1336
1448
  cluster_status[pod_name] = (pod_status, reason)
1449
+
1450
+ # Find the list of pod names that should be there
1451
+ # from k8s services. Filter duplicates as -ssh service
1452
+ # creates a duplicate entry.
1453
+ target_pod_names = list(
1454
+ set([
1455
+ service['spec']['selector']['component']
1456
+ for service in provider_config.get('services', [])
1457
+ ]))
1458
+
1459
+ for target_pod_name in target_pod_names:
1460
+ if target_pod_name not in cluster_status:
1461
+ # If the pod is not in the cluster_status, it means it's not
1462
+ # running.
1463
+ # Analyze what happened to the pod based on events.
1464
+ reason = _get_pod_missing_reason(context, namespace, cluster_name,
1465
+ target_pod_name)
1466
+ reason = (f'{target_pod_name}: {reason}'
1467
+ if reason is not None else None)
1468
+ cluster_status[target_pod_name] = (None, reason)
1469
+
1337
1470
  return cluster_status
1338
1471
 
1339
1472
 
@@ -226,11 +226,13 @@ def get_cluster_info(
226
226
 
227
227
 
228
228
  def query_instances(
229
+ cluster_name: str,
229
230
  cluster_name_on_cloud: str,
230
231
  provider_config: Optional[Dict[str, Any]] = None,
231
232
  non_terminated_only: bool = True,
232
233
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
234
  """See sky/provision/__init__.py"""
235
+ del cluster_name # unused
234
236
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
237
  instances = _filter_instances(cluster_name_on_cloud, None)
236
238
 
@@ -247,11 +247,13 @@ def get_cluster_info(
247
247
 
248
248
 
249
249
  def query_instances(
250
+ cluster_name: str,
250
251
  cluster_name_on_cloud: str,
251
252
  provider_config: Optional[Dict[str, Any]] = None,
252
253
  non_terminated_only: bool = True,
253
254
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
254
255
  """See sky/provision/__init__.py"""
256
+ del cluster_name # unused
255
257
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
256
258
  instances = _filter_instances(provider_config['region'],
257
259
  cluster_name_on_cloud, None)