skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend.py +5 -3
- sky/backends/backend_utils.py +22 -7
- sky/backends/cloud_vm_ray_backend.py +50 -18
- sky/backends/local_docker_backend.py +8 -3
- sky/client/cli/command.py +25 -10
- sky/client/sdk.py +51 -1
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/core.py +9 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
- sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +29 -9
- sky/execution.py +13 -10
- sky/global_user_state.py +131 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/recovery_strategy.py +0 -3
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -11
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/kubernetes/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/api/responses.py +50 -1
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/common.py +2 -3
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +20 -5
- sky/server/requests/serializers/encoders.py +21 -8
- sky/server/server.py +57 -11
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +2 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -85,6 +85,12 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
85
85
|
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
|
|
86
86
|
|
|
87
87
|
|
|
88
|
+
class ManagedJobQueueResultType(enum.Enum):
|
|
89
|
+
"""The type of the managed job queue result."""
|
|
90
|
+
DICT = 'DICT'
|
|
91
|
+
LIST = 'LIST'
|
|
92
|
+
|
|
93
|
+
|
|
88
94
|
class UserSignal(enum.Enum):
|
|
89
95
|
"""The signal to be sent to the user."""
|
|
90
96
|
CANCEL = 'CANCEL'
|
|
@@ -337,9 +343,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
337
343
|
if handle is not None:
|
|
338
344
|
try:
|
|
339
345
|
if pool is None:
|
|
340
|
-
global_user_state.add_cluster_event(
|
|
341
|
-
cluster_name, None, 'Cluster was cleaned up.',
|
|
342
|
-
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
343
346
|
terminate_cluster(cluster_name)
|
|
344
347
|
except Exception as e: # pylint: disable=broad-except
|
|
345
348
|
error_msg = (
|
|
@@ -1120,7 +1123,17 @@ def stream_logs(job_id: Optional[int],
|
|
|
1120
1123
|
return stream_logs_by_id(job_id, follow, tail)
|
|
1121
1124
|
|
|
1122
1125
|
|
|
1123
|
-
def dump_managed_job_queue(
|
|
1126
|
+
def dump_managed_job_queue(
|
|
1127
|
+
skip_finished: bool = False,
|
|
1128
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1129
|
+
job_ids: Optional[List[int]] = None,
|
|
1130
|
+
workspace_match: Optional[str] = None,
|
|
1131
|
+
name_match: Optional[str] = None,
|
|
1132
|
+
pool_match: Optional[str] = None,
|
|
1133
|
+
page: Optional[int] = None,
|
|
1134
|
+
limit: Optional[int] = None,
|
|
1135
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1136
|
+
) -> str:
|
|
1124
1137
|
# Make sure to get all jobs - some logic below (e.g. high priority job
|
|
1125
1138
|
# detection) requires a full view of the jobs table.
|
|
1126
1139
|
jobs = managed_job_state.get_managed_jobs()
|
|
@@ -1147,6 +1160,31 @@ def dump_managed_job_queue() -> str:
|
|
|
1147
1160
|
if priority is not None and priority > highest_blocking_priority:
|
|
1148
1161
|
highest_blocking_priority = priority
|
|
1149
1162
|
|
|
1163
|
+
if user_hashes:
|
|
1164
|
+
jobs = [
|
|
1165
|
+
job for job in jobs if job.get('user_hash', None) in user_hashes
|
|
1166
|
+
]
|
|
1167
|
+
if accessible_workspaces:
|
|
1168
|
+
jobs = [
|
|
1169
|
+
job for job in jobs
|
|
1170
|
+
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1171
|
+
accessible_workspaces
|
|
1172
|
+
]
|
|
1173
|
+
if skip_finished:
|
|
1174
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
1175
|
+
# finished, we will include all its tasks.
|
|
1176
|
+
non_finished_tasks = list(
|
|
1177
|
+
filter(
|
|
1178
|
+
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1179
|
+
'status']).is_terminal(), jobs))
|
|
1180
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1181
|
+
jobs = list(
|
|
1182
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1183
|
+
if job_ids:
|
|
1184
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1185
|
+
|
|
1186
|
+
jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
|
|
1187
|
+
page, limit)
|
|
1150
1188
|
for job in jobs:
|
|
1151
1189
|
end_at = job['end_at']
|
|
1152
1190
|
if end_at is None:
|
|
@@ -1220,12 +1258,96 @@ def dump_managed_job_queue() -> str:
|
|
|
1220
1258
|
else:
|
|
1221
1259
|
job['details'] = None
|
|
1222
1260
|
|
|
1223
|
-
return message_utils.encode_payload(jobs)
|
|
1261
|
+
return message_utils.encode_payload({'jobs': jobs, 'total': total})
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
def filter_jobs(
|
|
1265
|
+
jobs: List[Dict[str, Any]],
|
|
1266
|
+
workspace_match: Optional[str],
|
|
1267
|
+
name_match: Optional[str],
|
|
1268
|
+
pool_match: Optional[str],
|
|
1269
|
+
page: Optional[int],
|
|
1270
|
+
limit: Optional[int],
|
|
1271
|
+
user_match: Optional[str] = None,
|
|
1272
|
+
enable_user_match: bool = False,
|
|
1273
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1274
|
+
"""Filter jobs based on the given criteria.
|
|
1275
|
+
|
|
1276
|
+
Args:
|
|
1277
|
+
jobs: List of jobs to filter.
|
|
1278
|
+
workspace_match: Workspace name to filter.
|
|
1279
|
+
name_match: Job name to filter.
|
|
1280
|
+
pool_match: Pool name to filter.
|
|
1281
|
+
page: Page to filter.
|
|
1282
|
+
limit: Limit to filter.
|
|
1283
|
+
user_match: User name to filter.
|
|
1284
|
+
enable_user_match: Whether to enable user match.
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
List of filtered jobs and total number of jobs.
|
|
1288
|
+
"""
|
|
1289
|
+
|
|
1290
|
+
# TODO(hailong): refactor the whole function including the
|
|
1291
|
+
# `dump_managed_job_queue()` to use DB filtering.
|
|
1292
|
+
|
|
1293
|
+
def _pattern_matches(job: Dict[str, Any], key: str,
|
|
1294
|
+
pattern: Optional[str]) -> bool:
|
|
1295
|
+
if pattern is None:
|
|
1296
|
+
return True
|
|
1297
|
+
if key not in job:
|
|
1298
|
+
return False
|
|
1299
|
+
value = job[key]
|
|
1300
|
+
if not value:
|
|
1301
|
+
return False
|
|
1302
|
+
return pattern in str(value)
|
|
1303
|
+
|
|
1304
|
+
def _handle_page_and_limit(
|
|
1305
|
+
result: List[Dict[str, Any]],
|
|
1306
|
+
page: Optional[int],
|
|
1307
|
+
limit: Optional[int],
|
|
1308
|
+
) -> List[Dict[str, Any]]:
|
|
1309
|
+
if page is None and limit is None:
|
|
1310
|
+
return result
|
|
1311
|
+
assert page is not None and limit is not None, (page, limit)
|
|
1312
|
+
# page starts from 1
|
|
1313
|
+
start = (page - 1) * limit
|
|
1314
|
+
end = min(start + limit, len(result))
|
|
1315
|
+
return result[start:end]
|
|
1316
|
+
|
|
1317
|
+
result = []
|
|
1318
|
+
checks = [
|
|
1319
|
+
('workspace', workspace_match),
|
|
1320
|
+
('job_name', name_match),
|
|
1321
|
+
('pool', pool_match),
|
|
1322
|
+
]
|
|
1323
|
+
if enable_user_match:
|
|
1324
|
+
checks.append(('user_name', user_match))
|
|
1325
|
+
|
|
1326
|
+
for job in jobs:
|
|
1327
|
+
if not all(
|
|
1328
|
+
_pattern_matches(job, key, pattern) for key, pattern in checks):
|
|
1329
|
+
continue
|
|
1330
|
+
result.append(job)
|
|
1331
|
+
|
|
1332
|
+
total = len(result)
|
|
1333
|
+
|
|
1334
|
+
return _handle_page_and_limit(result, page, limit), total
|
|
1224
1335
|
|
|
1225
1336
|
|
|
1226
|
-
def load_managed_job_queue(
|
|
1337
|
+
def load_managed_job_queue(
|
|
1338
|
+
payload: str
|
|
1339
|
+
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
|
|
1227
1340
|
"""Load job queue from json string."""
|
|
1228
|
-
|
|
1341
|
+
result = message_utils.decode_payload(payload)
|
|
1342
|
+
result_type = ManagedJobQueueResultType.DICT
|
|
1343
|
+
if isinstance(result, dict):
|
|
1344
|
+
jobs = result['jobs']
|
|
1345
|
+
total = result['total']
|
|
1346
|
+
else:
|
|
1347
|
+
jobs = result
|
|
1348
|
+
total = len(jobs)
|
|
1349
|
+
result_type = ManagedJobQueueResultType.LIST
|
|
1350
|
+
|
|
1229
1351
|
for job in jobs:
|
|
1230
1352
|
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1231
1353
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
@@ -1233,7 +1355,7 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
1233
1355
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1234
1356
|
user = global_user_state.get_user(job['user_hash'])
|
|
1235
1357
|
job['user_name'] = user.name if user is not None else None
|
|
1236
|
-
return jobs
|
|
1358
|
+
return jobs, total, result_type
|
|
1237
1359
|
|
|
1238
1360
|
|
|
1239
1361
|
def _get_job_status_from_tasks(
|
|
@@ -1580,9 +1702,35 @@ class ManagedJobCodeGen:
|
|
|
1580
1702
|
""")
|
|
1581
1703
|
|
|
1582
1704
|
@classmethod
|
|
1583
|
-
def get_job_table(
|
|
1584
|
-
|
|
1585
|
-
|
|
1705
|
+
def get_job_table(
|
|
1706
|
+
cls,
|
|
1707
|
+
skip_finished: bool = False,
|
|
1708
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1709
|
+
job_ids: Optional[List[int]] = None,
|
|
1710
|
+
workspace_match: Optional[str] = None,
|
|
1711
|
+
name_match: Optional[str] = None,
|
|
1712
|
+
pool_match: Optional[str] = None,
|
|
1713
|
+
page: Optional[int] = None,
|
|
1714
|
+
limit: Optional[int] = None,
|
|
1715
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1716
|
+
) -> str:
|
|
1717
|
+
code = textwrap.dedent(f"""\
|
|
1718
|
+
if managed_job_version < 9:
|
|
1719
|
+
# For backward compatibility, since filtering is not supported
|
|
1720
|
+
# before #6652.
|
|
1721
|
+
# TODO(hailong): Remove compatibility before 0.12.0
|
|
1722
|
+
job_table = utils.dump_managed_job_queue()
|
|
1723
|
+
else:
|
|
1724
|
+
job_table = utils.dump_managed_job_queue(
|
|
1725
|
+
skip_finished={skip_finished},
|
|
1726
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
1727
|
+
job_ids={job_ids!r},
|
|
1728
|
+
workspace_match={workspace_match!r},
|
|
1729
|
+
name_match={name_match!r},
|
|
1730
|
+
pool_match={pool_match!r},
|
|
1731
|
+
page={page!r},
|
|
1732
|
+
limit={limit!r},
|
|
1733
|
+
user_hashes={user_hashes!r})
|
|
1586
1734
|
print(job_table, flush=True)
|
|
1587
1735
|
""")
|
|
1588
1736
|
return cls._build(code)
|
sky/provision/aws/config.py
CHANGED
|
@@ -105,13 +105,29 @@ def bootstrap_instances(
|
|
|
105
105
|
expected_sg_name,
|
|
106
106
|
extended_ip_rules)
|
|
107
107
|
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
|
-
|
|
108
|
+
logger.debug('Attempting to create the default security group.')
|
|
109
|
+
# Attempt to create the default security group. This is needed
|
|
109
110
|
# to enable us to use the default security group to quickly
|
|
110
111
|
# delete the cluster. If the default security group is not created,
|
|
111
112
|
# we will need to block on instance termination to delete the
|
|
112
113
|
# security group.
|
|
113
|
-
|
|
114
|
-
|
|
114
|
+
try:
|
|
115
|
+
_configure_security_group(ec2, vpc_id,
|
|
116
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
117
|
+
[])
|
|
118
|
+
logger.debug('Default security group created.')
|
|
119
|
+
except exceptions.NoClusterLaunchedError as e:
|
|
120
|
+
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
121
|
+
e):
|
|
122
|
+
# User does not have permission to create the default
|
|
123
|
+
# security group.
|
|
124
|
+
logger.debug('User does not have permission to create '
|
|
125
|
+
'the default security group. '
|
|
126
|
+
f'{e}')
|
|
127
|
+
pass
|
|
128
|
+
else:
|
|
129
|
+
raise e
|
|
130
|
+
|
|
115
131
|
end_time = time.time()
|
|
116
132
|
elapsed = end_time - start_time
|
|
117
133
|
logger.info(
|
sky/provision/aws/instance.py
CHANGED
|
@@ -713,7 +713,8 @@ def terminate_instances(
|
|
|
713
713
|
instances.terminate()
|
|
714
714
|
else:
|
|
715
715
|
# Case 4: We are managing the non-default sg. The default SG does not
|
|
716
|
-
# exist. We must block on instance termination
|
|
716
|
+
# exist. We must block on instance termination so that we can
|
|
717
|
+
# delete the security group.
|
|
717
718
|
instances.terminate()
|
|
718
719
|
for instance in instances:
|
|
719
720
|
instance.wait_until_terminated()
|
|
@@ -1465,7 +1465,8 @@ def query_instances(
|
|
|
1465
1465
|
target_pod_name)
|
|
1466
1466
|
reason = (f'{target_pod_name}: {reason}'
|
|
1467
1467
|
if reason is not None else None)
|
|
1468
|
-
|
|
1468
|
+
if not non_terminated_only:
|
|
1469
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1469
1470
|
|
|
1470
1471
|
return cluster_status
|
|
1471
1472
|
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -36,8 +36,10 @@ def retry(func):
|
|
|
36
36
|
|
|
37
37
|
def get_project_by_region(region: str) -> str:
|
|
38
38
|
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
|
39
|
-
projects =
|
|
40
|
-
|
|
39
|
+
projects = nebius.sync_call(
|
|
40
|
+
service.list(
|
|
41
|
+
nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
|
|
42
|
+
timeout=nebius.READ_TIMEOUT))
|
|
41
43
|
|
|
42
44
|
# Check is there project if in config
|
|
43
45
|
project_id = skypilot_config.get_effective_region_config(
|
|
@@ -56,19 +58,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
|
|
|
56
58
|
"""
|
|
57
59
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
58
60
|
try:
|
|
59
|
-
cluster =
|
|
60
|
-
|
|
61
|
-
name=name,
|
|
62
|
-
)).wait()
|
|
63
|
-
cluster_id = cluster.metadata.id
|
|
64
|
-
except nebius.request_error():
|
|
65
|
-
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
|
66
|
-
metadata=nebius.nebius_common().ResourceMetadata(
|
|
61
|
+
cluster = nebius.sync_call(
|
|
62
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
67
63
|
parent_id=project_id,
|
|
68
64
|
name=name,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
65
|
+
)))
|
|
66
|
+
cluster_id = cluster.metadata.id
|
|
67
|
+
except nebius.request_error():
|
|
68
|
+
cluster = nebius.sync_call(
|
|
69
|
+
service.create(nebius.compute().CreateGpuClusterRequest(
|
|
70
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
71
|
+
parent_id=project_id,
|
|
72
|
+
name=name,
|
|
73
|
+
),
|
|
74
|
+
spec=nebius.compute().GpuClusterSpec(
|
|
75
|
+
infiniband_fabric=fabric))))
|
|
72
76
|
cluster_id = cluster.resource_id
|
|
73
77
|
return cluster_id
|
|
74
78
|
|
|
@@ -78,14 +82,16 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
78
82
|
project_id = get_project_by_region(region)
|
|
79
83
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
80
84
|
try:
|
|
81
|
-
cluster =
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
cluster = nebius.sync_call(
|
|
86
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
87
|
+
parent_id=project_id,
|
|
88
|
+
name=name,
|
|
89
|
+
)))
|
|
85
90
|
cluster_id = cluster.metadata.id
|
|
86
91
|
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
nebius.sync_call(
|
|
93
|
+
service.delete(
|
|
94
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
|
|
89
95
|
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
|
90
96
|
except nebius.request_error():
|
|
91
97
|
logger.debug('GPU Cluster does not exist.')
|
|
@@ -94,8 +100,10 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
94
100
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
95
101
|
"""Lists instances associated with API key."""
|
|
96
102
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
97
|
-
result =
|
|
98
|
-
|
|
103
|
+
result = nebius.sync_call(
|
|
104
|
+
service.list(
|
|
105
|
+
nebius.compute().ListInstancesRequest(parent_id=project_id),
|
|
106
|
+
timeout=nebius.READ_TIMEOUT))
|
|
99
107
|
|
|
100
108
|
instances = result
|
|
101
109
|
|
|
@@ -116,12 +124,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
|
116
124
|
|
|
117
125
|
def stop(instance_id: str) -> None:
|
|
118
126
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
119
|
-
|
|
127
|
+
nebius.sync_call(
|
|
128
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
|
|
120
129
|
retry_count = 0
|
|
121
130
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
|
122
131
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
123
|
-
instance =
|
|
124
|
-
id=instance_id,))
|
|
132
|
+
instance = nebius.sync_call(
|
|
133
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
125
134
|
if instance.status.state.name == 'STOPPED':
|
|
126
135
|
break
|
|
127
136
|
time.sleep(POLL_INTERVAL)
|
|
@@ -138,12 +147,13 @@ def stop(instance_id: str) -> None:
|
|
|
138
147
|
|
|
139
148
|
def start(instance_id: str) -> None:
|
|
140
149
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
141
|
-
|
|
150
|
+
nebius.sync_call(
|
|
151
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
|
|
142
152
|
retry_count = 0
|
|
143
153
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
|
144
154
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
145
|
-
instance =
|
|
146
|
-
id=instance_id,))
|
|
155
|
+
instance = nebius.sync_call(
|
|
156
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
147
157
|
if instance.status.state.name == 'RUNNING':
|
|
148
158
|
break
|
|
149
159
|
time.sleep(POLL_INTERVAL)
|
|
@@ -212,24 +222,26 @@ def launch(cluster_name_on_cloud: str,
|
|
|
212
222
|
project_id, fabric)
|
|
213
223
|
|
|
214
224
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
215
|
-
disk =
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
225
|
+
disk = nebius.sync_call(
|
|
226
|
+
service.create(nebius.compute().CreateDiskRequest(
|
|
227
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
228
|
+
parent_id=project_id,
|
|
229
|
+
name=disk_name,
|
|
230
|
+
),
|
|
231
|
+
spec=nebius.compute().DiskSpec(
|
|
232
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
|
233
|
+
image_family=image_family),
|
|
234
|
+
size_gibibytes=disk_size,
|
|
235
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
|
236
|
+
))))
|
|
226
237
|
disk_id = disk.resource_id
|
|
227
238
|
retry_count = 0
|
|
228
239
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
|
229
|
-
disk =
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
240
|
+
disk = nebius.sync_call(
|
|
241
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
242
|
+
parent_id=project_id,
|
|
243
|
+
name=disk_name,
|
|
244
|
+
)))
|
|
233
245
|
if disk.status.state.name == 'READY':
|
|
234
246
|
break
|
|
235
247
|
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
|
@@ -254,50 +266,53 @@ def launch(cluster_name_on_cloud: str,
|
|
|
254
266
|
id=fs['filesystem_id'])))
|
|
255
267
|
|
|
256
268
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
|
257
|
-
sub_net =
|
|
258
|
-
parent_id=project_id,))
|
|
269
|
+
sub_net = nebius.sync_call(
|
|
270
|
+
service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
|
|
259
271
|
|
|
260
272
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
).AttachedDiskSpec
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
273
|
+
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
274
|
+
nebius.sync_call(
|
|
275
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
276
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
277
|
+
parent_id=project_id,
|
|
278
|
+
name=instance_name,
|
|
279
|
+
),
|
|
280
|
+
spec=nebius.compute().InstanceSpec(
|
|
281
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
282
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
283
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
284
|
+
attach_mode=nebius.compute(
|
|
285
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
286
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
|
287
|
+
cloud_init_user_data=user_data,
|
|
288
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
289
|
+
preset=preset),
|
|
290
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
291
|
+
network_interfaces=[
|
|
292
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
293
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
294
|
+
ip_address=nebius.compute().IPAddress(),
|
|
295
|
+
name='network-interface-0',
|
|
296
|
+
public_ip_address=nebius.compute().PublicIPAddress()
|
|
297
|
+
if associate_public_ip_address else None,
|
|
298
|
+
)
|
|
299
|
+
],
|
|
300
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
301
|
+
if use_spot else None,
|
|
302
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
303
|
+
priority=1,
|
|
304
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
305
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
306
|
+
))))
|
|
293
307
|
instance_id = ''
|
|
294
308
|
retry_count = 0
|
|
295
309
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
296
310
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
297
|
-
instance =
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
311
|
+
instance = nebius.sync_call(
|
|
312
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
313
|
+
parent_id=project_id,
|
|
314
|
+
name=instance_name,
|
|
315
|
+
)))
|
|
301
316
|
if instance.status.state.name == 'STARTING':
|
|
302
317
|
instance_id = instance.metadata.id
|
|
303
318
|
break
|
|
@@ -317,19 +332,19 @@ def launch(cluster_name_on_cloud: str,
|
|
|
317
332
|
def remove(instance_id: str) -> None:
|
|
318
333
|
"""Terminates the given instance."""
|
|
319
334
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
320
|
-
result =
|
|
321
|
-
nebius.compute().GetInstanceRequest(id=instance_id))
|
|
335
|
+
result = nebius.sync_call(
|
|
336
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
|
|
322
337
|
disk_id = result.spec.boot_disk.existing_disk.id
|
|
323
|
-
|
|
324
|
-
nebius.compute().DeleteInstanceRequest(id=instance_id))
|
|
338
|
+
nebius.sync_call(
|
|
339
|
+
service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
|
|
325
340
|
retry_count = 0
|
|
326
341
|
# The instance begins deleting and attempts to delete the disk.
|
|
327
342
|
# Must wait until the disk is unlocked and becomes deletable.
|
|
328
343
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
|
329
344
|
try:
|
|
330
345
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
331
|
-
|
|
332
|
-
nebius.compute().DeleteDiskRequest(id=disk_id))
|
|
346
|
+
nebius.sync_call(
|
|
347
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
333
348
|
break
|
|
334
349
|
except nebius.request_error():
|
|
335
350
|
logger.debug('Waiting for disk deletion.')
|
sky/provision/provisioner.py
CHANGED
|
@@ -76,7 +76,8 @@ def _bulk_provision(
|
|
|
76
76
|
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
|
77
77
|
rich_utils.force_update_status(
|
|
78
78
|
ux_utils.spinner_message('Launching - Checking instance status',
|
|
79
|
-
str(provision_logging.config.log_path)
|
|
79
|
+
str(provision_logging.config.log_path),
|
|
80
|
+
cluster_name=str(cluster_name)))
|
|
80
81
|
# AWS would take a very short time (<<1s) updating the state of the
|
|
81
82
|
# instance.
|
|
82
83
|
time.sleep(1)
|
|
@@ -462,9 +463,9 @@ def _post_provision_setup(
|
|
|
462
463
|
docker_config = config_from_yaml.get('docker', {})
|
|
463
464
|
|
|
464
465
|
with rich_utils.safe_status(
|
|
465
|
-
ux_utils.spinner_message(
|
|
466
|
-
|
|
467
|
-
|
|
466
|
+
ux_utils.spinner_message('Launching - Waiting for SSH access',
|
|
467
|
+
provision_logging.config.log_path,
|
|
468
|
+
cluster_name=str(cluster_name))) as status:
|
|
468
469
|
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
|
469
470
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
470
471
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
@@ -493,7 +494,8 @@ def _post_provision_setup(
|
|
|
493
494
|
status.update(
|
|
494
495
|
ux_utils.spinner_message(
|
|
495
496
|
'Launching - Initializing docker container',
|
|
496
|
-
provision_logging.config.log_path
|
|
497
|
+
provision_logging.config.log_path,
|
|
498
|
+
cluster_name=str(cluster_name)))
|
|
497
499
|
docker_user = instance_setup.initialize_docker(
|
|
498
500
|
cluster_name.name_on_cloud,
|
|
499
501
|
docker_config=docker_config,
|
|
@@ -541,7 +543,8 @@ def _post_provision_setup(
|
|
|
541
543
|
|
|
542
544
|
runtime_preparation_str = (ux_utils.spinner_message(
|
|
543
545
|
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
|
544
|
-
provision_logging.config.log_path
|
|
546
|
+
provision_logging.config.log_path,
|
|
547
|
+
cluster_name=str(cluster_name)))
|
|
545
548
|
status.update(
|
|
546
549
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
|
547
550
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
|
@@ -679,7 +682,8 @@ def _post_provision_setup(
|
|
|
679
682
|
if logging_agent:
|
|
680
683
|
status.update(
|
|
681
684
|
ux_utils.spinner_message('Setting up logging agent',
|
|
682
|
-
provision_logging.config.log_path
|
|
685
|
+
provision_logging.config.log_path,
|
|
686
|
+
cluster_name=str(cluster_name)))
|
|
683
687
|
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
|
684
688
|
cluster_info,
|
|
685
689
|
ssh_credentials)
|
|
@@ -689,7 +693,8 @@ def _post_provision_setup(
|
|
|
689
693
|
|
|
690
694
|
logger.info(
|
|
691
695
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
692
|
-
provision_logging.config.log_path
|
|
696
|
+
provision_logging.config.log_path,
|
|
697
|
+
cluster_name=str(cluster_name)))
|
|
693
698
|
return cluster_info
|
|
694
699
|
|
|
695
700
|
|
sky/resources.py
CHANGED
|
@@ -37,7 +37,7 @@ if typing.TYPE_CHECKING:
|
|
|
37
37
|
|
|
38
38
|
logger = sky_logging.init_logger(__name__)
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
DEFAULT_DISK_SIZE_GB = 256
|
|
41
41
|
|
|
42
42
|
RESOURCE_CONFIG_ALIASES = {
|
|
43
43
|
'gpus': 'accelerators',
|
|
@@ -319,7 +319,7 @@ class Resources:
|
|
|
319
319
|
self._disk_size = int(
|
|
320
320
|
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
|
321
321
|
else:
|
|
322
|
-
self._disk_size =
|
|
322
|
+
self._disk_size = DEFAULT_DISK_SIZE_GB
|
|
323
323
|
|
|
324
324
|
self._image_id: Optional[Dict[Optional[str], str]] = None
|
|
325
325
|
if isinstance(image_id, str):
|
|
@@ -482,7 +482,7 @@ class Resources:
|
|
|
482
482
|
network_tier = f', network_tier={self.network_tier.value}'
|
|
483
483
|
|
|
484
484
|
disk_size = ''
|
|
485
|
-
if self.disk_size !=
|
|
485
|
+
if self.disk_size != DEFAULT_DISK_SIZE_GB:
|
|
486
486
|
disk_size = f', disk_size={self.disk_size}'
|
|
487
487
|
|
|
488
488
|
ports = ''
|
|
@@ -1766,7 +1766,7 @@ class Resources:
|
|
|
1766
1766
|
self._accelerators is None,
|
|
1767
1767
|
self._accelerator_args is None,
|
|
1768
1768
|
not self._use_spot_specified,
|
|
1769
|
-
self._disk_size ==
|
|
1769
|
+
self._disk_size == DEFAULT_DISK_SIZE_GB,
|
|
1770
1770
|
self._disk_tier is None,
|
|
1771
1771
|
self._network_tier is None,
|
|
1772
1772
|
self._image_id is None,
|
|
@@ -2255,7 +2255,7 @@ class Resources:
|
|
|
2255
2255
|
accelerator_args = state.pop('accelerator_args', None)
|
|
2256
2256
|
state['_accelerator_args'] = accelerator_args
|
|
2257
2257
|
|
|
2258
|
-
disk_size = state.pop('disk_size',
|
|
2258
|
+
disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
|
|
2259
2259
|
state['_disk_size'] = disk_size
|
|
2260
2260
|
|
|
2261
2261
|
if version < 2:
|