skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +6 -2
- sky/backends/cloud_vm_ray_backend.py +13 -4
- sky/client/cli/command.py +22 -8
- sky/client/sdk.py +50 -0
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +13 -10
- sky/global_user_state.py +128 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -8
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +44 -2
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +99 -98
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -85,6 +85,12 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
85
85
|
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
|
|
86
86
|
|
|
87
87
|
|
|
88
|
+
class ManagedJobQueueResultType(enum.Enum):
|
|
89
|
+
"""The type of the managed job queue result."""
|
|
90
|
+
DICT = 'DICT'
|
|
91
|
+
LIST = 'LIST'
|
|
92
|
+
|
|
93
|
+
|
|
88
94
|
class UserSignal(enum.Enum):
|
|
89
95
|
"""The signal to be sent to the user."""
|
|
90
96
|
CANCEL = 'CANCEL'
|
|
@@ -1120,7 +1126,17 @@ def stream_logs(job_id: Optional[int],
|
|
|
1120
1126
|
return stream_logs_by_id(job_id, follow, tail)
|
|
1121
1127
|
|
|
1122
1128
|
|
|
1123
|
-
def dump_managed_job_queue(
|
|
1129
|
+
def dump_managed_job_queue(
|
|
1130
|
+
skip_finished: bool = False,
|
|
1131
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1132
|
+
job_ids: Optional[List[int]] = None,
|
|
1133
|
+
workspace_match: Optional[str] = None,
|
|
1134
|
+
name_match: Optional[str] = None,
|
|
1135
|
+
pool_match: Optional[str] = None,
|
|
1136
|
+
page: Optional[int] = None,
|
|
1137
|
+
limit: Optional[int] = None,
|
|
1138
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1139
|
+
) -> str:
|
|
1124
1140
|
# Make sure to get all jobs - some logic below (e.g. high priority job
|
|
1125
1141
|
# detection) requires a full view of the jobs table.
|
|
1126
1142
|
jobs = managed_job_state.get_managed_jobs()
|
|
@@ -1147,6 +1163,31 @@ def dump_managed_job_queue() -> str:
|
|
|
1147
1163
|
if priority is not None and priority > highest_blocking_priority:
|
|
1148
1164
|
highest_blocking_priority = priority
|
|
1149
1165
|
|
|
1166
|
+
if user_hashes:
|
|
1167
|
+
jobs = [
|
|
1168
|
+
job for job in jobs if job.get('user_hash', None) in user_hashes
|
|
1169
|
+
]
|
|
1170
|
+
if accessible_workspaces:
|
|
1171
|
+
jobs = [
|
|
1172
|
+
job for job in jobs
|
|
1173
|
+
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1174
|
+
accessible_workspaces
|
|
1175
|
+
]
|
|
1176
|
+
if skip_finished:
|
|
1177
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
1178
|
+
# finished, we will include all its tasks.
|
|
1179
|
+
non_finished_tasks = list(
|
|
1180
|
+
filter(
|
|
1181
|
+
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1182
|
+
'status']).is_terminal(), jobs))
|
|
1183
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1184
|
+
jobs = list(
|
|
1185
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1186
|
+
if job_ids:
|
|
1187
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1188
|
+
|
|
1189
|
+
jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
|
|
1190
|
+
page, limit)
|
|
1150
1191
|
for job in jobs:
|
|
1151
1192
|
end_at = job['end_at']
|
|
1152
1193
|
if end_at is None:
|
|
@@ -1220,12 +1261,96 @@ def dump_managed_job_queue() -> str:
|
|
|
1220
1261
|
else:
|
|
1221
1262
|
job['details'] = None
|
|
1222
1263
|
|
|
1223
|
-
return message_utils.encode_payload(jobs)
|
|
1264
|
+
return message_utils.encode_payload({'jobs': jobs, 'total': total})
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def filter_jobs(
|
|
1268
|
+
jobs: List[Dict[str, Any]],
|
|
1269
|
+
workspace_match: Optional[str],
|
|
1270
|
+
name_match: Optional[str],
|
|
1271
|
+
pool_match: Optional[str],
|
|
1272
|
+
page: Optional[int],
|
|
1273
|
+
limit: Optional[int],
|
|
1274
|
+
user_match: Optional[str] = None,
|
|
1275
|
+
enable_user_match: bool = False,
|
|
1276
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1277
|
+
"""Filter jobs based on the given criteria.
|
|
1278
|
+
|
|
1279
|
+
Args:
|
|
1280
|
+
jobs: List of jobs to filter.
|
|
1281
|
+
workspace_match: Workspace name to filter.
|
|
1282
|
+
name_match: Job name to filter.
|
|
1283
|
+
pool_match: Pool name to filter.
|
|
1284
|
+
page: Page to filter.
|
|
1285
|
+
limit: Limit to filter.
|
|
1286
|
+
user_match: User name to filter.
|
|
1287
|
+
enable_user_match: Whether to enable user match.
|
|
1288
|
+
|
|
1289
|
+
Returns:
|
|
1290
|
+
List of filtered jobs and total number of jobs.
|
|
1291
|
+
"""
|
|
1292
|
+
|
|
1293
|
+
# TODO(hailong): refactor the whole function including the
|
|
1294
|
+
# `dump_managed_job_queue()` to use DB filtering.
|
|
1295
|
+
|
|
1296
|
+
def _pattern_matches(job: Dict[str, Any], key: str,
|
|
1297
|
+
pattern: Optional[str]) -> bool:
|
|
1298
|
+
if pattern is None:
|
|
1299
|
+
return True
|
|
1300
|
+
if key not in job:
|
|
1301
|
+
return False
|
|
1302
|
+
value = job[key]
|
|
1303
|
+
if not value:
|
|
1304
|
+
return False
|
|
1305
|
+
return pattern in str(value)
|
|
1306
|
+
|
|
1307
|
+
def _handle_page_and_limit(
|
|
1308
|
+
result: List[Dict[str, Any]],
|
|
1309
|
+
page: Optional[int],
|
|
1310
|
+
limit: Optional[int],
|
|
1311
|
+
) -> List[Dict[str, Any]]:
|
|
1312
|
+
if page is None and limit is None:
|
|
1313
|
+
return result
|
|
1314
|
+
assert page is not None and limit is not None, (page, limit)
|
|
1315
|
+
# page starts from 1
|
|
1316
|
+
start = (page - 1) * limit
|
|
1317
|
+
end = min(start + limit, len(result))
|
|
1318
|
+
return result[start:end]
|
|
1319
|
+
|
|
1320
|
+
result = []
|
|
1321
|
+
checks = [
|
|
1322
|
+
('workspace', workspace_match),
|
|
1323
|
+
('job_name', name_match),
|
|
1324
|
+
('pool', pool_match),
|
|
1325
|
+
]
|
|
1326
|
+
if enable_user_match:
|
|
1327
|
+
checks.append(('user_name', user_match))
|
|
1328
|
+
|
|
1329
|
+
for job in jobs:
|
|
1330
|
+
if not all(
|
|
1331
|
+
_pattern_matches(job, key, pattern) for key, pattern in checks):
|
|
1332
|
+
continue
|
|
1333
|
+
result.append(job)
|
|
1334
|
+
|
|
1335
|
+
total = len(result)
|
|
1336
|
+
|
|
1337
|
+
return _handle_page_and_limit(result, page, limit), total
|
|
1224
1338
|
|
|
1225
1339
|
|
|
1226
|
-
def load_managed_job_queue(
|
|
1340
|
+
def load_managed_job_queue(
|
|
1341
|
+
payload: str
|
|
1342
|
+
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
|
|
1227
1343
|
"""Load job queue from json string."""
|
|
1228
|
-
|
|
1344
|
+
result = message_utils.decode_payload(payload)
|
|
1345
|
+
result_type = ManagedJobQueueResultType.DICT
|
|
1346
|
+
if isinstance(result, dict):
|
|
1347
|
+
jobs = result['jobs']
|
|
1348
|
+
total = result['total']
|
|
1349
|
+
else:
|
|
1350
|
+
jobs = result
|
|
1351
|
+
total = len(jobs)
|
|
1352
|
+
result_type = ManagedJobQueueResultType.LIST
|
|
1353
|
+
|
|
1229
1354
|
for job in jobs:
|
|
1230
1355
|
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1231
1356
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
@@ -1233,7 +1358,7 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
1233
1358
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1234
1359
|
user = global_user_state.get_user(job['user_hash'])
|
|
1235
1360
|
job['user_name'] = user.name if user is not None else None
|
|
1236
|
-
return jobs
|
|
1361
|
+
return jobs, total, result_type
|
|
1237
1362
|
|
|
1238
1363
|
|
|
1239
1364
|
def _get_job_status_from_tasks(
|
|
@@ -1580,9 +1705,35 @@ class ManagedJobCodeGen:
|
|
|
1580
1705
|
""")
|
|
1581
1706
|
|
|
1582
1707
|
@classmethod
|
|
1583
|
-
def get_job_table(
|
|
1584
|
-
|
|
1585
|
-
|
|
1708
|
+
def get_job_table(
|
|
1709
|
+
cls,
|
|
1710
|
+
skip_finished: bool = False,
|
|
1711
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1712
|
+
job_ids: Optional[List[int]] = None,
|
|
1713
|
+
workspace_match: Optional[str] = None,
|
|
1714
|
+
name_match: Optional[str] = None,
|
|
1715
|
+
pool_match: Optional[str] = None,
|
|
1716
|
+
page: Optional[int] = None,
|
|
1717
|
+
limit: Optional[int] = None,
|
|
1718
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1719
|
+
) -> str:
|
|
1720
|
+
code = textwrap.dedent(f"""\
|
|
1721
|
+
if managed_job_version < 9:
|
|
1722
|
+
# For backward compatibility, since filtering is not supported
|
|
1723
|
+
# before #6652.
|
|
1724
|
+
# TODO(hailong): Remove compatibility before 0.12.0
|
|
1725
|
+
job_table = utils.dump_managed_job_queue()
|
|
1726
|
+
else:
|
|
1727
|
+
job_table = utils.dump_managed_job_queue(
|
|
1728
|
+
skip_finished={skip_finished},
|
|
1729
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
1730
|
+
job_ids={job_ids!r},
|
|
1731
|
+
workspace_match={workspace_match!r},
|
|
1732
|
+
name_match={name_match!r},
|
|
1733
|
+
pool_match={pool_match!r},
|
|
1734
|
+
page={page!r},
|
|
1735
|
+
limit={limit!r},
|
|
1736
|
+
user_hashes={user_hashes!r})
|
|
1586
1737
|
print(job_table, flush=True)
|
|
1587
1738
|
""")
|
|
1588
1739
|
return cls._build(code)
|
sky/provision/aws/config.py
CHANGED
|
@@ -105,13 +105,29 @@ def bootstrap_instances(
|
|
|
105
105
|
expected_sg_name,
|
|
106
106
|
extended_ip_rules)
|
|
107
107
|
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
|
-
|
|
108
|
+
logger.debug('Attempting to create the default security group.')
|
|
109
|
+
# Attempt to create the default security group. This is needed
|
|
109
110
|
# to enable us to use the default security group to quickly
|
|
110
111
|
# delete the cluster. If the default security group is not created,
|
|
111
112
|
# we will need to block on instance termination to delete the
|
|
112
113
|
# security group.
|
|
113
|
-
|
|
114
|
-
|
|
114
|
+
try:
|
|
115
|
+
_configure_security_group(ec2, vpc_id,
|
|
116
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
117
|
+
[])
|
|
118
|
+
logger.debug('Default security group created.')
|
|
119
|
+
except exceptions.NoClusterLaunchedError as e:
|
|
120
|
+
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
121
|
+
e):
|
|
122
|
+
# User does not have permission to create the default
|
|
123
|
+
# security group.
|
|
124
|
+
logger.debug('User does not have permission to create '
|
|
125
|
+
'the default security group. '
|
|
126
|
+
f'{e}')
|
|
127
|
+
pass
|
|
128
|
+
else:
|
|
129
|
+
raise e
|
|
130
|
+
|
|
115
131
|
end_time = time.time()
|
|
116
132
|
elapsed = end_time - start_time
|
|
117
133
|
logger.info(
|
sky/provision/aws/instance.py
CHANGED
|
@@ -713,7 +713,8 @@ def terminate_instances(
|
|
|
713
713
|
instances.terminate()
|
|
714
714
|
else:
|
|
715
715
|
# Case 4: We are managing the non-default sg. The default SG does not
|
|
716
|
-
# exist. We must block on instance termination
|
|
716
|
+
# exist. We must block on instance termination so that we can
|
|
717
|
+
# delete the security group.
|
|
717
718
|
instances.terminate()
|
|
718
719
|
for instance in instances:
|
|
719
720
|
instance.wait_until_terminated()
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -36,8 +36,10 @@ def retry(func):
|
|
|
36
36
|
|
|
37
37
|
def get_project_by_region(region: str) -> str:
|
|
38
38
|
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
|
39
|
-
projects =
|
|
40
|
-
|
|
39
|
+
projects = nebius.sync_call(
|
|
40
|
+
service.list(
|
|
41
|
+
nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
|
|
42
|
+
timeout=nebius.READ_TIMEOUT))
|
|
41
43
|
|
|
42
44
|
# Check is there project if in config
|
|
43
45
|
project_id = skypilot_config.get_effective_region_config(
|
|
@@ -56,19 +58,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
|
|
|
56
58
|
"""
|
|
57
59
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
58
60
|
try:
|
|
59
|
-
cluster =
|
|
60
|
-
|
|
61
|
-
name=name,
|
|
62
|
-
)).wait()
|
|
63
|
-
cluster_id = cluster.metadata.id
|
|
64
|
-
except nebius.request_error():
|
|
65
|
-
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
|
66
|
-
metadata=nebius.nebius_common().ResourceMetadata(
|
|
61
|
+
cluster = nebius.sync_call(
|
|
62
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
67
63
|
parent_id=project_id,
|
|
68
64
|
name=name,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
65
|
+
)))
|
|
66
|
+
cluster_id = cluster.metadata.id
|
|
67
|
+
except nebius.request_error():
|
|
68
|
+
cluster = nebius.sync_call(
|
|
69
|
+
service.create(nebius.compute().CreateGpuClusterRequest(
|
|
70
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
71
|
+
parent_id=project_id,
|
|
72
|
+
name=name,
|
|
73
|
+
),
|
|
74
|
+
spec=nebius.compute().GpuClusterSpec(
|
|
75
|
+
infiniband_fabric=fabric))))
|
|
72
76
|
cluster_id = cluster.resource_id
|
|
73
77
|
return cluster_id
|
|
74
78
|
|
|
@@ -78,14 +82,16 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
78
82
|
project_id = get_project_by_region(region)
|
|
79
83
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
80
84
|
try:
|
|
81
|
-
cluster =
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
cluster = nebius.sync_call(
|
|
86
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
87
|
+
parent_id=project_id,
|
|
88
|
+
name=name,
|
|
89
|
+
)))
|
|
85
90
|
cluster_id = cluster.metadata.id
|
|
86
91
|
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
nebius.sync_call(
|
|
93
|
+
service.delete(
|
|
94
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
|
|
89
95
|
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
|
90
96
|
except nebius.request_error():
|
|
91
97
|
logger.debug('GPU Cluster does not exist.')
|
|
@@ -94,8 +100,10 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
94
100
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
95
101
|
"""Lists instances associated with API key."""
|
|
96
102
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
97
|
-
result =
|
|
98
|
-
|
|
103
|
+
result = nebius.sync_call(
|
|
104
|
+
service.list(
|
|
105
|
+
nebius.compute().ListInstancesRequest(parent_id=project_id),
|
|
106
|
+
timeout=nebius.READ_TIMEOUT))
|
|
99
107
|
|
|
100
108
|
instances = result
|
|
101
109
|
|
|
@@ -116,12 +124,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
|
116
124
|
|
|
117
125
|
def stop(instance_id: str) -> None:
|
|
118
126
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
119
|
-
|
|
127
|
+
nebius.sync_call(
|
|
128
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
|
|
120
129
|
retry_count = 0
|
|
121
130
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
|
122
131
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
123
|
-
instance =
|
|
124
|
-
id=instance_id,))
|
|
132
|
+
instance = nebius.sync_call(
|
|
133
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
125
134
|
if instance.status.state.name == 'STOPPED':
|
|
126
135
|
break
|
|
127
136
|
time.sleep(POLL_INTERVAL)
|
|
@@ -138,12 +147,13 @@ def stop(instance_id: str) -> None:
|
|
|
138
147
|
|
|
139
148
|
def start(instance_id: str) -> None:
|
|
140
149
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
141
|
-
|
|
150
|
+
nebius.sync_call(
|
|
151
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
|
|
142
152
|
retry_count = 0
|
|
143
153
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
|
144
154
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
145
|
-
instance =
|
|
146
|
-
id=instance_id,))
|
|
155
|
+
instance = nebius.sync_call(
|
|
156
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
147
157
|
if instance.status.state.name == 'RUNNING':
|
|
148
158
|
break
|
|
149
159
|
time.sleep(POLL_INTERVAL)
|
|
@@ -212,24 +222,26 @@ def launch(cluster_name_on_cloud: str,
|
|
|
212
222
|
project_id, fabric)
|
|
213
223
|
|
|
214
224
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
215
|
-
disk =
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
225
|
+
disk = nebius.sync_call(
|
|
226
|
+
service.create(nebius.compute().CreateDiskRequest(
|
|
227
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
228
|
+
parent_id=project_id,
|
|
229
|
+
name=disk_name,
|
|
230
|
+
),
|
|
231
|
+
spec=nebius.compute().DiskSpec(
|
|
232
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
|
233
|
+
image_family=image_family),
|
|
234
|
+
size_gibibytes=disk_size,
|
|
235
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
|
236
|
+
))))
|
|
226
237
|
disk_id = disk.resource_id
|
|
227
238
|
retry_count = 0
|
|
228
239
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
|
229
|
-
disk =
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
240
|
+
disk = nebius.sync_call(
|
|
241
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
242
|
+
parent_id=project_id,
|
|
243
|
+
name=disk_name,
|
|
244
|
+
)))
|
|
233
245
|
if disk.status.state.name == 'READY':
|
|
234
246
|
break
|
|
235
247
|
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
|
@@ -254,50 +266,53 @@ def launch(cluster_name_on_cloud: str,
|
|
|
254
266
|
id=fs['filesystem_id'])))
|
|
255
267
|
|
|
256
268
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
|
257
|
-
sub_net =
|
|
258
|
-
parent_id=project_id,))
|
|
269
|
+
sub_net = nebius.sync_call(
|
|
270
|
+
service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
|
|
259
271
|
|
|
260
272
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
).AttachedDiskSpec
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
273
|
+
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
274
|
+
nebius.sync_call(
|
|
275
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
276
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
277
|
+
parent_id=project_id,
|
|
278
|
+
name=instance_name,
|
|
279
|
+
),
|
|
280
|
+
spec=nebius.compute().InstanceSpec(
|
|
281
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
282
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
283
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
284
|
+
attach_mode=nebius.compute(
|
|
285
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
286
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
|
287
|
+
cloud_init_user_data=user_data,
|
|
288
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
289
|
+
preset=preset),
|
|
290
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
291
|
+
network_interfaces=[
|
|
292
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
293
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
294
|
+
ip_address=nebius.compute().IPAddress(),
|
|
295
|
+
name='network-interface-0',
|
|
296
|
+
public_ip_address=nebius.compute().PublicIPAddress()
|
|
297
|
+
if associate_public_ip_address else None,
|
|
298
|
+
)
|
|
299
|
+
],
|
|
300
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
301
|
+
if use_spot else None,
|
|
302
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
303
|
+
priority=1,
|
|
304
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
305
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
306
|
+
))))
|
|
293
307
|
instance_id = ''
|
|
294
308
|
retry_count = 0
|
|
295
309
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
296
310
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
297
|
-
instance =
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
311
|
+
instance = nebius.sync_call(
|
|
312
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
313
|
+
parent_id=project_id,
|
|
314
|
+
name=instance_name,
|
|
315
|
+
)))
|
|
301
316
|
if instance.status.state.name == 'STARTING':
|
|
302
317
|
instance_id = instance.metadata.id
|
|
303
318
|
break
|
|
@@ -317,19 +332,19 @@ def launch(cluster_name_on_cloud: str,
|
|
|
317
332
|
def remove(instance_id: str) -> None:
|
|
318
333
|
"""Terminates the given instance."""
|
|
319
334
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
320
|
-
result =
|
|
321
|
-
nebius.compute().GetInstanceRequest(id=instance_id))
|
|
335
|
+
result = nebius.sync_call(
|
|
336
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
|
|
322
337
|
disk_id = result.spec.boot_disk.existing_disk.id
|
|
323
|
-
|
|
324
|
-
nebius.compute().DeleteInstanceRequest(id=instance_id))
|
|
338
|
+
nebius.sync_call(
|
|
339
|
+
service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
|
|
325
340
|
retry_count = 0
|
|
326
341
|
# The instance begins deleting and attempts to delete the disk.
|
|
327
342
|
# Must wait until the disk is unlocked and becomes deletable.
|
|
328
343
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
|
329
344
|
try:
|
|
330
345
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
331
|
-
|
|
332
|
-
nebius.compute().DeleteDiskRequest(id=disk_id))
|
|
346
|
+
nebius.sync_call(
|
|
347
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
333
348
|
break
|
|
334
349
|
except nebius.request_error():
|
|
335
350
|
logger.debug('Waiting for disk deletion.')
|
sky/provision/provisioner.py
CHANGED
|
@@ -76,7 +76,8 @@ def _bulk_provision(
|
|
|
76
76
|
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
|
77
77
|
rich_utils.force_update_status(
|
|
78
78
|
ux_utils.spinner_message('Launching - Checking instance status',
|
|
79
|
-
str(provision_logging.config.log_path)
|
|
79
|
+
str(provision_logging.config.log_path),
|
|
80
|
+
cluster_name=str(cluster_name)))
|
|
80
81
|
# AWS would take a very short time (<<1s) updating the state of the
|
|
81
82
|
# instance.
|
|
82
83
|
time.sleep(1)
|
|
@@ -462,9 +463,9 @@ def _post_provision_setup(
|
|
|
462
463
|
docker_config = config_from_yaml.get('docker', {})
|
|
463
464
|
|
|
464
465
|
with rich_utils.safe_status(
|
|
465
|
-
ux_utils.spinner_message(
|
|
466
|
-
|
|
467
|
-
|
|
466
|
+
ux_utils.spinner_message('Launching - Waiting for SSH access',
|
|
467
|
+
provision_logging.config.log_path,
|
|
468
|
+
cluster_name=str(cluster_name))) as status:
|
|
468
469
|
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
|
469
470
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
470
471
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
@@ -493,7 +494,8 @@ def _post_provision_setup(
|
|
|
493
494
|
status.update(
|
|
494
495
|
ux_utils.spinner_message(
|
|
495
496
|
'Launching - Initializing docker container',
|
|
496
|
-
provision_logging.config.log_path
|
|
497
|
+
provision_logging.config.log_path,
|
|
498
|
+
cluster_name=str(cluster_name)))
|
|
497
499
|
docker_user = instance_setup.initialize_docker(
|
|
498
500
|
cluster_name.name_on_cloud,
|
|
499
501
|
docker_config=docker_config,
|
|
@@ -541,7 +543,8 @@ def _post_provision_setup(
|
|
|
541
543
|
|
|
542
544
|
runtime_preparation_str = (ux_utils.spinner_message(
|
|
543
545
|
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
|
544
|
-
provision_logging.config.log_path
|
|
546
|
+
provision_logging.config.log_path,
|
|
547
|
+
cluster_name=str(cluster_name)))
|
|
545
548
|
status.update(
|
|
546
549
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
|
547
550
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
|
@@ -679,7 +682,8 @@ def _post_provision_setup(
|
|
|
679
682
|
if logging_agent:
|
|
680
683
|
status.update(
|
|
681
684
|
ux_utils.spinner_message('Setting up logging agent',
|
|
682
|
-
provision_logging.config.log_path
|
|
685
|
+
provision_logging.config.log_path,
|
|
686
|
+
cluster_name=str(cluster_name)))
|
|
683
687
|
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
|
684
688
|
cluster_info,
|
|
685
689
|
ssh_credentials)
|
|
@@ -689,7 +693,8 @@ def _post_provision_setup(
|
|
|
689
693
|
|
|
690
694
|
logger.info(
|
|
691
695
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
692
|
-
provision_logging.config.log_path
|
|
696
|
+
provision_logging.config.log_path,
|
|
697
|
+
cluster_name=str(cluster_name)))
|
|
693
698
|
return cluster_info
|
|
694
699
|
|
|
695
700
|
|
sky/resources.py
CHANGED
|
@@ -37,7 +37,7 @@ if typing.TYPE_CHECKING:
|
|
|
37
37
|
|
|
38
38
|
logger = sky_logging.init_logger(__name__)
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
DEFAULT_DISK_SIZE_GB = 256
|
|
41
41
|
|
|
42
42
|
RESOURCE_CONFIG_ALIASES = {
|
|
43
43
|
'gpus': 'accelerators',
|
|
@@ -319,7 +319,7 @@ class Resources:
|
|
|
319
319
|
self._disk_size = int(
|
|
320
320
|
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
|
321
321
|
else:
|
|
322
|
-
self._disk_size =
|
|
322
|
+
self._disk_size = DEFAULT_DISK_SIZE_GB
|
|
323
323
|
|
|
324
324
|
self._image_id: Optional[Dict[Optional[str], str]] = None
|
|
325
325
|
if isinstance(image_id, str):
|
|
@@ -482,7 +482,7 @@ class Resources:
|
|
|
482
482
|
network_tier = f', network_tier={self.network_tier.value}'
|
|
483
483
|
|
|
484
484
|
disk_size = ''
|
|
485
|
-
if self.disk_size !=
|
|
485
|
+
if self.disk_size != DEFAULT_DISK_SIZE_GB:
|
|
486
486
|
disk_size = f', disk_size={self.disk_size}'
|
|
487
487
|
|
|
488
488
|
ports = ''
|
|
@@ -1766,7 +1766,7 @@ class Resources:
|
|
|
1766
1766
|
self._accelerators is None,
|
|
1767
1767
|
self._accelerator_args is None,
|
|
1768
1768
|
not self._use_spot_specified,
|
|
1769
|
-
self._disk_size ==
|
|
1769
|
+
self._disk_size == DEFAULT_DISK_SIZE_GB,
|
|
1770
1770
|
self._disk_tier is None,
|
|
1771
1771
|
self._network_tier is None,
|
|
1772
1772
|
self._image_id is None,
|
|
@@ -2255,7 +2255,7 @@ class Resources:
|
|
|
2255
2255
|
accelerator_args = state.pop('accelerator_args', None)
|
|
2256
2256
|
state['_accelerator_args'] = accelerator_args
|
|
2257
2257
|
|
|
2258
|
-
disk_size = state.pop('disk_size',
|
|
2258
|
+
disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
|
|
2259
2259
|
state['_disk_size'] = disk_size
|
|
2260
2260
|
|
|
2261
2261
|
if version < 2:
|