skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -85,6 +85,12 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
85
85
|
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
|
|
86
86
|
|
|
87
87
|
|
|
88
|
+
class ManagedJobQueueResultType(enum.Enum):
|
|
89
|
+
"""The type of the managed job queue result."""
|
|
90
|
+
DICT = 'DICT'
|
|
91
|
+
LIST = 'LIST'
|
|
92
|
+
|
|
93
|
+
|
|
88
94
|
class UserSignal(enum.Enum):
|
|
89
95
|
"""The signal to be sent to the user."""
|
|
90
96
|
CANCEL = 'CANCEL'
|
|
@@ -1120,7 +1126,17 @@ def stream_logs(job_id: Optional[int],
|
|
|
1120
1126
|
return stream_logs_by_id(job_id, follow, tail)
|
|
1121
1127
|
|
|
1122
1128
|
|
|
1123
|
-
def dump_managed_job_queue(
|
|
1129
|
+
def dump_managed_job_queue(
|
|
1130
|
+
skip_finished: bool = False,
|
|
1131
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1132
|
+
job_ids: Optional[List[int]] = None,
|
|
1133
|
+
workspace_match: Optional[str] = None,
|
|
1134
|
+
name_match: Optional[str] = None,
|
|
1135
|
+
pool_match: Optional[str] = None,
|
|
1136
|
+
page: Optional[int] = None,
|
|
1137
|
+
limit: Optional[int] = None,
|
|
1138
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1139
|
+
) -> str:
|
|
1124
1140
|
# Make sure to get all jobs - some logic below (e.g. high priority job
|
|
1125
1141
|
# detection) requires a full view of the jobs table.
|
|
1126
1142
|
jobs = managed_job_state.get_managed_jobs()
|
|
@@ -1147,6 +1163,31 @@ def dump_managed_job_queue() -> str:
|
|
|
1147
1163
|
if priority is not None and priority > highest_blocking_priority:
|
|
1148
1164
|
highest_blocking_priority = priority
|
|
1149
1165
|
|
|
1166
|
+
if user_hashes:
|
|
1167
|
+
jobs = [
|
|
1168
|
+
job for job in jobs if job.get('user_hash', None) in user_hashes
|
|
1169
|
+
]
|
|
1170
|
+
if accessible_workspaces:
|
|
1171
|
+
jobs = [
|
|
1172
|
+
job for job in jobs
|
|
1173
|
+
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1174
|
+
accessible_workspaces
|
|
1175
|
+
]
|
|
1176
|
+
if skip_finished:
|
|
1177
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
1178
|
+
# finished, we will include all its tasks.
|
|
1179
|
+
non_finished_tasks = list(
|
|
1180
|
+
filter(
|
|
1181
|
+
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1182
|
+
'status']).is_terminal(), jobs))
|
|
1183
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1184
|
+
jobs = list(
|
|
1185
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1186
|
+
if job_ids:
|
|
1187
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1188
|
+
|
|
1189
|
+
jobs, total = filter_jobs(jobs, workspace_match, name_match, pool_match,
|
|
1190
|
+
page, limit)
|
|
1150
1191
|
for job in jobs:
|
|
1151
1192
|
end_at = job['end_at']
|
|
1152
1193
|
if end_at is None:
|
|
@@ -1220,12 +1261,96 @@ def dump_managed_job_queue() -> str:
|
|
|
1220
1261
|
else:
|
|
1221
1262
|
job['details'] = None
|
|
1222
1263
|
|
|
1223
|
-
return message_utils.encode_payload(jobs)
|
|
1264
|
+
return message_utils.encode_payload({'jobs': jobs, 'total': total})
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def filter_jobs(
|
|
1268
|
+
jobs: List[Dict[str, Any]],
|
|
1269
|
+
workspace_match: Optional[str],
|
|
1270
|
+
name_match: Optional[str],
|
|
1271
|
+
pool_match: Optional[str],
|
|
1272
|
+
page: Optional[int],
|
|
1273
|
+
limit: Optional[int],
|
|
1274
|
+
user_match: Optional[str] = None,
|
|
1275
|
+
enable_user_match: bool = False,
|
|
1276
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1277
|
+
"""Filter jobs based on the given criteria.
|
|
1278
|
+
|
|
1279
|
+
Args:
|
|
1280
|
+
jobs: List of jobs to filter.
|
|
1281
|
+
workspace_match: Workspace name to filter.
|
|
1282
|
+
name_match: Job name to filter.
|
|
1283
|
+
pool_match: Pool name to filter.
|
|
1284
|
+
page: Page to filter.
|
|
1285
|
+
limit: Limit to filter.
|
|
1286
|
+
user_match: User name to filter.
|
|
1287
|
+
enable_user_match: Whether to enable user match.
|
|
1288
|
+
|
|
1289
|
+
Returns:
|
|
1290
|
+
List of filtered jobs and total number of jobs.
|
|
1291
|
+
"""
|
|
1292
|
+
|
|
1293
|
+
# TODO(hailong): refactor the whole function including the
|
|
1294
|
+
# `dump_managed_job_queue()` to use DB filtering.
|
|
1295
|
+
|
|
1296
|
+
def _pattern_matches(job: Dict[str, Any], key: str,
|
|
1297
|
+
pattern: Optional[str]) -> bool:
|
|
1298
|
+
if pattern is None:
|
|
1299
|
+
return True
|
|
1300
|
+
if key not in job:
|
|
1301
|
+
return False
|
|
1302
|
+
value = job[key]
|
|
1303
|
+
if not value:
|
|
1304
|
+
return False
|
|
1305
|
+
return pattern in str(value)
|
|
1306
|
+
|
|
1307
|
+
def _handle_page_and_limit(
|
|
1308
|
+
result: List[Dict[str, Any]],
|
|
1309
|
+
page: Optional[int],
|
|
1310
|
+
limit: Optional[int],
|
|
1311
|
+
) -> List[Dict[str, Any]]:
|
|
1312
|
+
if page is None and limit is None:
|
|
1313
|
+
return result
|
|
1314
|
+
assert page is not None and limit is not None, (page, limit)
|
|
1315
|
+
# page starts from 1
|
|
1316
|
+
start = (page - 1) * limit
|
|
1317
|
+
end = min(start + limit, len(result))
|
|
1318
|
+
return result[start:end]
|
|
1319
|
+
|
|
1320
|
+
result = []
|
|
1321
|
+
checks = [
|
|
1322
|
+
('workspace', workspace_match),
|
|
1323
|
+
('job_name', name_match),
|
|
1324
|
+
('pool', pool_match),
|
|
1325
|
+
]
|
|
1326
|
+
if enable_user_match:
|
|
1327
|
+
checks.append(('user_name', user_match))
|
|
1328
|
+
|
|
1329
|
+
for job in jobs:
|
|
1330
|
+
if not all(
|
|
1331
|
+
_pattern_matches(job, key, pattern) for key, pattern in checks):
|
|
1332
|
+
continue
|
|
1333
|
+
result.append(job)
|
|
1334
|
+
|
|
1335
|
+
total = len(result)
|
|
1336
|
+
|
|
1337
|
+
return _handle_page_and_limit(result, page, limit), total
|
|
1224
1338
|
|
|
1225
1339
|
|
|
1226
|
-
def load_managed_job_queue(
|
|
1340
|
+
def load_managed_job_queue(
|
|
1341
|
+
payload: str
|
|
1342
|
+
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType]:
|
|
1227
1343
|
"""Load job queue from json string."""
|
|
1228
|
-
|
|
1344
|
+
result = message_utils.decode_payload(payload)
|
|
1345
|
+
result_type = ManagedJobQueueResultType.DICT
|
|
1346
|
+
if isinstance(result, dict):
|
|
1347
|
+
jobs = result['jobs']
|
|
1348
|
+
total = result['total']
|
|
1349
|
+
else:
|
|
1350
|
+
jobs = result
|
|
1351
|
+
total = len(jobs)
|
|
1352
|
+
result_type = ManagedJobQueueResultType.LIST
|
|
1353
|
+
|
|
1229
1354
|
for job in jobs:
|
|
1230
1355
|
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1231
1356
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
@@ -1233,7 +1358,7 @@ def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
1233
1358
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1234
1359
|
user = global_user_state.get_user(job['user_hash'])
|
|
1235
1360
|
job['user_name'] = user.name if user is not None else None
|
|
1236
|
-
return jobs
|
|
1361
|
+
return jobs, total, result_type
|
|
1237
1362
|
|
|
1238
1363
|
|
|
1239
1364
|
def _get_job_status_from_tasks(
|
|
@@ -1580,9 +1705,35 @@ class ManagedJobCodeGen:
|
|
|
1580
1705
|
""")
|
|
1581
1706
|
|
|
1582
1707
|
@classmethod
|
|
1583
|
-
def get_job_table(
|
|
1584
|
-
|
|
1585
|
-
|
|
1708
|
+
def get_job_table(
|
|
1709
|
+
cls,
|
|
1710
|
+
skip_finished: bool = False,
|
|
1711
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1712
|
+
job_ids: Optional[List[int]] = None,
|
|
1713
|
+
workspace_match: Optional[str] = None,
|
|
1714
|
+
name_match: Optional[str] = None,
|
|
1715
|
+
pool_match: Optional[str] = None,
|
|
1716
|
+
page: Optional[int] = None,
|
|
1717
|
+
limit: Optional[int] = None,
|
|
1718
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1719
|
+
) -> str:
|
|
1720
|
+
code = textwrap.dedent(f"""\
|
|
1721
|
+
if managed_job_version < 9:
|
|
1722
|
+
# For backward compatibility, since filtering is not supported
|
|
1723
|
+
# before #6652.
|
|
1724
|
+
# TODO(hailong): Remove compatibility before 0.12.0
|
|
1725
|
+
job_table = utils.dump_managed_job_queue()
|
|
1726
|
+
else:
|
|
1727
|
+
job_table = utils.dump_managed_job_queue(
|
|
1728
|
+
skip_finished={skip_finished},
|
|
1729
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
1730
|
+
job_ids={job_ids!r},
|
|
1731
|
+
workspace_match={workspace_match!r},
|
|
1732
|
+
name_match={name_match!r},
|
|
1733
|
+
pool_match={pool_match!r},
|
|
1734
|
+
page={page!r},
|
|
1735
|
+
limit={limit!r},
|
|
1736
|
+
user_hashes={user_hashes!r})
|
|
1586
1737
|
print(job_table, flush=True)
|
|
1587
1738
|
""")
|
|
1588
1739
|
return cls._build(code)
|
|
@@ -1690,6 +1841,7 @@ class ManagedJobCodeGen:
|
|
|
1690
1841
|
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
|
|
1691
1842
|
workspace: str, entrypoint: str) -> str:
|
|
1692
1843
|
dag_name = managed_job_dag.name
|
|
1844
|
+
pool = managed_job_dag.pool
|
|
1693
1845
|
# Add the managed job to queue table.
|
|
1694
1846
|
code = textwrap.dedent(f"""\
|
|
1695
1847
|
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
|
@@ -1697,6 +1849,13 @@ class ManagedJobCodeGen:
|
|
|
1697
1849
|
set_job_info_kwargs = {{}}
|
|
1698
1850
|
if managed_job_version >= 5:
|
|
1699
1851
|
set_job_info_kwargs['entrypoint'] = {entrypoint!r}
|
|
1852
|
+
if managed_job_version >= 8:
|
|
1853
|
+
from sky.serve import serve_state
|
|
1854
|
+
pool_hash = None
|
|
1855
|
+
if {pool!r} != None:
|
|
1856
|
+
pool_hash = serve_state.get_service_hash({pool!r})
|
|
1857
|
+
set_job_info_kwargs['pool'] = {pool!r}
|
|
1858
|
+
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
1700
1859
|
managed_job_state.set_job_info(
|
|
1701
1860
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
1702
1861
|
""")
|
sky/provision/__init__.py
CHANGED
sky/provision/aws/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import colorama
|
|
|
19
19
|
from sky import exceptions
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.adaptors import aws
|
|
22
|
+
from sky.clouds import aws as aws_cloud
|
|
22
23
|
from sky.provision import common
|
|
23
24
|
from sky.provision.aws import utils
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -103,6 +104,30 @@ def bootstrap_instances(
|
|
|
103
104
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
104
105
|
expected_sg_name,
|
|
105
106
|
extended_ip_rules)
|
|
107
|
+
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
|
+
logger.debug('Attempting to create the default security group.')
|
|
109
|
+
# Attempt to create the default security group. This is needed
|
|
110
|
+
# to enable us to use the default security group to quickly
|
|
111
|
+
# delete the cluster. If the default security group is not created,
|
|
112
|
+
# we will need to block on instance termination to delete the
|
|
113
|
+
# security group.
|
|
114
|
+
try:
|
|
115
|
+
_configure_security_group(ec2, vpc_id,
|
|
116
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
117
|
+
[])
|
|
118
|
+
logger.debug('Default security group created.')
|
|
119
|
+
except exceptions.NoClusterLaunchedError as e:
|
|
120
|
+
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
121
|
+
e):
|
|
122
|
+
# User does not have permission to create the default
|
|
123
|
+
# security group.
|
|
124
|
+
logger.debug('User does not have permission to create '
|
|
125
|
+
'the default security group. '
|
|
126
|
+
f'{e}')
|
|
127
|
+
pass
|
|
128
|
+
else:
|
|
129
|
+
raise e
|
|
130
|
+
|
|
106
131
|
end_time = time.time()
|
|
107
132
|
elapsed = end_time - start_time
|
|
108
133
|
logger.info(
|
sky/provision/aws/instance.py
CHANGED
|
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
527
527
|
to_start_count,
|
|
528
528
|
associate_public_ip_address=(
|
|
529
529
|
not config.provider_config['use_internal_ips']))
|
|
530
|
+
|
|
530
531
|
created_instances.extend(created_remaining_instances)
|
|
531
532
|
created_instances.sort(key=lambda x: x.id)
|
|
532
533
|
|
|
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
585
586
|
# stop() and terminate() for example already implicitly assume non-terminated.
|
|
586
587
|
@common_utils.retry
|
|
587
588
|
def query_instances(
|
|
589
|
+
cluster_name: str,
|
|
588
590
|
cluster_name_on_cloud: str,
|
|
589
591
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
590
592
|
non_terminated_only: bool = True,
|
|
591
593
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
592
594
|
"""See sky/provision/__init__.py"""
|
|
595
|
+
del cluster_name # unused
|
|
593
596
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
594
597
|
region = provider_config['region']
|
|
595
598
|
ec2 = _default_ec2_resource(region)
|
|
@@ -682,19 +685,40 @@ def terminate_instances(
|
|
|
682
685
|
filters,
|
|
683
686
|
included_instances=None,
|
|
684
687
|
excluded_instances=None)
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
#
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
688
|
+
default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
689
|
+
if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
690
|
+
# Case 1: The default SG is used, we don't need to ensure instance are
|
|
691
|
+
# terminated.
|
|
692
|
+
instances.terminate()
|
|
693
|
+
elif not managed_by_skypilot:
|
|
694
|
+
# Case 2: We are not managing the non-default sg. We don't need to
|
|
695
|
+
# ensure instances are terminated.
|
|
696
|
+
instances.terminate()
|
|
697
|
+
elif (managed_by_skypilot and default_sg is not None):
|
|
698
|
+
# Case 3: We are managing the non-default sg. The default SG exists
|
|
699
|
+
# so we can move the instances to the default SG and terminate them
|
|
700
|
+
# without blocking.
|
|
701
|
+
|
|
702
|
+
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
703
|
+
def modify_instance_sg(instance):
|
|
704
|
+
instance.modify_attribute(Groups=[default_sg.id])
|
|
705
|
+
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
706
|
+
f'{default_sg.id} for quick deletion.')
|
|
707
|
+
|
|
708
|
+
with pool.ThreadPool() as thread_pool:
|
|
709
|
+
thread_pool.map(modify_instance_sg, instances)
|
|
710
|
+
thread_pool.close()
|
|
711
|
+
thread_pool.join()
|
|
712
|
+
|
|
713
|
+
instances.terminate()
|
|
714
|
+
else:
|
|
715
|
+
# Case 4: We are managing the non-default sg. The default SG does not
|
|
716
|
+
# exist. We must block on instance termination so that we can
|
|
717
|
+
# delete the security group.
|
|
718
|
+
instances.terminate()
|
|
719
|
+
for instance in instances:
|
|
720
|
+
instance.wait_until_terminated()
|
|
721
|
+
|
|
698
722
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
|
699
723
|
# wait util the cluster is fully terminated, while other clouds just
|
|
700
724
|
# trigger the termination process (via http call) and then return.
|
sky/provision/azure/instance.py
CHANGED
|
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
|
|
952
952
|
|
|
953
953
|
@common_utils.retry
|
|
954
954
|
def query_instances(
|
|
955
|
+
cluster_name: str,
|
|
955
956
|
cluster_name_on_cloud: str,
|
|
956
957
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
957
958
|
non_terminated_only: bool = True,
|
|
958
959
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
959
960
|
"""See sky/provision/__init__.py"""
|
|
961
|
+
del cluster_name # unused
|
|
960
962
|
assert provider_config is not None, cluster_name_on_cloud
|
|
961
963
|
|
|
962
964
|
subscription_id = provider_config['subscription_id']
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -191,11 +191,13 @@ def get_cluster_info(
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
def query_instances(
|
|
194
|
+
cluster_name: str,
|
|
194
195
|
cluster_name_on_cloud: str,
|
|
195
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
196
197
|
non_terminated_only: bool = True,
|
|
197
198
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
198
199
|
"""See sky/provision/__init__.py"""
|
|
200
|
+
del cluster_name # unused
|
|
199
201
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
200
202
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
201
203
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -242,11 +242,13 @@ def get_cluster_info(
|
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def query_instances(
|
|
245
|
+
cluster_name: str,
|
|
245
246
|
cluster_name_on_cloud: str,
|
|
246
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
247
248
|
non_terminated_only: bool = True,
|
|
248
249
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
249
250
|
"""See sky/provision/__init__.py"""
|
|
251
|
+
del cluster_name # unused
|
|
250
252
|
# terminated instances are not retrieved by the
|
|
251
253
|
# API making `non_terminated_only` argument moot.
|
|
252
254
|
del non_terminated_only
|
|
@@ -287,11 +287,13 @@ def get_cluster_info(
|
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
def query_instances(
|
|
290
|
+
cluster_name: str,
|
|
290
291
|
cluster_name_on_cloud: str,
|
|
291
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
292
293
|
non_terminated_only: bool = True,
|
|
293
294
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
294
295
|
"""See sky/provision/__init__.py"""
|
|
296
|
+
del cluster_name # unused
|
|
295
297
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
296
298
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
297
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -58,11 +58,13 @@ def _filter_instances(
|
|
|
58
58
|
# for terminated instances, if they have already been fully deleted.
|
|
59
59
|
@common_utils.retry
|
|
60
60
|
def query_instances(
|
|
61
|
+
cluster_name: str,
|
|
61
62
|
cluster_name_on_cloud: str,
|
|
62
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
63
64
|
non_terminated_only: bool = True,
|
|
64
65
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
65
66
|
"""See sky/provision/__init__.py"""
|
|
67
|
+
del cluster_name # unused
|
|
66
68
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
67
69
|
zone = provider_config['availability_zone']
|
|
68
70
|
project_id = provider_config['project_id']
|
|
@@ -304,12 +304,13 @@ def get_cluster_info(
|
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
def query_instances(
|
|
307
|
+
cluster_name: str,
|
|
307
308
|
cluster_name_on_cloud: str,
|
|
308
309
|
provider_config: Optional[dict] = None,
|
|
309
310
|
non_terminated_only: bool = True,
|
|
310
311
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
311
312
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
312
|
-
del provider_config # unused
|
|
313
|
+
del cluster_name, provider_config # unused
|
|
313
314
|
# Fetch all instances for this cluster
|
|
314
315
|
instances = utils.list_instances(
|
|
315
316
|
metadata={'skypilot': {
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import time
|
|
5
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
7
|
|
|
7
8
|
from sky import exceptions
|
|
9
|
+
from sky import global_user_state
|
|
8
10
|
from sky import sky_logging
|
|
9
11
|
from sky import skypilot_config
|
|
10
12
|
from sky.adaptors import kubernetes
|
|
@@ -24,6 +26,7 @@ from sky.utils import status_lib
|
|
|
24
26
|
from sky.utils import subprocess_utils
|
|
25
27
|
from sky.utils import timeline
|
|
26
28
|
from sky.utils import ux_utils
|
|
29
|
+
from sky.utils.db import db_utils
|
|
27
30
|
|
|
28
31
|
POLL_INTERVAL = 2
|
|
29
32
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
@@ -1270,7 +1273,116 @@ def _get_pod_termination_reason(pod: Any) -> str:
|
|
|
1270
1273
|
return ' | '.join(reasons)
|
|
1271
1274
|
|
|
1272
1275
|
|
|
1276
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1277
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1278
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1279
|
+
pod_field_selector = (
|
|
1280
|
+
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1281
|
+
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1282
|
+
namespace,
|
|
1283
|
+
field_selector=pod_field_selector,
|
|
1284
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1285
|
+
pod_events = sorted(
|
|
1286
|
+
pod_events,
|
|
1287
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1288
|
+
# latest event appears first
|
|
1289
|
+
reverse=True)
|
|
1290
|
+
last_scheduled_node = None
|
|
1291
|
+
insert_new_pod_event = True
|
|
1292
|
+
new_event_inserted = False
|
|
1293
|
+
for event in pod_events:
|
|
1294
|
+
if event.reason == 'Scheduled':
|
|
1295
|
+
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
1296
|
+
match = re.search(pattern, event.message)
|
|
1297
|
+
if match:
|
|
1298
|
+
scheduled_node = match.group(2)
|
|
1299
|
+
last_scheduled_node = scheduled_node
|
|
1300
|
+
if insert_new_pod_event:
|
|
1301
|
+
# Try inserting the latest events first. If the event is a
|
|
1302
|
+
# duplicate, it means the event (and any previous events) have
|
|
1303
|
+
# already been inserted - so do not insert further events.
|
|
1304
|
+
try:
|
|
1305
|
+
global_user_state.add_cluster_event(
|
|
1306
|
+
cluster_name,
|
|
1307
|
+
None, f'[kubernetes pod {pod_name}] '
|
|
1308
|
+
f'{event.reason} {event.message}',
|
|
1309
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1310
|
+
transitioned_at=int(
|
|
1311
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1312
|
+
expose_duplicate_error=True)
|
|
1313
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1314
|
+
insert_new_pod_event = False
|
|
1315
|
+
else:
|
|
1316
|
+
new_event_inserted = True
|
|
1317
|
+
|
|
1318
|
+
if last_scheduled_node is not None:
|
|
1319
|
+
node_field_selector = ('involvedObject.kind=Node,'
|
|
1320
|
+
f'involvedObject.name={last_scheduled_node}')
|
|
1321
|
+
node_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1322
|
+
namespace,
|
|
1323
|
+
field_selector=node_field_selector,
|
|
1324
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1325
|
+
node_events = sorted(
|
|
1326
|
+
node_events,
|
|
1327
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1328
|
+
# latest event appears first
|
|
1329
|
+
reverse=True)
|
|
1330
|
+
insert_new_node_event = True
|
|
1331
|
+
for event in node_events:
|
|
1332
|
+
if insert_new_node_event:
|
|
1333
|
+
# Try inserting the latest events first. If the event is a
|
|
1334
|
+
# duplicate, it means the event (and any previous events) have
|
|
1335
|
+
# already been inserted - so do not insert further events.
|
|
1336
|
+
try:
|
|
1337
|
+
global_user_state.add_cluster_event(
|
|
1338
|
+
cluster_name,
|
|
1339
|
+
None, f'[kubernetes node {last_scheduled_node}] '
|
|
1340
|
+
f'{event.reason} {event.message}',
|
|
1341
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1342
|
+
transitioned_at=int(
|
|
1343
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1344
|
+
expose_duplicate_error=True)
|
|
1345
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1346
|
+
insert_new_node_event = False
|
|
1347
|
+
else:
|
|
1348
|
+
new_event_inserted = True
|
|
1349
|
+
|
|
1350
|
+
if not new_event_inserted:
|
|
1351
|
+
# If new event is not inserted, there is no useful information to
|
|
1352
|
+
# return. Return None.
|
|
1353
|
+
return None
|
|
1354
|
+
|
|
1355
|
+
# Analyze the events for failure
|
|
1356
|
+
failure_reason = None
|
|
1357
|
+
failure_decisiveness = 0
|
|
1358
|
+
|
|
1359
|
+
def _record_failure_reason(reason: str, decisiveness: int):
|
|
1360
|
+
nonlocal failure_reason, failure_decisiveness
|
|
1361
|
+
if decisiveness > failure_decisiveness:
|
|
1362
|
+
failure_reason = reason
|
|
1363
|
+
failure_decisiveness = decisiveness
|
|
1364
|
+
|
|
1365
|
+
cluster_events = global_user_state.get_cluster_events(
|
|
1366
|
+
cluster_name, None, global_user_state.ClusterEventType.DEBUG)
|
|
1367
|
+
for event in cluster_events:
|
|
1368
|
+
if event.startswith('[kubernetes pod'):
|
|
1369
|
+
event = event.split(']')[1].strip()
|
|
1370
|
+
elif event.startswith('[kubernetes node'):
|
|
1371
|
+
event = event.split(']')[1].strip()
|
|
1372
|
+
|
|
1373
|
+
if event.startswith('NodeNotReady '):
|
|
1374
|
+
_record_failure_reason(event[len('NodeNotReady '):], 1)
|
|
1375
|
+
elif event.startswith('TaintManagerEviction '):
|
|
1376
|
+
# usually the event message for TaintManagerEviction is not useful
|
|
1377
|
+
# so we record a more generic message.
|
|
1378
|
+
_record_failure_reason('pod was evicted by taint manager', 2)
|
|
1379
|
+
elif event.startswith('DeletingNode '):
|
|
1380
|
+
_record_failure_reason(event[len('DeletingNode '):], 3)
|
|
1381
|
+
return failure_reason
|
|
1382
|
+
|
|
1383
|
+
|
|
1273
1384
|
def query_instances(
|
|
1385
|
+
cluster_name: str,
|
|
1274
1386
|
cluster_name_on_cloud: str,
|
|
1275
1387
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1276
1388
|
non_terminated_only: bool = True
|
|
@@ -1334,6 +1446,27 @@ def query_instances(
|
|
|
1334
1446
|
pod_name = pod.metadata.name
|
|
1335
1447
|
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1336
1448
|
cluster_status[pod_name] = (pod_status, reason)
|
|
1449
|
+
|
|
1450
|
+
# Find the list of pod names that should be there
|
|
1451
|
+
# from k8s services. Filter duplicates as -ssh service
|
|
1452
|
+
# creates a duplicate entry.
|
|
1453
|
+
target_pod_names = list(
|
|
1454
|
+
set([
|
|
1455
|
+
service['spec']['selector']['component']
|
|
1456
|
+
for service in provider_config.get('services', [])
|
|
1457
|
+
]))
|
|
1458
|
+
|
|
1459
|
+
for target_pod_name in target_pod_names:
|
|
1460
|
+
if target_pod_name not in cluster_status:
|
|
1461
|
+
# If the pod is not in the cluster_status, it means it's not
|
|
1462
|
+
# running.
|
|
1463
|
+
# Analyze what happened to the pod based on events.
|
|
1464
|
+
reason = _get_pod_missing_reason(context, namespace, cluster_name,
|
|
1465
|
+
target_pod_name)
|
|
1466
|
+
reason = (f'{target_pod_name}: {reason}'
|
|
1467
|
+
if reason is not None else None)
|
|
1468
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1469
|
+
|
|
1337
1470
|
return cluster_status
|
|
1338
1471
|
|
|
1339
1472
|
|
|
@@ -226,11 +226,13 @@ def get_cluster_info(
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
def query_instances(
|
|
229
|
+
cluster_name: str,
|
|
229
230
|
cluster_name_on_cloud: str,
|
|
230
231
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
232
|
non_terminated_only: bool = True,
|
|
232
233
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
234
|
"""See sky/provision/__init__.py"""
|
|
235
|
+
del cluster_name # unused
|
|
234
236
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
237
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
236
238
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -247,11 +247,13 @@ def get_cluster_info(
|
|
|
247
247
|
|
|
248
248
|
|
|
249
249
|
def query_instances(
|
|
250
|
+
cluster_name: str,
|
|
250
251
|
cluster_name_on_cloud: str,
|
|
251
252
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
252
253
|
non_terminated_only: bool = True,
|
|
253
254
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
254
255
|
"""See sky/provision/__init__.py"""
|
|
256
|
+
del cluster_name # unused
|
|
255
257
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
256
258
|
instances = _filter_instances(provider_config['region'],
|
|
257
259
|
cluster_name_on_cloud, None)
|