skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -275,65 +275,6 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
|
275
275
|
return list(env_dict.items())
|
|
276
276
|
|
|
277
277
|
|
|
278
|
-
def _format_job_ids_str(job_ids: List[int], max_length: int = 30) -> str:
|
|
279
|
-
"""Format job IDs string with ellipsis if too long.
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
job_ids: List of job IDs to format.
|
|
283
|
-
max_length: Maximum length of the output string.
|
|
284
|
-
|
|
285
|
-
Returns:
|
|
286
|
-
Formatted string like "11,12,...,2017,2018" if truncated,
|
|
287
|
-
or the full string if it fits within max_length.
|
|
288
|
-
"""
|
|
289
|
-
if not job_ids:
|
|
290
|
-
return ''
|
|
291
|
-
|
|
292
|
-
# Convert all to strings
|
|
293
|
-
job_strs = [str(job_id) for job_id in job_ids]
|
|
294
|
-
full_str = ','.join(job_strs)
|
|
295
|
-
|
|
296
|
-
# If it fits, return as is
|
|
297
|
-
if len(full_str) <= max_length:
|
|
298
|
-
return full_str
|
|
299
|
-
|
|
300
|
-
if len(job_strs) <= 2:
|
|
301
|
-
return full_str # Can't truncate further
|
|
302
|
-
|
|
303
|
-
# Need to truncate with ellipsis
|
|
304
|
-
ellipsis = '...'
|
|
305
|
-
|
|
306
|
-
# Start with minimum: first and last
|
|
307
|
-
start_count = 1
|
|
308
|
-
end_count = 1
|
|
309
|
-
|
|
310
|
-
while start_count + end_count < len(job_strs):
|
|
311
|
-
# Try adding one more to start
|
|
312
|
-
if start_count + 1 + end_count < len(job_strs):
|
|
313
|
-
start_part = ','.join(job_strs[:start_count + 1])
|
|
314
|
-
end_part = ','.join(job_strs[-end_count:])
|
|
315
|
-
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
316
|
-
if len(candidate) <= max_length:
|
|
317
|
-
start_count += 1
|
|
318
|
-
continue
|
|
319
|
-
|
|
320
|
-
# Try adding one more to end
|
|
321
|
-
if start_count + end_count + 1 < len(job_strs):
|
|
322
|
-
start_part = ','.join(job_strs[:start_count])
|
|
323
|
-
end_part = ','.join(job_strs[-(end_count + 1):])
|
|
324
|
-
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
325
|
-
if len(candidate) <= max_length:
|
|
326
|
-
end_count += 1
|
|
327
|
-
continue
|
|
328
|
-
|
|
329
|
-
# Can't add more
|
|
330
|
-
break
|
|
331
|
-
|
|
332
|
-
start_part = ','.join(job_strs[:start_count])
|
|
333
|
-
end_part = ','.join(job_strs[-end_count:])
|
|
334
|
-
return f'{start_part},{ellipsis},{end_part}'
|
|
335
|
-
|
|
336
|
-
|
|
337
278
|
def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
338
279
|
incomplete: str) -> List[str]:
|
|
339
280
|
"""Handle shell completion for cluster names."""
|
|
@@ -1187,11 +1128,15 @@ def launch(
|
|
|
1187
1128
|
raise ValueError(f'{backend_name} backend is not supported.')
|
|
1188
1129
|
|
|
1189
1130
|
if task.service is not None:
|
|
1131
|
+
noun = 'pool' if task.service.pool else 'service'
|
|
1132
|
+
capnoun = noun.capitalize()
|
|
1133
|
+
sysname = 'Jobs Worker Pool' if task.service.pool else 'SkyServe'
|
|
1134
|
+
cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
|
|
1190
1135
|
logger.info(
|
|
1191
|
-
f'{colorama.Fore.YELLOW}
|
|
1192
|
-
f'`sky launch`. {colorama.Style.RESET_ALL}\n
|
|
1193
|
-
'To spin up a
|
|
1194
|
-
f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}
|
|
1136
|
+
f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
|
|
1137
|
+
f'using `sky launch`. {colorama.Style.RESET_ALL}\n'
|
|
1138
|
+
f'{colorama.Fore.YELLOW}To spin up a {noun}, use {sysname} CLI: '
|
|
1139
|
+
f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}{cmd}'
|
|
1195
1140
|
f'{colorama.Style.RESET_ALL}')
|
|
1196
1141
|
|
|
1197
1142
|
request_id = sdk.launch(
|
|
@@ -2226,6 +2171,10 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2226
2171
|
|
|
2227
2172
|
@cli.command()
|
|
2228
2173
|
@flags.config_option(expose_value=False)
|
|
2174
|
+
@click.option('--provision',
|
|
2175
|
+
is_flag=True,
|
|
2176
|
+
default=False,
|
|
2177
|
+
help='Stream the cluster provisioning logs (provision.log).')
|
|
2229
2178
|
@click.option(
|
|
2230
2179
|
'--sync-down',
|
|
2231
2180
|
'-s',
|
|
@@ -2262,6 +2211,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2262
2211
|
def logs(
|
|
2263
2212
|
cluster: str,
|
|
2264
2213
|
job_ids: Tuple[str, ...],
|
|
2214
|
+
provision: bool,
|
|
2265
2215
|
sync_down: bool,
|
|
2266
2216
|
status: bool, # pylint: disable=redefined-outer-name
|
|
2267
2217
|
follow: bool,
|
|
@@ -2291,6 +2241,11 @@ def logs(
|
|
|
2291
2241
|
4. If the job fails or fetching the logs fails, the command will exit with
|
|
2292
2242
|
a non-zero return code.
|
|
2293
2243
|
"""
|
|
2244
|
+
if provision and (sync_down or status or job_ids):
|
|
2245
|
+
raise click.UsageError(
|
|
2246
|
+
'--provision cannot be combined with job log options '
|
|
2247
|
+
'(--sync-down/--status/job IDs).')
|
|
2248
|
+
|
|
2294
2249
|
if sync_down and status:
|
|
2295
2250
|
raise click.UsageError(
|
|
2296
2251
|
'Both --sync_down and --status are specified '
|
|
@@ -2303,6 +2258,10 @@ def logs(
|
|
|
2303
2258
|
|
|
2304
2259
|
job_ids = None if not job_ids else job_ids
|
|
2305
2260
|
|
|
2261
|
+
if provision:
|
|
2262
|
+
# Stream provision logs
|
|
2263
|
+
sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
|
|
2264
|
+
|
|
2306
2265
|
if sync_down:
|
|
2307
2266
|
with rich_utils.client_status(
|
|
2308
2267
|
ux_utils.spinner_message('Downloading logs')):
|
|
@@ -2981,15 +2940,15 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2981
2940
|
controller = controller_utils.Controllers.from_name(controller_name)
|
|
2982
2941
|
assert controller is not None, controller_name
|
|
2983
2942
|
|
|
2984
|
-
# TODO(tian): We also need to check pools after we allow running pools on
|
|
2985
|
-
# jobs controller.
|
|
2986
2943
|
with rich_utils.client_status(
|
|
2987
|
-
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
|
2944
|
+
'[bold cyan]Checking for in-progress managed jobs and pools[/]'):
|
|
2988
2945
|
try:
|
|
2989
2946
|
request_id = managed_jobs.queue(refresh=False,
|
|
2990
2947
|
skip_finished=True,
|
|
2991
2948
|
all_users=True)
|
|
2992
2949
|
managed_jobs_ = sdk.stream_and_get(request_id)
|
|
2950
|
+
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2951
|
+
pools_ = sdk.stream_and_get(request_id_pools)
|
|
2993
2952
|
except exceptions.ClusterNotUpError as e:
|
|
2994
2953
|
if controller.value.connection_error_hint in str(e):
|
|
2995
2954
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -3004,6 +2963,7 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3004
2963
|
# the controller being STOPPED or being firstly launched, i.e.,
|
|
3005
2964
|
# there is no in-prgress managed jobs.
|
|
3006
2965
|
managed_jobs_ = []
|
|
2966
|
+
pools_ = []
|
|
3007
2967
|
except exceptions.InconsistentConsolidationModeError:
|
|
3008
2968
|
# If this error is raised, it means the user switched to the
|
|
3009
2969
|
# consolidation mode but the previous controller cluster is still
|
|
@@ -3021,6 +2981,8 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3021
2981
|
skip_finished=True,
|
|
3022
2982
|
all_users=True)
|
|
3023
2983
|
managed_jobs_ = sdk.stream_and_get(request_id)
|
|
2984
|
+
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2985
|
+
pools_ = sdk.stream_and_get(request_id_pools)
|
|
3024
2986
|
|
|
3025
2987
|
msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
|
|
3026
2988
|
'jobs controller. Please be aware of the following:'
|
|
@@ -3042,9 +3004,23 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3042
3004
|
else:
|
|
3043
3005
|
with ux_utils.print_exception_no_traceback():
|
|
3044
3006
|
raise exceptions.NotSupportedError(msg)
|
|
3007
|
+
elif pools_:
|
|
3008
|
+
pool_names = ', '.join([pool['name'] for pool in pools_])
|
|
3009
|
+
if purge:
|
|
3010
|
+
logger.warning('--purge is set, ignoring the in-progress pools. '
|
|
3011
|
+
'This could cause leaked clusters!')
|
|
3012
|
+
else:
|
|
3013
|
+
msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
|
|
3014
|
+
'jobs controller is not supported, as it is currently '
|
|
3015
|
+
f'hosting the following pools: {pool_names}. Please '
|
|
3016
|
+
'terminate the pools first with '
|
|
3017
|
+
f'{colorama.Style.BRIGHT}sky jobs pool down -a'
|
|
3018
|
+
f'{colorama.Style.RESET_ALL}.')
|
|
3019
|
+
with ux_utils.print_exception_no_traceback():
|
|
3020
|
+
raise exceptions.NotSupportedError(msg)
|
|
3045
3021
|
else:
|
|
3046
|
-
click.echo(' * No in-progress managed jobs found. It
|
|
3047
|
-
'terminate (see caveats above).')
|
|
3022
|
+
click.echo(' * No in-progress managed jobs or running pools found. It '
|
|
3023
|
+
'should be safe to terminate (see caveats above).')
|
|
3048
3024
|
|
|
3049
3025
|
|
|
3050
3026
|
def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
@@ -4509,8 +4485,8 @@ def jobs_launch(
|
|
|
4509
4485
|
if print_setup_fm_warning:
|
|
4510
4486
|
click.secho(
|
|
4511
4487
|
f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
|
|
4512
|
-
' will be ignored
|
|
4513
|
-
f'use `sky pool apply {pool} pool.yaml`. '
|
|
4488
|
+
' will be ignored when submit jobs to pool. To update a pool, '
|
|
4489
|
+
f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
|
|
4514
4490
|
f'{colorama.Style.RESET_ALL}')
|
|
4515
4491
|
|
|
4516
4492
|
# Optimize info is only show if _need_confirmation.
|
|
@@ -4537,7 +4513,9 @@ def jobs_launch(
|
|
|
4537
4513
|
controller=False)
|
|
4538
4514
|
sys.exit(returncode)
|
|
4539
4515
|
else:
|
|
4540
|
-
|
|
4516
|
+
# TODO(tian): This can be very long. Considering have a "group id"
|
|
4517
|
+
# and query all job ids with the same group id.
|
|
4518
|
+
job_ids_str = ','.join(map(str, job_ids))
|
|
4541
4519
|
click.secho(
|
|
4542
4520
|
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4543
4521
|
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
@@ -4822,7 +4800,7 @@ def pool():
|
|
|
4822
4800
|
type=str,
|
|
4823
4801
|
nargs=-1,
|
|
4824
4802
|
**_get_shell_complete_args(_complete_file_name))
|
|
4825
|
-
@click.option('--pool
|
|
4803
|
+
@click.option('--pool',
|
|
4826
4804
|
'-p',
|
|
4827
4805
|
default=None,
|
|
4828
4806
|
type=str,
|
|
@@ -4844,7 +4822,7 @@ def pool():
|
|
|
4844
4822
|
@usage_lib.entrypoint
|
|
4845
4823
|
def jobs_pool_apply(
|
|
4846
4824
|
pool_yaml: Tuple[str, ...],
|
|
4847
|
-
|
|
4825
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4848
4826
|
workdir: Optional[str],
|
|
4849
4827
|
infra: Optional[str],
|
|
4850
4828
|
cloud: Optional[str],
|
|
@@ -4877,11 +4855,11 @@ def jobs_pool_apply(
|
|
|
4877
4855
|
"""
|
|
4878
4856
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4879
4857
|
infra, cloud, region, zone)
|
|
4880
|
-
if
|
|
4881
|
-
|
|
4858
|
+
if pool is None:
|
|
4859
|
+
pool = serve_lib.generate_service_name(pool=True)
|
|
4882
4860
|
|
|
4883
4861
|
task = _generate_task_with_service(
|
|
4884
|
-
service_name=
|
|
4862
|
+
service_name=pool,
|
|
4885
4863
|
service_yaml_args=pool_yaml,
|
|
4886
4864
|
workdir=workdir,
|
|
4887
4865
|
cloud=cloud,
|
|
@@ -4918,7 +4896,7 @@ def jobs_pool_apply(
|
|
|
4918
4896
|
dag.add(task)
|
|
4919
4897
|
|
|
4920
4898
|
request_id = managed_jobs.pool_apply(task,
|
|
4921
|
-
|
|
4899
|
+
pool,
|
|
4922
4900
|
mode=serve_lib.UpdateMode(mode),
|
|
4923
4901
|
_need_confirmation=not yes)
|
|
4924
4902
|
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
@@ -5156,7 +5134,7 @@ def _handle_serve_logs(
|
|
|
5156
5134
|
@usage_lib.entrypoint
|
|
5157
5135
|
# TODO(tian): Add default argument for this CLI if none of the flags are
|
|
5158
5136
|
# specified.
|
|
5159
|
-
def
|
|
5137
|
+
def jobs_pool_logs(
|
|
5160
5138
|
pool_name: str,
|
|
5161
5139
|
follow: bool,
|
|
5162
5140
|
controller: bool,
|
|
@@ -6073,7 +6051,7 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6073
6051
|
# server accepts log_path-only streaming.
|
|
6074
6052
|
req_id = (server_common.RequestId[None](request_id)
|
|
6075
6053
|
if request_id is not None else None)
|
|
6076
|
-
sdk.stream_and_get(req_id, log_path, tail, follow
|
|
6054
|
+
sdk.stream_and_get(req_id, log_path, tail, follow)
|
|
6077
6055
|
|
|
6078
6056
|
|
|
6079
6057
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|
|
@@ -6212,16 +6190,15 @@ def api_info():
|
|
|
6212
6190
|
"""Shows the SkyPilot API server URL."""
|
|
6213
6191
|
url = server_common.get_server_url()
|
|
6214
6192
|
api_server_info = sdk.api_info()
|
|
6215
|
-
api_server_user = api_server_info.
|
|
6193
|
+
api_server_user = api_server_info.user
|
|
6216
6194
|
if api_server_user is not None:
|
|
6217
|
-
user =
|
|
6218
|
-
name=api_server_user['name'])
|
|
6195
|
+
user = api_server_user
|
|
6219
6196
|
else:
|
|
6220
6197
|
user = models.User.get_current_user()
|
|
6221
6198
|
click.echo(f'Using SkyPilot API server and dashboard: {url}\n'
|
|
6222
|
-
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info
|
|
6223
|
-
f'commit: {api_server_info
|
|
6224
|
-
f'version: {api_server_info
|
|
6199
|
+
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info.status}, '
|
|
6200
|
+
f'commit: {api_server_info.commit}, '
|
|
6201
|
+
f'version: {api_server_info.version}\n'
|
|
6225
6202
|
f'{ux_utils.INDENT_LAST_SYMBOL}User: {user.name} ({user.id})')
|
|
6226
6203
|
|
|
6227
6204
|
|
sky/client/common.py
CHANGED
sky/client/sdk.py
CHANGED
|
@@ -30,6 +30,7 @@ from sky import skypilot_config
|
|
|
30
30
|
from sky.adaptors import common as adaptors_common
|
|
31
31
|
from sky.client import common as client_common
|
|
32
32
|
from sky.client import oauth as oauth_lib
|
|
33
|
+
from sky.schemas.api import responses
|
|
33
34
|
from sky.server import common as server_common
|
|
34
35
|
from sky.server import rest
|
|
35
36
|
from sky.server import versions
|
|
@@ -66,8 +67,8 @@ if typing.TYPE_CHECKING:
|
|
|
66
67
|
|
|
67
68
|
import sky
|
|
68
69
|
from sky import backends
|
|
70
|
+
from sky import catalog
|
|
69
71
|
from sky import models
|
|
70
|
-
import sky.catalog
|
|
71
72
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
72
73
|
from sky.skylet import job_lib
|
|
73
74
|
else:
|
|
@@ -234,7 +235,7 @@ def list_accelerators(
|
|
|
234
235
|
require_price: bool = True,
|
|
235
236
|
case_sensitive: bool = True
|
|
236
237
|
) -> server_common.RequestId[Dict[str,
|
|
237
|
-
List['
|
|
238
|
+
List['catalog.common.InstanceTypeInfo']]]:
|
|
238
239
|
"""Lists the names of all accelerators offered by Sky.
|
|
239
240
|
|
|
240
241
|
This will include all accelerators offered by Sky, including those
|
|
@@ -479,11 +480,11 @@ def launch(
|
|
|
479
480
|
This option works in conjunction with ``idle_minutes_to_autostop``.
|
|
480
481
|
Choices:
|
|
481
482
|
|
|
482
|
-
1. "jobs_and_ssh" (default) - Wait for
|
|
483
|
-
|
|
484
|
-
2. "jobs" -
|
|
485
|
-
3. "none" -
|
|
486
|
-
|
|
483
|
+
1. "jobs_and_ssh" (default) - Wait for in-progress jobs and SSH
|
|
484
|
+
connections to finish.
|
|
485
|
+
2. "jobs" - Only wait for in-progress jobs.
|
|
486
|
+
3. "none" - Wait for nothing; autostop right after
|
|
487
|
+
``idle_minutes_to_autostop``.
|
|
487
488
|
dryrun: if True, do not actually launch the cluster.
|
|
488
489
|
down: Tear down the cluster after all jobs finish (successfully or
|
|
489
490
|
abnormally). If --idle-minutes-to-autostop is also set, the
|
|
@@ -854,6 +855,56 @@ def tail_logs(cluster_name: str,
|
|
|
854
855
|
resumable=(tail == 0))
|
|
855
856
|
|
|
856
857
|
|
|
858
|
+
@usage_lib.entrypoint
|
|
859
|
+
@server_common.check_server_healthy_or_start
|
|
860
|
+
@versions.minimal_api_version(17)
|
|
861
|
+
@annotations.client_api
|
|
862
|
+
@rest.retry_transient_errors()
|
|
863
|
+
def tail_provision_logs(cluster_name: str,
|
|
864
|
+
follow: bool = True,
|
|
865
|
+
tail: int = 0,
|
|
866
|
+
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
867
|
+
"""Tails the provisioning logs (provision.log) for a cluster.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
cluster_name: name of the cluster.
|
|
871
|
+
follow: follow the logs.
|
|
872
|
+
tail: lines from end to tail.
|
|
873
|
+
output_stream: optional stream to write logs.
|
|
874
|
+
Returns:
|
|
875
|
+
Exit code 0 on streaming success; raises on HTTP error.
|
|
876
|
+
"""
|
|
877
|
+
body = payloads.ClusterNameBody(cluster_name=cluster_name)
|
|
878
|
+
params = {
|
|
879
|
+
'follow': str(follow).lower(),
|
|
880
|
+
'tail': tail,
|
|
881
|
+
}
|
|
882
|
+
response = server_common.make_authenticated_request(
|
|
883
|
+
'POST',
|
|
884
|
+
'/provision_logs',
|
|
885
|
+
json=json.loads(body.model_dump_json()),
|
|
886
|
+
params=params,
|
|
887
|
+
stream=True,
|
|
888
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
|
889
|
+
None))
|
|
890
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
|
891
|
+
# streaming point on retry.
|
|
892
|
+
# request_id=None here because /provision_logs does not create an async
|
|
893
|
+
# request. Instead, it streams a plain file from the server. This does NOT
|
|
894
|
+
# violate the stream_response doc warning about None in multi-user
|
|
895
|
+
# environments: we are not asking stream_response to select “the latest
|
|
896
|
+
# request”. We already have the HTTP response to stream; request_id=None
|
|
897
|
+
# merely disables the follow-up GET. It is also necessary for --no-follow
|
|
898
|
+
# to return cleanly after printing the tailed lines. If we provided a
|
|
899
|
+
# non-None request_id here, the get(request_id) in stream_response(
|
|
900
|
+
# would fail since /provision_logs does not create a request record.
|
|
901
|
+
stream_response(request_id=None,
|
|
902
|
+
response=response,
|
|
903
|
+
output_stream=output_stream,
|
|
904
|
+
resumable=(tail == 0))
|
|
905
|
+
return 0
|
|
906
|
+
|
|
907
|
+
|
|
857
908
|
@usage_lib.entrypoint
|
|
858
909
|
@server_common.check_server_healthy_or_start
|
|
859
910
|
@annotations.client_api
|
|
@@ -935,11 +986,11 @@ def start(
|
|
|
935
986
|
This option works in conjunction with ``idle_minutes_to_autostop``.
|
|
936
987
|
Choices:
|
|
937
988
|
|
|
938
|
-
1. "jobs_and_ssh" (default) - Wait for
|
|
939
|
-
|
|
940
|
-
2. "jobs" -
|
|
941
|
-
3. "none" -
|
|
942
|
-
|
|
989
|
+
1. "jobs_and_ssh" (default) - Wait for in-progress jobs and SSH
|
|
990
|
+
connections to finish.
|
|
991
|
+
2. "jobs" - Only wait for in-progress jobs.
|
|
992
|
+
3. "none" - Wait for nothing; autostop right after
|
|
993
|
+
``idle_minutes_to_autostop``.
|
|
943
994
|
retry_until_up: whether to retry launching the cluster until it is
|
|
944
995
|
up.
|
|
945
996
|
down: Autodown the cluster: tear down the cluster after specified
|
|
@@ -1118,11 +1169,10 @@ def autostop(
|
|
|
1118
1169
|
This option works in conjunction with ``idle_minutes``.
|
|
1119
1170
|
Choices:
|
|
1120
1171
|
|
|
1121
|
-
1. "jobs_and_ssh" (default) - Wait for
|
|
1122
|
-
|
|
1123
|
-
2. "jobs" -
|
|
1124
|
-
3. "none" -
|
|
1125
|
-
regardless of running jobs or SSH connections.
|
|
1172
|
+
1. "jobs_and_ssh" (default) - Wait for in-progress jobs and SSH
|
|
1173
|
+
connections to finish.
|
|
1174
|
+
2. "jobs" - Only wait for in-progress jobs.
|
|
1175
|
+
3. "none" - Wait for nothing; autostop right after ``idle_minutes``.
|
|
1126
1176
|
down: if true, use autodown (tear down the cluster; non-restartable),
|
|
1127
1177
|
rather than autostop (restartable).
|
|
1128
1178
|
|
|
@@ -2059,7 +2109,7 @@ def api_status(
|
|
|
2059
2109
|
@usage_lib.entrypoint
|
|
2060
2110
|
@server_common.check_server_healthy_or_start
|
|
2061
2111
|
@annotations.client_api
|
|
2062
|
-
def api_info() ->
|
|
2112
|
+
def api_info() -> responses.APIHealthResponse:
|
|
2063
2113
|
"""Gets the server's status, commit and version.
|
|
2064
2114
|
|
|
2065
2115
|
Returns:
|
|
@@ -2084,7 +2134,7 @@ def api_info() -> Dict[str, Any]:
|
|
|
2084
2134
|
"""
|
|
2085
2135
|
response = server_common.make_authenticated_request('GET', '/api/health')
|
|
2086
2136
|
response.raise_for_status()
|
|
2087
|
-
return response.json()
|
|
2137
|
+
return responses.APIHealthResponse(**response.json())
|
|
2088
2138
|
|
|
2089
2139
|
|
|
2090
2140
|
@usage_lib.entrypoint
|
sky/client/sdk_async.py
CHANGED
|
@@ -20,13 +20,14 @@ import colorama
|
|
|
20
20
|
|
|
21
21
|
from sky import admin_policy
|
|
22
22
|
from sky import backends
|
|
23
|
+
from sky import catalog
|
|
23
24
|
from sky import exceptions
|
|
24
25
|
from sky import models
|
|
25
26
|
from sky import sky_logging
|
|
26
|
-
import sky.catalog
|
|
27
27
|
from sky.client import common as client_common
|
|
28
28
|
from sky.client import sdk
|
|
29
29
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
30
|
+
from sky.schemas.api import responses
|
|
30
31
|
from sky.server import common as server_common
|
|
31
32
|
from sky.server import rest
|
|
32
33
|
from sky.server.requests import payloads
|
|
@@ -301,7 +302,7 @@ async def list_accelerators(
|
|
|
301
302
|
require_price: bool = True,
|
|
302
303
|
case_sensitive: bool = True,
|
|
303
304
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
304
|
-
) -> Dict[str, List[
|
|
305
|
+
) -> Dict[str, List[catalog.common.InstanceTypeInfo]]:
|
|
305
306
|
"""Async version of list_accelerators() that lists the names of all
|
|
306
307
|
accelerators offered by Sky."""
|
|
307
308
|
request_id = await context_utils.to_thread(sdk.list_accelerators, gpus_only,
|
|
@@ -345,7 +346,7 @@ async def optimize(
|
|
|
345
346
|
admin_policy_request_options: Optional[
|
|
346
347
|
admin_policy.RequestOptions] = None,
|
|
347
348
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
348
|
-
) -> sky.
|
|
349
|
+
) -> 'sky.Dag':
|
|
349
350
|
"""Async version of optimize() that finds the best execution plan for the
|
|
350
351
|
given DAG."""
|
|
351
352
|
request_id = await context_utils.to_thread(sdk.optimize, dag, minimize,
|
|
@@ -786,7 +787,7 @@ async def dashboard(starting_page: Optional[str] = None) -> None:
|
|
|
786
787
|
|
|
787
788
|
@usage_lib.entrypoint
|
|
788
789
|
@annotations.client_api
|
|
789
|
-
async def api_info() ->
|
|
790
|
+
async def api_info() -> responses.APIHealthResponse:
|
|
790
791
|
"""Async version of api_info() that gets the server's status, commit and
|
|
791
792
|
version."""
|
|
792
793
|
return await context_utils.to_thread(sdk.api_info)
|
sky/clouds/aws.py
CHANGED
|
@@ -65,6 +65,8 @@ _CREDENTIAL_FILES = [
|
|
|
65
65
|
]
|
|
66
66
|
|
|
67
67
|
DEFAULT_AMI_GB = 45
|
|
68
|
+
DEFAULT_SSH_USER = 'ubuntu'
|
|
69
|
+
DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
|
|
68
70
|
|
|
69
71
|
# Temporary measure, as deleting per-cluster SGs is too slow.
|
|
70
72
|
# See https://github.com/skypilot-org/skypilot/pull/742.
|
|
@@ -343,6 +345,44 @@ class AWS(clouds.Cloud):
|
|
|
343
345
|
raise ValueError(image_not_found_message) from None
|
|
344
346
|
return image_size
|
|
345
347
|
|
|
348
|
+
@classmethod
|
|
349
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
|
350
|
+
def get_image_root_device_name(cls, image_id: str,
|
|
351
|
+
region: Optional[str]) -> str:
|
|
352
|
+
if image_id.startswith('skypilot:'):
|
|
353
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
354
|
+
assert region is not None, (image_id, region)
|
|
355
|
+
image_not_found_message = (
|
|
356
|
+
f'Image {image_id!r} not found in AWS region {region}.\n'
|
|
357
|
+
f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
|
|
358
|
+
'Example: ami-0729d913a335efca7')
|
|
359
|
+
try:
|
|
360
|
+
client = aws.client('ec2', region_name=region)
|
|
361
|
+
image_info = client.describe_images(ImageIds=[image_id]).get(
|
|
362
|
+
'Images', [])
|
|
363
|
+
if not image_info:
|
|
364
|
+
with ux_utils.print_exception_no_traceback():
|
|
365
|
+
raise ValueError(image_not_found_message)
|
|
366
|
+
image = image_info[0]
|
|
367
|
+
if 'RootDeviceName' not in image:
|
|
368
|
+
logger.warning(f'Image {image_id!r} does not have a root '
|
|
369
|
+
f'device name. '
|
|
370
|
+
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
371
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
372
|
+
return image['RootDeviceName']
|
|
373
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
|
374
|
+
aws.botocore_exceptions().ProfileNotFound):
|
|
375
|
+
# Fallback to default root device name if no credentials are
|
|
376
|
+
# available.
|
|
377
|
+
# The credentials issue will be caught when actually provisioning
|
|
378
|
+
# the instance and appropriate errors will be raised there.
|
|
379
|
+
logger.warning(f'No credentials available for region {region}. '
|
|
380
|
+
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
381
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
382
|
+
except aws.botocore_exceptions().ClientError:
|
|
383
|
+
with ux_utils.print_exception_no_traceback():
|
|
384
|
+
raise ValueError(image_not_found_message) from None
|
|
385
|
+
|
|
346
386
|
@classmethod
|
|
347
387
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
348
388
|
# The command for getting the current zone is from:
|
|
@@ -466,6 +506,15 @@ class AWS(clouds.Cloud):
|
|
|
466
506
|
image_id = self._get_image_id(image_id_to_use, region_name,
|
|
467
507
|
resources.instance_type)
|
|
468
508
|
|
|
509
|
+
root_device_name = self.get_image_root_device_name(
|
|
510
|
+
image_id, region_name)
|
|
511
|
+
|
|
512
|
+
ssh_user = skypilot_config.get_effective_region_config(
|
|
513
|
+
cloud='aws',
|
|
514
|
+
region=region_name,
|
|
515
|
+
keys=('ssh_user',),
|
|
516
|
+
default_value=DEFAULT_SSH_USER)
|
|
517
|
+
|
|
469
518
|
disk_encrypted = skypilot_config.get_effective_region_config(
|
|
470
519
|
cloud='aws',
|
|
471
520
|
region=region_name,
|
|
@@ -509,6 +558,8 @@ class AWS(clouds.Cloud):
|
|
|
509
558
|
'region': region_name,
|
|
510
559
|
'zones': ','.join(zone_names),
|
|
511
560
|
'image_id': image_id,
|
|
561
|
+
'root_device_name': root_device_name,
|
|
562
|
+
'ssh_user': ssh_user,
|
|
512
563
|
'security_group': security_group,
|
|
513
564
|
'security_group_managed_by_skypilot':
|
|
514
565
|
str(security_group != user_security_group).lower(),
|
|
@@ -1080,7 +1131,7 @@ class AWS(clouds.Cloud):
|
|
|
1080
1131
|
|
|
1081
1132
|
image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
|
|
1082
1133
|
|
|
1083
|
-
status = provision_lib.query_instances('AWS',
|
|
1134
|
+
status = provision_lib.query_instances('AWS', cluster_name.display_name,
|
|
1084
1135
|
cluster_name.name_on_cloud,
|
|
1085
1136
|
{'region': region})
|
|
1086
1137
|
instance_ids = list(status.keys())
|
sky/clouds/kubernetes.py
CHANGED
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
import re
|
|
4
4
|
import subprocess
|
|
5
5
|
import tempfile
|
|
6
|
-
import typing
|
|
7
6
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
8
7
|
|
|
9
8
|
import colorama
|
|
@@ -11,6 +10,7 @@ import colorama
|
|
|
11
10
|
from sky import catalog
|
|
12
11
|
from sky import clouds
|
|
13
12
|
from sky import exceptions
|
|
13
|
+
from sky import resources as resources_lib
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky import skypilot_config
|
|
16
16
|
from sky.adaptors import kubernetes
|
|
@@ -31,10 +31,6 @@ from sky.utils import resources_utils
|
|
|
31
31
|
from sky.utils import schemas
|
|
32
32
|
from sky.utils import volume as volume_lib
|
|
33
33
|
|
|
34
|
-
if typing.TYPE_CHECKING:
|
|
35
|
-
# Renaming to avoid shadowing variables.
|
|
36
|
-
from sky import resources as resources_lib
|
|
37
|
-
|
|
38
34
|
logger = sky_logging.init_logger(__name__)
|
|
39
35
|
|
|
40
36
|
# Namespace for SkyPilot resources shared across multiple tenants on the
|
|
@@ -771,11 +767,25 @@ class Kubernetes(clouds.Cloud):
|
|
|
771
767
|
|
|
772
768
|
return deploy_vars
|
|
773
769
|
|
|
770
|
+
@staticmethod
|
|
771
|
+
def _warn_on_disk_size(resources: 'resources_lib.Resources'):
|
|
772
|
+
if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
|
|
773
|
+
logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
|
|
774
|
+
'is not supported by Kubernetes. '
|
|
775
|
+
'To add additional disk, use volumes.'
|
|
776
|
+
f'{colorama.Style.RESET_ALL}')
|
|
777
|
+
if resources.disk_tier is not None:
|
|
778
|
+
logger.info(f'{colorama.Style.DIM}Disk tier {resources.disk_tier} '
|
|
779
|
+
'is not supported by Kubernetes. '
|
|
780
|
+
'To add additional disk, use volumes.'
|
|
781
|
+
f'{colorama.Style.RESET_ALL}')
|
|
782
|
+
|
|
774
783
|
def _get_feasible_launchable_resources(
|
|
775
784
|
self, resources: 'resources_lib.Resources'
|
|
776
785
|
) -> 'resources_utils.FeasibleResources':
|
|
777
786
|
# TODO(zhwu): This needs to be updated to return the correct region
|
|
778
787
|
# (context) that has enough resources.
|
|
788
|
+
self._warn_on_disk_size(resources)
|
|
779
789
|
fuzzy_candidate_list: List[str] = []
|
|
780
790
|
if resources.instance_type is not None:
|
|
781
791
|
assert resources.is_launchable(), resources
|
sky/clouds/nebius.py
CHANGED
|
@@ -442,7 +442,9 @@ class Nebius(clouds.Cloud):
|
|
|
442
442
|
del workspace_config # Unused
|
|
443
443
|
sdk = nebius.sdk()
|
|
444
444
|
profile_client = nebius.iam().ProfileServiceClient(sdk)
|
|
445
|
-
profile =
|
|
445
|
+
profile = nebius.sync_call(
|
|
446
|
+
profile_client.get(nebius.iam().GetProfileRequest(),
|
|
447
|
+
timeout=nebius.READ_TIMEOUT))
|
|
446
448
|
if profile.user_profile is not None:
|
|
447
449
|
if profile.user_profile.attributes is None:
|
|
448
450
|
raise exceptions.CloudUserIdentityError(
|
sky/dag.py
CHANGED
sky/dashboard/out/404.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|