skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/client/cli.py
CHANGED
@@ -25,7 +25,6 @@ each other.
|
|
25
25
|
"""
|
26
26
|
import collections
|
27
27
|
import copy
|
28
|
-
import datetime
|
29
28
|
import fnmatch
|
30
29
|
import functools
|
31
30
|
import os
|
@@ -34,7 +33,6 @@ import shlex
|
|
34
33
|
import shutil
|
35
34
|
import subprocess
|
36
35
|
import sys
|
37
|
-
import textwrap
|
38
36
|
import traceback
|
39
37
|
import typing
|
40
38
|
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
@@ -52,15 +50,12 @@ from sky import backends
|
|
52
50
|
from sky import catalog
|
53
51
|
from sky import clouds
|
54
52
|
from sky import exceptions
|
55
|
-
from sky import global_user_state
|
56
53
|
from sky import jobs as managed_jobs
|
57
54
|
from sky import models
|
58
55
|
from sky import serve as serve_lib
|
59
56
|
from sky import sky_logging
|
60
57
|
from sky import skypilot_config
|
61
58
|
from sky.adaptors import common as adaptors_common
|
62
|
-
from sky.benchmark import benchmark_state
|
63
|
-
from sky.benchmark import benchmark_utils
|
64
59
|
from sky.client import sdk
|
65
60
|
from sky.data import storage_utils
|
66
61
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
@@ -236,6 +231,22 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
|
|
236
231
|
return ret[0], ret[1]
|
237
232
|
|
238
233
|
|
234
|
+
def _parse_secret_var(secret_var: str) -> Tuple[str, str]:
|
235
|
+
"""Parse secret vars into a (KEY, VAL) pair."""
|
236
|
+
if '=' not in secret_var:
|
237
|
+
value = os.environ.get(secret_var)
|
238
|
+
if value is None:
|
239
|
+
raise click.UsageError(
|
240
|
+
f'{secret_var} is not set in local environment.')
|
241
|
+
return (secret_var, value)
|
242
|
+
ret = tuple(secret_var.split('=', 1))
|
243
|
+
if len(ret) != 2:
|
244
|
+
raise click.UsageError(
|
245
|
+
f'Invalid secret var: {secret_var}. Must be in the form of KEY=VAL '
|
246
|
+
'or KEY.')
|
247
|
+
return ret[0], ret[1]
|
248
|
+
|
249
|
+
|
239
250
|
def _async_call_or_wait(request_id: str, async_call: bool,
|
240
251
|
request_name: str) -> Any:
|
241
252
|
short_request_id = request_id[:8]
|
@@ -461,6 +472,23 @@ _TASK_OPTIONS = [
|
|
461
472
|
|
462
473
|
3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
|
463
474
|
same value of ``$MY_ENV3`` in the local environment.""",
|
475
|
+
),
|
476
|
+
click.option(
|
477
|
+
'--secret',
|
478
|
+
required=False,
|
479
|
+
type=_parse_secret_var,
|
480
|
+
multiple=True,
|
481
|
+
help="""\
|
482
|
+
Secret variable to set on the remote node. These variables will be
|
483
|
+
redacted in logs and YAML outputs for security. It can be specified
|
484
|
+
multiple times. Examples:
|
485
|
+
|
486
|
+
\b
|
487
|
+
1. ``--secret API_KEY=secret123``: set ``$API_KEY`` on the cluster to
|
488
|
+
be secret123.
|
489
|
+
|
490
|
+
2. ``--secret JWT_SECRET``: set ``$JWT_SECRET`` on the cluster to be
|
491
|
+
the same value of ``$JWT_SECRET`` in the local environment.""",
|
464
492
|
)
|
465
493
|
]
|
466
494
|
_TASK_OPTIONS_WITH_NAME = [
|
@@ -873,6 +901,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
873
901
|
network_tier: Optional[str] = None,
|
874
902
|
ports: Optional[Tuple[str, ...]] = None,
|
875
903
|
env: Optional[List[Tuple[str, str]]] = None,
|
904
|
+
secret: Optional[List[Tuple[str, str]]] = None,
|
876
905
|
field_to_ignore: Optional[List[str]] = None,
|
877
906
|
# job launch specific
|
878
907
|
job_recovery: Optional[str] = None,
|
@@ -921,7 +950,9 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
921
950
|
if is_yaml:
|
922
951
|
assert entrypoint is not None
|
923
952
|
usage_lib.messages.usage.update_user_task_yaml(entrypoint)
|
924
|
-
dag = dag_utils.load_chain_dag_from_yaml(entrypoint,
|
953
|
+
dag = dag_utils.load_chain_dag_from_yaml(entrypoint,
|
954
|
+
env_overrides=env,
|
955
|
+
secret_overrides=secret)
|
925
956
|
if len(dag.tasks) > 1:
|
926
957
|
# When the dag has more than 1 task. It is unclear how to
|
927
958
|
# override the params for the dag. So we just ignore the
|
@@ -940,6 +971,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
940
971
|
task.set_resources({sky.Resources()})
|
941
972
|
# env update has been done for DAG in load_chain_dag_from_yaml for YAML.
|
942
973
|
task.update_envs(env)
|
974
|
+
task.update_secrets(secret)
|
943
975
|
|
944
976
|
# Override.
|
945
977
|
if workdir is not None:
|
@@ -1248,6 +1280,7 @@ def launch(
|
|
1248
1280
|
image_id: Optional[str],
|
1249
1281
|
env_file: Optional[Dict[str, str]],
|
1250
1282
|
env: List[Tuple[str, str]],
|
1283
|
+
secret: List[Tuple[str, str]],
|
1251
1284
|
disk_size: Optional[int],
|
1252
1285
|
disk_tier: Optional[str],
|
1253
1286
|
network_tier: Optional[str],
|
@@ -1302,6 +1335,7 @@ def launch(
|
|
1302
1335
|
use_spot=use_spot,
|
1303
1336
|
image_id=image_id,
|
1304
1337
|
env=env,
|
1338
|
+
secret=secret,
|
1305
1339
|
disk_size=disk_size,
|
1306
1340
|
disk_tier=disk_tier,
|
1307
1341
|
network_tier=network_tier,
|
@@ -1418,6 +1452,7 @@ def exec(cluster: Optional[str],
|
|
1418
1452
|
image_id: Optional[str],
|
1419
1453
|
env_file: Optional[Dict[str, str]],
|
1420
1454
|
env: List[Tuple[str, str]],
|
1455
|
+
secret: List[Tuple[str, str]],
|
1421
1456
|
cpus: Optional[str],
|
1422
1457
|
memory: Optional[str],
|
1423
1458
|
disk_size: Optional[int],
|
@@ -1516,6 +1551,7 @@ def exec(cluster: Optional[str],
|
|
1516
1551
|
image_id=image_id,
|
1517
1552
|
num_nodes=num_nodes,
|
1518
1553
|
env=env,
|
1554
|
+
secret=secret,
|
1519
1555
|
disk_size=disk_size,
|
1520
1556
|
disk_tier=disk_tier,
|
1521
1557
|
network_tier=network_tier,
|
@@ -4163,12 +4199,6 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
4163
4199
|
f'{colorama.Style.RESET_ALL}')
|
4164
4200
|
|
4165
4201
|
|
4166
|
-
@cli.group(cls=_NaturalOrderGroup, hidden=True)
|
4167
|
-
def bench():
|
4168
|
-
"""SkyPilot Benchmark CLI."""
|
4169
|
-
raise click.UsageError('The benchmark CLI is currently disabled.')
|
4170
|
-
|
4171
|
-
|
4172
4202
|
@cli.group(cls=_NaturalOrderGroup)
|
4173
4203
|
def jobs():
|
4174
4204
|
"""Managed Jobs CLI (jobs with auto-recovery)."""
|
@@ -4235,6 +4265,7 @@ def jobs_launch(
|
|
4235
4265
|
job_recovery: Optional[str],
|
4236
4266
|
env_file: Optional[Dict[str, str]],
|
4237
4267
|
env: List[Tuple[str, str]],
|
4268
|
+
secret: List[Tuple[str, str]],
|
4238
4269
|
disk_size: Optional[int],
|
4239
4270
|
disk_tier: Optional[str],
|
4240
4271
|
network_tier: Optional[str],
|
@@ -4282,6 +4313,7 @@ def jobs_launch(
|
|
4282
4313
|
use_spot=use_spot,
|
4283
4314
|
image_id=image_id,
|
4284
4315
|
env=env,
|
4316
|
+
secret=secret,
|
4285
4317
|
disk_size=disk_size,
|
4286
4318
|
disk_tier=disk_tier,
|
4287
4319
|
network_tier=network_tier,
|
@@ -4615,6 +4647,7 @@ def _generate_task_with_service(
|
|
4615
4647
|
image_id: Optional[str],
|
4616
4648
|
env_file: Optional[Dict[str, str]],
|
4617
4649
|
env: List[Tuple[str, str]],
|
4650
|
+
secret: Optional[List[Tuple[str, str]]],
|
4618
4651
|
gpus: Optional[str],
|
4619
4652
|
instance_type: Optional[str],
|
4620
4653
|
ports: Optional[Tuple[str]],
|
@@ -4647,6 +4680,7 @@ def _generate_task_with_service(
|
|
4647
4680
|
use_spot=use_spot,
|
4648
4681
|
image_id=image_id,
|
4649
4682
|
env=env,
|
4683
|
+
secret=secret,
|
4650
4684
|
disk_size=disk_size,
|
4651
4685
|
disk_tier=disk_tier,
|
4652
4686
|
network_tier=network_tier,
|
@@ -4756,6 +4790,7 @@ def serve_up(
|
|
4756
4790
|
image_id: Optional[str],
|
4757
4791
|
env_file: Optional[Dict[str, str]],
|
4758
4792
|
env: List[Tuple[str, str]],
|
4793
|
+
secret: List[Tuple[str, str]],
|
4759
4794
|
gpus: Optional[str],
|
4760
4795
|
instance_type: Optional[str],
|
4761
4796
|
ports: Tuple[str],
|
@@ -4816,6 +4851,7 @@ def serve_up(
|
|
4816
4851
|
image_id=image_id,
|
4817
4852
|
env_file=env_file,
|
4818
4853
|
env=env,
|
4854
|
+
secret=secret,
|
4819
4855
|
disk_size=disk_size,
|
4820
4856
|
disk_tier=disk_tier,
|
4821
4857
|
network_tier=network_tier,
|
@@ -4864,11 +4900,12 @@ def serve_up(
|
|
4864
4900
|
@timeline.event
|
4865
4901
|
@usage_lib.entrypoint
|
4866
4902
|
def serve_update(
|
4867
|
-
service_name: str, service_yaml: Tuple[str,
|
4868
|
-
|
4869
|
-
|
4870
|
-
|
4871
|
-
|
4903
|
+
service_name: str, service_yaml: Tuple[str,
|
4904
|
+
...], workdir: Optional[str],
|
4905
|
+
infra: Optional[str], cloud: Optional[str], region: Optional[str],
|
4906
|
+
zone: Optional[str], num_nodes: Optional[int], use_spot: Optional[bool],
|
4907
|
+
image_id: Optional[str], env_file: Optional[Dict[str, str]],
|
4908
|
+
env: List[Tuple[str, str]], secret: List[Tuple[str, str]],
|
4872
4909
|
gpus: Optional[str], instance_type: Optional[str], ports: Tuple[str],
|
4873
4910
|
cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
|
4874
4911
|
disk_tier: Optional[str], network_tier: Optional[str], mode: str,
|
@@ -4920,6 +4957,7 @@ def serve_update(
|
|
4920
4957
|
image_id=image_id,
|
4921
4958
|
env_file=env_file,
|
4922
4959
|
env=env,
|
4960
|
+
secret=secret,
|
4923
4961
|
disk_size=disk_size,
|
4924
4962
|
disk_tier=disk_tier,
|
4925
4963
|
network_tier=network_tier,
|
@@ -5325,626 +5363,6 @@ def serve_logs(
|
|
5325
5363
|
raise
|
5326
5364
|
|
5327
5365
|
|
5328
|
-
# ==============================
|
5329
|
-
# Sky Benchmark CLIs
|
5330
|
-
# ==============================
|
5331
|
-
|
5332
|
-
|
5333
|
-
@ux_utils.print_exception_no_traceback()
|
5334
|
-
def _get_candidate_configs(
|
5335
|
-
entrypoint_yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
5336
|
-
"""Gets benchmark candidate configs from a YAML file.
|
5337
|
-
|
5338
|
-
Benchmark candidates are configured in the YAML file as a list of
|
5339
|
-
dictionaries. Each dictionary defines a candidate config
|
5340
|
-
by overriding resources. For example:
|
5341
|
-
|
5342
|
-
resources:
|
5343
|
-
cloud: aws
|
5344
|
-
candidates:
|
5345
|
-
- {accelerators: K80}
|
5346
|
-
- {instance_type: g4dn.2xlarge}
|
5347
|
-
- {cloud: gcp, accelerators: V100} # overrides cloud
|
5348
|
-
"""
|
5349
|
-
config = common_utils.read_yaml(os.path.expanduser(entrypoint_yaml_path))
|
5350
|
-
if not isinstance(config, dict):
|
5351
|
-
raise ValueError(f'Invalid YAML file: {entrypoint_yaml_path}. '
|
5352
|
-
'The YAML file should be parsed into a dictionary.')
|
5353
|
-
if config.get('resources') is None:
|
5354
|
-
return None
|
5355
|
-
|
5356
|
-
resources = config['resources']
|
5357
|
-
if not isinstance(resources, dict):
|
5358
|
-
raise ValueError(
|
5359
|
-
f'Invalid resources configuration in {entrypoint_yaml_path}. '
|
5360
|
-
'Resources must be a dictionary.')
|
5361
|
-
if resources.get('candidates') is None:
|
5362
|
-
return None
|
5363
|
-
|
5364
|
-
candidates = resources['candidates']
|
5365
|
-
if not isinstance(candidates, list):
|
5366
|
-
raise ValueError('Resource candidates must be a list of dictionaries.')
|
5367
|
-
for candidate in candidates:
|
5368
|
-
if not isinstance(candidate, dict):
|
5369
|
-
raise ValueError('Each resource candidate must be a dictionary.')
|
5370
|
-
return candidates
|
5371
|
-
|
5372
|
-
|
5373
|
-
@bench.command('launch', cls=_DocumentedCodeCommand)
|
5374
|
-
@config_option(expose_value=True)
|
5375
|
-
@click.argument('entrypoint',
|
5376
|
-
required=True,
|
5377
|
-
type=str,
|
5378
|
-
nargs=-1,
|
5379
|
-
**_get_shell_complete_args(_complete_file_name))
|
5380
|
-
@click.option('--benchmark',
|
5381
|
-
'-b',
|
5382
|
-
required=True,
|
5383
|
-
type=str,
|
5384
|
-
help='Benchmark name.')
|
5385
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
|
5386
|
-
@click.option('--gpus',
|
5387
|
-
required=False,
|
5388
|
-
type=str,
|
5389
|
-
help=('Comma-separated list of GPUs to run benchmark on. '
|
5390
|
-
'Example values: "T4:4,V100:8" (without blank spaces).'))
|
5391
|
-
@click.option(
|
5392
|
-
'--ports',
|
5393
|
-
required=False,
|
5394
|
-
type=str,
|
5395
|
-
multiple=True,
|
5396
|
-
help=('Ports to open on the cluster. '
|
5397
|
-
'If specified, overrides the "ports" config in the YAML. '),
|
5398
|
-
)
|
5399
|
-
@click.option(
|
5400
|
-
'--idle-minutes-to-autostop',
|
5401
|
-
'-i',
|
5402
|
-
default=None,
|
5403
|
-
type=int,
|
5404
|
-
required=False,
|
5405
|
-
help=('Automatically stop the cluster after this many minutes '
|
5406
|
-
'of idleness after setup/file_mounts. This is equivalent to '
|
5407
|
-
'running `sky launch -d ...` and then `sky autostop -i <minutes>`. '
|
5408
|
-
'If not set, the cluster will not be autostopped.'))
|
5409
|
-
# Disabling quote check here, as there seems to be a bug in pylint,
|
5410
|
-
# which incorrectly recognizes the help string as a docstring.
|
5411
|
-
# pylint: disable=bad-docstring-quotes
|
5412
|
-
@click.option('--yes',
|
5413
|
-
'-y',
|
5414
|
-
is_flag=True,
|
5415
|
-
default=False,
|
5416
|
-
required=False,
|
5417
|
-
help='Skip confirmation prompt.')
|
5418
|
-
@usage_lib.entrypoint
|
5419
|
-
def benchmark_launch(
|
5420
|
-
entrypoint: str,
|
5421
|
-
benchmark: str,
|
5422
|
-
name: Optional[str],
|
5423
|
-
workdir: Optional[str],
|
5424
|
-
infra: Optional[str],
|
5425
|
-
cloud: Optional[str],
|
5426
|
-
region: Optional[str],
|
5427
|
-
zone: Optional[str],
|
5428
|
-
gpus: Optional[str],
|
5429
|
-
num_nodes: Optional[int],
|
5430
|
-
use_spot: Optional[bool],
|
5431
|
-
image_id: Optional[str],
|
5432
|
-
env_file: Optional[Dict[str, str]],
|
5433
|
-
env: List[Tuple[str, str]],
|
5434
|
-
cpus: Optional[str],
|
5435
|
-
memory: Optional[str],
|
5436
|
-
disk_size: Optional[int],
|
5437
|
-
disk_tier: Optional[str],
|
5438
|
-
ports: Tuple[str],
|
5439
|
-
idle_minutes_to_autostop: Optional[int],
|
5440
|
-
yes: bool,
|
5441
|
-
async_call: bool, # pylint: disable=unused-argument
|
5442
|
-
config_override: Optional[Dict[str, Any]] = None,
|
5443
|
-
) -> None:
|
5444
|
-
"""Benchmark a task on different resources.
|
5445
|
-
|
5446
|
-
Example usage: `sky bench launch mytask.yaml -b mytask --gpus V100,T4`
|
5447
|
-
will benchmark your task on a V100 cluster and a T4 cluster simultaneously.
|
5448
|
-
Alternatively, specify the benchmarking resources in your YAML (see doc),
|
5449
|
-
which allows benchmarking on many more resource fields.
|
5450
|
-
"""
|
5451
|
-
# TODO(zhwu): move benchmark to SkyPilot API server
|
5452
|
-
env = _merge_env_vars(env_file, env)
|
5453
|
-
record = benchmark_state.get_benchmark_from_name(benchmark)
|
5454
|
-
if record is not None:
|
5455
|
-
raise click.BadParameter(f'Benchmark {benchmark} already exists. '
|
5456
|
-
'To delete the previous benchmark result, '
|
5457
|
-
f'run `sky bench delete {benchmark}`.')
|
5458
|
-
entrypoint = ' '.join(entrypoint)
|
5459
|
-
if not entrypoint:
|
5460
|
-
raise click.BadParameter('Please specify a task yaml to benchmark.')
|
5461
|
-
|
5462
|
-
is_yaml, config = _check_yaml(entrypoint)
|
5463
|
-
if not is_yaml:
|
5464
|
-
raise click.BadParameter(
|
5465
|
-
'Sky Benchmark does not support command line tasks. '
|
5466
|
-
'Please provide a YAML file.')
|
5467
|
-
assert config is not None, (is_yaml, config)
|
5468
|
-
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
5469
|
-
infra, cloud, region, zone)
|
5470
|
-
|
5471
|
-
click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
|
5472
|
-
click.secho(entrypoint, bold=True)
|
5473
|
-
|
5474
|
-
candidates = _get_candidate_configs(entrypoint)
|
5475
|
-
# Check if the candidate configs are specified in both CLI and YAML.
|
5476
|
-
if candidates is not None:
|
5477
|
-
message = ('is specified in both CLI and resources.candidates '
|
5478
|
-
'in the YAML. Please specify only one of them.')
|
5479
|
-
if cloud is not None:
|
5480
|
-
if any('cloud' in candidate for candidate in candidates):
|
5481
|
-
raise click.BadParameter(f'cloud {message}')
|
5482
|
-
if region is not None:
|
5483
|
-
if any('region' in candidate for candidate in candidates):
|
5484
|
-
raise click.BadParameter(f'region {message}')
|
5485
|
-
if zone is not None:
|
5486
|
-
if any('zone' in candidate for candidate in candidates):
|
5487
|
-
raise click.BadParameter(f'zone {message}')
|
5488
|
-
if gpus is not None:
|
5489
|
-
if any('accelerators' in candidate for candidate in candidates):
|
5490
|
-
raise click.BadParameter(f'gpus (accelerators) {message}')
|
5491
|
-
if use_spot is not None:
|
5492
|
-
if any('use_spot' in candidate for candidate in candidates):
|
5493
|
-
raise click.BadParameter(f'use_spot {message}')
|
5494
|
-
if image_id is not None:
|
5495
|
-
if any('image_id' in candidate for candidate in candidates):
|
5496
|
-
raise click.BadParameter(f'image_id {message}')
|
5497
|
-
if disk_size is not None:
|
5498
|
-
if any('disk_size' in candidate for candidate in candidates):
|
5499
|
-
raise click.BadParameter(f'disk_size {message}')
|
5500
|
-
if disk_tier is not None:
|
5501
|
-
if any('disk_tier' in candidate for candidate in candidates):
|
5502
|
-
raise click.BadParameter(f'disk_tier {message}')
|
5503
|
-
if ports:
|
5504
|
-
if any('ports' in candidate for candidate in candidates):
|
5505
|
-
raise click.BadParameter(f'ports {message}')
|
5506
|
-
|
5507
|
-
# The user can specify the benchmark candidates in either of the two ways:
|
5508
|
-
# 1. By specifying resources.candidates in the YAML.
|
5509
|
-
# 2. By specifying gpu types as a command line argument (--gpus).
|
5510
|
-
override_gpu = None
|
5511
|
-
if gpus is not None:
|
5512
|
-
gpu_list = gpus.split(',')
|
5513
|
-
gpu_list = [gpu.strip() for gpu in gpu_list]
|
5514
|
-
if ' ' in gpus:
|
5515
|
-
raise click.BadParameter('Remove blanks in --gpus.')
|
5516
|
-
|
5517
|
-
if len(gpu_list) == 1:
|
5518
|
-
override_gpu = gpu_list[0]
|
5519
|
-
else:
|
5520
|
-
# If len(gpu_list) > 1, gpus is interpreted
|
5521
|
-
# as a list of benchmark candidates.
|
5522
|
-
if candidates is None:
|
5523
|
-
candidates = [{'accelerators': gpu} for gpu in gpu_list]
|
5524
|
-
override_gpu = None
|
5525
|
-
else:
|
5526
|
-
raise ValueError('Provide benchmark candidates in either '
|
5527
|
-
'--gpus or resources.candidates in the YAML.')
|
5528
|
-
if candidates is None:
|
5529
|
-
candidates = [{}]
|
5530
|
-
|
5531
|
-
if 'resources' not in config:
|
5532
|
-
config['resources'] = {}
|
5533
|
-
resources_config = config['resources']
|
5534
|
-
|
5535
|
-
# Override the yaml config with the command line arguments.
|
5536
|
-
if name is not None:
|
5537
|
-
config['name'] = name
|
5538
|
-
if workdir is not None:
|
5539
|
-
config['workdir'] = workdir
|
5540
|
-
if num_nodes is not None:
|
5541
|
-
config['num_nodes'] = num_nodes
|
5542
|
-
override_params = _parse_override_params(cloud=cloud,
|
5543
|
-
region=region,
|
5544
|
-
zone=zone,
|
5545
|
-
gpus=override_gpu,
|
5546
|
-
cpus=cpus,
|
5547
|
-
memory=memory,
|
5548
|
-
use_spot=use_spot,
|
5549
|
-
image_id=image_id,
|
5550
|
-
disk_size=disk_size,
|
5551
|
-
disk_tier=disk_tier,
|
5552
|
-
ports=ports,
|
5553
|
-
config_override=config_override)
|
5554
|
-
_pop_and_ignore_fields_in_override_params(
|
5555
|
-
override_params, field_to_ignore=['cpus', 'memory'])
|
5556
|
-
resources_config.update(override_params)
|
5557
|
-
if 'cloud' in resources_config:
|
5558
|
-
cloud = resources_config.pop('cloud')
|
5559
|
-
if cloud is not None:
|
5560
|
-
resources_config['cloud'] = str(cloud)
|
5561
|
-
if 'region' in resources_config:
|
5562
|
-
if resources_config['region'] is None:
|
5563
|
-
resources_config.pop('region')
|
5564
|
-
if 'zone' in resources_config:
|
5565
|
-
if resources_config['zone'] is None:
|
5566
|
-
resources_config.pop('zone')
|
5567
|
-
if 'accelerators' in resources_config:
|
5568
|
-
if resources_config['accelerators'] is None:
|
5569
|
-
resources_config.pop('accelerators')
|
5570
|
-
if 'image_id' in resources_config:
|
5571
|
-
if resources_config['image_id'] is None:
|
5572
|
-
resources_config.pop('image_id')
|
5573
|
-
|
5574
|
-
# Fully generate the benchmark candidate configs.
|
5575
|
-
clusters, candidate_configs = benchmark_utils.generate_benchmark_configs(
|
5576
|
-
benchmark, config, candidates)
|
5577
|
-
# Show the benchmarking VM instances selected by the optimizer.
|
5578
|
-
# This also detects the case where the user requested infeasible resources.
|
5579
|
-
benchmark_utils.print_benchmark_clusters(benchmark, clusters, config,
|
5580
|
-
candidate_configs)
|
5581
|
-
if not yes:
|
5582
|
-
plural = 's' if len(candidates) > 1 else ''
|
5583
|
-
prompt = f'Launching {len(candidates)} cluster{plural}. Proceed?'
|
5584
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
5585
|
-
|
5586
|
-
# Configs that are only accepted by the CLI.
|
5587
|
-
commandline_args: Dict[str, Any] = {}
|
5588
|
-
# Set the default idle minutes to autostop as 5, mimicking
|
5589
|
-
# the serverless execution.
|
5590
|
-
if idle_minutes_to_autostop is None:
|
5591
|
-
idle_minutes_to_autostop = 5
|
5592
|
-
commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop
|
5593
|
-
if env:
|
5594
|
-
commandline_args['env'] = [f'{k}={v}' for k, v in env]
|
5595
|
-
|
5596
|
-
# Launch the benchmarking clusters in detach mode in parallel.
|
5597
|
-
benchmark_created = benchmark_utils.launch_benchmark_clusters(
|
5598
|
-
benchmark, clusters, candidate_configs, commandline_args)
|
5599
|
-
|
5600
|
-
# If at least one cluster is created, print the following messages.
|
5601
|
-
if benchmark_created:
|
5602
|
-
logger.info(
|
5603
|
-
f'\n{colorama.Fore.CYAN}Benchmark name: '
|
5604
|
-
f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
|
5605
|
-
'\nTo see the benchmark results: '
|
5606
|
-
f'{ux_utils.BOLD}sky bench show '
|
5607
|
-
f'{benchmark}{ux_utils.RESET_BOLD}'
|
5608
|
-
'\nTo teardown the clusters: '
|
5609
|
-
f'{ux_utils.BOLD}sky bench down '
|
5610
|
-
f'{benchmark}{ux_utils.RESET_BOLD}')
|
5611
|
-
subprocess_utils.run('sky bench ls')
|
5612
|
-
else:
|
5613
|
-
logger.error('No benchmarking clusters are created.')
|
5614
|
-
subprocess_utils.run('sky status')
|
5615
|
-
|
5616
|
-
|
5617
|
-
@bench.command('ls', cls=_DocumentedCodeCommand)
|
5618
|
-
@config_option(expose_value=False)
|
5619
|
-
@usage_lib.entrypoint
|
5620
|
-
def benchmark_ls() -> None:
|
5621
|
-
"""List the benchmark history."""
|
5622
|
-
benchmarks = benchmark_state.get_benchmarks()
|
5623
|
-
columns = [
|
5624
|
-
'BENCHMARK',
|
5625
|
-
'TASK',
|
5626
|
-
'LAUNCHED',
|
5627
|
-
]
|
5628
|
-
|
5629
|
-
max_num_candidates = 1
|
5630
|
-
for benchmark in benchmarks:
|
5631
|
-
benchmark_results = benchmark_state.get_benchmark_results(
|
5632
|
-
benchmark['name'])
|
5633
|
-
num_candidates = len(benchmark_results)
|
5634
|
-
if num_candidates > max_num_candidates:
|
5635
|
-
max_num_candidates = num_candidates
|
5636
|
-
|
5637
|
-
if max_num_candidates == 1:
|
5638
|
-
columns += ['CANDIDATE']
|
5639
|
-
else:
|
5640
|
-
columns += [f'CANDIDATE {i}' for i in range(1, max_num_candidates + 1)]
|
5641
|
-
benchmark_table = log_utils.create_table(columns)
|
5642
|
-
|
5643
|
-
for benchmark in benchmarks:
|
5644
|
-
if benchmark['task'] is not None:
|
5645
|
-
task = benchmark['task']
|
5646
|
-
else:
|
5647
|
-
task = '-'
|
5648
|
-
row = [
|
5649
|
-
# BENCHMARK
|
5650
|
-
benchmark['name'],
|
5651
|
-
# TASK
|
5652
|
-
task,
|
5653
|
-
# LAUNCHED
|
5654
|
-
datetime.datetime.fromtimestamp(benchmark['launched_at']),
|
5655
|
-
]
|
5656
|
-
|
5657
|
-
benchmark_results = benchmark_state.get_benchmark_results(
|
5658
|
-
benchmark['name'])
|
5659
|
-
# RESOURCES
|
5660
|
-
for b in benchmark_results:
|
5661
|
-
num_nodes = b['num_nodes']
|
5662
|
-
resources = b['resources']
|
5663
|
-
postfix_spot = '[Spot]' if resources.use_spot else ''
|
5664
|
-
instance_type = resources.instance_type + postfix_spot
|
5665
|
-
if resources.accelerators is None:
|
5666
|
-
accelerators = ''
|
5667
|
-
else:
|
5668
|
-
accelerator, count = list(resources.accelerators.items())[0]
|
5669
|
-
accelerators = f' ({accelerator}:{count})'
|
5670
|
-
# For brevity, skip the cloud names.
|
5671
|
-
resources_str = f'{num_nodes}x {instance_type}{accelerators}'
|
5672
|
-
row.append(resources_str)
|
5673
|
-
row += [''] * (max_num_candidates - len(benchmark_results))
|
5674
|
-
benchmark_table.add_row(row)
|
5675
|
-
if benchmarks:
|
5676
|
-
click.echo(benchmark_table)
|
5677
|
-
else:
|
5678
|
-
click.echo('No benchmark history found.')
|
5679
|
-
|
5680
|
-
|
5681
|
-
@bench.command('show', cls=_DocumentedCodeCommand)
|
5682
|
-
@config_option(expose_value=False)
|
5683
|
-
@click.argument('benchmark', required=True, type=str)
|
5684
|
-
# TODO(woosuk): Add --all option to show all the collected information
|
5685
|
-
# (e.g., setup time, warmup steps, total steps, etc.).
|
5686
|
-
@usage_lib.entrypoint
|
5687
|
-
def benchmark_show(benchmark: str) -> None:
|
5688
|
-
"""Show a benchmark report."""
|
5689
|
-
record = benchmark_state.get_benchmark_from_name(benchmark)
|
5690
|
-
if record is None:
|
5691
|
-
raise click.BadParameter(f'Benchmark {benchmark} does not exist.')
|
5692
|
-
benchmark_utils.update_benchmark_state(benchmark)
|
5693
|
-
|
5694
|
-
click.echo(
|
5695
|
-
textwrap.dedent("""\
|
5696
|
-
Legend:
|
5697
|
-
- #STEPS: Number of steps taken.
|
5698
|
-
- SEC/STEP, $/STEP: Average time (cost) per step.
|
5699
|
-
- EST(hr), EST($): Estimated total time (cost) to complete the benchmark.
|
5700
|
-
"""))
|
5701
|
-
columns = [
|
5702
|
-
'CLUSTER',
|
5703
|
-
'RESOURCES',
|
5704
|
-
'STATUS',
|
5705
|
-
'DURATION',
|
5706
|
-
'SPENT($)',
|
5707
|
-
'#STEPS',
|
5708
|
-
'SEC/STEP',
|
5709
|
-
'$/STEP',
|
5710
|
-
'EST(hr)',
|
5711
|
-
'EST($)',
|
5712
|
-
]
|
5713
|
-
|
5714
|
-
cluster_table = log_utils.create_table(columns)
|
5715
|
-
rows = []
|
5716
|
-
benchmark_results = benchmark_state.get_benchmark_results(benchmark)
|
5717
|
-
for result in benchmark_results:
|
5718
|
-
num_nodes = result['num_nodes']
|
5719
|
-
resources = result['resources']
|
5720
|
-
row = [
|
5721
|
-
# CLUSTER
|
5722
|
-
result['cluster'],
|
5723
|
-
# RESOURCES
|
5724
|
-
f'{num_nodes}x {resources}',
|
5725
|
-
# STATUS
|
5726
|
-
result['status'].value,
|
5727
|
-
]
|
5728
|
-
|
5729
|
-
record = result['record']
|
5730
|
-
if (record is None or record.start_time is None or
|
5731
|
-
record.last_time is None):
|
5732
|
-
row += ['-'] * (len(columns) - len(row))
|
5733
|
-
rows.append(row)
|
5734
|
-
continue
|
5735
|
-
|
5736
|
-
duration_str = log_utils.readable_time_duration(record.start_time,
|
5737
|
-
record.last_time,
|
5738
|
-
absolute=True)
|
5739
|
-
duration = record.last_time - record.start_time
|
5740
|
-
spent = num_nodes * resources.get_cost(duration)
|
5741
|
-
spent_str = f'{spent:.4f}'
|
5742
|
-
|
5743
|
-
num_steps = record.num_steps_so_far
|
5744
|
-
if num_steps is None:
|
5745
|
-
num_steps = '-'
|
5746
|
-
|
5747
|
-
seconds_per_step = record.seconds_per_step
|
5748
|
-
if seconds_per_step is None:
|
5749
|
-
seconds_per_step_str = '-'
|
5750
|
-
cost_per_step_str = '-'
|
5751
|
-
else:
|
5752
|
-
seconds_per_step_str = f'{seconds_per_step:.4f}'
|
5753
|
-
cost_per_step = num_nodes * resources.get_cost(seconds_per_step)
|
5754
|
-
cost_per_step_str = f'{cost_per_step:.6f}'
|
5755
|
-
|
5756
|
-
total_time = record.estimated_total_seconds
|
5757
|
-
if total_time is None:
|
5758
|
-
total_time_str = '-'
|
5759
|
-
total_cost_str = '-'
|
5760
|
-
else:
|
5761
|
-
total_time_str = f'{total_time / 3600:.2f}'
|
5762
|
-
total_cost = num_nodes * resources.get_cost(total_time)
|
5763
|
-
total_cost_str = f'{total_cost:.2f}'
|
5764
|
-
|
5765
|
-
row += [
|
5766
|
-
# DURATION
|
5767
|
-
duration_str,
|
5768
|
-
# SPENT($)
|
5769
|
-
spent_str,
|
5770
|
-
# STEPS
|
5771
|
-
num_steps,
|
5772
|
-
# SEC/STEP
|
5773
|
-
seconds_per_step_str,
|
5774
|
-
# $/STEP
|
5775
|
-
cost_per_step_str,
|
5776
|
-
# EST(hr)
|
5777
|
-
total_time_str,
|
5778
|
-
# EST($)
|
5779
|
-
total_cost_str,
|
5780
|
-
]
|
5781
|
-
rows.append(row)
|
5782
|
-
|
5783
|
-
cluster_table.add_rows(rows)
|
5784
|
-
click.echo(cluster_table)
|
5785
|
-
|
5786
|
-
finished = [
|
5787
|
-
row for row in rows
|
5788
|
-
if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value
|
5789
|
-
]
|
5790
|
-
if any(row[5] == '-' for row in finished):
|
5791
|
-
# No #STEPS. SkyCallback was unused.
|
5792
|
-
click.secho(
|
5793
|
-
'SkyCallback logs are not found in this benchmark. '
|
5794
|
-
'Consider using SkyCallback to get more detailed information '
|
5795
|
-
'in real time.',
|
5796
|
-
fg='yellow')
|
5797
|
-
elif any(row[6] != '-' and row[-1] == '-' for row in rows):
|
5798
|
-
# No EST($). total_steps is not specified and cannot be inferred.
|
5799
|
-
click.secho(
|
5800
|
-
'Cannot estimate total time and cost because '
|
5801
|
-
'the total number of steps cannot be inferred by SkyCallback. '
|
5802
|
-
'To get the estimation, specify the total number of steps in '
|
5803
|
-
'either `sky_callback.init` or `Sky*Callback`.',
|
5804
|
-
fg='yellow')
|
5805
|
-
|
5806
|
-
|
5807
|
-
@bench.command('down', cls=_DocumentedCodeCommand)
|
5808
|
-
@config_option(expose_value=False)
|
5809
|
-
@click.argument('benchmark', required=True, type=str)
|
5810
|
-
@click.option(
|
5811
|
-
'--exclude',
|
5812
|
-
'-e',
|
5813
|
-
'clusters_to_exclude',
|
5814
|
-
required=False,
|
5815
|
-
type=str,
|
5816
|
-
multiple=True,
|
5817
|
-
help=('Cluster name(s) to exclude from termination. '
|
5818
|
-
'Typically, you might want to see the benchmark results in '
|
5819
|
-
'`sky bench show` and exclude a "winner" cluster from termination '
|
5820
|
-
'to finish the running task.'))
|
5821
|
-
@click.option('--yes',
|
5822
|
-
'-y',
|
5823
|
-
is_flag=True,
|
5824
|
-
default=False,
|
5825
|
-
required=False,
|
5826
|
-
help='Skip confirmation prompt.')
|
5827
|
-
@usage_lib.entrypoint
|
5828
|
-
def benchmark_down(
|
5829
|
-
benchmark: str,
|
5830
|
-
clusters_to_exclude: List[str],
|
5831
|
-
yes: bool,
|
5832
|
-
) -> None:
|
5833
|
-
"""Tear down all clusters belonging to a benchmark."""
|
5834
|
-
record = benchmark_state.get_benchmark_from_name(benchmark)
|
5835
|
-
if record is None:
|
5836
|
-
raise click.BadParameter(f'Benchmark {benchmark} does not exist.')
|
5837
|
-
|
5838
|
-
clusters = benchmark_state.get_benchmark_clusters(benchmark)
|
5839
|
-
to_stop: List[str] = []
|
5840
|
-
for cluster in clusters:
|
5841
|
-
if cluster in clusters_to_exclude:
|
5842
|
-
continue
|
5843
|
-
if global_user_state.get_cluster_from_name(cluster) is None:
|
5844
|
-
continue
|
5845
|
-
to_stop.append(cluster)
|
5846
|
-
|
5847
|
-
_down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
|
5848
|
-
|
5849
|
-
|
5850
|
-
@bench.command('delete', cls=_DocumentedCodeCommand)
|
5851
|
-
@config_option(expose_value=False)
|
5852
|
-
@click.argument('benchmarks', required=False, type=str, nargs=-1)
|
5853
|
-
@click.option('--all',
|
5854
|
-
'-a',
|
5855
|
-
default=None,
|
5856
|
-
is_flag=True,
|
5857
|
-
help='Delete all benchmark reports from the history.')
|
5858
|
-
@click.option('--yes',
|
5859
|
-
'-y',
|
5860
|
-
is_flag=True,
|
5861
|
-
default=False,
|
5862
|
-
required=False,
|
5863
|
-
help='Skip confirmation prompt.')
|
5864
|
-
@usage_lib.entrypoint
|
5865
|
-
# pylint: disable=redefined-builtin
|
5866
|
-
def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
5867
|
-
yes: bool) -> None:
|
5868
|
-
"""Delete benchmark reports from the history."""
|
5869
|
-
if not benchmarks and all is None:
|
5870
|
-
raise click.BadParameter(
|
5871
|
-
'Either specify benchmarks or use --all to delete all benchmarks.')
|
5872
|
-
to_delete = []
|
5873
|
-
if benchmarks:
|
5874
|
-
for benchmark in benchmarks:
|
5875
|
-
record = benchmark_state.get_benchmark_from_name(benchmark)
|
5876
|
-
if record is None:
|
5877
|
-
print(f'Benchmark {benchmark} not found.')
|
5878
|
-
else:
|
5879
|
-
to_delete.append(record)
|
5880
|
-
if all:
|
5881
|
-
to_delete = benchmark_state.get_benchmarks()
|
5882
|
-
if benchmarks:
|
5883
|
-
print('Both --all and benchmark(s) specified '
|
5884
|
-
'for sky bench delete. Letting --all take effect.')
|
5885
|
-
|
5886
|
-
to_delete = [r['name'] for r in to_delete]
|
5887
|
-
if not to_delete:
|
5888
|
-
return
|
5889
|
-
|
5890
|
-
benchmark_list = ', '.join(to_delete)
|
5891
|
-
plural = 's' if len(to_delete) > 1 else ''
|
5892
|
-
if not yes:
|
5893
|
-
click.confirm(
|
5894
|
-
f'Deleting the benchmark{plural}: {benchmark_list}. Proceed?',
|
5895
|
-
default=True,
|
5896
|
-
abort=True,
|
5897
|
-
show_default=True)
|
5898
|
-
|
5899
|
-
progress = rich_progress.Progress(transient=True,
|
5900
|
-
redirect_stdout=False,
|
5901
|
-
redirect_stderr=False)
|
5902
|
-
task = progress.add_task(
|
5903
|
-
f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ',
|
5904
|
-
total=len(to_delete))
|
5905
|
-
|
5906
|
-
def _delete_benchmark(benchmark: str) -> None:
|
5907
|
-
clusters = benchmark_state.get_benchmark_clusters(benchmark)
|
5908
|
-
records = []
|
5909
|
-
for cluster in clusters:
|
5910
|
-
record = global_user_state.get_cluster_from_name(cluster)
|
5911
|
-
records.append(record)
|
5912
|
-
num_clusters = len([r for r in records if r is not None])
|
5913
|
-
|
5914
|
-
if num_clusters > 0:
|
5915
|
-
plural = 's' if num_clusters > 1 else ''
|
5916
|
-
message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
|
5917
|
-
f'has {num_clusters} un-terminated cluster{plural}. '
|
5918
|
-
f'Terminate the cluster{plural} with '
|
5919
|
-
f'{ux_utils.BOLD} sky bench down {benchmark} '
|
5920
|
-
f'{ux_utils.RESET_BOLD} '
|
5921
|
-
'before deleting the benchmark report.')
|
5922
|
-
success = False
|
5923
|
-
else:
|
5924
|
-
bucket_name = benchmark_state.get_benchmark_from_name(
|
5925
|
-
benchmark)['bucket']
|
5926
|
-
handle = global_user_state.get_handle_from_storage_name(bucket_name)
|
5927
|
-
assert handle is not None, bucket_name
|
5928
|
-
bucket_type = list(handle.sky_stores.keys())[0]
|
5929
|
-
benchmark_utils.remove_benchmark_logs(benchmark, bucket_name,
|
5930
|
-
bucket_type)
|
5931
|
-
benchmark_state.delete_benchmark(benchmark)
|
5932
|
-
message = (f'{colorama.Fore.GREEN}Benchmark report for '
|
5933
|
-
f'{benchmark} deleted.{colorama.Style.RESET_ALL}')
|
5934
|
-
success = True
|
5935
|
-
|
5936
|
-
progress.stop()
|
5937
|
-
click.secho(message)
|
5938
|
-
if success:
|
5939
|
-
progress.update(task, advance=1)
|
5940
|
-
progress.start()
|
5941
|
-
|
5942
|
-
with progress:
|
5943
|
-
subprocess_utils.run_in_parallel(_delete_benchmark, to_delete)
|
5944
|
-
progress.live.transient = False
|
5945
|
-
progress.refresh()
|
5946
|
-
|
5947
|
-
|
5948
5366
|
@cli.group(cls=_NaturalOrderGroup, hidden=True)
|
5949
5367
|
def local():
|
5950
5368
|
"""SkyPilot local tools CLI."""
|
@@ -6080,10 +5498,19 @@ def api():
|
|
6080
5498
|
'to manage the process lifecycle and collect logs directly. '
|
6081
5499
|
'This is useful when the API server is managed by systems '
|
6082
5500
|
'like systemd and Kubernetes.')
|
5501
|
+
@click.option('--enable-basic-auth',
|
5502
|
+
is_flag=True,
|
5503
|
+
default=False,
|
5504
|
+
required=False,
|
5505
|
+
help='Enable basic authentication in the SkyPilot API server.')
|
6083
5506
|
@usage_lib.entrypoint
|
6084
|
-
def api_start(deploy: bool, host: Optional[str], foreground: bool
|
5507
|
+
def api_start(deploy: bool, host: Optional[str], foreground: bool,
|
5508
|
+
enable_basic_auth: bool):
|
6085
5509
|
"""Starts the SkyPilot API server locally."""
|
6086
|
-
sdk.api_start(deploy=deploy,
|
5510
|
+
sdk.api_start(deploy=deploy,
|
5511
|
+
host=host,
|
5512
|
+
foreground=foreground,
|
5513
|
+
enable_basic_auth=enable_basic_auth)
|
6087
5514
|
|
6088
5515
|
|
6089
5516
|
@api.command('stop', cls=_DocumentedCodeCommand)
|