skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/client/cli.py CHANGED
@@ -25,7 +25,6 @@ each other.
25
25
  """
26
26
  import collections
27
27
  import copy
28
- import datetime
29
28
  import fnmatch
30
29
  import functools
31
30
  import os
@@ -34,7 +33,6 @@ import shlex
34
33
  import shutil
35
34
  import subprocess
36
35
  import sys
37
- import textwrap
38
36
  import traceback
39
37
  import typing
40
38
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
@@ -52,15 +50,12 @@ from sky import backends
52
50
  from sky import catalog
53
51
  from sky import clouds
54
52
  from sky import exceptions
55
- from sky import global_user_state
56
53
  from sky import jobs as managed_jobs
57
54
  from sky import models
58
55
  from sky import serve as serve_lib
59
56
  from sky import sky_logging
60
57
  from sky import skypilot_config
61
58
  from sky.adaptors import common as adaptors_common
62
- from sky.benchmark import benchmark_state
63
- from sky.benchmark import benchmark_utils
64
59
  from sky.client import sdk
65
60
  from sky.data import storage_utils
66
61
  from sky.provision.kubernetes import constants as kubernetes_constants
@@ -236,6 +231,22 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
236
231
  return ret[0], ret[1]
237
232
 
238
233
 
234
+ def _parse_secret_var(secret_var: str) -> Tuple[str, str]:
235
+ """Parse secret vars into a (KEY, VAL) pair."""
236
+ if '=' not in secret_var:
237
+ value = os.environ.get(secret_var)
238
+ if value is None:
239
+ raise click.UsageError(
240
+ f'{secret_var} is not set in local environment.')
241
+ return (secret_var, value)
242
+ ret = tuple(secret_var.split('=', 1))
243
+ if len(ret) != 2:
244
+ raise click.UsageError(
245
+ f'Invalid secret var: {secret_var}. Must be in the form of KEY=VAL '
246
+ 'or KEY.')
247
+ return ret[0], ret[1]
248
+
249
+
239
250
  def _async_call_or_wait(request_id: str, async_call: bool,
240
251
  request_name: str) -> Any:
241
252
  short_request_id = request_id[:8]
@@ -461,6 +472,23 @@ _TASK_OPTIONS = [
461
472
 
462
473
  3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
463
474
  same value of ``$MY_ENV3`` in the local environment.""",
475
+ ),
476
+ click.option(
477
+ '--secret',
478
+ required=False,
479
+ type=_parse_secret_var,
480
+ multiple=True,
481
+ help="""\
482
+ Secret variable to set on the remote node. These variables will be
483
+ redacted in logs and YAML outputs for security. It can be specified
484
+ multiple times. Examples:
485
+
486
+ \b
487
+ 1. ``--secret API_KEY=secret123``: set ``$API_KEY`` on the cluster to
488
+ be secret123.
489
+
490
+ 2. ``--secret JWT_SECRET``: set ``$JWT_SECRET`` on the cluster to be
491
+ the same value of ``$JWT_SECRET`` in the local environment.""",
464
492
  )
465
493
  ]
466
494
  _TASK_OPTIONS_WITH_NAME = [
@@ -873,6 +901,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
873
901
  network_tier: Optional[str] = None,
874
902
  ports: Optional[Tuple[str, ...]] = None,
875
903
  env: Optional[List[Tuple[str, str]]] = None,
904
+ secret: Optional[List[Tuple[str, str]]] = None,
876
905
  field_to_ignore: Optional[List[str]] = None,
877
906
  # job launch specific
878
907
  job_recovery: Optional[str] = None,
@@ -921,7 +950,9 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
921
950
  if is_yaml:
922
951
  assert entrypoint is not None
923
952
  usage_lib.messages.usage.update_user_task_yaml(entrypoint)
924
- dag = dag_utils.load_chain_dag_from_yaml(entrypoint, env_overrides=env)
953
+ dag = dag_utils.load_chain_dag_from_yaml(entrypoint,
954
+ env_overrides=env,
955
+ secret_overrides=secret)
925
956
  if len(dag.tasks) > 1:
926
957
  # When the dag has more than 1 task. It is unclear how to
927
958
  # override the params for the dag. So we just ignore the
@@ -940,6 +971,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
940
971
  task.set_resources({sky.Resources()})
941
972
  # env update has been done for DAG in load_chain_dag_from_yaml for YAML.
942
973
  task.update_envs(env)
974
+ task.update_secrets(secret)
943
975
 
944
976
  # Override.
945
977
  if workdir is not None:
@@ -1248,6 +1280,7 @@ def launch(
1248
1280
  image_id: Optional[str],
1249
1281
  env_file: Optional[Dict[str, str]],
1250
1282
  env: List[Tuple[str, str]],
1283
+ secret: List[Tuple[str, str]],
1251
1284
  disk_size: Optional[int],
1252
1285
  disk_tier: Optional[str],
1253
1286
  network_tier: Optional[str],
@@ -1302,6 +1335,7 @@ def launch(
1302
1335
  use_spot=use_spot,
1303
1336
  image_id=image_id,
1304
1337
  env=env,
1338
+ secret=secret,
1305
1339
  disk_size=disk_size,
1306
1340
  disk_tier=disk_tier,
1307
1341
  network_tier=network_tier,
@@ -1418,6 +1452,7 @@ def exec(cluster: Optional[str],
1418
1452
  image_id: Optional[str],
1419
1453
  env_file: Optional[Dict[str, str]],
1420
1454
  env: List[Tuple[str, str]],
1455
+ secret: List[Tuple[str, str]],
1421
1456
  cpus: Optional[str],
1422
1457
  memory: Optional[str],
1423
1458
  disk_size: Optional[int],
@@ -1516,6 +1551,7 @@ def exec(cluster: Optional[str],
1516
1551
  image_id=image_id,
1517
1552
  num_nodes=num_nodes,
1518
1553
  env=env,
1554
+ secret=secret,
1519
1555
  disk_size=disk_size,
1520
1556
  disk_tier=disk_tier,
1521
1557
  network_tier=network_tier,
@@ -4163,12 +4199,6 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
4163
4199
  f'{colorama.Style.RESET_ALL}')
4164
4200
 
4165
4201
 
4166
- @cli.group(cls=_NaturalOrderGroup, hidden=True)
4167
- def bench():
4168
- """SkyPilot Benchmark CLI."""
4169
- raise click.UsageError('The benchmark CLI is currently disabled.')
4170
-
4171
-
4172
4202
  @cli.group(cls=_NaturalOrderGroup)
4173
4203
  def jobs():
4174
4204
  """Managed Jobs CLI (jobs with auto-recovery)."""
@@ -4235,6 +4265,7 @@ def jobs_launch(
4235
4265
  job_recovery: Optional[str],
4236
4266
  env_file: Optional[Dict[str, str]],
4237
4267
  env: List[Tuple[str, str]],
4268
+ secret: List[Tuple[str, str]],
4238
4269
  disk_size: Optional[int],
4239
4270
  disk_tier: Optional[str],
4240
4271
  network_tier: Optional[str],
@@ -4282,6 +4313,7 @@ def jobs_launch(
4282
4313
  use_spot=use_spot,
4283
4314
  image_id=image_id,
4284
4315
  env=env,
4316
+ secret=secret,
4285
4317
  disk_size=disk_size,
4286
4318
  disk_tier=disk_tier,
4287
4319
  network_tier=network_tier,
@@ -4615,6 +4647,7 @@ def _generate_task_with_service(
4615
4647
  image_id: Optional[str],
4616
4648
  env_file: Optional[Dict[str, str]],
4617
4649
  env: List[Tuple[str, str]],
4650
+ secret: Optional[List[Tuple[str, str]]],
4618
4651
  gpus: Optional[str],
4619
4652
  instance_type: Optional[str],
4620
4653
  ports: Optional[Tuple[str]],
@@ -4647,6 +4680,7 @@ def _generate_task_with_service(
4647
4680
  use_spot=use_spot,
4648
4681
  image_id=image_id,
4649
4682
  env=env,
4683
+ secret=secret,
4650
4684
  disk_size=disk_size,
4651
4685
  disk_tier=disk_tier,
4652
4686
  network_tier=network_tier,
@@ -4756,6 +4790,7 @@ def serve_up(
4756
4790
  image_id: Optional[str],
4757
4791
  env_file: Optional[Dict[str, str]],
4758
4792
  env: List[Tuple[str, str]],
4793
+ secret: List[Tuple[str, str]],
4759
4794
  gpus: Optional[str],
4760
4795
  instance_type: Optional[str],
4761
4796
  ports: Tuple[str],
@@ -4816,6 +4851,7 @@ def serve_up(
4816
4851
  image_id=image_id,
4817
4852
  env_file=env_file,
4818
4853
  env=env,
4854
+ secret=secret,
4819
4855
  disk_size=disk_size,
4820
4856
  disk_tier=disk_tier,
4821
4857
  network_tier=network_tier,
@@ -4864,11 +4900,12 @@ def serve_up(
4864
4900
  @timeline.event
4865
4901
  @usage_lib.entrypoint
4866
4902
  def serve_update(
4867
- service_name: str, service_yaml: Tuple[str, ...],
4868
- workdir: Optional[str], infra: Optional[str], cloud: Optional[str],
4869
- region: Optional[str], zone: Optional[str], num_nodes: Optional[int],
4870
- use_spot: Optional[bool], image_id: Optional[str],
4871
- env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
4903
+ service_name: str, service_yaml: Tuple[str,
4904
+ ...], workdir: Optional[str],
4905
+ infra: Optional[str], cloud: Optional[str], region: Optional[str],
4906
+ zone: Optional[str], num_nodes: Optional[int], use_spot: Optional[bool],
4907
+ image_id: Optional[str], env_file: Optional[Dict[str, str]],
4908
+ env: List[Tuple[str, str]], secret: List[Tuple[str, str]],
4872
4909
  gpus: Optional[str], instance_type: Optional[str], ports: Tuple[str],
4873
4910
  cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
4874
4911
  disk_tier: Optional[str], network_tier: Optional[str], mode: str,
@@ -4920,6 +4957,7 @@ def serve_update(
4920
4957
  image_id=image_id,
4921
4958
  env_file=env_file,
4922
4959
  env=env,
4960
+ secret=secret,
4923
4961
  disk_size=disk_size,
4924
4962
  disk_tier=disk_tier,
4925
4963
  network_tier=network_tier,
@@ -5325,626 +5363,6 @@ def serve_logs(
5325
5363
  raise
5326
5364
 
5327
5365
 
5328
- # ==============================
5329
- # Sky Benchmark CLIs
5330
- # ==============================
5331
-
5332
-
5333
- @ux_utils.print_exception_no_traceback()
5334
- def _get_candidate_configs(
5335
- entrypoint_yaml_path: str) -> Optional[List[Dict[str, str]]]:
5336
- """Gets benchmark candidate configs from a YAML file.
5337
-
5338
- Benchmark candidates are configured in the YAML file as a list of
5339
- dictionaries. Each dictionary defines a candidate config
5340
- by overriding resources. For example:
5341
-
5342
- resources:
5343
- cloud: aws
5344
- candidates:
5345
- - {accelerators: K80}
5346
- - {instance_type: g4dn.2xlarge}
5347
- - {cloud: gcp, accelerators: V100} # overrides cloud
5348
- """
5349
- config = common_utils.read_yaml(os.path.expanduser(entrypoint_yaml_path))
5350
- if not isinstance(config, dict):
5351
- raise ValueError(f'Invalid YAML file: {entrypoint_yaml_path}. '
5352
- 'The YAML file should be parsed into a dictionary.')
5353
- if config.get('resources') is None:
5354
- return None
5355
-
5356
- resources = config['resources']
5357
- if not isinstance(resources, dict):
5358
- raise ValueError(
5359
- f'Invalid resources configuration in {entrypoint_yaml_path}. '
5360
- 'Resources must be a dictionary.')
5361
- if resources.get('candidates') is None:
5362
- return None
5363
-
5364
- candidates = resources['candidates']
5365
- if not isinstance(candidates, list):
5366
- raise ValueError('Resource candidates must be a list of dictionaries.')
5367
- for candidate in candidates:
5368
- if not isinstance(candidate, dict):
5369
- raise ValueError('Each resource candidate must be a dictionary.')
5370
- return candidates
5371
-
5372
-
5373
- @bench.command('launch', cls=_DocumentedCodeCommand)
5374
- @config_option(expose_value=True)
5375
- @click.argument('entrypoint',
5376
- required=True,
5377
- type=str,
5378
- nargs=-1,
5379
- **_get_shell_complete_args(_complete_file_name))
5380
- @click.option('--benchmark',
5381
- '-b',
5382
- required=True,
5383
- type=str,
5384
- help='Benchmark name.')
5385
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
5386
- @click.option('--gpus',
5387
- required=False,
5388
- type=str,
5389
- help=('Comma-separated list of GPUs to run benchmark on. '
5390
- 'Example values: "T4:4,V100:8" (without blank spaces).'))
5391
- @click.option(
5392
- '--ports',
5393
- required=False,
5394
- type=str,
5395
- multiple=True,
5396
- help=('Ports to open on the cluster. '
5397
- 'If specified, overrides the "ports" config in the YAML. '),
5398
- )
5399
- @click.option(
5400
- '--idle-minutes-to-autostop',
5401
- '-i',
5402
- default=None,
5403
- type=int,
5404
- required=False,
5405
- help=('Automatically stop the cluster after this many minutes '
5406
- 'of idleness after setup/file_mounts. This is equivalent to '
5407
- 'running `sky launch -d ...` and then `sky autostop -i <minutes>`. '
5408
- 'If not set, the cluster will not be autostopped.'))
5409
- # Disabling quote check here, as there seems to be a bug in pylint,
5410
- # which incorrectly recognizes the help string as a docstring.
5411
- # pylint: disable=bad-docstring-quotes
5412
- @click.option('--yes',
5413
- '-y',
5414
- is_flag=True,
5415
- default=False,
5416
- required=False,
5417
- help='Skip confirmation prompt.')
5418
- @usage_lib.entrypoint
5419
- def benchmark_launch(
5420
- entrypoint: str,
5421
- benchmark: str,
5422
- name: Optional[str],
5423
- workdir: Optional[str],
5424
- infra: Optional[str],
5425
- cloud: Optional[str],
5426
- region: Optional[str],
5427
- zone: Optional[str],
5428
- gpus: Optional[str],
5429
- num_nodes: Optional[int],
5430
- use_spot: Optional[bool],
5431
- image_id: Optional[str],
5432
- env_file: Optional[Dict[str, str]],
5433
- env: List[Tuple[str, str]],
5434
- cpus: Optional[str],
5435
- memory: Optional[str],
5436
- disk_size: Optional[int],
5437
- disk_tier: Optional[str],
5438
- ports: Tuple[str],
5439
- idle_minutes_to_autostop: Optional[int],
5440
- yes: bool,
5441
- async_call: bool, # pylint: disable=unused-argument
5442
- config_override: Optional[Dict[str, Any]] = None,
5443
- ) -> None:
5444
- """Benchmark a task on different resources.
5445
-
5446
- Example usage: `sky bench launch mytask.yaml -b mytask --gpus V100,T4`
5447
- will benchmark your task on a V100 cluster and a T4 cluster simultaneously.
5448
- Alternatively, specify the benchmarking resources in your YAML (see doc),
5449
- which allows benchmarking on many more resource fields.
5450
- """
5451
- # TODO(zhwu): move benchmark to SkyPilot API server
5452
- env = _merge_env_vars(env_file, env)
5453
- record = benchmark_state.get_benchmark_from_name(benchmark)
5454
- if record is not None:
5455
- raise click.BadParameter(f'Benchmark {benchmark} already exists. '
5456
- 'To delete the previous benchmark result, '
5457
- f'run `sky bench delete {benchmark}`.')
5458
- entrypoint = ' '.join(entrypoint)
5459
- if not entrypoint:
5460
- raise click.BadParameter('Please specify a task yaml to benchmark.')
5461
-
5462
- is_yaml, config = _check_yaml(entrypoint)
5463
- if not is_yaml:
5464
- raise click.BadParameter(
5465
- 'Sky Benchmark does not support command line tasks. '
5466
- 'Please provide a YAML file.')
5467
- assert config is not None, (is_yaml, config)
5468
- cloud, region, zone = _handle_infra_cloud_region_zone_options(
5469
- infra, cloud, region, zone)
5470
-
5471
- click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
5472
- click.secho(entrypoint, bold=True)
5473
-
5474
- candidates = _get_candidate_configs(entrypoint)
5475
- # Check if the candidate configs are specified in both CLI and YAML.
5476
- if candidates is not None:
5477
- message = ('is specified in both CLI and resources.candidates '
5478
- 'in the YAML. Please specify only one of them.')
5479
- if cloud is not None:
5480
- if any('cloud' in candidate for candidate in candidates):
5481
- raise click.BadParameter(f'cloud {message}')
5482
- if region is not None:
5483
- if any('region' in candidate for candidate in candidates):
5484
- raise click.BadParameter(f'region {message}')
5485
- if zone is not None:
5486
- if any('zone' in candidate for candidate in candidates):
5487
- raise click.BadParameter(f'zone {message}')
5488
- if gpus is not None:
5489
- if any('accelerators' in candidate for candidate in candidates):
5490
- raise click.BadParameter(f'gpus (accelerators) {message}')
5491
- if use_spot is not None:
5492
- if any('use_spot' in candidate for candidate in candidates):
5493
- raise click.BadParameter(f'use_spot {message}')
5494
- if image_id is not None:
5495
- if any('image_id' in candidate for candidate in candidates):
5496
- raise click.BadParameter(f'image_id {message}')
5497
- if disk_size is not None:
5498
- if any('disk_size' in candidate for candidate in candidates):
5499
- raise click.BadParameter(f'disk_size {message}')
5500
- if disk_tier is not None:
5501
- if any('disk_tier' in candidate for candidate in candidates):
5502
- raise click.BadParameter(f'disk_tier {message}')
5503
- if ports:
5504
- if any('ports' in candidate for candidate in candidates):
5505
- raise click.BadParameter(f'ports {message}')
5506
-
5507
- # The user can specify the benchmark candidates in either of the two ways:
5508
- # 1. By specifying resources.candidates in the YAML.
5509
- # 2. By specifying gpu types as a command line argument (--gpus).
5510
- override_gpu = None
5511
- if gpus is not None:
5512
- gpu_list = gpus.split(',')
5513
- gpu_list = [gpu.strip() for gpu in gpu_list]
5514
- if ' ' in gpus:
5515
- raise click.BadParameter('Remove blanks in --gpus.')
5516
-
5517
- if len(gpu_list) == 1:
5518
- override_gpu = gpu_list[0]
5519
- else:
5520
- # If len(gpu_list) > 1, gpus is interpreted
5521
- # as a list of benchmark candidates.
5522
- if candidates is None:
5523
- candidates = [{'accelerators': gpu} for gpu in gpu_list]
5524
- override_gpu = None
5525
- else:
5526
- raise ValueError('Provide benchmark candidates in either '
5527
- '--gpus or resources.candidates in the YAML.')
5528
- if candidates is None:
5529
- candidates = [{}]
5530
-
5531
- if 'resources' not in config:
5532
- config['resources'] = {}
5533
- resources_config = config['resources']
5534
-
5535
- # Override the yaml config with the command line arguments.
5536
- if name is not None:
5537
- config['name'] = name
5538
- if workdir is not None:
5539
- config['workdir'] = workdir
5540
- if num_nodes is not None:
5541
- config['num_nodes'] = num_nodes
5542
- override_params = _parse_override_params(cloud=cloud,
5543
- region=region,
5544
- zone=zone,
5545
- gpus=override_gpu,
5546
- cpus=cpus,
5547
- memory=memory,
5548
- use_spot=use_spot,
5549
- image_id=image_id,
5550
- disk_size=disk_size,
5551
- disk_tier=disk_tier,
5552
- ports=ports,
5553
- config_override=config_override)
5554
- _pop_and_ignore_fields_in_override_params(
5555
- override_params, field_to_ignore=['cpus', 'memory'])
5556
- resources_config.update(override_params)
5557
- if 'cloud' in resources_config:
5558
- cloud = resources_config.pop('cloud')
5559
- if cloud is not None:
5560
- resources_config['cloud'] = str(cloud)
5561
- if 'region' in resources_config:
5562
- if resources_config['region'] is None:
5563
- resources_config.pop('region')
5564
- if 'zone' in resources_config:
5565
- if resources_config['zone'] is None:
5566
- resources_config.pop('zone')
5567
- if 'accelerators' in resources_config:
5568
- if resources_config['accelerators'] is None:
5569
- resources_config.pop('accelerators')
5570
- if 'image_id' in resources_config:
5571
- if resources_config['image_id'] is None:
5572
- resources_config.pop('image_id')
5573
-
5574
- # Fully generate the benchmark candidate configs.
5575
- clusters, candidate_configs = benchmark_utils.generate_benchmark_configs(
5576
- benchmark, config, candidates)
5577
- # Show the benchmarking VM instances selected by the optimizer.
5578
- # This also detects the case where the user requested infeasible resources.
5579
- benchmark_utils.print_benchmark_clusters(benchmark, clusters, config,
5580
- candidate_configs)
5581
- if not yes:
5582
- plural = 's' if len(candidates) > 1 else ''
5583
- prompt = f'Launching {len(candidates)} cluster{plural}. Proceed?'
5584
- click.confirm(prompt, default=True, abort=True, show_default=True)
5585
-
5586
- # Configs that are only accepted by the CLI.
5587
- commandline_args: Dict[str, Any] = {}
5588
- # Set the default idle minutes to autostop as 5, mimicking
5589
- # the serverless execution.
5590
- if idle_minutes_to_autostop is None:
5591
- idle_minutes_to_autostop = 5
5592
- commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop
5593
- if env:
5594
- commandline_args['env'] = [f'{k}={v}' for k, v in env]
5595
-
5596
- # Launch the benchmarking clusters in detach mode in parallel.
5597
- benchmark_created = benchmark_utils.launch_benchmark_clusters(
5598
- benchmark, clusters, candidate_configs, commandline_args)
5599
-
5600
- # If at least one cluster is created, print the following messages.
5601
- if benchmark_created:
5602
- logger.info(
5603
- f'\n{colorama.Fore.CYAN}Benchmark name: '
5604
- f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
5605
- '\nTo see the benchmark results: '
5606
- f'{ux_utils.BOLD}sky bench show '
5607
- f'{benchmark}{ux_utils.RESET_BOLD}'
5608
- '\nTo teardown the clusters: '
5609
- f'{ux_utils.BOLD}sky bench down '
5610
- f'{benchmark}{ux_utils.RESET_BOLD}')
5611
- subprocess_utils.run('sky bench ls')
5612
- else:
5613
- logger.error('No benchmarking clusters are created.')
5614
- subprocess_utils.run('sky status')
5615
-
5616
-
5617
- @bench.command('ls', cls=_DocumentedCodeCommand)
5618
- @config_option(expose_value=False)
5619
- @usage_lib.entrypoint
5620
- def benchmark_ls() -> None:
5621
- """List the benchmark history."""
5622
- benchmarks = benchmark_state.get_benchmarks()
5623
- columns = [
5624
- 'BENCHMARK',
5625
- 'TASK',
5626
- 'LAUNCHED',
5627
- ]
5628
-
5629
- max_num_candidates = 1
5630
- for benchmark in benchmarks:
5631
- benchmark_results = benchmark_state.get_benchmark_results(
5632
- benchmark['name'])
5633
- num_candidates = len(benchmark_results)
5634
- if num_candidates > max_num_candidates:
5635
- max_num_candidates = num_candidates
5636
-
5637
- if max_num_candidates == 1:
5638
- columns += ['CANDIDATE']
5639
- else:
5640
- columns += [f'CANDIDATE {i}' for i in range(1, max_num_candidates + 1)]
5641
- benchmark_table = log_utils.create_table(columns)
5642
-
5643
- for benchmark in benchmarks:
5644
- if benchmark['task'] is not None:
5645
- task = benchmark['task']
5646
- else:
5647
- task = '-'
5648
- row = [
5649
- # BENCHMARK
5650
- benchmark['name'],
5651
- # TASK
5652
- task,
5653
- # LAUNCHED
5654
- datetime.datetime.fromtimestamp(benchmark['launched_at']),
5655
- ]
5656
-
5657
- benchmark_results = benchmark_state.get_benchmark_results(
5658
- benchmark['name'])
5659
- # RESOURCES
5660
- for b in benchmark_results:
5661
- num_nodes = b['num_nodes']
5662
- resources = b['resources']
5663
- postfix_spot = '[Spot]' if resources.use_spot else ''
5664
- instance_type = resources.instance_type + postfix_spot
5665
- if resources.accelerators is None:
5666
- accelerators = ''
5667
- else:
5668
- accelerator, count = list(resources.accelerators.items())[0]
5669
- accelerators = f' ({accelerator}:{count})'
5670
- # For brevity, skip the cloud names.
5671
- resources_str = f'{num_nodes}x {instance_type}{accelerators}'
5672
- row.append(resources_str)
5673
- row += [''] * (max_num_candidates - len(benchmark_results))
5674
- benchmark_table.add_row(row)
5675
- if benchmarks:
5676
- click.echo(benchmark_table)
5677
- else:
5678
- click.echo('No benchmark history found.')
5679
-
5680
-
5681
- @bench.command('show', cls=_DocumentedCodeCommand)
5682
- @config_option(expose_value=False)
5683
- @click.argument('benchmark', required=True, type=str)
5684
- # TODO(woosuk): Add --all option to show all the collected information
5685
- # (e.g., setup time, warmup steps, total steps, etc.).
5686
- @usage_lib.entrypoint
5687
- def benchmark_show(benchmark: str) -> None:
5688
- """Show a benchmark report."""
5689
- record = benchmark_state.get_benchmark_from_name(benchmark)
5690
- if record is None:
5691
- raise click.BadParameter(f'Benchmark {benchmark} does not exist.')
5692
- benchmark_utils.update_benchmark_state(benchmark)
5693
-
5694
- click.echo(
5695
- textwrap.dedent("""\
5696
- Legend:
5697
- - #STEPS: Number of steps taken.
5698
- - SEC/STEP, $/STEP: Average time (cost) per step.
5699
- - EST(hr), EST($): Estimated total time (cost) to complete the benchmark.
5700
- """))
5701
- columns = [
5702
- 'CLUSTER',
5703
- 'RESOURCES',
5704
- 'STATUS',
5705
- 'DURATION',
5706
- 'SPENT($)',
5707
- '#STEPS',
5708
- 'SEC/STEP',
5709
- '$/STEP',
5710
- 'EST(hr)',
5711
- 'EST($)',
5712
- ]
5713
-
5714
- cluster_table = log_utils.create_table(columns)
5715
- rows = []
5716
- benchmark_results = benchmark_state.get_benchmark_results(benchmark)
5717
- for result in benchmark_results:
5718
- num_nodes = result['num_nodes']
5719
- resources = result['resources']
5720
- row = [
5721
- # CLUSTER
5722
- result['cluster'],
5723
- # RESOURCES
5724
- f'{num_nodes}x {resources}',
5725
- # STATUS
5726
- result['status'].value,
5727
- ]
5728
-
5729
- record = result['record']
5730
- if (record is None or record.start_time is None or
5731
- record.last_time is None):
5732
- row += ['-'] * (len(columns) - len(row))
5733
- rows.append(row)
5734
- continue
5735
-
5736
- duration_str = log_utils.readable_time_duration(record.start_time,
5737
- record.last_time,
5738
- absolute=True)
5739
- duration = record.last_time - record.start_time
5740
- spent = num_nodes * resources.get_cost(duration)
5741
- spent_str = f'{spent:.4f}'
5742
-
5743
- num_steps = record.num_steps_so_far
5744
- if num_steps is None:
5745
- num_steps = '-'
5746
-
5747
- seconds_per_step = record.seconds_per_step
5748
- if seconds_per_step is None:
5749
- seconds_per_step_str = '-'
5750
- cost_per_step_str = '-'
5751
- else:
5752
- seconds_per_step_str = f'{seconds_per_step:.4f}'
5753
- cost_per_step = num_nodes * resources.get_cost(seconds_per_step)
5754
- cost_per_step_str = f'{cost_per_step:.6f}'
5755
-
5756
- total_time = record.estimated_total_seconds
5757
- if total_time is None:
5758
- total_time_str = '-'
5759
- total_cost_str = '-'
5760
- else:
5761
- total_time_str = f'{total_time / 3600:.2f}'
5762
- total_cost = num_nodes * resources.get_cost(total_time)
5763
- total_cost_str = f'{total_cost:.2f}'
5764
-
5765
- row += [
5766
- # DURATION
5767
- duration_str,
5768
- # SPENT($)
5769
- spent_str,
5770
- # STEPS
5771
- num_steps,
5772
- # SEC/STEP
5773
- seconds_per_step_str,
5774
- # $/STEP
5775
- cost_per_step_str,
5776
- # EST(hr)
5777
- total_time_str,
5778
- # EST($)
5779
- total_cost_str,
5780
- ]
5781
- rows.append(row)
5782
-
5783
- cluster_table.add_rows(rows)
5784
- click.echo(cluster_table)
5785
-
5786
- finished = [
5787
- row for row in rows
5788
- if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value
5789
- ]
5790
- if any(row[5] == '-' for row in finished):
5791
- # No #STEPS. SkyCallback was unused.
5792
- click.secho(
5793
- 'SkyCallback logs are not found in this benchmark. '
5794
- 'Consider using SkyCallback to get more detailed information '
5795
- 'in real time.',
5796
- fg='yellow')
5797
- elif any(row[6] != '-' and row[-1] == '-' for row in rows):
5798
- # No EST($). total_steps is not specified and cannot be inferred.
5799
- click.secho(
5800
- 'Cannot estimate total time and cost because '
5801
- 'the total number of steps cannot be inferred by SkyCallback. '
5802
- 'To get the estimation, specify the total number of steps in '
5803
- 'either `sky_callback.init` or `Sky*Callback`.',
5804
- fg='yellow')
5805
-
5806
-
5807
- @bench.command('down', cls=_DocumentedCodeCommand)
5808
- @config_option(expose_value=False)
5809
- @click.argument('benchmark', required=True, type=str)
5810
- @click.option(
5811
- '--exclude',
5812
- '-e',
5813
- 'clusters_to_exclude',
5814
- required=False,
5815
- type=str,
5816
- multiple=True,
5817
- help=('Cluster name(s) to exclude from termination. '
5818
- 'Typically, you might want to see the benchmark results in '
5819
- '`sky bench show` and exclude a "winner" cluster from termination '
5820
- 'to finish the running task.'))
5821
- @click.option('--yes',
5822
- '-y',
5823
- is_flag=True,
5824
- default=False,
5825
- required=False,
5826
- help='Skip confirmation prompt.')
5827
- @usage_lib.entrypoint
5828
- def benchmark_down(
5829
- benchmark: str,
5830
- clusters_to_exclude: List[str],
5831
- yes: bool,
5832
- ) -> None:
5833
- """Tear down all clusters belonging to a benchmark."""
5834
- record = benchmark_state.get_benchmark_from_name(benchmark)
5835
- if record is None:
5836
- raise click.BadParameter(f'Benchmark {benchmark} does not exist.')
5837
-
5838
- clusters = benchmark_state.get_benchmark_clusters(benchmark)
5839
- to_stop: List[str] = []
5840
- for cluster in clusters:
5841
- if cluster in clusters_to_exclude:
5842
- continue
5843
- if global_user_state.get_cluster_from_name(cluster) is None:
5844
- continue
5845
- to_stop.append(cluster)
5846
-
5847
- _down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
5848
-
5849
-
5850
- @bench.command('delete', cls=_DocumentedCodeCommand)
5851
- @config_option(expose_value=False)
5852
- @click.argument('benchmarks', required=False, type=str, nargs=-1)
5853
- @click.option('--all',
5854
- '-a',
5855
- default=None,
5856
- is_flag=True,
5857
- help='Delete all benchmark reports from the history.')
5858
- @click.option('--yes',
5859
- '-y',
5860
- is_flag=True,
5861
- default=False,
5862
- required=False,
5863
- help='Skip confirmation prompt.')
5864
- @usage_lib.entrypoint
5865
- # pylint: disable=redefined-builtin
5866
- def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
5867
- yes: bool) -> None:
5868
- """Delete benchmark reports from the history."""
5869
- if not benchmarks and all is None:
5870
- raise click.BadParameter(
5871
- 'Either specify benchmarks or use --all to delete all benchmarks.')
5872
- to_delete = []
5873
- if benchmarks:
5874
- for benchmark in benchmarks:
5875
- record = benchmark_state.get_benchmark_from_name(benchmark)
5876
- if record is None:
5877
- print(f'Benchmark {benchmark} not found.')
5878
- else:
5879
- to_delete.append(record)
5880
- if all:
5881
- to_delete = benchmark_state.get_benchmarks()
5882
- if benchmarks:
5883
- print('Both --all and benchmark(s) specified '
5884
- 'for sky bench delete. Letting --all take effect.')
5885
-
5886
- to_delete = [r['name'] for r in to_delete]
5887
- if not to_delete:
5888
- return
5889
-
5890
- benchmark_list = ', '.join(to_delete)
5891
- plural = 's' if len(to_delete) > 1 else ''
5892
- if not yes:
5893
- click.confirm(
5894
- f'Deleting the benchmark{plural}: {benchmark_list}. Proceed?',
5895
- default=True,
5896
- abort=True,
5897
- show_default=True)
5898
-
5899
- progress = rich_progress.Progress(transient=True,
5900
- redirect_stdout=False,
5901
- redirect_stderr=False)
5902
- task = progress.add_task(
5903
- f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ',
5904
- total=len(to_delete))
5905
-
5906
- def _delete_benchmark(benchmark: str) -> None:
5907
- clusters = benchmark_state.get_benchmark_clusters(benchmark)
5908
- records = []
5909
- for cluster in clusters:
5910
- record = global_user_state.get_cluster_from_name(cluster)
5911
- records.append(record)
5912
- num_clusters = len([r for r in records if r is not None])
5913
-
5914
- if num_clusters > 0:
5915
- plural = 's' if num_clusters > 1 else ''
5916
- message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
5917
- f'has {num_clusters} un-terminated cluster{plural}. '
5918
- f'Terminate the cluster{plural} with '
5919
- f'{ux_utils.BOLD} sky bench down {benchmark} '
5920
- f'{ux_utils.RESET_BOLD} '
5921
- 'before deleting the benchmark report.')
5922
- success = False
5923
- else:
5924
- bucket_name = benchmark_state.get_benchmark_from_name(
5925
- benchmark)['bucket']
5926
- handle = global_user_state.get_handle_from_storage_name(bucket_name)
5927
- assert handle is not None, bucket_name
5928
- bucket_type = list(handle.sky_stores.keys())[0]
5929
- benchmark_utils.remove_benchmark_logs(benchmark, bucket_name,
5930
- bucket_type)
5931
- benchmark_state.delete_benchmark(benchmark)
5932
- message = (f'{colorama.Fore.GREEN}Benchmark report for '
5933
- f'{benchmark} deleted.{colorama.Style.RESET_ALL}')
5934
- success = True
5935
-
5936
- progress.stop()
5937
- click.secho(message)
5938
- if success:
5939
- progress.update(task, advance=1)
5940
- progress.start()
5941
-
5942
- with progress:
5943
- subprocess_utils.run_in_parallel(_delete_benchmark, to_delete)
5944
- progress.live.transient = False
5945
- progress.refresh()
5946
-
5947
-
5948
5366
  @cli.group(cls=_NaturalOrderGroup, hidden=True)
5949
5367
  def local():
5950
5368
  """SkyPilot local tools CLI."""
@@ -6080,10 +5498,19 @@ def api():
6080
5498
  'to manage the process lifecycle and collect logs directly. '
6081
5499
  'This is useful when the API server is managed by systems '
6082
5500
  'like systemd and Kubernetes.')
5501
+ @click.option('--enable-basic-auth',
5502
+ is_flag=True,
5503
+ default=False,
5504
+ required=False,
5505
+ help='Enable basic authentication in the SkyPilot API server.')
6083
5506
  @usage_lib.entrypoint
6084
- def api_start(deploy: bool, host: Optional[str], foreground: bool):
5507
+ def api_start(deploy: bool, host: Optional[str], foreground: bool,
5508
+ enable_basic_auth: bool):
6085
5509
  """Starts the SkyPilot API server locally."""
6086
- sdk.api_start(deploy=deploy, host=host, foreground=foreground)
5510
+ sdk.api_start(deploy=deploy,
5511
+ host=host,
5512
+ foreground=foreground,
5513
+ enable_basic_auth=enable_basic_auth)
6087
5514
 
6088
5515
 
6089
5516
  @api.command('stop', cls=_DocumentedCodeCommand)