skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +31 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/backends/wheel_utils.py +9 -0
  7. sky/check.py +1 -1
  8. sky/cli.py +253 -74
  9. sky/client/cli.py +253 -74
  10. sky/client/common.py +10 -3
  11. sky/client/sdk.py +11 -8
  12. sky/clouds/aws.py +2 -2
  13. sky/clouds/kubernetes.py +0 -8
  14. sky/clouds/oci.py +1 -1
  15. sky/core.py +17 -11
  16. sky/dashboard/out/404.html +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  21. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  25. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  37. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  38. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  39. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  40. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  41. sky/dashboard/out/clusters/[cluster].html +1 -0
  42. sky/dashboard/out/clusters.html +1 -0
  43. sky/dashboard/out/favicon.ico +0 -0
  44. sky/dashboard/out/index.html +1 -0
  45. sky/dashboard/out/jobs/[job].html +1 -0
  46. sky/dashboard/out/jobs.html +1 -0
  47. sky/dashboard/out/skypilot.svg +15 -0
  48. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  49. sky/data/data_transfer.py +2 -1
  50. sky/data/storage.py +24 -14
  51. sky/exceptions.py +5 -0
  52. sky/jobs/constants.py +8 -1
  53. sky/jobs/server/core.py +12 -8
  54. sky/models.py +28 -0
  55. sky/optimizer.py +7 -9
  56. sky/provision/kubernetes/config.py +1 -1
  57. sky/provision/kubernetes/instance.py +16 -14
  58. sky/provision/kubernetes/network_utils.py +1 -1
  59. sky/provision/kubernetes/utils.py +50 -22
  60. sky/provision/provisioner.py +2 -1
  61. sky/resources.py +56 -2
  62. sky/serve/__init__.py +2 -0
  63. sky/serve/autoscalers.py +6 -2
  64. sky/serve/client/sdk.py +61 -0
  65. sky/serve/constants.py +6 -0
  66. sky/serve/load_balancing_policies.py +0 -4
  67. sky/serve/replica_managers.py +6 -8
  68. sky/serve/serve_state.py +0 -6
  69. sky/serve/serve_utils.py +33 -1
  70. sky/serve/server/core.py +192 -7
  71. sky/serve/server/server.py +28 -0
  72. sky/server/common.py +152 -47
  73. sky/server/constants.py +7 -1
  74. sky/server/requests/executor.py +4 -0
  75. sky/server/requests/payloads.py +12 -15
  76. sky/server/requests/serializers/decoders.py +2 -5
  77. sky/server/requests/serializers/encoders.py +2 -5
  78. sky/server/server.py +44 -1
  79. sky/setup_files/MANIFEST.in +1 -0
  80. sky/setup_files/dependencies.py +1 -0
  81. sky/sky_logging.py +12 -2
  82. sky/skylet/constants.py +5 -7
  83. sky/skylet/job_lib.py +3 -3
  84. sky/skypilot_config.py +225 -84
  85. sky/templates/kubernetes-ray.yml.j2 +7 -3
  86. sky/utils/cli_utils/status_utils.py +12 -5
  87. sky/utils/config_utils.py +39 -15
  88. sky/utils/controller_utils.py +44 -7
  89. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  90. sky/utils/kubernetes/gpu_labeler.py +99 -16
  91. sky/utils/schemas.py +24 -0
  92. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
  93. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
  94. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  95. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/cli.py CHANGED
@@ -28,6 +28,7 @@ import datetime
28
28
  import functools
29
29
  import getpass
30
30
  import os
31
+ import pathlib
31
32
  import shlex
32
33
  import shutil
33
34
  import subprocess
@@ -54,6 +55,7 @@ from sky import jobs as managed_jobs
54
55
  from sky import models
55
56
  from sky import serve as serve_lib
56
57
  from sky import sky_logging
58
+ from sky import skypilot_config
57
59
  from sky.adaptors import common as adaptors_common
58
60
  from sky.benchmark import benchmark_state
59
61
  from sky.benchmark import benchmark_utils
@@ -278,6 +280,54 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
278
280
  return list(env_dict.items())
279
281
 
280
282
 
283
+ def config_option(expose_value: bool):
284
+ """A decorator for the --config option.
285
+
286
+ This decorator is used to parse the --config option.
287
+
288
+ Any overrides specified in the command line will be applied to the skypilot
289
+ config before the decorated function is called.
290
+
291
+ If expose_value is True, the decorated function will receive the parsed
292
+ config overrides as 'config_override' parameter.
293
+
294
+ Args:
295
+ expose_value: Whether to expose the value of the option to the decorated
296
+ function.
297
+ """
298
+
299
+ def preprocess_config_options(ctx, param, value):
300
+ del ctx # Unused.
301
+ param.name = 'config_override'
302
+ try:
303
+ if len(value) == 0:
304
+ return None
305
+ elif len(value) > 1:
306
+ raise ValueError('argument specified multiple times. '
307
+ 'To specify multiple configs, use '
308
+ '--config nested.key1=val1,another.key2=val2')
309
+ else:
310
+ # Apply the config overrides to the skypilot config.
311
+ return skypilot_config.apply_cli_config(value[0])
312
+ except ValueError as e:
313
+ raise click.BadParameter(f'{str(e)}') from e
314
+
315
+ def return_option_decorator(func):
316
+ return click.option(
317
+ '--config',
318
+ required=False,
319
+ type=str,
320
+ multiple=True,
321
+ expose_value=expose_value,
322
+ callback=preprocess_config_options,
323
+ help=('Path to a config file or a comma-separated '
324
+ 'list of key-value pairs '
325
+ '(e.g. "nested.key1=val1,another.key2=val2").'),
326
+ )(func)
327
+
328
+ return return_option_decorator
329
+
330
+
281
331
  _COMMON_OPTIONS = [
282
332
  click.option('--async/--no-async',
283
333
  'async_call',
@@ -630,7 +680,8 @@ def _parse_override_params(
630
680
  image_id: Optional[str] = None,
631
681
  disk_size: Optional[int] = None,
632
682
  disk_tier: Optional[str] = None,
633
- ports: Optional[Tuple[str, ...]] = None) -> Dict[str, Any]:
683
+ ports: Optional[Tuple[str, ...]] = None,
684
+ config_override: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
634
685
  """Parses the override parameters into a dictionary."""
635
686
  override_params: Dict[str, Any] = {}
636
687
  if cloud is not None:
@@ -691,6 +742,8 @@ def _parse_override_params(
691
742
  override_params['ports'] = None
692
743
  else:
693
744
  override_params['ports'] = ports
745
+ if config_override:
746
+ override_params['_cluster_config_overrides'] = config_override
694
747
  return override_params
695
748
 
696
749
 
@@ -793,6 +846,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
793
846
  field_to_ignore: Optional[List[str]] = None,
794
847
  # job launch specific
795
848
  job_recovery: Optional[str] = None,
849
+ config_override: Optional[Dict[str, Any]] = None,
796
850
  ) -> Union[sky.Task, sky.Dag]:
797
851
  """Creates a task or a dag from an entrypoint with overrides.
798
852
 
@@ -826,7 +880,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
826
880
  image_id=image_id,
827
881
  disk_size=disk_size,
828
882
  disk_tier=disk_tier,
829
- ports=ports)
883
+ ports=ports,
884
+ config_override=config_override)
830
885
  if field_to_ignore is not None:
831
886
  _pop_and_ignore_fields_in_override_params(override_params,
832
887
  field_to_ignore)
@@ -1010,6 +1065,7 @@ def cli():
1010
1065
 
1011
1066
 
1012
1067
  @cli.command(cls=_DocumentedCodeCommand)
1068
+ @config_option(expose_value=True)
1013
1069
  @click.argument('entrypoint',
1014
1070
  required=False,
1015
1071
  type=str,
@@ -1139,7 +1195,8 @@ def launch(
1139
1195
  no_setup: bool,
1140
1196
  clone_disk_from: Optional[str],
1141
1197
  fast: bool,
1142
- async_call: bool):
1198
+ async_call: bool,
1199
+ config_override: Optional[Dict[str, Any]] = None):
1143
1200
  """Launch a cluster or task.
1144
1201
 
1145
1202
  If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -1181,6 +1238,7 @@ def launch(
1181
1238
  disk_size=disk_size,
1182
1239
  disk_tier=disk_tier,
1183
1240
  ports=ports,
1241
+ config_override=config_override,
1184
1242
  )
1185
1243
  if isinstance(task_or_dag, sky.Dag):
1186
1244
  raise click.UsageError(
@@ -1245,6 +1303,7 @@ def launch(
1245
1303
 
1246
1304
 
1247
1305
  @cli.command(cls=_DocumentedCodeCommand)
1306
+ @config_option(expose_value=True)
1248
1307
  @click.argument('cluster',
1249
1308
  required=False,
1250
1309
  type=str,
@@ -1273,15 +1332,29 @@ def launch(
1273
1332
  _COMMON_OPTIONS)
1274
1333
  @usage_lib.entrypoint
1275
1334
  # pylint: disable=redefined-builtin
1276
- def exec(cluster: Optional[str], cluster_option: Optional[str],
1277
- entrypoint: Tuple[str, ...], detach_run: bool, name: Optional[str],
1278
- cloud: Optional[str], region: Optional[str], zone: Optional[str],
1279
- workdir: Optional[str], gpus: Optional[str], ports: Tuple[str],
1280
- instance_type: Optional[str], num_nodes: Optional[int],
1281
- use_spot: Optional[bool], image_id: Optional[str],
1282
- env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
1283
- cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
1284
- disk_tier: Optional[str], async_call: bool):
1335
+ def exec(cluster: Optional[str],
1336
+ cluster_option: Optional[str],
1337
+ entrypoint: Tuple[str, ...],
1338
+ detach_run: bool,
1339
+ name: Optional[str],
1340
+ cloud: Optional[str],
1341
+ region: Optional[str],
1342
+ zone: Optional[str],
1343
+ workdir: Optional[str],
1344
+ gpus: Optional[str],
1345
+ ports: Tuple[str],
1346
+ instance_type: Optional[str],
1347
+ num_nodes: Optional[int],
1348
+ use_spot: Optional[bool],
1349
+ image_id: Optional[str],
1350
+ env_file: Optional[Dict[str, str]],
1351
+ env: List[Tuple[str, str]],
1352
+ cpus: Optional[str],
1353
+ memory: Optional[str],
1354
+ disk_size: Optional[int],
1355
+ disk_tier: Optional[str],
1356
+ async_call: bool,
1357
+ config_override: Optional[Dict[str, Any]] = None):
1285
1358
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1286
1359
  """Execute a task or command on an existing cluster.
1287
1360
 
@@ -1374,6 +1447,7 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
1374
1447
  disk_tier=disk_tier,
1375
1448
  ports=ports,
1376
1449
  field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'],
1450
+ config_override=config_override,
1377
1451
  )
1378
1452
 
1379
1453
  if isinstance(task_or_dag, sky.Dag):
@@ -1657,6 +1731,7 @@ def _show_endpoint(query_clusters: Optional[List[str]],
1657
1731
 
1658
1732
 
1659
1733
  @cli.command()
1734
+ @config_option(expose_value=False)
1660
1735
  @click.option('--verbose',
1661
1736
  '-v',
1662
1737
  default=False,
@@ -1949,6 +2024,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1949
2024
 
1950
2025
 
1951
2026
  @cli.command()
2027
+ @config_option(expose_value=False)
1952
2028
  @click.option('--all',
1953
2029
  '-a',
1954
2030
  default=False,
@@ -2019,6 +2095,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
2019
2095
 
2020
2096
 
2021
2097
  @cli.command()
2098
+ @config_option(expose_value=False)
2022
2099
  @click.option('--all-users',
2023
2100
  '-u',
2024
2101
  default=False,
@@ -2080,6 +2157,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2080
2157
 
2081
2158
 
2082
2159
  @cli.command()
2160
+ @config_option(expose_value=False)
2083
2161
  @click.option(
2084
2162
  '--sync-down',
2085
2163
  '-s',
@@ -2217,6 +2295,7 @@ def logs(
2217
2295
 
2218
2296
 
2219
2297
  @cli.command()
2298
+ @config_option(expose_value=False)
2220
2299
  @click.argument('cluster',
2221
2300
  required=True,
2222
2301
  type=str,
@@ -2320,6 +2399,7 @@ def cancel(
2320
2399
 
2321
2400
 
2322
2401
  @cli.command(cls=_DocumentedCodeCommand)
2402
+ @config_option(expose_value=False)
2323
2403
  @click.argument('clusters',
2324
2404
  nargs=-1,
2325
2405
  required=False,
@@ -2387,6 +2467,7 @@ def stop(
2387
2467
 
2388
2468
 
2389
2469
  @cli.command(cls=_DocumentedCodeCommand)
2470
+ @config_option(expose_value=False)
2390
2471
  @click.argument('clusters',
2391
2472
  nargs=-1,
2392
2473
  required=False,
@@ -2499,6 +2580,7 @@ def autostop(
2499
2580
 
2500
2581
 
2501
2582
  @cli.command(cls=_DocumentedCodeCommand)
2583
+ @config_option(expose_value=False)
2502
2584
  @click.argument('clusters',
2503
2585
  nargs=-1,
2504
2586
  required=False,
@@ -2744,6 +2826,7 @@ def start(
2744
2826
 
2745
2827
 
2746
2828
  @cli.command(cls=_DocumentedCodeCommand)
2829
+ @config_option(expose_value=False)
2747
2830
  @click.argument('clusters',
2748
2831
  nargs=-1,
2749
2832
  required=False,
@@ -3182,6 +3265,7 @@ def _down_or_stop_clusters(
3182
3265
 
3183
3266
 
3184
3267
  @cli.command(cls=_DocumentedCodeCommand)
3268
+ @config_option(expose_value=False)
3185
3269
  @click.argument('clouds', required=False, type=str, nargs=-1)
3186
3270
  @click.option('--verbose',
3187
3271
  '-v',
@@ -3222,6 +3306,7 @@ def check(clouds: Tuple[str], verbose: bool):
3222
3306
 
3223
3307
 
3224
3308
  @cli.command()
3309
+ @config_option(expose_value=False)
3225
3310
  @click.argument('accelerator_str', required=False)
3226
3311
  @click.option('--all',
3227
3312
  '-a',
@@ -3379,15 +3464,14 @@ def show_gpus(
3379
3464
  ])
3380
3465
  return realtime_gpu_table
3381
3466
 
3382
- # TODO(zhwu): this needs to run on remote server.
3383
- def _get_kubernetes_node_info_table(context: Optional[str]):
3467
+ def _format_kubernetes_node_info(context: Optional[str]):
3384
3468
  node_table = log_utils.create_table(
3385
3469
  ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
3386
3470
 
3387
- no_permissions_str = '<no permissions>'
3388
- node_info_dict = sdk.stream_and_get(
3471
+ nodes_info = sdk.stream_and_get(
3389
3472
  sdk.kubernetes_node_info(context=context))
3390
- for node_name, node_info in node_info_dict.items():
3473
+ no_permissions_str = '<no permissions>'
3474
+ for node_name, node_info in nodes_info.node_info_dict.items():
3391
3475
  available = node_info.free[
3392
3476
  'accelerators_available'] if node_info.free[
3393
3477
  'accelerators_available'] != -1 else no_permissions_str
@@ -3395,7 +3479,14 @@ def show_gpus(
3395
3479
  node_name, node_info.accelerator_type,
3396
3480
  node_info.total['accelerator_count'], available
3397
3481
  ])
3398
- return node_table
3482
+ k8s_per_node_acc_message = (
3483
+ 'Kubernetes per node accelerator availability ')
3484
+ if nodes_info.hint:
3485
+ k8s_per_node_acc_message += nodes_info.hint
3486
+ return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3487
+ f'{k8s_per_node_acc_message}'
3488
+ f'{colorama.Style.RESET_ALL}\n'
3489
+ f'{node_table.get_string()}')
3399
3490
 
3400
3491
  def _output() -> Generator[str, None, None]:
3401
3492
  gpu_table = log_utils.create_table(
@@ -3443,22 +3534,8 @@ def show_gpus(
3443
3534
  f'Kubernetes GPUs {context_str}'
3444
3535
  f'{colorama.Style.RESET_ALL}\n')
3445
3536
  yield from k8s_realtime_table.get_string()
3446
- k8s_node_table = _get_kubernetes_node_info_table(context)
3447
3537
  yield '\n\n'
3448
- # TODO(Doyoung): Update the message with the multi-host TPU
3449
- # support.
3450
- k8s_per_node_acc_message = (
3451
- 'Kubernetes per node accelerator availability ')
3452
- if kubernetes_utils.multi_host_tpu_exists_in_cluster(
3453
- context):
3454
- k8s_per_node_acc_message += (
3455
- '(Note: Multi-host TPUs are detected and excluded '
3456
- 'from the display as multi-host TPUs are not '
3457
- 'supported.)')
3458
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3459
- f'{k8s_per_node_acc_message}'
3460
- f'{colorama.Style.RESET_ALL}\n')
3461
- yield from k8s_node_table.get_string()
3538
+ yield _format_kubernetes_node_info(context)
3462
3539
  if kubernetes_autoscaling:
3463
3540
  k8s_messages += (
3464
3541
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3693,6 +3770,7 @@ def storage():
3693
3770
 
3694
3771
 
3695
3772
  @storage.command('ls', cls=_DocumentedCodeCommand)
3773
+ @config_option(expose_value=False)
3696
3774
  @click.option('--verbose',
3697
3775
  '-v',
3698
3776
  default=False,
@@ -3711,6 +3789,7 @@ def storage_ls(verbose: bool):
3711
3789
 
3712
3790
 
3713
3791
  @storage.command('delete', cls=_DocumentedCodeCommand)
3792
+ @config_option(expose_value=False)
3714
3793
  @click.argument('names',
3715
3794
  required=False,
3716
3795
  type=str,
@@ -3795,6 +3874,7 @@ def jobs():
3795
3874
 
3796
3875
 
3797
3876
  @jobs.command('launch', cls=_DocumentedCodeCommand)
3877
+ @config_option(expose_value=True)
3798
3878
  @click.argument('entrypoint',
3799
3879
  required=True,
3800
3880
  type=str,
@@ -3852,6 +3932,7 @@ def jobs_launch(
3852
3932
  detach_run: bool,
3853
3933
  yes: bool,
3854
3934
  async_call: bool,
3935
+ config_override: Optional[Dict[str, Any]] = None,
3855
3936
  ):
3856
3937
  """Launch a managed job from a YAML or a command.
3857
3938
 
@@ -3892,6 +3973,7 @@ def jobs_launch(
3892
3973
  disk_tier=disk_tier,
3893
3974
  ports=ports,
3894
3975
  job_recovery=job_recovery,
3976
+ config_override=config_override,
3895
3977
  )
3896
3978
 
3897
3979
  if not isinstance(task_or_dag, sky.Dag):
@@ -3929,6 +4011,7 @@ def jobs_launch(
3929
4011
 
3930
4012
 
3931
4013
  @jobs.command('queue', cls=_DocumentedCodeCommand)
4014
+ @config_option(expose_value=False)
3932
4015
  @click.option('--verbose',
3933
4016
  '-v',
3934
4017
  default=False,
@@ -4045,6 +4128,7 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4045
4128
 
4046
4129
 
4047
4130
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
4131
+ @config_option(expose_value=False)
4048
4132
  @click.option('--name',
4049
4133
  '-n',
4050
4134
  required=False,
@@ -4119,6 +4203,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4119
4203
 
4120
4204
 
4121
4205
  @jobs.command('logs', cls=_DocumentedCodeCommand)
4206
+ @config_option(expose_value=False)
4122
4207
  @click.option('--name',
4123
4208
  '-n',
4124
4209
  required=False,
@@ -4183,6 +4268,7 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
4183
4268
 
4184
4269
 
4185
4270
  @jobs.command('dashboard', cls=_DocumentedCodeCommand)
4271
+ @config_option(expose_value=False)
4186
4272
  @usage_lib.entrypoint
4187
4273
  def jobs_dashboard():
4188
4274
  """Opens a dashboard for managed jobs."""
@@ -4312,6 +4398,7 @@ def _generate_task_with_service(
4312
4398
 
4313
4399
 
4314
4400
  @serve.command('up', cls=_DocumentedCodeCommand)
4401
+ @config_option(expose_value=False)
4315
4402
  @click.argument('service_yaml',
4316
4403
  required=True,
4317
4404
  type=str,
@@ -4423,6 +4510,7 @@ def serve_up(
4423
4510
  # TODO(MaoZiming): Expose mix replica traffic option to user.
4424
4511
  # Currently, we do not mix traffic from old and new replicas.
4425
4512
  @serve.command('update', cls=_DocumentedCodeCommand)
4513
+ @config_option(expose_value=False)
4426
4514
  @click.argument('service_name', required=True, type=str)
4427
4515
  @click.argument('service_yaml',
4428
4516
  required=True,
@@ -4523,6 +4611,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
4523
4611
 
4524
4612
 
4525
4613
  @serve.command('status', cls=_DocumentedCodeCommand)
4614
+ @config_option(expose_value=False)
4526
4615
  @click.option('--verbose',
4527
4616
  '-v',
4528
4617
  default=False,
@@ -4648,6 +4737,7 @@ def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
4648
4737
 
4649
4738
 
4650
4739
  @serve.command('down', cls=_DocumentedCodeCommand)
4740
+ @config_option(expose_value=False)
4651
4741
  @click.argument('service_names', required=False, type=str, nargs=-1)
4652
4742
  @click.option('--all',
4653
4743
  '-a',
@@ -4761,6 +4851,7 @@ def serve_down(
4761
4851
 
4762
4852
 
4763
4853
  @serve.command('logs', cls=_DocumentedCodeCommand)
4854
+ @config_option(expose_value=False)
4764
4855
  @click.option(
4765
4856
  '--follow/--no-follow',
4766
4857
  is_flag=True,
@@ -4777,8 +4868,14 @@ def serve_down(
4777
4868
  default=False,
4778
4869
  required=False,
4779
4870
  help='Show the load balancer logs of this service.')
4871
+ @click.option('--sync-down',
4872
+ '-s',
4873
+ is_flag=True,
4874
+ default=False,
4875
+ help='Sync down logs to the local machine. Can be combined with '
4876
+ '--controller, --load-balancer, or a replica ID to narrow scope.')
4780
4877
  @click.argument('service_name', required=True, type=str)
4781
- @click.argument('replica_id', required=False, type=int)
4878
+ @click.argument('replica_ids', required=False, type=int, nargs=-1)
4782
4879
  @usage_lib.entrypoint
4783
4880
  # TODO(tian): Add default argument for this CLI if none of the flags are
4784
4881
  # specified.
@@ -4787,9 +4884,13 @@ def serve_logs(
4787
4884
  follow: bool,
4788
4885
  controller: bool,
4789
4886
  load_balancer: bool,
4790
- replica_id: Optional[int],
4887
+ replica_ids: Tuple[int, ...],
4888
+ sync_down: bool,
4791
4889
  ):
4792
- """Tail the log of a service.
4890
+ """Tail or sync down logs of a service.
4891
+
4892
+ Logs can be tailed from one target (controller, load balancer, or a single
4893
+ replica) or synced down from multiple targets simultaneously.
4793
4894
 
4794
4895
  Example:
4795
4896
 
@@ -4803,27 +4904,89 @@ def serve_logs(
4803
4904
  \b
4804
4905
  # Tail the logs of replica 1
4805
4906
  sky serve logs [SERVICE_NAME] 1
4907
+ \b
4908
+ # Sync down all logs of the service (controller, LB, all replicas)
4909
+ sky serve logs [SERVICE_NAME] --sync-down
4910
+ \b
4911
+ # Sync down controller logs and logs for replicas 1 and 3
4912
+ sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
4806
4913
  """
4807
- have_replica_id = replica_id is not None
4808
- num_flags = (controller + load_balancer + have_replica_id)
4809
- if num_flags > 1:
4810
- raise click.UsageError('At most one of --controller, --load-balancer, '
4811
- '[REPLICA_ID] can be specified.')
4812
- if num_flags == 0:
4813
- raise click.UsageError('One of --controller, --load-balancer, '
4814
- '[REPLICA_ID] must be specified.')
4914
+ chosen_components: Set[serve_lib.ServiceComponent] = set()
4815
4915
  if controller:
4816
- target_component = serve_lib.ServiceComponent.CONTROLLER
4817
- elif load_balancer:
4818
- target_component = serve_lib.ServiceComponent.LOAD_BALANCER
4819
- else:
4820
- # Already checked that num_flags == 1.
4821
- assert replica_id is not None
4822
- target_component = serve_lib.ServiceComponent.REPLICA
4916
+ chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
4917
+ if load_balancer:
4918
+ chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
4919
+ # replica_ids contains the specific replica IDs provided by the user.
4920
+ # If it's not empty, it implies the user wants replica logs.
4921
+ if replica_ids:
4922
+ chosen_components.add(serve_lib.ServiceComponent.REPLICA)
4923
+
4924
+ if sync_down:
4925
+ # For sync-down, multiple targets are allowed.
4926
+ # If no specific components/replicas are mentioned, sync all.
4927
+ # Note: Multiple replicas or targets can only be specified when
4928
+ # using --sync-down.
4929
+ targets_to_sync = list(chosen_components)
4930
+ if not targets_to_sync and not replica_ids:
4931
+ # Default to all components if nothing specific is requested
4932
+ targets_to_sync = [
4933
+ serve_lib.ServiceComponent.CONTROLLER,
4934
+ serve_lib.ServiceComponent.LOAD_BALANCER,
4935
+ serve_lib.ServiceComponent.REPLICA,
4936
+ ]
4937
+
4938
+ timestamp = sky_logging.get_run_timestamp()
4939
+ log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
4940
+ f'{service_name}_{timestamp}').expanduser()
4941
+ log_dir.mkdir(parents=True, exist_ok=True)
4942
+
4943
+ with rich_utils.client_status(
4944
+ ux_utils.spinner_message('Downloading service logs...')):
4945
+ serve_lib.sync_down_logs(service_name,
4946
+ local_dir=str(log_dir),
4947
+ targets=targets_to_sync,
4948
+ replica_ids=list(replica_ids))
4949
+ style = colorama.Style
4950
+ fore = colorama.Fore
4951
+ logger.info(f'{fore.CYAN}Service {service_name} logs: '
4952
+ f'{log_dir}{style.RESET_ALL}')
4953
+ return
4954
+
4955
+ # Tailing requires exactly one target.
4956
+ num_targets = len(chosen_components)
4957
+ # If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
4958
+ if serve_lib.ServiceComponent.REPLICA in chosen_components:
4959
+ if len(replica_ids) != 1:
4960
+ raise click.UsageError(
4961
+ 'Can only tail logs from a single replica at a time. '
4962
+ 'Provide exactly one REPLICA_ID or use --sync-down '
4963
+ 'to download logs from multiple replicas.')
4964
+ # If replica is chosen and len is 1, num_targets effectively counts it.
4965
+ # We need to ensure no other component (controller/LB) is selected.
4966
+ if num_targets > 1:
4967
+ raise click.UsageError(
4968
+ 'Can only tail logs from one target at a time (controller, '
4969
+ 'load balancer, or a single replica). Use --sync-down '
4970
+ 'to download logs from multiple sources.')
4971
+ elif num_targets == 0:
4972
+ raise click.UsageError(
4973
+ 'Specify a target to tail: --controller, --load-balancer, or '
4974
+ 'a REPLICA_ID.')
4975
+ elif num_targets > 1:
4976
+ raise click.UsageError(
4977
+ 'Can only tail logs from one target at a time. Use --sync-down '
4978
+ 'to download logs from multiple sources.')
4979
+
4980
+ # At this point, we have exactly one target for tailing.
4981
+ assert len(chosen_components) == 1
4982
+ assert len(replica_ids) in [0, 1]
4983
+ target_component = chosen_components.pop()
4984
+ target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
4985
+
4823
4986
  try:
4824
4987
  serve_lib.tail_logs(service_name,
4825
4988
  target=target_component,
4826
- replica_id=replica_id,
4989
+ replica_id=target_replica_id,
4827
4990
  follow=follow)
4828
4991
  except exceptions.ClusterNotUpError:
4829
4992
  with ux_utils.print_exception_no_traceback():
@@ -4874,6 +5037,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4874
5037
 
4875
5038
 
4876
5039
  @bench.command('launch', cls=_DocumentedCodeCommand)
5040
+ @config_option(expose_value=True)
4877
5041
  @click.argument('entrypoint',
4878
5042
  required=True,
4879
5043
  type=str,
@@ -4919,27 +5083,28 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4919
5083
  help='Skip confirmation prompt.')
4920
5084
  @usage_lib.entrypoint
4921
5085
  def benchmark_launch(
4922
- entrypoint: str,
4923
- benchmark: str,
4924
- name: Optional[str],
4925
- workdir: Optional[str],
4926
- cloud: Optional[str],
4927
- region: Optional[str],
4928
- zone: Optional[str],
4929
- gpus: Optional[str],
4930
- num_nodes: Optional[int],
4931
- use_spot: Optional[bool],
4932
- image_id: Optional[str],
4933
- env_file: Optional[Dict[str, str]],
4934
- env: List[Tuple[str, str]],
4935
- cpus: Optional[str],
4936
- memory: Optional[str],
4937
- disk_size: Optional[int],
4938
- disk_tier: Optional[str],
4939
- ports: Tuple[str],
4940
- idle_minutes_to_autostop: Optional[int],
4941
- yes: bool,
4942
- async_call: bool, # pylint: disable=unused-argument
5086
+ entrypoint: str,
5087
+ benchmark: str,
5088
+ name: Optional[str],
5089
+ workdir: Optional[str],
5090
+ cloud: Optional[str],
5091
+ region: Optional[str],
5092
+ zone: Optional[str],
5093
+ gpus: Optional[str],
5094
+ num_nodes: Optional[int],
5095
+ use_spot: Optional[bool],
5096
+ image_id: Optional[str],
5097
+ env_file: Optional[Dict[str, str]],
5098
+ env: List[Tuple[str, str]],
5099
+ cpus: Optional[str],
5100
+ memory: Optional[str],
5101
+ disk_size: Optional[int],
5102
+ disk_tier: Optional[str],
5103
+ ports: Tuple[str],
5104
+ idle_minutes_to_autostop: Optional[int],
5105
+ yes: bool,
5106
+ async_call: bool, # pylint: disable=unused-argument
5107
+ config_override: Optional[Dict[str, Any]] = None,
4943
5108
  ) -> None:
4944
5109
  """Benchmark a task on different resources.
4945
5110
 
@@ -5048,7 +5213,8 @@ def benchmark_launch(
5048
5213
  image_id=image_id,
5049
5214
  disk_size=disk_size,
5050
5215
  disk_tier=disk_tier,
5051
- ports=ports)
5216
+ ports=ports,
5217
+ config_override=config_override)
5052
5218
  _pop_and_ignore_fields_in_override_params(
5053
5219
  override_params, field_to_ignore=['cpus', 'memory'])
5054
5220
  resources_config.update(override_params)
@@ -5113,6 +5279,7 @@ def benchmark_launch(
5113
5279
 
5114
5280
 
5115
5281
  @bench.command('ls', cls=_DocumentedCodeCommand)
5282
+ @config_option(expose_value=False)
5116
5283
  @usage_lib.entrypoint
5117
5284
  def benchmark_ls() -> None:
5118
5285
  """List the benchmark history."""
@@ -5176,6 +5343,7 @@ def benchmark_ls() -> None:
5176
5343
 
5177
5344
 
5178
5345
  @bench.command('show', cls=_DocumentedCodeCommand)
5346
+ @config_option(expose_value=False)
5179
5347
  @click.argument('benchmark', required=True, type=str)
5180
5348
  # TODO(woosuk): Add --all option to show all the collected information
5181
5349
  # (e.g., setup time, warmup steps, total steps, etc.).
@@ -5301,6 +5469,7 @@ def benchmark_show(benchmark: str) -> None:
5301
5469
 
5302
5470
 
5303
5471
  @bench.command('down', cls=_DocumentedCodeCommand)
5472
+ @config_option(expose_value=False)
5304
5473
  @click.argument('benchmark', required=True, type=str)
5305
5474
  @click.option(
5306
5475
  '--exclude',
@@ -5343,6 +5512,7 @@ def benchmark_down(
5343
5512
 
5344
5513
 
5345
5514
  @bench.command('delete', cls=_DocumentedCodeCommand)
5515
+ @config_option(expose_value=False)
5346
5516
  @click.argument('benchmarks', required=False, type=str, nargs=-1)
5347
5517
  @click.option('--all',
5348
5518
  '-a',
@@ -5477,6 +5647,7 @@ def local():
5477
5647
  help='Password for the ssh-user to execute sudo commands. '
5478
5648
  'Required only if passwordless sudo is not setup.')
5479
5649
  @local.command('up', cls=_DocumentedCodeCommand)
5650
+ @config_option(expose_value=False)
5480
5651
  @_add_click_options(_COMMON_OPTIONS)
5481
5652
  @usage_lib.entrypoint
5482
5653
  def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
@@ -5532,6 +5703,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5532
5703
 
5533
5704
 
5534
5705
  @local.command('down', cls=_DocumentedCodeCommand)
5706
+ @config_option(expose_value=False)
5535
5707
  @_add_click_options(_COMMON_OPTIONS)
5536
5708
  @usage_lib.entrypoint
5537
5709
  def local_down(async_call: bool):
@@ -5547,6 +5719,7 @@ def api():
5547
5719
 
5548
5720
 
5549
5721
  @api.command('start', cls=_DocumentedCodeCommand)
5722
+ @config_option(expose_value=False)
5550
5723
  @click.option('--deploy',
5551
5724
  type=bool,
5552
5725
  is_flag=True,
@@ -5579,6 +5752,7 @@ def api_start(deploy: bool, host: Optional[str], foreground: bool):
5579
5752
 
5580
5753
 
5581
5754
  @api.command('stop', cls=_DocumentedCodeCommand)
5755
+ @config_option(expose_value=False)
5582
5756
  @usage_lib.entrypoint
5583
5757
  def api_stop():
5584
5758
  """Stops the SkyPilot API server locally."""
@@ -5586,6 +5760,7 @@ def api_stop():
5586
5760
 
5587
5761
 
5588
5762
  @api.command('logs', cls=_DocumentedCodeCommand)
5763
+ @config_option(expose_value=False)
5589
5764
  @click.argument('request_id', required=False, type=str)
5590
5765
  @click.option('--server-logs',
5591
5766
  is_flag=True,
@@ -5625,6 +5800,7 @@ def api_logs(request_id: Optional[str], server_logs: bool,
5625
5800
 
5626
5801
 
5627
5802
  @api.command('cancel', cls=_DocumentedCodeCommand)
5803
+ @config_option(expose_value=False)
5628
5804
  @click.argument('request_ids', required=False, type=str, nargs=-1)
5629
5805
  @click.option('--all',
5630
5806
  '-a',
@@ -5666,6 +5842,7 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
5666
5842
 
5667
5843
 
5668
5844
  @api.command('status', cls=_DocumentedCodeCommand)
5845
+ @config_option(expose_value=False)
5669
5846
  @click.argument('request_ids', required=False, type=str, nargs=-1)
5670
5847
  @click.option('--all-status',
5671
5848
  '-a',
@@ -5709,6 +5886,7 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
5709
5886
 
5710
5887
 
5711
5888
  @api.command('login', cls=_DocumentedCodeCommand)
5889
+ @config_option(expose_value=False)
5712
5890
  @click.option('--endpoint',
5713
5891
  '-e',
5714
5892
  required=False,
@@ -5720,6 +5898,7 @@ def api_login(endpoint: Optional[str]):
5720
5898
 
5721
5899
 
5722
5900
  @api.command('info', cls=_DocumentedCodeCommand)
5901
+ @config_option(expose_value=False)
5723
5902
  @usage_lib.entrypoint
5724
5903
  def api_info():
5725
5904
  """Shows the SkyPilot API server URL."""