skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +31 -3
- sky/backends/cloud_vm_ray_backend.py +22 -29
- sky/backends/wheel_utils.py +9 -0
- sky/check.py +1 -1
- sky/cli.py +253 -74
- sky/client/cli.py +253 -74
- sky/client/common.py +10 -3
- sky/client/sdk.py +11 -8
- sky/clouds/aws.py +2 -2
- sky/clouds/kubernetes.py +0 -8
- sky/clouds/oci.py +1 -1
- sky/core.py +17 -11
- sky/dashboard/out/404.html +1 -0
- sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
- sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
- sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
- sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
- sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
- sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
- sky/dashboard/out/clusters/[cluster].html +1 -0
- sky/dashboard/out/clusters.html +1 -0
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -0
- sky/dashboard/out/jobs.html +1 -0
- sky/dashboard/out/skypilot.svg +15 -0
- sky/dashboard/out/videos/cursor-small.mp4 +0 -0
- sky/data/data_transfer.py +2 -1
- sky/data/storage.py +24 -14
- sky/exceptions.py +5 -0
- sky/jobs/constants.py +8 -1
- sky/jobs/server/core.py +12 -8
- sky/models.py +28 -0
- sky/optimizer.py +7 -9
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/instance.py +16 -14
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +50 -22
- sky/provision/provisioner.py +2 -1
- sky/resources.py +56 -2
- sky/serve/__init__.py +2 -0
- sky/serve/autoscalers.py +6 -2
- sky/serve/client/sdk.py +61 -0
- sky/serve/constants.py +6 -0
- sky/serve/load_balancing_policies.py +0 -4
- sky/serve/replica_managers.py +6 -8
- sky/serve/serve_state.py +0 -6
- sky/serve/serve_utils.py +33 -1
- sky/serve/server/core.py +192 -7
- sky/serve/server/server.py +28 -0
- sky/server/common.py +152 -47
- sky/server/constants.py +7 -1
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +12 -15
- sky/server/requests/serializers/decoders.py +2 -5
- sky/server/requests/serializers/encoders.py +2 -5
- sky/server/server.py +44 -1
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +12 -2
- sky/skylet/constants.py +5 -7
- sky/skylet/job_lib.py +3 -3
- sky/skypilot_config.py +225 -84
- sky/templates/kubernetes-ray.yml.j2 +7 -3
- sky/utils/cli_utils/status_utils.py +12 -5
- sky/utils/config_utils.py +39 -15
- sky/utils/controller_utils.py +44 -7
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/gpu_labeler.py +99 -16
- sky/utils/schemas.py +24 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/cli.py
CHANGED
@@ -28,6 +28,7 @@ import datetime
|
|
28
28
|
import functools
|
29
29
|
import getpass
|
30
30
|
import os
|
31
|
+
import pathlib
|
31
32
|
import shlex
|
32
33
|
import shutil
|
33
34
|
import subprocess
|
@@ -54,6 +55,7 @@ from sky import jobs as managed_jobs
|
|
54
55
|
from sky import models
|
55
56
|
from sky import serve as serve_lib
|
56
57
|
from sky import sky_logging
|
58
|
+
from sky import skypilot_config
|
57
59
|
from sky.adaptors import common as adaptors_common
|
58
60
|
from sky.benchmark import benchmark_state
|
59
61
|
from sky.benchmark import benchmark_utils
|
@@ -278,6 +280,54 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
278
280
|
return list(env_dict.items())
|
279
281
|
|
280
282
|
|
283
|
+
def config_option(expose_value: bool):
|
284
|
+
"""A decorator for the --config option.
|
285
|
+
|
286
|
+
This decorator is used to parse the --config option.
|
287
|
+
|
288
|
+
Any overrides specified in the command line will be applied to the skypilot
|
289
|
+
config before the decorated function is called.
|
290
|
+
|
291
|
+
If expose_value is True, the decorated function will receive the parsed
|
292
|
+
config overrides as 'config_override' parameter.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
expose_value: Whether to expose the value of the option to the decorated
|
296
|
+
function.
|
297
|
+
"""
|
298
|
+
|
299
|
+
def preprocess_config_options(ctx, param, value):
|
300
|
+
del ctx # Unused.
|
301
|
+
param.name = 'config_override'
|
302
|
+
try:
|
303
|
+
if len(value) == 0:
|
304
|
+
return None
|
305
|
+
elif len(value) > 1:
|
306
|
+
raise ValueError('argument specified multiple times. '
|
307
|
+
'To specify multiple configs, use '
|
308
|
+
'--config nested.key1=val1,another.key2=val2')
|
309
|
+
else:
|
310
|
+
# Apply the config overrides to the skypilot config.
|
311
|
+
return skypilot_config.apply_cli_config(value[0])
|
312
|
+
except ValueError as e:
|
313
|
+
raise click.BadParameter(f'{str(e)}') from e
|
314
|
+
|
315
|
+
def return_option_decorator(func):
|
316
|
+
return click.option(
|
317
|
+
'--config',
|
318
|
+
required=False,
|
319
|
+
type=str,
|
320
|
+
multiple=True,
|
321
|
+
expose_value=expose_value,
|
322
|
+
callback=preprocess_config_options,
|
323
|
+
help=('Path to a config file or a comma-separated '
|
324
|
+
'list of key-value pairs '
|
325
|
+
'(e.g. "nested.key1=val1,another.key2=val2").'),
|
326
|
+
)(func)
|
327
|
+
|
328
|
+
return return_option_decorator
|
329
|
+
|
330
|
+
|
281
331
|
_COMMON_OPTIONS = [
|
282
332
|
click.option('--async/--no-async',
|
283
333
|
'async_call',
|
@@ -630,7 +680,8 @@ def _parse_override_params(
|
|
630
680
|
image_id: Optional[str] = None,
|
631
681
|
disk_size: Optional[int] = None,
|
632
682
|
disk_tier: Optional[str] = None,
|
633
|
-
ports: Optional[Tuple[str, ...]] = None
|
683
|
+
ports: Optional[Tuple[str, ...]] = None,
|
684
|
+
config_override: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
634
685
|
"""Parses the override parameters into a dictionary."""
|
635
686
|
override_params: Dict[str, Any] = {}
|
636
687
|
if cloud is not None:
|
@@ -691,6 +742,8 @@ def _parse_override_params(
|
|
691
742
|
override_params['ports'] = None
|
692
743
|
else:
|
693
744
|
override_params['ports'] = ports
|
745
|
+
if config_override:
|
746
|
+
override_params['_cluster_config_overrides'] = config_override
|
694
747
|
return override_params
|
695
748
|
|
696
749
|
|
@@ -793,6 +846,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
793
846
|
field_to_ignore: Optional[List[str]] = None,
|
794
847
|
# job launch specific
|
795
848
|
job_recovery: Optional[str] = None,
|
849
|
+
config_override: Optional[Dict[str, Any]] = None,
|
796
850
|
) -> Union[sky.Task, sky.Dag]:
|
797
851
|
"""Creates a task or a dag from an entrypoint with overrides.
|
798
852
|
|
@@ -826,7 +880,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
826
880
|
image_id=image_id,
|
827
881
|
disk_size=disk_size,
|
828
882
|
disk_tier=disk_tier,
|
829
|
-
ports=ports
|
883
|
+
ports=ports,
|
884
|
+
config_override=config_override)
|
830
885
|
if field_to_ignore is not None:
|
831
886
|
_pop_and_ignore_fields_in_override_params(override_params,
|
832
887
|
field_to_ignore)
|
@@ -1010,6 +1065,7 @@ def cli():
|
|
1010
1065
|
|
1011
1066
|
|
1012
1067
|
@cli.command(cls=_DocumentedCodeCommand)
|
1068
|
+
@config_option(expose_value=True)
|
1013
1069
|
@click.argument('entrypoint',
|
1014
1070
|
required=False,
|
1015
1071
|
type=str,
|
@@ -1139,7 +1195,8 @@ def launch(
|
|
1139
1195
|
no_setup: bool,
|
1140
1196
|
clone_disk_from: Optional[str],
|
1141
1197
|
fast: bool,
|
1142
|
-
async_call: bool
|
1198
|
+
async_call: bool,
|
1199
|
+
config_override: Optional[Dict[str, Any]] = None):
|
1143
1200
|
"""Launch a cluster or task.
|
1144
1201
|
|
1145
1202
|
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
@@ -1181,6 +1238,7 @@ def launch(
|
|
1181
1238
|
disk_size=disk_size,
|
1182
1239
|
disk_tier=disk_tier,
|
1183
1240
|
ports=ports,
|
1241
|
+
config_override=config_override,
|
1184
1242
|
)
|
1185
1243
|
if isinstance(task_or_dag, sky.Dag):
|
1186
1244
|
raise click.UsageError(
|
@@ -1245,6 +1303,7 @@ def launch(
|
|
1245
1303
|
|
1246
1304
|
|
1247
1305
|
@cli.command(cls=_DocumentedCodeCommand)
|
1306
|
+
@config_option(expose_value=True)
|
1248
1307
|
@click.argument('cluster',
|
1249
1308
|
required=False,
|
1250
1309
|
type=str,
|
@@ -1273,15 +1332,29 @@ def launch(
|
|
1273
1332
|
_COMMON_OPTIONS)
|
1274
1333
|
@usage_lib.entrypoint
|
1275
1334
|
# pylint: disable=redefined-builtin
|
1276
|
-
def exec(cluster: Optional[str],
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1335
|
+
def exec(cluster: Optional[str],
|
1336
|
+
cluster_option: Optional[str],
|
1337
|
+
entrypoint: Tuple[str, ...],
|
1338
|
+
detach_run: bool,
|
1339
|
+
name: Optional[str],
|
1340
|
+
cloud: Optional[str],
|
1341
|
+
region: Optional[str],
|
1342
|
+
zone: Optional[str],
|
1343
|
+
workdir: Optional[str],
|
1344
|
+
gpus: Optional[str],
|
1345
|
+
ports: Tuple[str],
|
1346
|
+
instance_type: Optional[str],
|
1347
|
+
num_nodes: Optional[int],
|
1348
|
+
use_spot: Optional[bool],
|
1349
|
+
image_id: Optional[str],
|
1350
|
+
env_file: Optional[Dict[str, str]],
|
1351
|
+
env: List[Tuple[str, str]],
|
1352
|
+
cpus: Optional[str],
|
1353
|
+
memory: Optional[str],
|
1354
|
+
disk_size: Optional[int],
|
1355
|
+
disk_tier: Optional[str],
|
1356
|
+
async_call: bool,
|
1357
|
+
config_override: Optional[Dict[str, Any]] = None):
|
1285
1358
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1286
1359
|
"""Execute a task or command on an existing cluster.
|
1287
1360
|
|
@@ -1374,6 +1447,7 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
|
|
1374
1447
|
disk_tier=disk_tier,
|
1375
1448
|
ports=ports,
|
1376
1449
|
field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'],
|
1450
|
+
config_override=config_override,
|
1377
1451
|
)
|
1378
1452
|
|
1379
1453
|
if isinstance(task_or_dag, sky.Dag):
|
@@ -1657,6 +1731,7 @@ def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1657
1731
|
|
1658
1732
|
|
1659
1733
|
@cli.command()
|
1734
|
+
@config_option(expose_value=False)
|
1660
1735
|
@click.option('--verbose',
|
1661
1736
|
'-v',
|
1662
1737
|
default=False,
|
@@ -1949,6 +2024,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1949
2024
|
|
1950
2025
|
|
1951
2026
|
@cli.command()
|
2027
|
+
@config_option(expose_value=False)
|
1952
2028
|
@click.option('--all',
|
1953
2029
|
'-a',
|
1954
2030
|
default=False,
|
@@ -2019,6 +2095,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
2019
2095
|
|
2020
2096
|
|
2021
2097
|
@cli.command()
|
2098
|
+
@config_option(expose_value=False)
|
2022
2099
|
@click.option('--all-users',
|
2023
2100
|
'-u',
|
2024
2101
|
default=False,
|
@@ -2080,6 +2157,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
2080
2157
|
|
2081
2158
|
|
2082
2159
|
@cli.command()
|
2160
|
+
@config_option(expose_value=False)
|
2083
2161
|
@click.option(
|
2084
2162
|
'--sync-down',
|
2085
2163
|
'-s',
|
@@ -2217,6 +2295,7 @@ def logs(
|
|
2217
2295
|
|
2218
2296
|
|
2219
2297
|
@cli.command()
|
2298
|
+
@config_option(expose_value=False)
|
2220
2299
|
@click.argument('cluster',
|
2221
2300
|
required=True,
|
2222
2301
|
type=str,
|
@@ -2320,6 +2399,7 @@ def cancel(
|
|
2320
2399
|
|
2321
2400
|
|
2322
2401
|
@cli.command(cls=_DocumentedCodeCommand)
|
2402
|
+
@config_option(expose_value=False)
|
2323
2403
|
@click.argument('clusters',
|
2324
2404
|
nargs=-1,
|
2325
2405
|
required=False,
|
@@ -2387,6 +2467,7 @@ def stop(
|
|
2387
2467
|
|
2388
2468
|
|
2389
2469
|
@cli.command(cls=_DocumentedCodeCommand)
|
2470
|
+
@config_option(expose_value=False)
|
2390
2471
|
@click.argument('clusters',
|
2391
2472
|
nargs=-1,
|
2392
2473
|
required=False,
|
@@ -2499,6 +2580,7 @@ def autostop(
|
|
2499
2580
|
|
2500
2581
|
|
2501
2582
|
@cli.command(cls=_DocumentedCodeCommand)
|
2583
|
+
@config_option(expose_value=False)
|
2502
2584
|
@click.argument('clusters',
|
2503
2585
|
nargs=-1,
|
2504
2586
|
required=False,
|
@@ -2744,6 +2826,7 @@ def start(
|
|
2744
2826
|
|
2745
2827
|
|
2746
2828
|
@cli.command(cls=_DocumentedCodeCommand)
|
2829
|
+
@config_option(expose_value=False)
|
2747
2830
|
@click.argument('clusters',
|
2748
2831
|
nargs=-1,
|
2749
2832
|
required=False,
|
@@ -3182,6 +3265,7 @@ def _down_or_stop_clusters(
|
|
3182
3265
|
|
3183
3266
|
|
3184
3267
|
@cli.command(cls=_DocumentedCodeCommand)
|
3268
|
+
@config_option(expose_value=False)
|
3185
3269
|
@click.argument('clouds', required=False, type=str, nargs=-1)
|
3186
3270
|
@click.option('--verbose',
|
3187
3271
|
'-v',
|
@@ -3222,6 +3306,7 @@ def check(clouds: Tuple[str], verbose: bool):
|
|
3222
3306
|
|
3223
3307
|
|
3224
3308
|
@cli.command()
|
3309
|
+
@config_option(expose_value=False)
|
3225
3310
|
@click.argument('accelerator_str', required=False)
|
3226
3311
|
@click.option('--all',
|
3227
3312
|
'-a',
|
@@ -3379,15 +3464,14 @@ def show_gpus(
|
|
3379
3464
|
])
|
3380
3465
|
return realtime_gpu_table
|
3381
3466
|
|
3382
|
-
|
3383
|
-
def _get_kubernetes_node_info_table(context: Optional[str]):
|
3467
|
+
def _format_kubernetes_node_info(context: Optional[str]):
|
3384
3468
|
node_table = log_utils.create_table(
|
3385
3469
|
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
|
3386
3470
|
|
3387
|
-
|
3388
|
-
node_info_dict = sdk.stream_and_get(
|
3471
|
+
nodes_info = sdk.stream_and_get(
|
3389
3472
|
sdk.kubernetes_node_info(context=context))
|
3390
|
-
|
3473
|
+
no_permissions_str = '<no permissions>'
|
3474
|
+
for node_name, node_info in nodes_info.node_info_dict.items():
|
3391
3475
|
available = node_info.free[
|
3392
3476
|
'accelerators_available'] if node_info.free[
|
3393
3477
|
'accelerators_available'] != -1 else no_permissions_str
|
@@ -3395,7 +3479,14 @@ def show_gpus(
|
|
3395
3479
|
node_name, node_info.accelerator_type,
|
3396
3480
|
node_info.total['accelerator_count'], available
|
3397
3481
|
])
|
3398
|
-
|
3482
|
+
k8s_per_node_acc_message = (
|
3483
|
+
'Kubernetes per node accelerator availability ')
|
3484
|
+
if nodes_info.hint:
|
3485
|
+
k8s_per_node_acc_message += nodes_info.hint
|
3486
|
+
return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3487
|
+
f'{k8s_per_node_acc_message}'
|
3488
|
+
f'{colorama.Style.RESET_ALL}\n'
|
3489
|
+
f'{node_table.get_string()}')
|
3399
3490
|
|
3400
3491
|
def _output() -> Generator[str, None, None]:
|
3401
3492
|
gpu_table = log_utils.create_table(
|
@@ -3443,22 +3534,8 @@ def show_gpus(
|
|
3443
3534
|
f'Kubernetes GPUs {context_str}'
|
3444
3535
|
f'{colorama.Style.RESET_ALL}\n')
|
3445
3536
|
yield from k8s_realtime_table.get_string()
|
3446
|
-
k8s_node_table = _get_kubernetes_node_info_table(context)
|
3447
3537
|
yield '\n\n'
|
3448
|
-
|
3449
|
-
# support.
|
3450
|
-
k8s_per_node_acc_message = (
|
3451
|
-
'Kubernetes per node accelerator availability ')
|
3452
|
-
if kubernetes_utils.multi_host_tpu_exists_in_cluster(
|
3453
|
-
context):
|
3454
|
-
k8s_per_node_acc_message += (
|
3455
|
-
'(Note: Multi-host TPUs are detected and excluded '
|
3456
|
-
'from the display as multi-host TPUs are not '
|
3457
|
-
'supported.)')
|
3458
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3459
|
-
f'{k8s_per_node_acc_message}'
|
3460
|
-
f'{colorama.Style.RESET_ALL}\n')
|
3461
|
-
yield from k8s_node_table.get_string()
|
3538
|
+
yield _format_kubernetes_node_info(context)
|
3462
3539
|
if kubernetes_autoscaling:
|
3463
3540
|
k8s_messages += (
|
3464
3541
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3693,6 +3770,7 @@ def storage():
|
|
3693
3770
|
|
3694
3771
|
|
3695
3772
|
@storage.command('ls', cls=_DocumentedCodeCommand)
|
3773
|
+
@config_option(expose_value=False)
|
3696
3774
|
@click.option('--verbose',
|
3697
3775
|
'-v',
|
3698
3776
|
default=False,
|
@@ -3711,6 +3789,7 @@ def storage_ls(verbose: bool):
|
|
3711
3789
|
|
3712
3790
|
|
3713
3791
|
@storage.command('delete', cls=_DocumentedCodeCommand)
|
3792
|
+
@config_option(expose_value=False)
|
3714
3793
|
@click.argument('names',
|
3715
3794
|
required=False,
|
3716
3795
|
type=str,
|
@@ -3795,6 +3874,7 @@ def jobs():
|
|
3795
3874
|
|
3796
3875
|
|
3797
3876
|
@jobs.command('launch', cls=_DocumentedCodeCommand)
|
3877
|
+
@config_option(expose_value=True)
|
3798
3878
|
@click.argument('entrypoint',
|
3799
3879
|
required=True,
|
3800
3880
|
type=str,
|
@@ -3852,6 +3932,7 @@ def jobs_launch(
|
|
3852
3932
|
detach_run: bool,
|
3853
3933
|
yes: bool,
|
3854
3934
|
async_call: bool,
|
3935
|
+
config_override: Optional[Dict[str, Any]] = None,
|
3855
3936
|
):
|
3856
3937
|
"""Launch a managed job from a YAML or a command.
|
3857
3938
|
|
@@ -3892,6 +3973,7 @@ def jobs_launch(
|
|
3892
3973
|
disk_tier=disk_tier,
|
3893
3974
|
ports=ports,
|
3894
3975
|
job_recovery=job_recovery,
|
3976
|
+
config_override=config_override,
|
3895
3977
|
)
|
3896
3978
|
|
3897
3979
|
if not isinstance(task_or_dag, sky.Dag):
|
@@ -3929,6 +4011,7 @@ def jobs_launch(
|
|
3929
4011
|
|
3930
4012
|
|
3931
4013
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
4014
|
+
@config_option(expose_value=False)
|
3932
4015
|
@click.option('--verbose',
|
3933
4016
|
'-v',
|
3934
4017
|
default=False,
|
@@ -4045,6 +4128,7 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4045
4128
|
|
4046
4129
|
|
4047
4130
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
4131
|
+
@config_option(expose_value=False)
|
4048
4132
|
@click.option('--name',
|
4049
4133
|
'-n',
|
4050
4134
|
required=False,
|
@@ -4119,6 +4203,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
4119
4203
|
|
4120
4204
|
|
4121
4205
|
@jobs.command('logs', cls=_DocumentedCodeCommand)
|
4206
|
+
@config_option(expose_value=False)
|
4122
4207
|
@click.option('--name',
|
4123
4208
|
'-n',
|
4124
4209
|
required=False,
|
@@ -4183,6 +4268,7 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
4183
4268
|
|
4184
4269
|
|
4185
4270
|
@jobs.command('dashboard', cls=_DocumentedCodeCommand)
|
4271
|
+
@config_option(expose_value=False)
|
4186
4272
|
@usage_lib.entrypoint
|
4187
4273
|
def jobs_dashboard():
|
4188
4274
|
"""Opens a dashboard for managed jobs."""
|
@@ -4312,6 +4398,7 @@ def _generate_task_with_service(
|
|
4312
4398
|
|
4313
4399
|
|
4314
4400
|
@serve.command('up', cls=_DocumentedCodeCommand)
|
4401
|
+
@config_option(expose_value=False)
|
4315
4402
|
@click.argument('service_yaml',
|
4316
4403
|
required=True,
|
4317
4404
|
type=str,
|
@@ -4423,6 +4510,7 @@ def serve_up(
|
|
4423
4510
|
# TODO(MaoZiming): Expose mix replica traffic option to user.
|
4424
4511
|
# Currently, we do not mix traffic from old and new replicas.
|
4425
4512
|
@serve.command('update', cls=_DocumentedCodeCommand)
|
4513
|
+
@config_option(expose_value=False)
|
4426
4514
|
@click.argument('service_name', required=True, type=str)
|
4427
4515
|
@click.argument('service_yaml',
|
4428
4516
|
required=True,
|
@@ -4523,6 +4611,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
|
4523
4611
|
|
4524
4612
|
|
4525
4613
|
@serve.command('status', cls=_DocumentedCodeCommand)
|
4614
|
+
@config_option(expose_value=False)
|
4526
4615
|
@click.option('--verbose',
|
4527
4616
|
'-v',
|
4528
4617
|
default=False,
|
@@ -4648,6 +4737,7 @@ def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
|
|
4648
4737
|
|
4649
4738
|
|
4650
4739
|
@serve.command('down', cls=_DocumentedCodeCommand)
|
4740
|
+
@config_option(expose_value=False)
|
4651
4741
|
@click.argument('service_names', required=False, type=str, nargs=-1)
|
4652
4742
|
@click.option('--all',
|
4653
4743
|
'-a',
|
@@ -4761,6 +4851,7 @@ def serve_down(
|
|
4761
4851
|
|
4762
4852
|
|
4763
4853
|
@serve.command('logs', cls=_DocumentedCodeCommand)
|
4854
|
+
@config_option(expose_value=False)
|
4764
4855
|
@click.option(
|
4765
4856
|
'--follow/--no-follow',
|
4766
4857
|
is_flag=True,
|
@@ -4777,8 +4868,14 @@ def serve_down(
|
|
4777
4868
|
default=False,
|
4778
4869
|
required=False,
|
4779
4870
|
help='Show the load balancer logs of this service.')
|
4871
|
+
@click.option('--sync-down',
|
4872
|
+
'-s',
|
4873
|
+
is_flag=True,
|
4874
|
+
default=False,
|
4875
|
+
help='Sync down logs to the local machine. Can be combined with '
|
4876
|
+
'--controller, --load-balancer, or a replica ID to narrow scope.')
|
4780
4877
|
@click.argument('service_name', required=True, type=str)
|
4781
|
-
@click.argument('
|
4878
|
+
@click.argument('replica_ids', required=False, type=int, nargs=-1)
|
4782
4879
|
@usage_lib.entrypoint
|
4783
4880
|
# TODO(tian): Add default argument for this CLI if none of the flags are
|
4784
4881
|
# specified.
|
@@ -4787,9 +4884,13 @@ def serve_logs(
|
|
4787
4884
|
follow: bool,
|
4788
4885
|
controller: bool,
|
4789
4886
|
load_balancer: bool,
|
4790
|
-
|
4887
|
+
replica_ids: Tuple[int, ...],
|
4888
|
+
sync_down: bool,
|
4791
4889
|
):
|
4792
|
-
"""Tail
|
4890
|
+
"""Tail or sync down logs of a service.
|
4891
|
+
|
4892
|
+
Logs can be tailed from one target (controller, load balancer, or a single
|
4893
|
+
replica) or synced down from multiple targets simultaneously.
|
4793
4894
|
|
4794
4895
|
Example:
|
4795
4896
|
|
@@ -4803,27 +4904,89 @@ def serve_logs(
|
|
4803
4904
|
\b
|
4804
4905
|
# Tail the logs of replica 1
|
4805
4906
|
sky serve logs [SERVICE_NAME] 1
|
4907
|
+
\b
|
4908
|
+
# Sync down all logs of the service (controller, LB, all replicas)
|
4909
|
+
sky serve logs [SERVICE_NAME] --sync-down
|
4910
|
+
\b
|
4911
|
+
# Sync down controller logs and logs for replicas 1 and 3
|
4912
|
+
sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
|
4806
4913
|
"""
|
4807
|
-
|
4808
|
-
num_flags = (controller + load_balancer + have_replica_id)
|
4809
|
-
if num_flags > 1:
|
4810
|
-
raise click.UsageError('At most one of --controller, --load-balancer, '
|
4811
|
-
'[REPLICA_ID] can be specified.')
|
4812
|
-
if num_flags == 0:
|
4813
|
-
raise click.UsageError('One of --controller, --load-balancer, '
|
4814
|
-
'[REPLICA_ID] must be specified.')
|
4914
|
+
chosen_components: Set[serve_lib.ServiceComponent] = set()
|
4815
4915
|
if controller:
|
4816
|
-
|
4817
|
-
|
4818
|
-
|
4819
|
-
|
4820
|
-
|
4821
|
-
|
4822
|
-
|
4916
|
+
chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
|
4917
|
+
if load_balancer:
|
4918
|
+
chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
|
4919
|
+
# replica_ids contains the specific replica IDs provided by the user.
|
4920
|
+
# If it's not empty, it implies the user wants replica logs.
|
4921
|
+
if replica_ids:
|
4922
|
+
chosen_components.add(serve_lib.ServiceComponent.REPLICA)
|
4923
|
+
|
4924
|
+
if sync_down:
|
4925
|
+
# For sync-down, multiple targets are allowed.
|
4926
|
+
# If no specific components/replicas are mentioned, sync all.
|
4927
|
+
# Note: Multiple replicas or targets can only be specified when
|
4928
|
+
# using --sync-down.
|
4929
|
+
targets_to_sync = list(chosen_components)
|
4930
|
+
if not targets_to_sync and not replica_ids:
|
4931
|
+
# Default to all components if nothing specific is requested
|
4932
|
+
targets_to_sync = [
|
4933
|
+
serve_lib.ServiceComponent.CONTROLLER,
|
4934
|
+
serve_lib.ServiceComponent.LOAD_BALANCER,
|
4935
|
+
serve_lib.ServiceComponent.REPLICA,
|
4936
|
+
]
|
4937
|
+
|
4938
|
+
timestamp = sky_logging.get_run_timestamp()
|
4939
|
+
log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
|
4940
|
+
f'{service_name}_{timestamp}').expanduser()
|
4941
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
4942
|
+
|
4943
|
+
with rich_utils.client_status(
|
4944
|
+
ux_utils.spinner_message('Downloading service logs...')):
|
4945
|
+
serve_lib.sync_down_logs(service_name,
|
4946
|
+
local_dir=str(log_dir),
|
4947
|
+
targets=targets_to_sync,
|
4948
|
+
replica_ids=list(replica_ids))
|
4949
|
+
style = colorama.Style
|
4950
|
+
fore = colorama.Fore
|
4951
|
+
logger.info(f'{fore.CYAN}Service {service_name} logs: '
|
4952
|
+
f'{log_dir}{style.RESET_ALL}')
|
4953
|
+
return
|
4954
|
+
|
4955
|
+
# Tailing requires exactly one target.
|
4956
|
+
num_targets = len(chosen_components)
|
4957
|
+
# If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
|
4958
|
+
if serve_lib.ServiceComponent.REPLICA in chosen_components:
|
4959
|
+
if len(replica_ids) != 1:
|
4960
|
+
raise click.UsageError(
|
4961
|
+
'Can only tail logs from a single replica at a time. '
|
4962
|
+
'Provide exactly one REPLICA_ID or use --sync-down '
|
4963
|
+
'to download logs from multiple replicas.')
|
4964
|
+
# If replica is chosen and len is 1, num_targets effectively counts it.
|
4965
|
+
# We need to ensure no other component (controller/LB) is selected.
|
4966
|
+
if num_targets > 1:
|
4967
|
+
raise click.UsageError(
|
4968
|
+
'Can only tail logs from one target at a time (controller, '
|
4969
|
+
'load balancer, or a single replica). Use --sync-down '
|
4970
|
+
'to download logs from multiple sources.')
|
4971
|
+
elif num_targets == 0:
|
4972
|
+
raise click.UsageError(
|
4973
|
+
'Specify a target to tail: --controller, --load-balancer, or '
|
4974
|
+
'a REPLICA_ID.')
|
4975
|
+
elif num_targets > 1:
|
4976
|
+
raise click.UsageError(
|
4977
|
+
'Can only tail logs from one target at a time. Use --sync-down '
|
4978
|
+
'to download logs from multiple sources.')
|
4979
|
+
|
4980
|
+
# At this point, we have exactly one target for tailing.
|
4981
|
+
assert len(chosen_components) == 1
|
4982
|
+
assert len(replica_ids) in [0, 1]
|
4983
|
+
target_component = chosen_components.pop()
|
4984
|
+
target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
|
4985
|
+
|
4823
4986
|
try:
|
4824
4987
|
serve_lib.tail_logs(service_name,
|
4825
4988
|
target=target_component,
|
4826
|
-
replica_id=
|
4989
|
+
replica_id=target_replica_id,
|
4827
4990
|
follow=follow)
|
4828
4991
|
except exceptions.ClusterNotUpError:
|
4829
4992
|
with ux_utils.print_exception_no_traceback():
|
@@ -4874,6 +5037,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4874
5037
|
|
4875
5038
|
|
4876
5039
|
@bench.command('launch', cls=_DocumentedCodeCommand)
|
5040
|
+
@config_option(expose_value=True)
|
4877
5041
|
@click.argument('entrypoint',
|
4878
5042
|
required=True,
|
4879
5043
|
type=str,
|
@@ -4919,27 +5083,28 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4919
5083
|
help='Skip confirmation prompt.')
|
4920
5084
|
@usage_lib.entrypoint
|
4921
5085
|
def benchmark_launch(
|
4922
|
-
|
4923
|
-
|
4924
|
-
|
4925
|
-
|
4926
|
-
|
4927
|
-
|
4928
|
-
|
4929
|
-
|
4930
|
-
|
4931
|
-
|
4932
|
-
|
4933
|
-
|
4934
|
-
|
4935
|
-
|
4936
|
-
|
4937
|
-
|
4938
|
-
|
4939
|
-
|
4940
|
-
|
4941
|
-
|
4942
|
-
|
5086
|
+
entrypoint: str,
|
5087
|
+
benchmark: str,
|
5088
|
+
name: Optional[str],
|
5089
|
+
workdir: Optional[str],
|
5090
|
+
cloud: Optional[str],
|
5091
|
+
region: Optional[str],
|
5092
|
+
zone: Optional[str],
|
5093
|
+
gpus: Optional[str],
|
5094
|
+
num_nodes: Optional[int],
|
5095
|
+
use_spot: Optional[bool],
|
5096
|
+
image_id: Optional[str],
|
5097
|
+
env_file: Optional[Dict[str, str]],
|
5098
|
+
env: List[Tuple[str, str]],
|
5099
|
+
cpus: Optional[str],
|
5100
|
+
memory: Optional[str],
|
5101
|
+
disk_size: Optional[int],
|
5102
|
+
disk_tier: Optional[str],
|
5103
|
+
ports: Tuple[str],
|
5104
|
+
idle_minutes_to_autostop: Optional[int],
|
5105
|
+
yes: bool,
|
5106
|
+
async_call: bool, # pylint: disable=unused-argument
|
5107
|
+
config_override: Optional[Dict[str, Any]] = None,
|
4943
5108
|
) -> None:
|
4944
5109
|
"""Benchmark a task on different resources.
|
4945
5110
|
|
@@ -5048,7 +5213,8 @@ def benchmark_launch(
|
|
5048
5213
|
image_id=image_id,
|
5049
5214
|
disk_size=disk_size,
|
5050
5215
|
disk_tier=disk_tier,
|
5051
|
-
ports=ports
|
5216
|
+
ports=ports,
|
5217
|
+
config_override=config_override)
|
5052
5218
|
_pop_and_ignore_fields_in_override_params(
|
5053
5219
|
override_params, field_to_ignore=['cpus', 'memory'])
|
5054
5220
|
resources_config.update(override_params)
|
@@ -5113,6 +5279,7 @@ def benchmark_launch(
|
|
5113
5279
|
|
5114
5280
|
|
5115
5281
|
@bench.command('ls', cls=_DocumentedCodeCommand)
|
5282
|
+
@config_option(expose_value=False)
|
5116
5283
|
@usage_lib.entrypoint
|
5117
5284
|
def benchmark_ls() -> None:
|
5118
5285
|
"""List the benchmark history."""
|
@@ -5176,6 +5343,7 @@ def benchmark_ls() -> None:
|
|
5176
5343
|
|
5177
5344
|
|
5178
5345
|
@bench.command('show', cls=_DocumentedCodeCommand)
|
5346
|
+
@config_option(expose_value=False)
|
5179
5347
|
@click.argument('benchmark', required=True, type=str)
|
5180
5348
|
# TODO(woosuk): Add --all option to show all the collected information
|
5181
5349
|
# (e.g., setup time, warmup steps, total steps, etc.).
|
@@ -5301,6 +5469,7 @@ def benchmark_show(benchmark: str) -> None:
|
|
5301
5469
|
|
5302
5470
|
|
5303
5471
|
@bench.command('down', cls=_DocumentedCodeCommand)
|
5472
|
+
@config_option(expose_value=False)
|
5304
5473
|
@click.argument('benchmark', required=True, type=str)
|
5305
5474
|
@click.option(
|
5306
5475
|
'--exclude',
|
@@ -5343,6 +5512,7 @@ def benchmark_down(
|
|
5343
5512
|
|
5344
5513
|
|
5345
5514
|
@bench.command('delete', cls=_DocumentedCodeCommand)
|
5515
|
+
@config_option(expose_value=False)
|
5346
5516
|
@click.argument('benchmarks', required=False, type=str, nargs=-1)
|
5347
5517
|
@click.option('--all',
|
5348
5518
|
'-a',
|
@@ -5477,6 +5647,7 @@ def local():
|
|
5477
5647
|
help='Password for the ssh-user to execute sudo commands. '
|
5478
5648
|
'Required only if passwordless sudo is not setup.')
|
5479
5649
|
@local.command('up', cls=_DocumentedCodeCommand)
|
5650
|
+
@config_option(expose_value=False)
|
5480
5651
|
@_add_click_options(_COMMON_OPTIONS)
|
5481
5652
|
@usage_lib.entrypoint
|
5482
5653
|
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
@@ -5532,6 +5703,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
|
5532
5703
|
|
5533
5704
|
|
5534
5705
|
@local.command('down', cls=_DocumentedCodeCommand)
|
5706
|
+
@config_option(expose_value=False)
|
5535
5707
|
@_add_click_options(_COMMON_OPTIONS)
|
5536
5708
|
@usage_lib.entrypoint
|
5537
5709
|
def local_down(async_call: bool):
|
@@ -5547,6 +5719,7 @@ def api():
|
|
5547
5719
|
|
5548
5720
|
|
5549
5721
|
@api.command('start', cls=_DocumentedCodeCommand)
|
5722
|
+
@config_option(expose_value=False)
|
5550
5723
|
@click.option('--deploy',
|
5551
5724
|
type=bool,
|
5552
5725
|
is_flag=True,
|
@@ -5579,6 +5752,7 @@ def api_start(deploy: bool, host: Optional[str], foreground: bool):
|
|
5579
5752
|
|
5580
5753
|
|
5581
5754
|
@api.command('stop', cls=_DocumentedCodeCommand)
|
5755
|
+
@config_option(expose_value=False)
|
5582
5756
|
@usage_lib.entrypoint
|
5583
5757
|
def api_stop():
|
5584
5758
|
"""Stops the SkyPilot API server locally."""
|
@@ -5586,6 +5760,7 @@ def api_stop():
|
|
5586
5760
|
|
5587
5761
|
|
5588
5762
|
@api.command('logs', cls=_DocumentedCodeCommand)
|
5763
|
+
@config_option(expose_value=False)
|
5589
5764
|
@click.argument('request_id', required=False, type=str)
|
5590
5765
|
@click.option('--server-logs',
|
5591
5766
|
is_flag=True,
|
@@ -5625,6 +5800,7 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
5625
5800
|
|
5626
5801
|
|
5627
5802
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|
5803
|
+
@config_option(expose_value=False)
|
5628
5804
|
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5629
5805
|
@click.option('--all',
|
5630
5806
|
'-a',
|
@@ -5666,6 +5842,7 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
5666
5842
|
|
5667
5843
|
|
5668
5844
|
@api.command('status', cls=_DocumentedCodeCommand)
|
5845
|
+
@config_option(expose_value=False)
|
5669
5846
|
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5670
5847
|
@click.option('--all-status',
|
5671
5848
|
'-a',
|
@@ -5709,6 +5886,7 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
5709
5886
|
|
5710
5887
|
|
5711
5888
|
@api.command('login', cls=_DocumentedCodeCommand)
|
5889
|
+
@config_option(expose_value=False)
|
5712
5890
|
@click.option('--endpoint',
|
5713
5891
|
'-e',
|
5714
5892
|
required=False,
|
@@ -5720,6 +5898,7 @@ def api_login(endpoint: Optional[str]):
|
|
5720
5898
|
|
5721
5899
|
|
5722
5900
|
@api.command('info', cls=_DocumentedCodeCommand)
|
5901
|
+
@config_option(expose_value=False)
|
5723
5902
|
@usage_lib.entrypoint
|
5724
5903
|
def api_info():
|
5725
5904
|
"""Shows the SkyPilot API server URL."""
|