skypilot-nightly 1.0.0.dev20250417__py3-none-any.whl → 1.0.0.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +2 -13
- sky/backends/backend_utils.py +28 -0
- sky/backends/wheel_utils.py +9 -0
- sky/cli.py +93 -24
- sky/client/cli.py +93 -24
- sky/client/common.py +10 -3
- sky/client/sdk.py +6 -3
- sky/clouds/aws.py +5 -5
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +9 -9
- sky/dashboard/out/404.html +1 -0
- sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_ssgManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
- sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
- sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
- sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
- sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
- sky/dashboard/out/clusters/[cluster].html +1 -0
- sky/dashboard/out/clusters.html +1 -0
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -0
- sky/dashboard/out/jobs.html +1 -0
- sky/dashboard/out/skypilot.svg +15 -0
- sky/dashboard/out/videos/cursor-small.mp4 +0 -0
- sky/data/data_transfer.py +2 -1
- sky/data/storage.py +24 -14
- sky/optimizer.py +7 -9
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/utils.py +32 -6
- sky/resources.py +11 -2
- sky/serve/__init__.py +2 -0
- sky/serve/autoscalers.py +6 -2
- sky/serve/client/sdk.py +61 -0
- sky/serve/replica_managers.py +6 -8
- sky/serve/serve_utils.py +33 -1
- sky/serve/server/core.py +187 -5
- sky/serve/server/server.py +28 -0
- sky/server/common.py +19 -1
- sky/server/constants.py +6 -0
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +27 -15
- sky/server/server.py +43 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/sky_logging.py +10 -0
- sky/skypilot_config.py +58 -37
- sky/templates/kubernetes-ray.yml.j2 +6 -2
- sky/utils/config_utils.py +0 -1
- sky/utils/controller_utils.py +0 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/RECORD +73 -40
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '69aebc34b3963a1a8d4026e68b3cffd86347c1e2'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250422'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/aws.py
CHANGED
@@ -28,7 +28,6 @@ This is informed by the following boto3 docs:
|
|
28
28
|
|
29
29
|
# pylint: disable=import-outside-toplevel
|
30
30
|
|
31
|
-
import functools
|
32
31
|
import logging
|
33
32
|
import threading
|
34
33
|
import time
|
@@ -60,23 +59,13 @@ class _ThreadLocalLRUCache(threading.local):
|
|
60
59
|
|
61
60
|
def __init__(self, maxsize=32):
|
62
61
|
super().__init__()
|
63
|
-
self.cache = annotations.lru_cache(scope='
|
62
|
+
self.cache = annotations.lru_cache(scope='request', maxsize=maxsize)
|
64
63
|
|
65
64
|
|
66
65
|
def _thread_local_lru_cache(maxsize=32):
|
67
66
|
# Create thread-local storage for the LRU cache
|
68
67
|
local_cache = _ThreadLocalLRUCache(maxsize)
|
69
|
-
|
70
|
-
def decorator(func):
|
71
|
-
|
72
|
-
@functools.wraps(func)
|
73
|
-
def wrapper(*args, **kwargs):
|
74
|
-
# Use the thread-local LRU cache
|
75
|
-
return local_cache.cache(func)(*args, **kwargs)
|
76
|
-
|
77
|
-
return wrapper
|
78
|
-
|
79
|
-
return decorator
|
68
|
+
return local_cache.cache
|
80
69
|
|
81
70
|
|
82
71
|
def _assert_kwargs_builtin_type(kwargs):
|
sky/backends/backend_utils.py
CHANGED
@@ -2582,11 +2582,36 @@ def get_clusters(
|
|
2582
2582
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
2583
2583
|
records = new_records
|
2584
2584
|
|
2585
|
+
def _update_record_with_resources(record: Optional[Dict[str, Any]]) -> None:
|
2586
|
+
"""Add the resources to the record."""
|
2587
|
+
if record is None:
|
2588
|
+
return
|
2589
|
+
handle = record['handle']
|
2590
|
+
if handle is None:
|
2591
|
+
return
|
2592
|
+
record['nodes'] = handle.launched_nodes
|
2593
|
+
if handle.launched_resources is None:
|
2594
|
+
return
|
2595
|
+
record['cloud'] = (f'{handle.launched_resources.cloud}'
|
2596
|
+
if handle.launched_resources.cloud else None)
|
2597
|
+
record['region'] = (f'{handle.launched_resources.region}'
|
2598
|
+
if handle.launched_resources.region else None)
|
2599
|
+
record['cpus'] = (f'{handle.launched_resources.cpus}'
|
2600
|
+
if handle.launched_resources.cpus else None)
|
2601
|
+
record['memory'] = (f'{handle.launched_resources.memory}'
|
2602
|
+
if handle.launched_resources.memory else None)
|
2603
|
+
record['accelerators'] = (f'{handle.launched_resources.accelerators}'
|
2604
|
+
if handle.launched_resources.accelerators else
|
2605
|
+
None)
|
2606
|
+
|
2585
2607
|
# Add auth_config to the records
|
2586
2608
|
for record in records:
|
2587
2609
|
_update_record_with_credentials_and_resources_str(record)
|
2588
2610
|
|
2589
2611
|
if refresh == common.StatusRefreshMode.NONE:
|
2612
|
+
# Add resources to the records
|
2613
|
+
for record in records:
|
2614
|
+
_update_record_with_resources(record)
|
2590
2615
|
return records
|
2591
2616
|
|
2592
2617
|
plural = 's' if len(records) > 1 else ''
|
@@ -2662,6 +2687,9 @@ def get_clusters(
|
|
2662
2687
|
for cluster_name, e in failed_clusters:
|
2663
2688
|
logger.warning(f' {bright}{cluster_name}{reset}: {e}')
|
2664
2689
|
|
2690
|
+
# Add resources to the records
|
2691
|
+
for record in kept_records:
|
2692
|
+
_update_record_with_resources(record)
|
2665
2693
|
return kept_records
|
2666
2694
|
|
2667
2695
|
|
sky/backends/wheel_utils.py
CHANGED
@@ -85,6 +85,15 @@ def _build_sky_wheel() -> pathlib.Path:
|
|
85
85
|
for f in setup_files_dir.iterdir():
|
86
86
|
if f.is_file() and f.name != 'setup.py':
|
87
87
|
shutil.copy(str(f), str(tmp_dir))
|
88
|
+
if f.name == 'MANIFEST.in':
|
89
|
+
# Remove the line `sky/dashboard/out`, so we do not
|
90
|
+
# include the dashboard files in the internal wheel
|
91
|
+
import fileinput # pylint: disable=import-outside-toplevel
|
92
|
+
with fileinput.input(tmp_dir / f.name,
|
93
|
+
inplace=True) as file:
|
94
|
+
for line in file:
|
95
|
+
if 'sky/dashboard/out' not in line:
|
96
|
+
print(line, end='')
|
88
97
|
|
89
98
|
init_file_path = SKY_PACKAGE_PATH / '__init__.py'
|
90
99
|
init_file_content = init_file_path.read_text()
|
sky/cli.py
CHANGED
@@ -28,6 +28,7 @@ import datetime
|
|
28
28
|
import functools
|
29
29
|
import getpass
|
30
30
|
import os
|
31
|
+
import pathlib
|
31
32
|
import shlex
|
32
33
|
import shutil
|
33
34
|
import subprocess
|
@@ -301,13 +302,9 @@ def config_option(expose_value: bool):
|
|
301
302
|
try:
|
302
303
|
if len(value) == 0:
|
303
304
|
return None
|
304
|
-
elif len(value) > 1:
|
305
|
-
raise ValueError('argument specified multiple times. '
|
306
|
-
'To specify multiple configs, use '
|
307
|
-
'--config nested.key1=val1,another.key2=val2')
|
308
305
|
else:
|
309
306
|
# Apply the config overrides to the skypilot config.
|
310
|
-
return skypilot_config.apply_cli_config(value
|
307
|
+
return skypilot_config.apply_cli_config(value)
|
311
308
|
except ValueError as e:
|
312
309
|
raise click.BadParameter(f'{str(e)}') from e
|
313
310
|
|
@@ -4867,8 +4864,14 @@ def serve_down(
|
|
4867
4864
|
default=False,
|
4868
4865
|
required=False,
|
4869
4866
|
help='Show the load balancer logs of this service.')
|
4867
|
+
@click.option('--sync-down',
|
4868
|
+
'-s',
|
4869
|
+
is_flag=True,
|
4870
|
+
default=False,
|
4871
|
+
help='Sync down logs to the local machine. Can be combined with '
|
4872
|
+
'--controller, --load-balancer, or a replica ID to narrow scope.')
|
4870
4873
|
@click.argument('service_name', required=True, type=str)
|
4871
|
-
@click.argument('
|
4874
|
+
@click.argument('replica_ids', required=False, type=int, nargs=-1)
|
4872
4875
|
@usage_lib.entrypoint
|
4873
4876
|
# TODO(tian): Add default argument for this CLI if none of the flags are
|
4874
4877
|
# specified.
|
@@ -4877,9 +4880,13 @@ def serve_logs(
|
|
4877
4880
|
follow: bool,
|
4878
4881
|
controller: bool,
|
4879
4882
|
load_balancer: bool,
|
4880
|
-
|
4883
|
+
replica_ids: Tuple[int, ...],
|
4884
|
+
sync_down: bool,
|
4881
4885
|
):
|
4882
|
-
"""Tail
|
4886
|
+
"""Tail or sync down logs of a service.
|
4887
|
+
|
4888
|
+
Logs can be tailed from one target (controller, load balancer, or a single
|
4889
|
+
replica) or synced down from multiple targets simultaneously.
|
4883
4890
|
|
4884
4891
|
Example:
|
4885
4892
|
|
@@ -4893,27 +4900,89 @@ def serve_logs(
|
|
4893
4900
|
\b
|
4894
4901
|
# Tail the logs of replica 1
|
4895
4902
|
sky serve logs [SERVICE_NAME] 1
|
4903
|
+
\b
|
4904
|
+
# Sync down all logs of the service (controller, LB, all replicas)
|
4905
|
+
sky serve logs [SERVICE_NAME] --sync-down
|
4906
|
+
\b
|
4907
|
+
# Sync down controller logs and logs for replicas 1 and 3
|
4908
|
+
sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
|
4896
4909
|
"""
|
4897
|
-
|
4898
|
-
num_flags = (controller + load_balancer + have_replica_id)
|
4899
|
-
if num_flags > 1:
|
4900
|
-
raise click.UsageError('At most one of --controller, --load-balancer, '
|
4901
|
-
'[REPLICA_ID] can be specified.')
|
4902
|
-
if num_flags == 0:
|
4903
|
-
raise click.UsageError('One of --controller, --load-balancer, '
|
4904
|
-
'[REPLICA_ID] must be specified.')
|
4910
|
+
chosen_components: Set[serve_lib.ServiceComponent] = set()
|
4905
4911
|
if controller:
|
4906
|
-
|
4907
|
-
|
4908
|
-
|
4909
|
-
|
4910
|
-
|
4911
|
-
|
4912
|
-
|
4912
|
+
chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
|
4913
|
+
if load_balancer:
|
4914
|
+
chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
|
4915
|
+
# replica_ids contains the specific replica IDs provided by the user.
|
4916
|
+
# If it's not empty, it implies the user wants replica logs.
|
4917
|
+
if replica_ids:
|
4918
|
+
chosen_components.add(serve_lib.ServiceComponent.REPLICA)
|
4919
|
+
|
4920
|
+
if sync_down:
|
4921
|
+
# For sync-down, multiple targets are allowed.
|
4922
|
+
# If no specific components/replicas are mentioned, sync all.
|
4923
|
+
# Note: Multiple replicas or targets can only be specified when
|
4924
|
+
# using --sync-down.
|
4925
|
+
targets_to_sync = list(chosen_components)
|
4926
|
+
if not targets_to_sync and not replica_ids:
|
4927
|
+
# Default to all components if nothing specific is requested
|
4928
|
+
targets_to_sync = [
|
4929
|
+
serve_lib.ServiceComponent.CONTROLLER,
|
4930
|
+
serve_lib.ServiceComponent.LOAD_BALANCER,
|
4931
|
+
serve_lib.ServiceComponent.REPLICA,
|
4932
|
+
]
|
4933
|
+
|
4934
|
+
timestamp = sky_logging.get_run_timestamp()
|
4935
|
+
log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
|
4936
|
+
f'{service_name}_{timestamp}').expanduser()
|
4937
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
4938
|
+
|
4939
|
+
with rich_utils.client_status(
|
4940
|
+
ux_utils.spinner_message('Downloading service logs...')):
|
4941
|
+
serve_lib.sync_down_logs(service_name,
|
4942
|
+
local_dir=str(log_dir),
|
4943
|
+
targets=targets_to_sync,
|
4944
|
+
replica_ids=list(replica_ids))
|
4945
|
+
style = colorama.Style
|
4946
|
+
fore = colorama.Fore
|
4947
|
+
logger.info(f'{fore.CYAN}Service {service_name} logs: '
|
4948
|
+
f'{log_dir}{style.RESET_ALL}')
|
4949
|
+
return
|
4950
|
+
|
4951
|
+
# Tailing requires exactly one target.
|
4952
|
+
num_targets = len(chosen_components)
|
4953
|
+
# If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
|
4954
|
+
if serve_lib.ServiceComponent.REPLICA in chosen_components:
|
4955
|
+
if len(replica_ids) != 1:
|
4956
|
+
raise click.UsageError(
|
4957
|
+
'Can only tail logs from a single replica at a time. '
|
4958
|
+
'Provide exactly one REPLICA_ID or use --sync-down '
|
4959
|
+
'to download logs from multiple replicas.')
|
4960
|
+
# If replica is chosen and len is 1, num_targets effectively counts it.
|
4961
|
+
# We need to ensure no other component (controller/LB) is selected.
|
4962
|
+
if num_targets > 1:
|
4963
|
+
raise click.UsageError(
|
4964
|
+
'Can only tail logs from one target at a time (controller, '
|
4965
|
+
'load balancer, or a single replica). Use --sync-down '
|
4966
|
+
'to download logs from multiple sources.')
|
4967
|
+
elif num_targets == 0:
|
4968
|
+
raise click.UsageError(
|
4969
|
+
'Specify a target to tail: --controller, --load-balancer, or '
|
4970
|
+
'a REPLICA_ID.')
|
4971
|
+
elif num_targets > 1:
|
4972
|
+
raise click.UsageError(
|
4973
|
+
'Can only tail logs from one target at a time. Use --sync-down '
|
4974
|
+
'to download logs from multiple sources.')
|
4975
|
+
|
4976
|
+
# At this point, we have exactly one target for tailing.
|
4977
|
+
assert len(chosen_components) == 1
|
4978
|
+
assert len(replica_ids) in [0, 1]
|
4979
|
+
target_component = chosen_components.pop()
|
4980
|
+
target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
|
4981
|
+
|
4913
4982
|
try:
|
4914
4983
|
serve_lib.tail_logs(service_name,
|
4915
4984
|
target=target_component,
|
4916
|
-
replica_id=
|
4985
|
+
replica_id=target_replica_id,
|
4917
4986
|
follow=follow)
|
4918
4987
|
except exceptions.ClusterNotUpError:
|
4919
4988
|
with ux_utils.print_exception_no_traceback():
|
sky/client/cli.py
CHANGED
@@ -28,6 +28,7 @@ import datetime
|
|
28
28
|
import functools
|
29
29
|
import getpass
|
30
30
|
import os
|
31
|
+
import pathlib
|
31
32
|
import shlex
|
32
33
|
import shutil
|
33
34
|
import subprocess
|
@@ -301,13 +302,9 @@ def config_option(expose_value: bool):
|
|
301
302
|
try:
|
302
303
|
if len(value) == 0:
|
303
304
|
return None
|
304
|
-
elif len(value) > 1:
|
305
|
-
raise ValueError('argument specified multiple times. '
|
306
|
-
'To specify multiple configs, use '
|
307
|
-
'--config nested.key1=val1,another.key2=val2')
|
308
305
|
else:
|
309
306
|
# Apply the config overrides to the skypilot config.
|
310
|
-
return skypilot_config.apply_cli_config(value
|
307
|
+
return skypilot_config.apply_cli_config(value)
|
311
308
|
except ValueError as e:
|
312
309
|
raise click.BadParameter(f'{str(e)}') from e
|
313
310
|
|
@@ -4867,8 +4864,14 @@ def serve_down(
|
|
4867
4864
|
default=False,
|
4868
4865
|
required=False,
|
4869
4866
|
help='Show the load balancer logs of this service.')
|
4867
|
+
@click.option('--sync-down',
|
4868
|
+
'-s',
|
4869
|
+
is_flag=True,
|
4870
|
+
default=False,
|
4871
|
+
help='Sync down logs to the local machine. Can be combined with '
|
4872
|
+
'--controller, --load-balancer, or a replica ID to narrow scope.')
|
4870
4873
|
@click.argument('service_name', required=True, type=str)
|
4871
|
-
@click.argument('
|
4874
|
+
@click.argument('replica_ids', required=False, type=int, nargs=-1)
|
4872
4875
|
@usage_lib.entrypoint
|
4873
4876
|
# TODO(tian): Add default argument for this CLI if none of the flags are
|
4874
4877
|
# specified.
|
@@ -4877,9 +4880,13 @@ def serve_logs(
|
|
4877
4880
|
follow: bool,
|
4878
4881
|
controller: bool,
|
4879
4882
|
load_balancer: bool,
|
4880
|
-
|
4883
|
+
replica_ids: Tuple[int, ...],
|
4884
|
+
sync_down: bool,
|
4881
4885
|
):
|
4882
|
-
"""Tail
|
4886
|
+
"""Tail or sync down logs of a service.
|
4887
|
+
|
4888
|
+
Logs can be tailed from one target (controller, load balancer, or a single
|
4889
|
+
replica) or synced down from multiple targets simultaneously.
|
4883
4890
|
|
4884
4891
|
Example:
|
4885
4892
|
|
@@ -4893,27 +4900,89 @@ def serve_logs(
|
|
4893
4900
|
\b
|
4894
4901
|
# Tail the logs of replica 1
|
4895
4902
|
sky serve logs [SERVICE_NAME] 1
|
4903
|
+
\b
|
4904
|
+
# Sync down all logs of the service (controller, LB, all replicas)
|
4905
|
+
sky serve logs [SERVICE_NAME] --sync-down
|
4906
|
+
\b
|
4907
|
+
# Sync down controller logs and logs for replicas 1 and 3
|
4908
|
+
sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
|
4896
4909
|
"""
|
4897
|
-
|
4898
|
-
num_flags = (controller + load_balancer + have_replica_id)
|
4899
|
-
if num_flags > 1:
|
4900
|
-
raise click.UsageError('At most one of --controller, --load-balancer, '
|
4901
|
-
'[REPLICA_ID] can be specified.')
|
4902
|
-
if num_flags == 0:
|
4903
|
-
raise click.UsageError('One of --controller, --load-balancer, '
|
4904
|
-
'[REPLICA_ID] must be specified.')
|
4910
|
+
chosen_components: Set[serve_lib.ServiceComponent] = set()
|
4905
4911
|
if controller:
|
4906
|
-
|
4907
|
-
|
4908
|
-
|
4909
|
-
|
4910
|
-
|
4911
|
-
|
4912
|
-
|
4912
|
+
chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
|
4913
|
+
if load_balancer:
|
4914
|
+
chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
|
4915
|
+
# replica_ids contains the specific replica IDs provided by the user.
|
4916
|
+
# If it's not empty, it implies the user wants replica logs.
|
4917
|
+
if replica_ids:
|
4918
|
+
chosen_components.add(serve_lib.ServiceComponent.REPLICA)
|
4919
|
+
|
4920
|
+
if sync_down:
|
4921
|
+
# For sync-down, multiple targets are allowed.
|
4922
|
+
# If no specific components/replicas are mentioned, sync all.
|
4923
|
+
# Note: Multiple replicas or targets can only be specified when
|
4924
|
+
# using --sync-down.
|
4925
|
+
targets_to_sync = list(chosen_components)
|
4926
|
+
if not targets_to_sync and not replica_ids:
|
4927
|
+
# Default to all components if nothing specific is requested
|
4928
|
+
targets_to_sync = [
|
4929
|
+
serve_lib.ServiceComponent.CONTROLLER,
|
4930
|
+
serve_lib.ServiceComponent.LOAD_BALANCER,
|
4931
|
+
serve_lib.ServiceComponent.REPLICA,
|
4932
|
+
]
|
4933
|
+
|
4934
|
+
timestamp = sky_logging.get_run_timestamp()
|
4935
|
+
log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
|
4936
|
+
f'{service_name}_{timestamp}').expanduser()
|
4937
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
4938
|
+
|
4939
|
+
with rich_utils.client_status(
|
4940
|
+
ux_utils.spinner_message('Downloading service logs...')):
|
4941
|
+
serve_lib.sync_down_logs(service_name,
|
4942
|
+
local_dir=str(log_dir),
|
4943
|
+
targets=targets_to_sync,
|
4944
|
+
replica_ids=list(replica_ids))
|
4945
|
+
style = colorama.Style
|
4946
|
+
fore = colorama.Fore
|
4947
|
+
logger.info(f'{fore.CYAN}Service {service_name} logs: '
|
4948
|
+
f'{log_dir}{style.RESET_ALL}')
|
4949
|
+
return
|
4950
|
+
|
4951
|
+
# Tailing requires exactly one target.
|
4952
|
+
num_targets = len(chosen_components)
|
4953
|
+
# If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
|
4954
|
+
if serve_lib.ServiceComponent.REPLICA in chosen_components:
|
4955
|
+
if len(replica_ids) != 1:
|
4956
|
+
raise click.UsageError(
|
4957
|
+
'Can only tail logs from a single replica at a time. '
|
4958
|
+
'Provide exactly one REPLICA_ID or use --sync-down '
|
4959
|
+
'to download logs from multiple replicas.')
|
4960
|
+
# If replica is chosen and len is 1, num_targets effectively counts it.
|
4961
|
+
# We need to ensure no other component (controller/LB) is selected.
|
4962
|
+
if num_targets > 1:
|
4963
|
+
raise click.UsageError(
|
4964
|
+
'Can only tail logs from one target at a time (controller, '
|
4965
|
+
'load balancer, or a single replica). Use --sync-down '
|
4966
|
+
'to download logs from multiple sources.')
|
4967
|
+
elif num_targets == 0:
|
4968
|
+
raise click.UsageError(
|
4969
|
+
'Specify a target to tail: --controller, --load-balancer, or '
|
4970
|
+
'a REPLICA_ID.')
|
4971
|
+
elif num_targets > 1:
|
4972
|
+
raise click.UsageError(
|
4973
|
+
'Can only tail logs from one target at a time. Use --sync-down '
|
4974
|
+
'to download logs from multiple sources.')
|
4975
|
+
|
4976
|
+
# At this point, we have exactly one target for tailing.
|
4977
|
+
assert len(chosen_components) == 1
|
4978
|
+
assert len(replica_ids) in [0, 1]
|
4979
|
+
target_component = chosen_components.pop()
|
4980
|
+
target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
|
4981
|
+
|
4913
4982
|
try:
|
4914
4983
|
serve_lib.tail_logs(service_name,
|
4915
4984
|
target=target_component,
|
4916
|
-
replica_id=
|
4985
|
+
replica_id=target_replica_id,
|
4917
4986
|
follow=follow)
|
4918
4987
|
except exceptions.ClusterNotUpError:
|
4919
4988
|
with ux_utils.print_exception_no_traceback():
|
sky/client/common.py
CHANGED
@@ -53,11 +53,18 @@ API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS = 5
|
|
53
53
|
|
54
54
|
|
55
55
|
def download_logs_from_api_server(
|
56
|
-
paths_on_api_server: Iterable[str]
|
56
|
+
paths_on_api_server: Iterable[str],
|
57
|
+
remote_machine_prefix: str = str(
|
58
|
+
server_common.api_server_user_logs_dir_prefix()),
|
59
|
+
local_machine_prefix: str = constants.SKY_LOGS_DIRECTORY
|
60
|
+
) -> Dict[str, str]:
|
57
61
|
"""Downloads the logs from the API server.
|
58
62
|
|
59
63
|
Args:
|
60
64
|
paths_on_api_server: The paths on the API server to download.
|
65
|
+
remote_machine_prefix: The prefix of the remote machine to save the
|
66
|
+
logs.
|
67
|
+
local_machine_prefix: The prefix of the local machine to save the logs.
|
61
68
|
|
62
69
|
Returns:
|
63
70
|
A dictionary mapping the remote path on API server to the local path.
|
@@ -69,8 +76,8 @@ def download_logs_from_api_server(
|
|
69
76
|
# This should be moved to remote API server. A proper way might be
|
70
77
|
# set the returned path to be started with a special prefix, instead
|
71
78
|
# of using the `api_server_user_logs_dir_prefix()`.
|
72
|
-
|
73
|
-
|
79
|
+
remote_machine_prefix,
|
80
|
+
local_machine_prefix) for remote_path in paths_on_api_server
|
74
81
|
}
|
75
82
|
body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
|
76
83
|
response = requests.post(f'{server_common.get_server_url()}/download',
|
sky/client/sdk.py
CHANGED
@@ -1821,6 +1821,9 @@ def api_login(endpoint: Optional[str] = None) -> None:
|
|
1821
1821
|
else:
|
1822
1822
|
config = skypilot_config.get_user_config()
|
1823
1823
|
config.set_nested(('api_server', 'endpoint'), endpoint)
|
1824
|
-
common_utils.dump_yaml(str(config_path), config)
|
1825
|
-
|
1826
|
-
|
1824
|
+
common_utils.dump_yaml(str(config_path), dict(config))
|
1825
|
+
dashboard_msg = f'Dashboard: {endpoint}/dashboard'
|
1826
|
+
click.secho(
|
1827
|
+
f'Logged in to SkyPilot API server at {endpoint}.'
|
1828
|
+
f' {dashboard_msg}',
|
1829
|
+
fg='green')
|
sky/clouds/aws.py
CHANGED
@@ -571,7 +571,7 @@ class AWS(clouds.Cloud):
|
|
571
571
|
return cls._check_credentials()
|
572
572
|
|
573
573
|
@classmethod
|
574
|
-
@annotations.lru_cache(scope='
|
574
|
+
@annotations.lru_cache(scope='request',
|
575
575
|
maxsize=1) # Cache since getting identity is slow.
|
576
576
|
def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
577
577
|
"""Checks if the user has access credentials to AWS."""
|
@@ -710,7 +710,7 @@ class AWS(clouds.Cloud):
|
|
710
710
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
711
711
|
|
712
712
|
@classmethod
|
713
|
-
@annotations.lru_cache(scope='
|
713
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
714
714
|
def _aws_configure_list(cls) -> Optional[bytes]:
|
715
715
|
proc = subprocess.run('aws configure list',
|
716
716
|
shell=True,
|
@@ -722,7 +722,7 @@ class AWS(clouds.Cloud):
|
|
722
722
|
return proc.stdout
|
723
723
|
|
724
724
|
@classmethod
|
725
|
-
@annotations.lru_cache(scope='
|
725
|
+
@annotations.lru_cache(scope='request',
|
726
726
|
maxsize=1) # Cache since getting identity is slow.
|
727
727
|
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
728
728
|
try:
|
@@ -804,7 +804,7 @@ class AWS(clouds.Cloud):
|
|
804
804
|
return [user_ids]
|
805
805
|
|
806
806
|
@classmethod
|
807
|
-
@annotations.lru_cache(scope='
|
807
|
+
@annotations.lru_cache(scope='request',
|
808
808
|
maxsize=1) # Cache since getting identity is slow.
|
809
809
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
810
810
|
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
@@ -909,7 +909,7 @@ class AWS(clouds.Cloud):
|
|
909
909
|
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
|
910
910
|
}
|
911
911
|
|
912
|
-
@annotations.lru_cache(scope='
|
912
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
913
913
|
def can_credential_expire(self) -> bool:
|
914
914
|
identity_type = self._current_identity_type()
|
915
915
|
return (identity_type is not None and
|
@@ -60,8 +60,9 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
60
60
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
61
61
|
""")))
|
62
62
|
|
63
|
-
#
|
64
|
-
|
63
|
+
# Maximum price for TPU V6e is $691.2/hour. Here we set a higher price
|
64
|
+
# so the failover will go to the region with precise pricing info first.
|
65
|
+
TPU_V6E_MAX_PRICE = 700
|
65
66
|
|
66
67
|
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
67
68
|
# NOTE(dev): Keep the zones and the df in sync.
|
@@ -699,13 +700,12 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
699
700
|
spot_str = 'spot ' if spot else ''
|
700
701
|
print(f'The {spot_str}price of {tpu_name} in {tpu_region} is '
|
701
702
|
'not found in SKUs or hidden TPU price DF.')
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
HIDDEN_TPU_DF)
|
703
|
+
# GCP's TPU V6e pricing info is not stable and there are some
|
704
|
+
# regions that are missing the pricing info. We set the price to
|
705
|
+
# the maximum price so the failover will go to the region with
|
706
|
+
# precise pricing info first.
|
707
|
+
if tpu_name.startswith('tpu-v6e'):
|
708
|
+
tpu_price = TPU_V6E_MAX_PRICE
|
709
709
|
return tpu_price
|
710
710
|
|
711
711
|
df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
|
@@ -0,0 +1 @@
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/f3538cd90cfca88c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/f3538cd90cfca88c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-3001e84c61acddfb.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/2GsKhI8XKYj9B2969iIDf/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2GsKhI8XKYj9B2969iIDf/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"2GsKhI8XKYj9B2969iIDf","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
@@ -0,0 +1 @@
|
|
1
|
+
self.__BUILD_MANIFEST=function(s,c,e,t,a,r){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-f9f039532ca8cbc4.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":[s,e,c,t,a,"static/chunks/pages/clusters-a93b93e10b8b074e.js"],"/clusters/[cluster]":[s,e,c,t,r,a,"static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js"],"/clusters/[cluster]/[job]":[s,c,"static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js"],"/jobs":[s,e,c,t,r,"static/chunks/pages/jobs-a75029b67aab6a2e.js"],"/jobs/[job]":[s,c,"static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/jobs","/jobs/[job]"]}}("static/chunks/678-206dddca808e6d16.js","static/chunks/979-7cd0778078b9cfad.js","static/chunks/312-c3c8845990db8ffc.js","static/chunks/845-2ea1cc63ba1f4067.js","static/chunks/37-72fdc8f71d6e4784.js","static/chunks/236-d437cf66e68a6f64.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
@@ -0,0 +1 @@
|
|
1
|
+
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
|