skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +20 -1
- sky/backends/cloud_vm_ray_backend.py +42 -6
- sky/check.py +11 -1
- sky/client/cli/command.py +248 -119
- sky/client/sdk.py +146 -66
- sky/client/sdk_async.py +5 -1
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +6 -4
- sky/global_user_state.py +22 -3
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +67 -19
- sky/jobs/controller.py +2 -1
- sky/jobs/server/core.py +48 -1
- sky/jobs/server/server.py +52 -3
- sky/jobs/state.py +5 -1
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/serve/client/impl.py +93 -6
- sky/serve/client/sdk.py +22 -53
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +4 -2
- sky/serve/serve_state.py +444 -324
- sky/serve/serve_utils.py +77 -46
- sky/serve/server/core.py +13 -197
- sky/serve/server/impl.py +239 -2
- sky/serve/service.py +8 -3
- sky/server/common.py +18 -7
- sky/server/constants.py +1 -1
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +19 -0
- sky/setup_files/alembic.ini +4 -0
- sky/task.py +18 -11
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/sky-serve-controller.yaml.j2 +1 -0
- sky/usage/usage_lib.py +8 -6
- sky/utils/annotations.py +8 -3
- sky/utils/cli_utils/status_utils.py +1 -1
- sky/utils/common_utils.py +11 -1
- sky/utils/db/db_utils.py +31 -0
- sky/utils/db/migration_utils.py +6 -2
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
- sky/client/sdk.pyi +0 -301
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
- /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -35,7 +35,7 @@ import sys
|
|
|
35
35
|
import traceback
|
|
36
36
|
import typing
|
|
37
37
|
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
|
38
|
-
Union)
|
|
38
|
+
TypeVar, Union)
|
|
39
39
|
|
|
40
40
|
import click
|
|
41
41
|
import colorama
|
|
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
|
|
|
116
116
|
'`sky jobs launch`. `{command}` supports a '
|
|
117
117
|
'single task only.')
|
|
118
118
|
|
|
119
|
+
T = TypeVar('T')
|
|
120
|
+
|
|
119
121
|
|
|
120
122
|
def _get_cluster_records_and_set_ssh_config(
|
|
121
123
|
clusters: Optional[List[str]],
|
|
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
|
|
|
224
226
|
return list(set(glob_storages))
|
|
225
227
|
|
|
226
228
|
|
|
227
|
-
def _async_call_or_wait(request_id:
|
|
228
|
-
request_name: str) -> Any:
|
|
229
|
+
def _async_call_or_wait(request_id: server_common.RequestId[T],
|
|
230
|
+
async_call: bool, request_name: str) -> Any:
|
|
229
231
|
short_request_id = request_id[:8]
|
|
230
232
|
if not async_call:
|
|
231
233
|
try:
|
|
@@ -1411,7 +1413,7 @@ def exec(
|
|
|
1411
1413
|
|
|
1412
1414
|
|
|
1413
1415
|
def _handle_jobs_queue_request(
|
|
1414
|
-
request_id: str,
|
|
1416
|
+
request_id: server_common.RequestId[List[Dict[str, Any]]],
|
|
1415
1417
|
show_all: bool,
|
|
1416
1418
|
show_user: bool,
|
|
1417
1419
|
max_num_jobs_to_show: Optional[int],
|
|
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
|
|
|
1492
1494
|
|
|
1493
1495
|
|
|
1494
1496
|
def _handle_services_request(
|
|
1495
|
-
request_id: str,
|
|
1497
|
+
request_id: server_common.RequestId[List[Dict[str, Any]]],
|
|
1496
1498
|
service_names: Optional[List[str]],
|
|
1497
1499
|
show_all: bool,
|
|
1498
1500
|
show_endpoint: bool,
|
|
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1879
1881
|
skip_finished=True,
|
|
1880
1882
|
all_users=all_users)
|
|
1881
1883
|
|
|
1882
|
-
def submit_services(
|
|
1884
|
+
def submit_services(
|
|
1885
|
+
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
1883
1886
|
return serve_lib.status(service_names=None)
|
|
1884
1887
|
|
|
1885
|
-
def submit_pools(
|
|
1888
|
+
def submit_pools(
|
|
1889
|
+
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
1886
1890
|
try:
|
|
1887
1891
|
return managed_jobs.pool_status(pool_names=None)
|
|
1888
1892
|
except exceptions.APINotSupportedError as e:
|
|
1889
1893
|
logger.debug(f'Pools are not supported in the remote server: {e}')
|
|
1890
1894
|
return None
|
|
1891
1895
|
|
|
1892
|
-
def submit_workspace() -> Optional[str]:
|
|
1896
|
+
def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
|
|
1893
1897
|
try:
|
|
1894
1898
|
return sdk.workspaces()
|
|
1895
1899
|
except RuntimeError:
|
|
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1928
1932
|
if not (ip or show_endpoints):
|
|
1929
1933
|
workspace_request_id = workspace_request_future.result()
|
|
1930
1934
|
|
|
1931
|
-
managed_jobs_queue_request_id = (
|
|
1932
|
-
|
|
1933
|
-
|
|
1935
|
+
managed_jobs_queue_request_id = (server_common.RequestId()
|
|
1936
|
+
if not managed_jobs_queue_request_id else
|
|
1937
|
+
managed_jobs_queue_request_id)
|
|
1938
|
+
service_status_request_id = (server_common.RequestId()
|
|
1939
|
+
if not service_status_request_id else
|
|
1934
1940
|
service_status_request_id)
|
|
1935
|
-
pool_status_request_id = (
|
|
1941
|
+
pool_status_request_id = (server_common.RequestId()
|
|
1942
|
+
if not pool_status_request_id else
|
|
1936
1943
|
pool_status_request_id)
|
|
1937
1944
|
|
|
1938
1945
|
# Phase 3: Get cluster records and handle special cases
|
|
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1957
1964
|
if workspace_request_id is not None:
|
|
1958
1965
|
all_workspaces = sdk.get(workspace_request_id)
|
|
1959
1966
|
else:
|
|
1960
|
-
all_workspaces =
|
|
1967
|
+
all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
|
|
1961
1968
|
active_workspace = skypilot_config.get_active_workspace()
|
|
1962
1969
|
show_workspace = len(all_workspaces) > 1
|
|
1963
1970
|
_show_enabled_infra(active_workspace, show_workspace)
|
|
@@ -3836,7 +3843,7 @@ def show_gpus(
|
|
|
3836
3843
|
yield k8s_messages
|
|
3837
3844
|
yield '\n\n'
|
|
3838
3845
|
|
|
3839
|
-
|
|
3846
|
+
list_accelerator_counts_result = sdk.stream_and_get(
|
|
3840
3847
|
sdk.list_accelerator_counts(
|
|
3841
3848
|
gpus_only=True,
|
|
3842
3849
|
clouds=clouds_to_list,
|
|
@@ -3853,14 +3860,20 @@ def show_gpus(
|
|
|
3853
3860
|
|
|
3854
3861
|
# "Common" GPUs
|
|
3855
3862
|
for gpu in catalog.get_common_gpus():
|
|
3856
|
-
if gpu in
|
|
3857
|
-
gpu_table.add_row([
|
|
3863
|
+
if gpu in list_accelerator_counts_result:
|
|
3864
|
+
gpu_table.add_row([
|
|
3865
|
+
gpu,
|
|
3866
|
+
_list_to_str(list_accelerator_counts_result.pop(gpu))
|
|
3867
|
+
])
|
|
3858
3868
|
yield from gpu_table.get_string()
|
|
3859
3869
|
|
|
3860
3870
|
# Google TPUs
|
|
3861
3871
|
for tpu in catalog.get_tpus():
|
|
3862
|
-
if tpu in
|
|
3863
|
-
tpu_table.add_row([
|
|
3872
|
+
if tpu in list_accelerator_counts_result:
|
|
3873
|
+
tpu_table.add_row([
|
|
3874
|
+
tpu,
|
|
3875
|
+
_list_to_str(list_accelerator_counts_result.pop(tpu))
|
|
3876
|
+
])
|
|
3864
3877
|
if tpu_table.get_string():
|
|
3865
3878
|
yield '\n\n'
|
|
3866
3879
|
yield from tpu_table.get_string()
|
|
@@ -3868,7 +3881,7 @@ def show_gpus(
|
|
|
3868
3881
|
# Other GPUs
|
|
3869
3882
|
if show_all:
|
|
3870
3883
|
yield '\n\n'
|
|
3871
|
-
for gpu, qty in sorted(
|
|
3884
|
+
for gpu, qty in sorted(list_accelerator_counts_result.items()):
|
|
3872
3885
|
other_table.add_row([gpu, _list_to_str(qty)])
|
|
3873
3886
|
yield from other_table.get_string()
|
|
3874
3887
|
yield '\n\n'
|
|
@@ -3919,7 +3932,7 @@ def show_gpus(
|
|
|
3919
3932
|
|
|
3920
3933
|
# For clouds other than Kubernetes, get the accelerator details
|
|
3921
3934
|
# Case-sensitive
|
|
3922
|
-
|
|
3935
|
+
list_accelerators_result = sdk.stream_and_get(
|
|
3923
3936
|
sdk.list_accelerators(gpus_only=True,
|
|
3924
3937
|
name_filter=name,
|
|
3925
3938
|
quantity_filter=quantity,
|
|
@@ -3935,8 +3948,8 @@ def show_gpus(
|
|
|
3935
3948
|
# - Group by cloud
|
|
3936
3949
|
# - Sort within each group by prices
|
|
3937
3950
|
# - Sort groups by each cloud's (min price, min spot price)
|
|
3938
|
-
new_result = {}
|
|
3939
|
-
for i, (gpu, items) in enumerate(
|
|
3951
|
+
new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
|
|
3952
|
+
for i, (gpu, items) in enumerate(list_accelerators_result.items()):
|
|
3940
3953
|
df = pd.DataFrame([t._asdict() for t in items])
|
|
3941
3954
|
# Determine the minimum prices for each cloud.
|
|
3942
3955
|
min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
|
|
@@ -3954,14 +3967,14 @@ def show_gpus(
|
|
|
3954
3967
|
for row in df.to_records(index=False)
|
|
3955
3968
|
]
|
|
3956
3969
|
new_result[gpu] = sorted_dataclasses
|
|
3957
|
-
|
|
3970
|
+
list_accelerators_result = new_result
|
|
3958
3971
|
|
|
3959
3972
|
if print_section_titles and not show_all:
|
|
3960
3973
|
yield '\n\n'
|
|
3961
3974
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
3962
3975
|
f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
|
|
3963
3976
|
|
|
3964
|
-
if not
|
|
3977
|
+
if not list_accelerators_result:
|
|
3965
3978
|
quantity_str = (f' with requested quantity {quantity}'
|
|
3966
3979
|
if quantity else '')
|
|
3967
3980
|
cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
|
|
@@ -3969,7 +3982,7 @@ def show_gpus(
|
|
|
3969
3982
|
yield 'To show available accelerators, run: sky show-gpus --all'
|
|
3970
3983
|
return
|
|
3971
3984
|
|
|
3972
|
-
for i, (gpu, items) in enumerate(
|
|
3985
|
+
for i, (gpu, items) in enumerate(list_accelerators_result.items()):
|
|
3973
3986
|
accelerator_table_headers = [
|
|
3974
3987
|
'GPU',
|
|
3975
3988
|
'QTY',
|
|
@@ -4972,6 +4985,205 @@ def jobs_pool_down(
|
|
|
4972
4985
|
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
|
|
4973
4986
|
|
|
4974
4987
|
|
|
4988
|
+
def _handle_serve_logs(
|
|
4989
|
+
service_name: str,
|
|
4990
|
+
follow: bool,
|
|
4991
|
+
controller: bool,
|
|
4992
|
+
load_balancer: bool,
|
|
4993
|
+
replica_ids: Tuple[int, ...],
|
|
4994
|
+
sync_down: bool,
|
|
4995
|
+
tail: Optional[int],
|
|
4996
|
+
pool: bool, # pylint: disable=redefined-outer-name
|
|
4997
|
+
):
|
|
4998
|
+
noun = 'pool' if pool else 'service'
|
|
4999
|
+
capnoun = noun.capitalize()
|
|
5000
|
+
repnoun = 'worker' if pool else 'replica'
|
|
5001
|
+
if tail is not None:
|
|
5002
|
+
if tail < 0:
|
|
5003
|
+
raise click.UsageError('--tail must be a non-negative integer.')
|
|
5004
|
+
# TODO(arda): We could add ability to tail and follow logs together.
|
|
5005
|
+
if follow:
|
|
5006
|
+
follow = False
|
|
5007
|
+
logger.warning(
|
|
5008
|
+
f'{colorama.Fore.YELLOW}'
|
|
5009
|
+
'--tail and --follow cannot be used together. '
|
|
5010
|
+
f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
|
|
5011
|
+
|
|
5012
|
+
chosen_components: Set[serve_lib.ServiceComponent] = set()
|
|
5013
|
+
if controller:
|
|
5014
|
+
chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
|
|
5015
|
+
if load_balancer:
|
|
5016
|
+
assert not pool, 'Load balancer is not supported for pools.'
|
|
5017
|
+
chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
|
|
5018
|
+
# replica_ids contains the specific replica IDs provided by the user.
|
|
5019
|
+
# If it's not empty, it implies the user wants replica logs.
|
|
5020
|
+
if replica_ids:
|
|
5021
|
+
chosen_components.add(serve_lib.ServiceComponent.REPLICA)
|
|
5022
|
+
|
|
5023
|
+
if sync_down:
|
|
5024
|
+
# For sync-down, multiple targets are allowed.
|
|
5025
|
+
# If no specific components/replicas are mentioned, sync all.
|
|
5026
|
+
# Note: Multiple replicas or targets can only be specified when
|
|
5027
|
+
# using --sync-down.
|
|
5028
|
+
targets_to_sync = list(chosen_components)
|
|
5029
|
+
if not targets_to_sync and not replica_ids:
|
|
5030
|
+
# Default to all components if nothing specific is requested
|
|
5031
|
+
targets_to_sync = [
|
|
5032
|
+
serve_lib.ServiceComponent.CONTROLLER,
|
|
5033
|
+
serve_lib.ServiceComponent.REPLICA,
|
|
5034
|
+
]
|
|
5035
|
+
if not pool:
|
|
5036
|
+
targets_to_sync.append(serve_lib.ServiceComponent.LOAD_BALANCER)
|
|
5037
|
+
|
|
5038
|
+
timestamp = sky_logging.get_run_timestamp()
|
|
5039
|
+
log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / noun /
|
|
5040
|
+
f'{service_name}_{timestamp}').expanduser()
|
|
5041
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
5042
|
+
|
|
5043
|
+
with rich_utils.client_status(
|
|
5044
|
+
ux_utils.spinner_message(f'Downloading {noun} logs...')):
|
|
5045
|
+
if pool:
|
|
5046
|
+
managed_jobs.pool_sync_down_logs(service_name,
|
|
5047
|
+
str(log_dir),
|
|
5048
|
+
targets=targets_to_sync,
|
|
5049
|
+
worker_ids=list(replica_ids),
|
|
5050
|
+
tail=tail)
|
|
5051
|
+
else:
|
|
5052
|
+
serve_lib.sync_down_logs(service_name,
|
|
5053
|
+
str(log_dir),
|
|
5054
|
+
targets=targets_to_sync,
|
|
5055
|
+
replica_ids=list(replica_ids),
|
|
5056
|
+
tail=tail)
|
|
5057
|
+
style = colorama.Style
|
|
5058
|
+
fore = colorama.Fore
|
|
5059
|
+
logger.info(f'{fore.CYAN}{capnoun} {service_name} logs: '
|
|
5060
|
+
f'{log_dir}{style.RESET_ALL}')
|
|
5061
|
+
return
|
|
5062
|
+
|
|
5063
|
+
# Tailing requires exactly one target.
|
|
5064
|
+
num_targets = len(chosen_components)
|
|
5065
|
+
# If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
|
|
5066
|
+
if serve_lib.ServiceComponent.REPLICA in chosen_components:
|
|
5067
|
+
if len(replica_ids) != 1:
|
|
5068
|
+
raise click.UsageError(
|
|
5069
|
+
f'Can only tail logs from a single {repnoun} at a time. '
|
|
5070
|
+
f'Provide exactly one {repnoun.upper()}_ID or use --sync-down '
|
|
5071
|
+
f'to download logs from multiple {repnoun}s.')
|
|
5072
|
+
# If replica is chosen and len is 1, num_targets effectively counts it.
|
|
5073
|
+
# We need to ensure no other component (controller/LB) is selected.
|
|
5074
|
+
if num_targets > 1:
|
|
5075
|
+
raise click.UsageError(
|
|
5076
|
+
'Can only tail logs from one target at a time (controller, '
|
|
5077
|
+
f'load balancer, or a single {repnoun}). Use --sync-down '
|
|
5078
|
+
'to download logs from multiple sources.')
|
|
5079
|
+
elif num_targets == 0:
|
|
5080
|
+
raise click.UsageError(
|
|
5081
|
+
'Specify a target to tail: --controller, --load-balancer, or '
|
|
5082
|
+
f'a {repnoun.upper()}_ID.')
|
|
5083
|
+
elif num_targets > 1:
|
|
5084
|
+
raise click.UsageError(
|
|
5085
|
+
'Can only tail logs from one target at a time. Use --sync-down '
|
|
5086
|
+
'to download logs from multiple sources.')
|
|
5087
|
+
|
|
5088
|
+
# At this point, we have exactly one target for tailing.
|
|
5089
|
+
assert len(chosen_components) == 1
|
|
5090
|
+
assert len(replica_ids) in [0, 1]
|
|
5091
|
+
target_component = chosen_components.pop()
|
|
5092
|
+
target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
|
|
5093
|
+
|
|
5094
|
+
try:
|
|
5095
|
+
if pool:
|
|
5096
|
+
managed_jobs.pool_tail_logs(service_name,
|
|
5097
|
+
target=target_component,
|
|
5098
|
+
worker_id=target_replica_id,
|
|
5099
|
+
follow=follow,
|
|
5100
|
+
tail=tail)
|
|
5101
|
+
else:
|
|
5102
|
+
serve_lib.tail_logs(service_name,
|
|
5103
|
+
target=target_component,
|
|
5104
|
+
replica_id=target_replica_id,
|
|
5105
|
+
follow=follow,
|
|
5106
|
+
tail=tail)
|
|
5107
|
+
except exceptions.ClusterNotUpError:
|
|
5108
|
+
with ux_utils.print_exception_no_traceback():
|
|
5109
|
+
raise
|
|
5110
|
+
|
|
5111
|
+
|
|
5112
|
+
@pool.command('logs', cls=_DocumentedCodeCommand)
|
|
5113
|
+
@flags.config_option(expose_value=False)
|
|
5114
|
+
@click.option(
|
|
5115
|
+
'--follow/--no-follow',
|
|
5116
|
+
is_flag=True,
|
|
5117
|
+
default=True,
|
|
5118
|
+
help=('Follow the logs of the job. [default: --follow] '
|
|
5119
|
+
'If --no-follow is specified, print the log so far and exit.'))
|
|
5120
|
+
@click.option('--controller',
|
|
5121
|
+
is_flag=True,
|
|
5122
|
+
default=False,
|
|
5123
|
+
required=False,
|
|
5124
|
+
help='Show the controller logs of this pool.')
|
|
5125
|
+
@click.option('--sync-down',
|
|
5126
|
+
'-s',
|
|
5127
|
+
is_flag=True,
|
|
5128
|
+
default=False,
|
|
5129
|
+
help='Sync down logs to the local machine. Can be combined with '
|
|
5130
|
+
'--controller or worker ID to narrow scope.')
|
|
5131
|
+
@click.option(
|
|
5132
|
+
'--tail',
|
|
5133
|
+
default=None,
|
|
5134
|
+
type=int,
|
|
5135
|
+
help='The number of lines to display from the end of the log file. '
|
|
5136
|
+
'Default is None, which means print all lines.')
|
|
5137
|
+
@click.argument('pool_name', required=True, type=str)
|
|
5138
|
+
@click.argument('worker_ids', required=False, type=int, nargs=-1)
|
|
5139
|
+
@usage_lib.entrypoint
|
|
5140
|
+
# TODO(tian): Add default argument for this CLI if none of the flags are
|
|
5141
|
+
# specified.
|
|
5142
|
+
def pool_logs(
|
|
5143
|
+
pool_name: str,
|
|
5144
|
+
follow: bool,
|
|
5145
|
+
controller: bool,
|
|
5146
|
+
worker_ids: Tuple[int, ...],
|
|
5147
|
+
sync_down: bool,
|
|
5148
|
+
tail: Optional[int],
|
|
5149
|
+
):
|
|
5150
|
+
"""Tail or sync down logs of a pool.
|
|
5151
|
+
|
|
5152
|
+
Logs can be tailed from one target (controller, or a single worker) or
|
|
5153
|
+
synced down from multiple targets simultaneously.
|
|
5154
|
+
|
|
5155
|
+
Example:
|
|
5156
|
+
|
|
5157
|
+
.. code-block:: bash
|
|
5158
|
+
|
|
5159
|
+
# Tail the controller logs of a pool
|
|
5160
|
+
sky pool logs --controller [POOL_NAME]
|
|
5161
|
+
\b
|
|
5162
|
+
# Print the worker logs so far and exit
|
|
5163
|
+
sky pool logs --no-follow [POOL_NAME]
|
|
5164
|
+
\b
|
|
5165
|
+
# Tail the logs of worker 1
|
|
5166
|
+
sky pool logs [POOL_NAME] 1
|
|
5167
|
+
\b
|
|
5168
|
+
# Show the last 100 lines of the controller logs
|
|
5169
|
+
sky pool logs --controller --tail 100 [POOL_NAME]
|
|
5170
|
+
\b
|
|
5171
|
+
# Sync down all logs of the pool (controller, all workers)
|
|
5172
|
+
sky pool logs [POOL_NAME] --sync-down
|
|
5173
|
+
\b
|
|
5174
|
+
# Sync down controller logs and logs for workers 1 and 3
|
|
5175
|
+
sky pool logs [POOL_NAME] 1 3 --controller --sync-down
|
|
5176
|
+
"""
|
|
5177
|
+
_handle_serve_logs(pool_name,
|
|
5178
|
+
follow=follow,
|
|
5179
|
+
controller=controller,
|
|
5180
|
+
load_balancer=False,
|
|
5181
|
+
replica_ids=worker_ids,
|
|
5182
|
+
sync_down=sync_down,
|
|
5183
|
+
tail=tail,
|
|
5184
|
+
pool=True)
|
|
5185
|
+
|
|
5186
|
+
|
|
4975
5187
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
4976
5188
|
@flags.config_option(expose_value=False)
|
|
4977
5189
|
@usage_lib.entrypoint
|
|
@@ -5555,6 +5767,7 @@ def serve_down(
|
|
|
5555
5767
|
show_default=True)
|
|
5556
5768
|
|
|
5557
5769
|
if replica_id_is_defined:
|
|
5770
|
+
assert replica_id is not None
|
|
5558
5771
|
request_id = serve_lib.terminate_replica(service_names[0], replica_id,
|
|
5559
5772
|
purge)
|
|
5560
5773
|
else:
|
|
@@ -5635,99 +5848,14 @@ def serve_logs(
|
|
|
5635
5848
|
# Sync down controller logs and logs for replicas 1 and 3
|
|
5636
5849
|
sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
|
|
5637
5850
|
"""
|
|
5638
|
-
|
|
5639
|
-
|
|
5640
|
-
|
|
5641
|
-
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
5645
|
-
|
|
5646
|
-
'--tail and --follow cannot be used together. '
|
|
5647
|
-
f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
|
|
5648
|
-
|
|
5649
|
-
chosen_components: Set[serve_lib.ServiceComponent] = set()
|
|
5650
|
-
if controller:
|
|
5651
|
-
chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
|
|
5652
|
-
if load_balancer:
|
|
5653
|
-
chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
|
|
5654
|
-
# replica_ids contains the specific replica IDs provided by the user.
|
|
5655
|
-
# If it's not empty, it implies the user wants replica logs.
|
|
5656
|
-
if replica_ids:
|
|
5657
|
-
chosen_components.add(serve_lib.ServiceComponent.REPLICA)
|
|
5658
|
-
|
|
5659
|
-
if sync_down:
|
|
5660
|
-
# For sync-down, multiple targets are allowed.
|
|
5661
|
-
# If no specific components/replicas are mentioned, sync all.
|
|
5662
|
-
# Note: Multiple replicas or targets can only be specified when
|
|
5663
|
-
# using --sync-down.
|
|
5664
|
-
targets_to_sync = list(chosen_components)
|
|
5665
|
-
if not targets_to_sync and not replica_ids:
|
|
5666
|
-
# Default to all components if nothing specific is requested
|
|
5667
|
-
targets_to_sync = [
|
|
5668
|
-
serve_lib.ServiceComponent.CONTROLLER,
|
|
5669
|
-
serve_lib.ServiceComponent.LOAD_BALANCER,
|
|
5670
|
-
serve_lib.ServiceComponent.REPLICA,
|
|
5671
|
-
]
|
|
5672
|
-
|
|
5673
|
-
timestamp = sky_logging.get_run_timestamp()
|
|
5674
|
-
log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
|
|
5675
|
-
f'{service_name}_{timestamp}').expanduser()
|
|
5676
|
-
log_dir.mkdir(parents=True, exist_ok=True)
|
|
5677
|
-
|
|
5678
|
-
with rich_utils.client_status(
|
|
5679
|
-
ux_utils.spinner_message('Downloading service logs...')):
|
|
5680
|
-
serve_lib.sync_down_logs(service_name,
|
|
5681
|
-
local_dir=str(log_dir),
|
|
5682
|
-
targets=targets_to_sync,
|
|
5683
|
-
replica_ids=list(replica_ids),
|
|
5684
|
-
tail=tail)
|
|
5685
|
-
style = colorama.Style
|
|
5686
|
-
fore = colorama.Fore
|
|
5687
|
-
logger.info(f'{fore.CYAN}Service {service_name} logs: '
|
|
5688
|
-
f'{log_dir}{style.RESET_ALL}')
|
|
5689
|
-
return
|
|
5690
|
-
|
|
5691
|
-
# Tailing requires exactly one target.
|
|
5692
|
-
num_targets = len(chosen_components)
|
|
5693
|
-
# If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
|
|
5694
|
-
if serve_lib.ServiceComponent.REPLICA in chosen_components:
|
|
5695
|
-
if len(replica_ids) != 1:
|
|
5696
|
-
raise click.UsageError(
|
|
5697
|
-
'Can only tail logs from a single replica at a time. '
|
|
5698
|
-
'Provide exactly one REPLICA_ID or use --sync-down '
|
|
5699
|
-
'to download logs from multiple replicas.')
|
|
5700
|
-
# If replica is chosen and len is 1, num_targets effectively counts it.
|
|
5701
|
-
# We need to ensure no other component (controller/LB) is selected.
|
|
5702
|
-
if num_targets > 1:
|
|
5703
|
-
raise click.UsageError(
|
|
5704
|
-
'Can only tail logs from one target at a time (controller, '
|
|
5705
|
-
'load balancer, or a single replica). Use --sync-down '
|
|
5706
|
-
'to download logs from multiple sources.')
|
|
5707
|
-
elif num_targets == 0:
|
|
5708
|
-
raise click.UsageError(
|
|
5709
|
-
'Specify a target to tail: --controller, --load-balancer, or '
|
|
5710
|
-
'a REPLICA_ID.')
|
|
5711
|
-
elif num_targets > 1:
|
|
5712
|
-
raise click.UsageError(
|
|
5713
|
-
'Can only tail logs from one target at a time. Use --sync-down '
|
|
5714
|
-
'to download logs from multiple sources.')
|
|
5715
|
-
|
|
5716
|
-
# At this point, we have exactly one target for tailing.
|
|
5717
|
-
assert len(chosen_components) == 1
|
|
5718
|
-
assert len(replica_ids) in [0, 1]
|
|
5719
|
-
target_component = chosen_components.pop()
|
|
5720
|
-
target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
|
|
5721
|
-
|
|
5722
|
-
try:
|
|
5723
|
-
serve_lib.tail_logs(service_name,
|
|
5724
|
-
target=target_component,
|
|
5725
|
-
replica_id=target_replica_id,
|
|
5726
|
-
follow=follow,
|
|
5727
|
-
tail=tail)
|
|
5728
|
-
except exceptions.ClusterNotUpError:
|
|
5729
|
-
with ux_utils.print_exception_no_traceback():
|
|
5730
|
-
raise
|
|
5851
|
+
_handle_serve_logs(service_name,
|
|
5852
|
+
follow=follow,
|
|
5853
|
+
controller=controller,
|
|
5854
|
+
load_balancer=load_balancer,
|
|
5855
|
+
replica_ids=replica_ids,
|
|
5856
|
+
sync_down=sync_down,
|
|
5857
|
+
tail=tail,
|
|
5858
|
+
pool=False)
|
|
5731
5859
|
|
|
5732
5860
|
|
|
5733
5861
|
@cli.group(cls=_NaturalOrderGroup, hidden=True)
|
|
@@ -5924,7 +6052,8 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
5924
6052
|
if request_id is not None and log_path is not None:
|
|
5925
6053
|
raise click.BadParameter(
|
|
5926
6054
|
'Only one of request ID and log path can be provided.')
|
|
5927
|
-
sdk.stream_and_get(request_id, log_path,
|
|
6055
|
+
sdk.stream_and_get(server_common.RequestId[None](request_id), log_path,
|
|
6056
|
+
tail)
|
|
5928
6057
|
|
|
5929
6058
|
|
|
5930
6059
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|