skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +20 -1
- sky/backends/cloud_vm_ray_backend.py +42 -6
- sky/check.py +11 -1
- sky/client/cli/command.py +248 -119
- sky/client/sdk.py +146 -66
- sky/client/sdk_async.py +5 -1
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +6 -4
- sky/global_user_state.py +22 -3
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +67 -19
- sky/jobs/controller.py +2 -1
- sky/jobs/server/core.py +48 -1
- sky/jobs/server/server.py +52 -3
- sky/jobs/state.py +5 -1
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/serve/client/impl.py +93 -6
- sky/serve/client/sdk.py +22 -53
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +4 -2
- sky/serve/serve_state.py +444 -324
- sky/serve/serve_utils.py +77 -46
- sky/serve/server/core.py +13 -197
- sky/serve/server/impl.py +239 -2
- sky/serve/service.py +8 -3
- sky/server/common.py +18 -7
- sky/server/constants.py +1 -1
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +19 -0
- sky/setup_files/alembic.ini +4 -0
- sky/task.py +18 -11
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/sky-serve-controller.yaml.j2 +1 -0
- sky/usage/usage_lib.py +8 -6
- sky/utils/annotations.py +8 -3
- sky/utils/cli_utils/status_utils.py +1 -1
- sky/utils/common_utils.py +11 -1
- sky/utils/db/db_utils.py +31 -0
- sky/utils/db/migration_utils.py +6 -2
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
- sky/client/sdk.pyi +0 -301
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
- /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -20,6 +20,7 @@ import uuid
|
|
|
20
20
|
|
|
21
21
|
import colorama
|
|
22
22
|
import filelock
|
|
23
|
+
import yaml
|
|
23
24
|
|
|
24
25
|
from sky import backends
|
|
25
26
|
from sky import exceptions
|
|
@@ -65,13 +66,12 @@ def get_num_service_threshold():
|
|
|
65
66
|
|
|
66
67
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
67
68
|
|
|
68
|
-
# NOTE(dev): We assume log
|
|
69
|
-
#
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
_SKYPILOT_PROVISION_LOG_PATTERN = (
|
|
73
|
-
|
|
74
|
-
_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
|
|
69
|
+
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
70
|
+
# when changing UX as this assumption is used to expand some log files while
|
|
71
|
+
# ignoring others.
|
|
72
|
+
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
73
|
+
_SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
74
|
+
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
75
75
|
|
|
76
76
|
# TODO(tian): Find all existing replica id and print here.
|
|
77
77
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
|
@@ -668,12 +668,18 @@ def _get_service_status(
|
|
|
668
668
|
if record['pool']:
|
|
669
669
|
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
670
670
|
record['version'])
|
|
671
|
-
|
|
672
|
-
original_config.
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
original_config
|
|
671
|
+
raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
|
|
672
|
+
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
673
|
+
if original_config is None:
|
|
674
|
+
# Fall back to old display format.
|
|
675
|
+
original_config = raw_yaml_config
|
|
676
|
+
original_config.pop('run', None)
|
|
677
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
678
|
+
if svc is not None:
|
|
679
|
+
svc.pop('pool', None) # Remove pool from service config
|
|
680
|
+
original_config['pool'] = svc # Add pool to root config
|
|
681
|
+
else:
|
|
682
|
+
original_config = yaml.safe_load(original_config)
|
|
677
683
|
record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
|
|
678
684
|
|
|
679
685
|
record['target_num_replicas'] = 0
|
|
@@ -959,8 +965,10 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
959
965
|
"""
|
|
960
966
|
start_time = time.time()
|
|
961
967
|
setup_completed = False
|
|
968
|
+
noun = 'pool' if pool else 'service'
|
|
962
969
|
while True:
|
|
963
|
-
#
|
|
970
|
+
# Only do this check for non-consolidation mode as consolidation mode
|
|
971
|
+
# has no setup process.
|
|
964
972
|
if not is_consolidation_mode(pool):
|
|
965
973
|
job_status = job_lib.get_status(job_id)
|
|
966
974
|
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
@@ -971,7 +979,7 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
971
979
|
with ux_utils.print_exception_no_traceback():
|
|
972
980
|
raise RuntimeError(
|
|
973
981
|
f'Failed to start the controller process for '
|
|
974
|
-
f'the
|
|
982
|
+
f'the {noun} {service_name!r} within '
|
|
975
983
|
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
|
|
976
984
|
f' seconds.')
|
|
977
985
|
# No need to check the service status as the controller process
|
|
@@ -979,22 +987,26 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
979
987
|
time.sleep(1)
|
|
980
988
|
continue
|
|
981
989
|
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
990
|
+
if not setup_completed:
|
|
991
|
+
setup_completed = True
|
|
992
|
+
# Reset the start time to wait for the service to be registered.
|
|
993
|
+
start_time = time.time()
|
|
986
994
|
|
|
987
|
-
record =
|
|
995
|
+
record = _get_service_status(service_name,
|
|
996
|
+
pool=pool,
|
|
997
|
+
with_replica_info=False)
|
|
988
998
|
if record is not None:
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
999
|
+
if job_id != record['controller_job_id']:
|
|
1000
|
+
if pool:
|
|
1001
|
+
command_to_run = 'sky jobs pool apply --pool'
|
|
1002
|
+
else:
|
|
1003
|
+
command_to_run = 'sky serve update'
|
|
992
1004
|
with ux_utils.print_exception_no_traceback():
|
|
993
1005
|
raise ValueError(
|
|
994
|
-
f'The
|
|
995
|
-
'Please specify a different name for your
|
|
996
|
-
'To update an existing
|
|
997
|
-
f'{service_name} <new-
|
|
1006
|
+
f'The {noun} {service_name!r} is already running. '
|
|
1007
|
+
f'Please specify a different name for your {noun}. '
|
|
1008
|
+
f'To update an existing {noun}, run: {command_to_run}'
|
|
1009
|
+
f' {service_name} <new-{noun}-yaml>')
|
|
998
1010
|
lb_port = record['load_balancer_port']
|
|
999
1011
|
if lb_port is not None:
|
|
1000
1012
|
return message_utils.encode_payload(lb_port)
|
|
@@ -1023,12 +1035,16 @@ def load_service_initialization_result(payload: str) -> int:
|
|
|
1023
1035
|
return message_utils.decode_payload(payload)
|
|
1024
1036
|
|
|
1025
1037
|
|
|
1026
|
-
def
|
|
1027
|
-
|
|
1038
|
+
def _check_service_status_healthy(service_name: str,
|
|
1039
|
+
pool: bool) -> Optional[str]:
|
|
1040
|
+
service_record = _get_service_status(service_name,
|
|
1041
|
+
pool,
|
|
1042
|
+
with_replica_info=False)
|
|
1043
|
+
capnoun = 'Service' if not pool else 'Pool'
|
|
1028
1044
|
if service_record is None:
|
|
1029
|
-
return f'
|
|
1045
|
+
return f'{capnoun} {service_name!r} does not exist.'
|
|
1030
1046
|
if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
|
|
1031
|
-
return (f'
|
|
1047
|
+
return (f'{capnoun} {service_name!r} is still initializing its '
|
|
1032
1048
|
'controller. Please try again later.')
|
|
1033
1049
|
return None
|
|
1034
1050
|
|
|
@@ -1067,7 +1083,10 @@ def _process_line(line: str,
|
|
|
1067
1083
|
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1068
1084
|
|
|
1069
1085
|
if provision_log_prompt is not None:
|
|
1070
|
-
|
|
1086
|
+
log_path = provision_log_prompt.group(1)
|
|
1087
|
+
nested_log_path = pathlib.Path(
|
|
1088
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1089
|
+
log_path).resolve()
|
|
1071
1090
|
|
|
1072
1091
|
try:
|
|
1073
1092
|
with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
|
|
@@ -1159,12 +1178,14 @@ def _capped_follow_logs_with_provision_expanding(
|
|
|
1159
1178
|
|
|
1160
1179
|
|
|
1161
1180
|
def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
1162
|
-
tail: Optional[int]) -> str:
|
|
1163
|
-
msg =
|
|
1181
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1182
|
+
msg = _check_service_status_healthy(service_name, pool=pool)
|
|
1164
1183
|
if msg is not None:
|
|
1165
1184
|
return msg
|
|
1185
|
+
repnoun = 'worker' if pool else 'replica'
|
|
1186
|
+
caprepnoun = repnoun.capitalize()
|
|
1166
1187
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
|
|
1167
|
-
f'of
|
|
1188
|
+
f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
|
|
1168
1189
|
log_file_name = generate_replica_log_file_name(service_name, replica_id)
|
|
1169
1190
|
if os.path.exists(log_file_name):
|
|
1170
1191
|
if tail is not None:
|
|
@@ -1181,7 +1202,7 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1181
1202
|
launch_log_file_name = generate_replica_launch_log_file_name(
|
|
1182
1203
|
service_name, replica_id)
|
|
1183
1204
|
if not os.path.exists(launch_log_file_name):
|
|
1184
|
-
return (f'{colorama.Fore.RED}
|
|
1205
|
+
return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
|
|
1185
1206
|
f'{colorama.Style.RESET_ALL}')
|
|
1186
1207
|
|
|
1187
1208
|
replica_cluster_name = generate_replica_cluster_name(
|
|
@@ -1231,6 +1252,10 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1231
1252
|
print(line, end='', flush=True)
|
|
1232
1253
|
return ''
|
|
1233
1254
|
|
|
1255
|
+
# For pools, we don't stream the job logs as the run section is ignored.
|
|
1256
|
+
if pool:
|
|
1257
|
+
return ''
|
|
1258
|
+
|
|
1234
1259
|
backend = backends.CloudVmRayBackend()
|
|
1235
1260
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
1236
1261
|
replica_cluster_name)
|
|
@@ -1245,13 +1270,13 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1245
1270
|
|
|
1246
1271
|
# Notify user here to make sure user won't think the log is finished.
|
|
1247
1272
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
|
|
1248
|
-
f'of
|
|
1273
|
+
f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
|
|
1249
1274
|
|
|
1250
1275
|
# Always tail the latest logs, which represent user setup & run.
|
|
1251
1276
|
if tail is None:
|
|
1252
1277
|
returncode = backend.tail_logs(handle, job_id=None, follow=follow)
|
|
1253
1278
|
if returncode != 0:
|
|
1254
|
-
return (f'{colorama.Fore.RED}Failed to stream logs for
|
|
1279
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
|
|
1255
1280
|
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1256
1281
|
elif not follow and tail > 0:
|
|
1257
1282
|
final = backend.tail_logs(handle,
|
|
@@ -1278,8 +1303,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1278
1303
|
|
|
1279
1304
|
|
|
1280
1305
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
1281
|
-
follow: bool, tail: Optional[int]
|
|
1282
|
-
|
|
1306
|
+
follow: bool, tail: Optional[int],
|
|
1307
|
+
pool: bool) -> str:
|
|
1308
|
+
msg = _check_service_status_healthy(service_name, pool)
|
|
1283
1309
|
if msg is not None:
|
|
1284
1310
|
return msg
|
|
1285
1311
|
if stream_controller:
|
|
@@ -1288,7 +1314,9 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
|
1288
1314
|
log_file = generate_remote_load_balancer_log_file_name(service_name)
|
|
1289
1315
|
|
|
1290
1316
|
def _service_is_terminal() -> bool:
|
|
1291
|
-
record =
|
|
1317
|
+
record = _get_service_status(service_name,
|
|
1318
|
+
pool,
|
|
1319
|
+
with_replica_info=False)
|
|
1292
1320
|
if record is None:
|
|
1293
1321
|
return True
|
|
1294
1322
|
return record['status'] in serve_state.ServiceStatus.failed_statuses()
|
|
@@ -1531,21 +1559,24 @@ class ServeCodeGen:
|
|
|
1531
1559
|
|
|
1532
1560
|
@classmethod
|
|
1533
1561
|
def stream_replica_logs(cls, service_name: str, replica_id: int,
|
|
1534
|
-
follow: bool, tail: Optional[int]
|
|
1562
|
+
follow: bool, tail: Optional[int],
|
|
1563
|
+
pool: bool) -> str:
|
|
1535
1564
|
code = [
|
|
1565
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1536
1566
|
'msg = serve_utils.stream_replica_logs('
|
|
1537
|
-
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}
|
|
1538
|
-
'print(msg, flush=True)'
|
|
1567
|
+
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
|
|
1568
|
+
'**kwargs)', 'print(msg, flush=True)'
|
|
1539
1569
|
]
|
|
1540
1570
|
return cls._build(code)
|
|
1541
1571
|
|
|
1542
1572
|
@classmethod
|
|
1543
1573
|
def stream_serve_process_logs(cls, service_name: str,
|
|
1544
1574
|
stream_controller: bool, follow: bool,
|
|
1545
|
-
tail: Optional[int]) -> str:
|
|
1575
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1546
1576
|
code = [
|
|
1577
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1547
1578
|
f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
|
|
1548
|
-
f'{stream_controller}, follow={follow}, tail={tail})',
|
|
1579
|
+
f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
|
|
1549
1580
|
'print(msg, flush=True)'
|
|
1550
1581
|
]
|
|
1551
1582
|
return cls._build(code)
|
sky/serve/server/core.py
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
"""SkyServe core APIs."""
|
|
2
|
-
import pathlib
|
|
3
|
-
import signal
|
|
4
|
-
import threading
|
|
5
2
|
import typing
|
|
6
|
-
from typing import Any, Dict, List, Optional,
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
4
|
|
|
8
5
|
from sky import backends
|
|
9
6
|
from sky import exceptions
|
|
@@ -12,11 +9,8 @@ from sky.backends import backend_utils
|
|
|
12
9
|
from sky.serve import serve_utils
|
|
13
10
|
from sky.serve.server import impl
|
|
14
11
|
from sky.usage import usage_lib
|
|
15
|
-
from sky.utils import command_runner
|
|
16
12
|
from sky.utils import controller_utils
|
|
17
|
-
from sky.utils import rich_utils
|
|
18
13
|
from sky.utils import subprocess_utils
|
|
19
|
-
from sky.utils import ux_utils
|
|
20
14
|
|
|
21
15
|
if typing.TYPE_CHECKING:
|
|
22
16
|
import sky
|
|
@@ -24,42 +18,6 @@ if typing.TYPE_CHECKING:
|
|
|
24
18
|
logger = sky_logging.init_logger(__name__)
|
|
25
19
|
|
|
26
20
|
|
|
27
|
-
def _get_all_replica_targets(
|
|
28
|
-
service_name: str, backend: backends.CloudVmRayBackend,
|
|
29
|
-
handle: backends.CloudVmRayResourceHandle
|
|
30
|
-
) -> Set[serve_utils.ServiceComponentTarget]:
|
|
31
|
-
"""Helper function to get targets for all live replicas."""
|
|
32
|
-
code = serve_utils.ServeCodeGen.get_service_status([service_name],
|
|
33
|
-
pool=False)
|
|
34
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
35
|
-
handle,
|
|
36
|
-
code,
|
|
37
|
-
require_outputs=True,
|
|
38
|
-
stream_logs=False,
|
|
39
|
-
separate_stderr=True)
|
|
40
|
-
|
|
41
|
-
try:
|
|
42
|
-
subprocess_utils.handle_returncode(returncode,
|
|
43
|
-
code,
|
|
44
|
-
'Failed to fetch services',
|
|
45
|
-
stderr,
|
|
46
|
-
stream_logs=True)
|
|
47
|
-
except exceptions.CommandError as e:
|
|
48
|
-
raise RuntimeError(e.error_msg) from e
|
|
49
|
-
|
|
50
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
51
|
-
if not service_records:
|
|
52
|
-
raise ValueError(f'Service {service_name!r} not found.')
|
|
53
|
-
assert len(service_records) == 1
|
|
54
|
-
service_record = service_records[0]
|
|
55
|
-
|
|
56
|
-
return {
|
|
57
|
-
serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
|
|
58
|
-
replica_info['replica_id'])
|
|
59
|
-
for replica_info in service_record['replica_info']
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
|
|
63
21
|
@usage_lib.entrypoint
|
|
64
22
|
def up(
|
|
65
23
|
task: 'sky.Task',
|
|
@@ -277,59 +235,12 @@ def tail_logs(
|
|
|
277
235
|
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
|
278
236
|
ValueError: arguments not valid, or failed to tail the logs.
|
|
279
237
|
"""
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
if target == serve_utils.ServiceComponent.REPLICA:
|
|
288
|
-
if replica_id is None:
|
|
289
|
-
with ux_utils.print_exception_no_traceback():
|
|
290
|
-
raise ValueError(
|
|
291
|
-
'`replica_id` must be specified when using target=REPLICA.')
|
|
292
|
-
else:
|
|
293
|
-
if replica_id is not None:
|
|
294
|
-
with ux_utils.print_exception_no_traceback():
|
|
295
|
-
raise ValueError('`replica_id` must be None when using '
|
|
296
|
-
'target=CONTROLLER/LOAD_BALANCER.')
|
|
297
|
-
|
|
298
|
-
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
|
299
|
-
handle = backend_utils.is_controller_accessible(
|
|
300
|
-
controller=controller_type,
|
|
301
|
-
stopped_message=controller_type.value.default_hint_if_non_existent)
|
|
302
|
-
|
|
303
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
304
|
-
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
|
305
|
-
|
|
306
|
-
if target != serve_utils.ServiceComponent.REPLICA:
|
|
307
|
-
code = serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
308
|
-
service_name,
|
|
309
|
-
stream_controller=(
|
|
310
|
-
target == serve_utils.ServiceComponent.CONTROLLER),
|
|
311
|
-
follow=follow,
|
|
312
|
-
tail=tail)
|
|
313
|
-
else:
|
|
314
|
-
assert replica_id is not None, service_name
|
|
315
|
-
code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
|
|
316
|
-
replica_id,
|
|
317
|
-
follow,
|
|
318
|
-
tail=tail)
|
|
319
|
-
|
|
320
|
-
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
|
321
|
-
# kill the process, so we need to handle it manually here.
|
|
322
|
-
if threading.current_thread() is threading.main_thread():
|
|
323
|
-
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
|
324
|
-
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
|
325
|
-
|
|
326
|
-
# Refer to the notes in
|
|
327
|
-
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
|
328
|
-
backend.run_on_head(handle,
|
|
329
|
-
code,
|
|
330
|
-
stream_logs=True,
|
|
331
|
-
process_stream=False,
|
|
332
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE)
|
|
238
|
+
return impl.tail_logs(service_name,
|
|
239
|
+
target=target,
|
|
240
|
+
replica_id=replica_id,
|
|
241
|
+
follow=follow,
|
|
242
|
+
tail=tail,
|
|
243
|
+
pool=False)
|
|
333
244
|
|
|
334
245
|
|
|
335
246
|
@usage_lib.entrypoint
|
|
@@ -374,104 +285,9 @@ def sync_down_logs(
|
|
|
374
285
|
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
|
375
286
|
ValueError: Arguments not valid.
|
|
376
287
|
"""
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
stopped_message=controller_type.value.default_hint_if_non_existent)
|
|
384
|
-
backend: backends.CloudVmRayBackend = (
|
|
385
|
-
backend_utils.get_backend_from_handle(handle))
|
|
386
|
-
|
|
387
|
-
requested_components: Set[serve_utils.ServiceComponent] = set()
|
|
388
|
-
if not targets:
|
|
389
|
-
# No targets specified -> request all components
|
|
390
|
-
requested_components = {
|
|
391
|
-
serve_utils.ServiceComponent.CONTROLLER,
|
|
392
|
-
serve_utils.ServiceComponent.LOAD_BALANCER,
|
|
393
|
-
serve_utils.ServiceComponent.REPLICA
|
|
394
|
-
}
|
|
395
|
-
else:
|
|
396
|
-
# Parse provided targets
|
|
397
|
-
if isinstance(targets, (str, serve_utils.ServiceComponent)):
|
|
398
|
-
requested_components = {serve_utils.ServiceComponent(targets)}
|
|
399
|
-
else: # list
|
|
400
|
-
requested_components = {
|
|
401
|
-
serve_utils.ServiceComponent(t) for t in targets
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
|
|
405
|
-
if serve_utils.ServiceComponent.CONTROLLER in requested_components:
|
|
406
|
-
normalized_targets.add(
|
|
407
|
-
serve_utils.ServiceComponentTarget(
|
|
408
|
-
serve_utils.ServiceComponent.CONTROLLER))
|
|
409
|
-
if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
|
|
410
|
-
normalized_targets.add(
|
|
411
|
-
serve_utils.ServiceComponentTarget(
|
|
412
|
-
serve_utils.ServiceComponent.LOAD_BALANCER))
|
|
413
|
-
if serve_utils.ServiceComponent.REPLICA in requested_components:
|
|
414
|
-
with rich_utils.safe_status(
|
|
415
|
-
ux_utils.spinner_message('Getting live replica infos...')):
|
|
416
|
-
replica_targets = _get_all_replica_targets(service_name, backend,
|
|
417
|
-
handle)
|
|
418
|
-
if not replica_ids:
|
|
419
|
-
# Replica target requested but no specific IDs
|
|
420
|
-
# -> Get all replica logs
|
|
421
|
-
normalized_targets.update(replica_targets)
|
|
422
|
-
else:
|
|
423
|
-
# Replica target requested with specific IDs
|
|
424
|
-
requested_replica_targets = [
|
|
425
|
-
serve_utils.ServiceComponentTarget(
|
|
426
|
-
serve_utils.ServiceComponent.REPLICA, rid)
|
|
427
|
-
for rid in replica_ids
|
|
428
|
-
]
|
|
429
|
-
for target in requested_replica_targets:
|
|
430
|
-
if target not in replica_targets:
|
|
431
|
-
logger.warning(f'Replica ID {target.replica_id} not found '
|
|
432
|
-
f'for {service_name}. Skipping...')
|
|
433
|
-
else:
|
|
434
|
-
normalized_targets.add(target)
|
|
435
|
-
|
|
436
|
-
def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
|
|
437
|
-
component = target.component
|
|
438
|
-
# We need to set one side of the pipe to a logs stream, and the other
|
|
439
|
-
# side to a file.
|
|
440
|
-
log_path = str(pathlib.Path(local_dir) / f'{target}.log')
|
|
441
|
-
stream_logs_code: str
|
|
442
|
-
|
|
443
|
-
if component == serve_utils.ServiceComponent.CONTROLLER:
|
|
444
|
-
stream_logs_code = (
|
|
445
|
-
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
446
|
-
service_name,
|
|
447
|
-
stream_controller=True,
|
|
448
|
-
follow=False,
|
|
449
|
-
tail=tail))
|
|
450
|
-
elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
|
|
451
|
-
stream_logs_code = (
|
|
452
|
-
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
453
|
-
service_name,
|
|
454
|
-
stream_controller=False,
|
|
455
|
-
follow=False,
|
|
456
|
-
tail=tail))
|
|
457
|
-
elif component == serve_utils.ServiceComponent.REPLICA:
|
|
458
|
-
replica_id = target.replica_id
|
|
459
|
-
assert replica_id is not None, service_name
|
|
460
|
-
stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
|
|
461
|
-
service_name, replica_id, follow=False, tail=tail)
|
|
462
|
-
else:
|
|
463
|
-
assert False, component
|
|
464
|
-
|
|
465
|
-
# Refer to the notes in
|
|
466
|
-
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
|
467
|
-
backend.run_on_head(handle,
|
|
468
|
-
stream_logs_code,
|
|
469
|
-
stream_logs=False,
|
|
470
|
-
process_stream=False,
|
|
471
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
|
472
|
-
log_path=log_path)
|
|
473
|
-
|
|
474
|
-
subprocess_utils.run_in_parallel(sync_down_logs_by_target,
|
|
475
|
-
list(normalized_targets))
|
|
476
|
-
|
|
477
|
-
return local_dir
|
|
288
|
+
return impl.sync_down_logs(service_name,
|
|
289
|
+
local_dir=local_dir,
|
|
290
|
+
targets=targets,
|
|
291
|
+
replica_ids=replica_ids,
|
|
292
|
+
tail=tail,
|
|
293
|
+
pool=False)
|