skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -57,21 +57,16 @@ else:
|
|
|
57
57
|
|
|
58
58
|
logger = sky_logging.init_logger(__name__)
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
@annotations.lru_cache(scope='request')
|
|
62
|
-
def get_num_service_threshold():
|
|
63
|
-
"""Get number of services threshold, calculating it only when needed."""
|
|
64
|
-
system_memory_gb = psutil.virtual_memory().total // (1024**3)
|
|
65
|
-
return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
|
|
66
|
-
|
|
67
|
-
|
|
68
60
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
69
61
|
|
|
70
62
|
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
71
63
|
# when changing UX as this assumption is used to expand some log files while
|
|
72
64
|
# ignoring others.
|
|
73
65
|
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
74
|
-
|
|
66
|
+
_SKYPILOT_PROVISION_API_LOG_PATTERN = (
|
|
67
|
+
fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
68
|
+
# New hint pattern for provision logs
|
|
69
|
+
_SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
|
|
75
70
|
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
76
71
|
|
|
77
72
|
# TODO(tian): Find all existing replica id and print here.
|
|
@@ -524,6 +519,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
|
|
|
524
519
|
|
|
525
520
|
|
|
526
521
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
|
522
|
+
# NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
|
|
523
|
+
# checking replica cluster existence. Be careful when changing it.
|
|
527
524
|
return f'{service_name}-{replica_id}'
|
|
528
525
|
|
|
529
526
|
|
|
@@ -796,9 +793,13 @@ def load_version_string(payload: str) -> str:
|
|
|
796
793
|
return message_utils.decode_payload(payload)
|
|
797
794
|
|
|
798
795
|
|
|
799
|
-
def
|
|
796
|
+
def get_ready_replicas(
|
|
797
|
+
service_name: str) -> List['replica_managers.ReplicaInfo']:
|
|
800
798
|
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
801
|
-
return
|
|
799
|
+
return [
|
|
800
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
801
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
802
|
+
]
|
|
802
803
|
|
|
803
804
|
|
|
804
805
|
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
@@ -823,12 +824,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
|
823
824
|
logger.error(f'Service {service_name!r} is not a cluster pool.')
|
|
824
825
|
return None
|
|
825
826
|
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
826
|
-
|
|
827
827
|
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
828
|
-
ready_replicas =
|
|
829
|
-
info for info in serve_state.get_replica_infos(service_name)
|
|
830
|
-
if info.status == serve_state.ReplicaStatus.READY
|
|
831
|
-
]
|
|
828
|
+
ready_replicas = get_ready_replicas(service_name)
|
|
832
829
|
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
833
830
|
for replica_info in ready_replicas:
|
|
834
831
|
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
@@ -1044,11 +1041,18 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
1044
1041
|
lb_port = record['load_balancer_port']
|
|
1045
1042
|
if lb_port is not None:
|
|
1046
1043
|
return message_utils.encode_payload(lb_port)
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1044
|
+
else:
|
|
1045
|
+
controller_log_path = os.path.expanduser(
|
|
1046
|
+
generate_remote_controller_log_file_name(service_name))
|
|
1047
|
+
if os.path.exists(controller_log_path):
|
|
1048
|
+
with open(controller_log_path, 'r', encoding='utf-8') as f:
|
|
1049
|
+
log_content = f.read()
|
|
1050
|
+
if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
|
|
1051
|
+
in log_content):
|
|
1052
|
+
with ux_utils.print_exception_no_traceback():
|
|
1053
|
+
raise RuntimeError('Max number of services reached. '
|
|
1054
|
+
'To spin up more services, please '
|
|
1055
|
+
'tear down some existing services.')
|
|
1052
1056
|
elapsed = time.time() - start_time
|
|
1053
1057
|
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
|
|
1054
1058
|
# Print the controller log to help user debug.
|
|
@@ -1113,31 +1117,49 @@ def _process_line(line: str,
|
|
|
1113
1117
|
return False
|
|
1114
1118
|
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1115
1119
|
|
|
1116
|
-
|
|
1120
|
+
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1121
|
+
line)
|
|
1122
|
+
provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
|
|
1123
|
+
line)
|
|
1117
1124
|
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1118
1125
|
|
|
1119
|
-
|
|
1120
|
-
log_path = provision_log_prompt.group(1)
|
|
1121
|
-
nested_log_path = pathlib.Path(
|
|
1122
|
-
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1123
|
-
log_path).resolve()
|
|
1124
|
-
|
|
1126
|
+
def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
|
|
1125
1127
|
try:
|
|
1126
|
-
with open(
|
|
1127
|
-
#
|
|
1128
|
-
# to avoid any internal bug that causes the launch to fail
|
|
1129
|
-
# while cluster status remains INIT.
|
|
1128
|
+
with open(p, 'r', newline='', encoding='utf-8') as f:
|
|
1129
|
+
# Exit if >10s without new content to avoid hanging when INIT
|
|
1130
1130
|
yield from log_utils.follow_logs(f,
|
|
1131
1131
|
should_stop=cluster_is_up,
|
|
1132
1132
|
stop_on_eof=stop_on_eof,
|
|
1133
1133
|
idle_timeout_seconds=10)
|
|
1134
1134
|
except FileNotFoundError:
|
|
1135
|
+
# Fall back cleanly if the hinted path doesn't exist
|
|
1135
1136
|
yield line
|
|
1136
|
-
|
|
1137
1137
|
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
1138
|
-
f'Try to expand log file {
|
|
1139
|
-
f'
|
|
1140
|
-
|
|
1138
|
+
f'Try to expand log file {p} but not found. Skipping...'
|
|
1139
|
+
f'{colorama.Style.RESET_ALL}')
|
|
1140
|
+
return
|
|
1141
|
+
|
|
1142
|
+
if provision_api_log_prompt is not None:
|
|
1143
|
+
rel_path = provision_api_log_prompt.group(1)
|
|
1144
|
+
nested_log_path = pathlib.Path(
|
|
1145
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1146
|
+
rel_path).resolve()
|
|
1147
|
+
yield from _stream_provision_path(nested_log_path)
|
|
1148
|
+
return
|
|
1149
|
+
|
|
1150
|
+
if provision_log_cmd_prompt is not None:
|
|
1151
|
+
# Resolve provision log via cluster table first, then history.
|
|
1152
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1153
|
+
cluster_name)
|
|
1154
|
+
if not log_path_str:
|
|
1155
|
+
log_path_str = (
|
|
1156
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1157
|
+
cluster_name))
|
|
1158
|
+
if not log_path_str:
|
|
1159
|
+
yield line
|
|
1160
|
+
return
|
|
1161
|
+
yield from _stream_provision_path(
|
|
1162
|
+
pathlib.Path(log_path_str).expanduser().resolve())
|
|
1141
1163
|
return
|
|
1142
1164
|
|
|
1143
1165
|
if log_prompt is not None:
|
sky/serve/server/impl.py
CHANGED
|
@@ -11,7 +11,6 @@ import uuid
|
|
|
11
11
|
import colorama
|
|
12
12
|
import filelock
|
|
13
13
|
|
|
14
|
-
import sky
|
|
15
14
|
from sky import backends
|
|
16
15
|
from sky import exceptions
|
|
17
16
|
from sky import execution
|
|
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
|
|
|
25
24
|
from sky.serve import serve_state
|
|
26
25
|
from sky.serve import serve_utils
|
|
27
26
|
from sky.skylet import constants
|
|
27
|
+
from sky.skylet import job_lib
|
|
28
28
|
from sky.utils import admin_policy_utils
|
|
29
29
|
from sky.utils import command_runner
|
|
30
30
|
from sky.utils import common
|
|
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
42
|
-
service_name: str, task: '
|
|
42
|
+
service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
|
|
43
43
|
"""Rewrite the paths of TLS credentials in the task.
|
|
44
44
|
|
|
45
45
|
Args:
|
|
@@ -103,15 +103,11 @@ def _get_service_record(
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
def up(
|
|
106
|
-
task: '
|
|
106
|
+
task: 'task_lib.Task',
|
|
107
107
|
service_name: Optional[str] = None,
|
|
108
108
|
pool: bool = False,
|
|
109
109
|
) -> Tuple[str, str]:
|
|
110
110
|
"""Spins up a service or a pool."""
|
|
111
|
-
if pool and not serve_utils.is_consolidation_mode(pool):
|
|
112
|
-
raise ValueError(
|
|
113
|
-
'Pool is only supported in consolidation mode. To fix, set '
|
|
114
|
-
'`jobs.controller.consolidation_mode: true` in SkyPilot config.')
|
|
115
111
|
task.validate()
|
|
116
112
|
serve_utils.validate_service_task(task, pool=pool)
|
|
117
113
|
assert task.service is not None
|
|
@@ -191,8 +187,7 @@ def up(
|
|
|
191
187
|
controller_log_file = (
|
|
192
188
|
serve_utils.generate_remote_controller_log_file_name(service_name))
|
|
193
189
|
controller_resources = controller_utils.get_controller_resources(
|
|
194
|
-
controller=
|
|
195
|
-
task_resources=task.resources)
|
|
190
|
+
controller=controller, task_resources=task.resources)
|
|
196
191
|
controller_job_id = None
|
|
197
192
|
if serve_utils.is_consolidation_mode(pool):
|
|
198
193
|
# We need a unique integer per sky.serve.up call to avoid name
|
|
@@ -228,10 +223,11 @@ def up(
|
|
|
228
223
|
# balancer port from the controller? So we don't need to open so many
|
|
229
224
|
# ports here. Or, we should have a nginx traffic control to refuse
|
|
230
225
|
# any connection to the unregistered ports.
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
226
|
+
if not pool:
|
|
227
|
+
controller_resources = {
|
|
228
|
+
r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
|
|
229
|
+
for r in controller_resources
|
|
230
|
+
}
|
|
235
231
|
controller_task.set_resources(controller_resources)
|
|
236
232
|
|
|
237
233
|
# # Set service_name so the backend will know to modify default ray
|
|
@@ -325,7 +321,7 @@ def up(
|
|
|
325
321
|
[controller_job_id],
|
|
326
322
|
stream_logs=False)
|
|
327
323
|
controller_job_status = list(statuses.values())[0]
|
|
328
|
-
if controller_job_status ==
|
|
324
|
+
if controller_job_status == job_lib.JobStatus.PENDING:
|
|
329
325
|
# Max number of services reached due to vCPU constraint.
|
|
330
326
|
# The controller job is pending due to ray job scheduling.
|
|
331
327
|
# We manually cancel the job here.
|
|
@@ -350,7 +346,7 @@ def up(
|
|
|
350
346
|
else:
|
|
351
347
|
lb_port = serve_utils.load_service_initialization_result(
|
|
352
348
|
lb_port_payload)
|
|
353
|
-
if not serve_utils.is_consolidation_mode(pool):
|
|
349
|
+
if not serve_utils.is_consolidation_mode(pool) and not pool:
|
|
354
350
|
socket_endpoint = backend_utils.get_endpoints(
|
|
355
351
|
controller_handle.cluster_name,
|
|
356
352
|
lb_port,
|
|
@@ -374,10 +370,10 @@ def up(
|
|
|
374
370
|
f'\n📋 Useful Commands'
|
|
375
371
|
f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
|
|
376
372
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
377
|
-
f'<
|
|
373
|
+
f'<yaml_file>{ux_utils.RESET_BOLD}'
|
|
378
374
|
f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
|
|
379
375
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
380
|
-
f'--num-jobs 10 <
|
|
376
|
+
f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
|
|
381
377
|
f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
|
|
382
378
|
f'{ux_utils.BOLD}sky jobs pool status {service_name}'
|
|
383
379
|
f'{ux_utils.RESET_BOLD}'
|
|
@@ -421,7 +417,7 @@ def up(
|
|
|
421
417
|
|
|
422
418
|
|
|
423
419
|
def update(
|
|
424
|
-
task: '
|
|
420
|
+
task: 'task_lib.Task',
|
|
425
421
|
service_name: str,
|
|
426
422
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
427
423
|
pool: bool = False,
|
|
@@ -576,7 +572,7 @@ def update(
|
|
|
576
572
|
|
|
577
573
|
|
|
578
574
|
def apply(
|
|
579
|
-
task: '
|
|
575
|
+
task: 'task_lib.Task',
|
|
580
576
|
service_name: str,
|
|
581
577
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
582
578
|
pool: bool = False,
|
sky/serve/service.py
CHANGED
|
@@ -15,11 +15,13 @@ import filelock
|
|
|
15
15
|
|
|
16
16
|
from sky import authentication
|
|
17
17
|
from sky import exceptions
|
|
18
|
+
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
19
20
|
from sky import task as task_lib
|
|
20
21
|
from sky.backends import backend_utils
|
|
21
22
|
from sky.backends import cloud_vm_ray_backend
|
|
22
23
|
from sky.data import data_utils
|
|
24
|
+
from sky.jobs import scheduler as jobs_scheduler
|
|
23
25
|
from sky.serve import constants
|
|
24
26
|
from sky.serve import controller
|
|
25
27
|
from sky.serve import load_balancer
|
|
@@ -28,6 +30,7 @@ from sky.serve import serve_state
|
|
|
28
30
|
from sky.serve import serve_utils
|
|
29
31
|
from sky.skylet import constants as skylet_constants
|
|
30
32
|
from sky.utils import common_utils
|
|
33
|
+
from sky.utils import controller_utils
|
|
31
34
|
from sky.utils import subprocess_utils
|
|
32
35
|
from sky.utils import ux_utils
|
|
33
36
|
|
|
@@ -110,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
110
113
|
return not failed
|
|
111
114
|
|
|
112
115
|
|
|
116
|
+
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
117
|
+
# because we killed all the processes (controller & replica manager) before
|
|
118
|
+
# calling this function.
|
|
113
119
|
def _cleanup(service_name: str) -> bool:
|
|
114
120
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
115
121
|
# Cleanup the HA recovery script first as it is possible that some error
|
|
@@ -120,31 +126,71 @@ def _cleanup(service_name: str) -> bool:
|
|
|
120
126
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
121
127
|
info2proc: Dict[replica_managers.ReplicaInfo,
|
|
122
128
|
multiprocessing.Process] = dict()
|
|
129
|
+
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
130
|
+
# generate_replica_cluster_name`. Change it if you change the function.
|
|
131
|
+
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
132
|
+
service_name)
|
|
123
133
|
for info in replica_infos:
|
|
134
|
+
if info.cluster_name not in existing_cluster_names:
|
|
135
|
+
logger.info(f'Cluster {info.cluster_name} for replica '
|
|
136
|
+
f'{info.replica_id} not found. Might be a failed '
|
|
137
|
+
'cluster. Skipping.')
|
|
138
|
+
continue
|
|
124
139
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
125
140
|
args=(info.cluster_name,))
|
|
126
|
-
p.start()
|
|
127
141
|
info2proc[info] = p
|
|
128
142
|
# Set replica status to `SHUTTING_DOWN`
|
|
129
143
|
info.status_property.sky_launch_status = (
|
|
130
|
-
replica_managers.ProcessStatus.SUCCEEDED)
|
|
144
|
+
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
131
145
|
info.status_property.sky_down_status = (
|
|
132
|
-
replica_managers.ProcessStatus.
|
|
146
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED)
|
|
133
147
|
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
134
|
-
logger.info(f'
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
+
logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
|
|
149
|
+
|
|
150
|
+
def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
|
|
151
|
+
nonlocal failed
|
|
152
|
+
# Set replica status to `FAILED_CLEANUP`
|
|
153
|
+
info.status_property.sky_down_status = (
|
|
154
|
+
replica_managers.common_utils.ProcessStatus.FAILED)
|
|
155
|
+
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
156
|
+
failed = True
|
|
157
|
+
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
|
158
|
+
|
|
159
|
+
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
160
|
+
# TODO(tian): Refactor to use the same logic and code.
|
|
161
|
+
while info2proc:
|
|
162
|
+
snapshot = list(info2proc.items())
|
|
163
|
+
for info, p in snapshot:
|
|
164
|
+
if p.is_alive():
|
|
165
|
+
continue
|
|
166
|
+
if (info.status_property.sky_down_status ==
|
|
167
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
168
|
+
if controller_utils.can_terminate():
|
|
169
|
+
try:
|
|
170
|
+
p.start()
|
|
171
|
+
except Exception as e: # pylint: disable=broad-except
|
|
172
|
+
_set_to_failed_cleanup(info)
|
|
173
|
+
logger.error(f'Failed to start process for replica '
|
|
174
|
+
f'{info.replica_id}: {e}')
|
|
175
|
+
del info2proc[info]
|
|
176
|
+
else:
|
|
177
|
+
info.status_property.sky_down_status = (
|
|
178
|
+
common_utils.ProcessStatus.RUNNING)
|
|
179
|
+
serve_state.add_or_update_replica(
|
|
180
|
+
service_name, info.replica_id, info)
|
|
181
|
+
else:
|
|
182
|
+
logger.info('Terminate process for replica '
|
|
183
|
+
f'{info.replica_id} finished.')
|
|
184
|
+
p.join()
|
|
185
|
+
del info2proc[info]
|
|
186
|
+
if p.exitcode == 0:
|
|
187
|
+
serve_state.remove_replica(service_name, info.replica_id)
|
|
188
|
+
logger.info(
|
|
189
|
+
f'Replica {info.replica_id} terminated successfully.')
|
|
190
|
+
else:
|
|
191
|
+
_set_to_failed_cleanup(info)
|
|
192
|
+
time.sleep(3)
|
|
193
|
+
|
|
148
194
|
versions = serve_state.get_service_versions(service_name)
|
|
149
195
|
serve_state.remove_service_versions(service_name)
|
|
150
196
|
|
|
@@ -214,22 +260,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
214
260
|
service_name, version)
|
|
215
261
|
|
|
216
262
|
if not is_recovery:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
263
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
264
|
+
if not controller_utils.can_start_new_process():
|
|
265
|
+
cleanup_storage(tmp_task_yaml)
|
|
266
|
+
with ux_utils.print_exception_no_traceback():
|
|
267
|
+
raise RuntimeError(
|
|
268
|
+
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
269
|
+
success = serve_state.add_service(
|
|
270
|
+
service_name,
|
|
271
|
+
controller_job_id=job_id,
|
|
272
|
+
policy=service_spec.autoscaling_policy_str(),
|
|
273
|
+
requested_resources_str=backend_utils.get_task_resources_str(
|
|
274
|
+
task),
|
|
275
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
|
276
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
277
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
278
|
+
pool=service_spec.pool,
|
|
279
|
+
controller_pid=os.getpid(),
|
|
280
|
+
entrypoint=entrypoint)
|
|
281
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
233
282
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
234
283
|
# for more details.
|
|
235
284
|
if not success:
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 17
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/requests/payloads.py
CHANGED
|
@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
|
|
|
497
497
|
skip_finished: bool = False
|
|
498
498
|
all_users: bool = False
|
|
499
499
|
job_ids: Optional[List[int]] = None
|
|
500
|
+
user_match: Optional[str] = None
|
|
501
|
+
workspace_match: Optional[str] = None
|
|
502
|
+
name_match: Optional[str] = None
|
|
503
|
+
pool_match: Optional[str] = None
|
|
504
|
+
page: Optional[int] = None
|
|
505
|
+
limit: Optional[int] = None
|
|
500
506
|
|
|
501
507
|
|
|
502
508
|
class JobsCancelBody(RequestBody):
|
|
@@ -102,8 +102,18 @@ def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
102
102
|
|
|
103
103
|
|
|
104
104
|
@register_decoders('jobs.queue')
|
|
105
|
-
def decode_jobs_queue(return_value
|
|
106
|
-
jobs
|
|
105
|
+
def decode_jobs_queue(return_value):
|
|
106
|
+
"""Decode jobs queue response.
|
|
107
|
+
|
|
108
|
+
Supports legacy list, or a dict {jobs, total}.
|
|
109
|
+
- Returns list[job]
|
|
110
|
+
"""
|
|
111
|
+
# Case 1: dict shape {jobs, total}
|
|
112
|
+
if isinstance(return_value, dict) and 'jobs' in return_value:
|
|
113
|
+
jobs = return_value.get('jobs', [])
|
|
114
|
+
else:
|
|
115
|
+
# Case 2: legacy list
|
|
116
|
+
jobs = return_value
|
|
107
117
|
for job in jobs:
|
|
108
118
|
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
109
119
|
return jobs
|
|
@@ -106,10 +106,18 @@ def encode_status_kubernetes(
|
|
|
106
106
|
|
|
107
107
|
|
|
108
108
|
@register_encoder('jobs.queue')
|
|
109
|
-
def encode_jobs_queue(
|
|
109
|
+
def encode_jobs_queue(jobs_or_tuple):
|
|
110
|
+
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
111
|
+
if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
|
|
112
|
+
jobs, total = jobs_or_tuple
|
|
113
|
+
else:
|
|
114
|
+
jobs = jobs_or_tuple
|
|
115
|
+
total = None
|
|
110
116
|
for job in jobs:
|
|
111
117
|
job['status'] = job['status'].value
|
|
112
|
-
|
|
118
|
+
if total is None:
|
|
119
|
+
return jobs
|
|
120
|
+
return {'jobs': jobs, 'total': total}
|
|
113
121
|
|
|
114
122
|
|
|
115
123
|
def _encode_serve_status(
|
sky/server/server.py
CHANGED
|
@@ -17,7 +17,7 @@ import resource
|
|
|
17
17
|
import shutil
|
|
18
18
|
import sys
|
|
19
19
|
import threading
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
21
21
|
import uuid
|
|
22
22
|
import zipfile
|
|
23
23
|
|
|
@@ -42,6 +42,7 @@ from sky.data import storage_utils
|
|
|
42
42
|
from sky.jobs.server import server as jobs_rest
|
|
43
43
|
from sky.metrics import utils as metrics_utils
|
|
44
44
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
45
|
+
from sky.schemas.api import responses
|
|
45
46
|
from sky.serve.server import server as serve_rest
|
|
46
47
|
from sky.server import common
|
|
47
48
|
from sky.server import config as server_config
|
|
@@ -791,8 +792,6 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
791
792
|
ctx.override_envs(validate_body.env_vars)
|
|
792
793
|
|
|
793
794
|
def validate_dag(dag: dag_utils.dag_lib.Dag):
|
|
794
|
-
# Resolve the volumes before admin policy and validation.
|
|
795
|
-
dag.resolve_and_validate_volumes()
|
|
796
795
|
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
|
797
796
|
# to run and may block the server thread. However, moving it into the
|
|
798
797
|
# executor adds a ~150ms penalty on the local API server because of
|
|
@@ -801,6 +800,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
801
800
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
802
801
|
dag,
|
|
803
802
|
request_options=validate_body.get_request_options()) as dag:
|
|
803
|
+
dag.resolve_and_validate_volumes()
|
|
804
804
|
# Skip validating workdir and file_mounts, as those need to be
|
|
805
805
|
# validated after the files are uploaded to the SkyPilot API server
|
|
806
806
|
# with `upload_mounts_to_api_server`.
|
|
@@ -1283,6 +1283,46 @@ async def download(download_body: payloads.DownloadBody) -> None:
|
|
|
1283
1283
|
detail=f'Error creating zip file: {str(e)}')
|
|
1284
1284
|
|
|
1285
1285
|
|
|
1286
|
+
@app.post('/provision_logs')
|
|
1287
|
+
async def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
1288
|
+
follow: bool = True,
|
|
1289
|
+
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1290
|
+
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1291
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1292
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1293
|
+
cluster_body.cluster_name)
|
|
1294
|
+
if not log_path_str:
|
|
1295
|
+
log_path_str = global_user_state.get_cluster_history_provision_log_path(
|
|
1296
|
+
cluster_body.cluster_name)
|
|
1297
|
+
if not log_path_str:
|
|
1298
|
+
raise fastapi.HTTPException(
|
|
1299
|
+
status_code=404,
|
|
1300
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1301
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1302
|
+
|
|
1303
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1304
|
+
if not log_path.exists():
|
|
1305
|
+
raise fastapi.HTTPException(
|
|
1306
|
+
status_code=404,
|
|
1307
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1308
|
+
|
|
1309
|
+
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1310
|
+
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1311
|
+
|
|
1312
|
+
return fastapi.responses.StreamingResponse(
|
|
1313
|
+
content=stream_utils.log_streamer(None,
|
|
1314
|
+
log_path,
|
|
1315
|
+
tail=effective_tail,
|
|
1316
|
+
follow=follow),
|
|
1317
|
+
media_type='text/plain',
|
|
1318
|
+
headers={
|
|
1319
|
+
'Cache-Control': 'no-cache, no-transform',
|
|
1320
|
+
'X-Accel-Buffering': 'no',
|
|
1321
|
+
'Transfer-Encoding': 'chunked',
|
|
1322
|
+
},
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
|
|
1286
1326
|
@app.post('/cost_report')
|
|
1287
1327
|
async def cost_report(request: fastapi.Request,
|
|
1288
1328
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
@@ -1531,8 +1571,12 @@ async def api_status(
|
|
|
1531
1571
|
return encoded_request_tasks
|
|
1532
1572
|
|
|
1533
1573
|
|
|
1534
|
-
@app.get(
|
|
1535
|
-
|
|
1574
|
+
@app.get(
|
|
1575
|
+
'/api/health',
|
|
1576
|
+
# response_model_exclude_unset omits unset fields
|
|
1577
|
+
# in the response JSON.
|
|
1578
|
+
response_model_exclude_unset=True)
|
|
1579
|
+
async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
1536
1580
|
"""Checks the health of the API server.
|
|
1537
1581
|
|
|
1538
1582
|
Returns:
|
|
@@ -1570,7 +1614,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1570
1614
|
# - There is no harm when an malicious client calls /api/health
|
|
1571
1615
|
# without authentication since no sensitive information is
|
|
1572
1616
|
# returned.
|
|
1573
|
-
return
|
|
1617
|
+
return responses.APIHealthResponse(
|
|
1618
|
+
status=common.ApiServerStatus.HEALTHY,)
|
|
1574
1619
|
# TODO(aylei): remove this after min_compatible_api_version >= 14.
|
|
1575
1620
|
if client_version < 14:
|
|
1576
1621
|
# For Client with API version < 14, the NEEDS_AUTH status is not
|
|
@@ -1579,19 +1624,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1579
1624
|
detail='Authentication required')
|
|
1580
1625
|
|
|
1581
1626
|
logger.debug(f'Health endpoint: request.state.auth_user = {user}')
|
|
1582
|
-
return
|
|
1583
|
-
|
|
1627
|
+
return responses.APIHealthResponse(
|
|
1628
|
+
status=server_status,
|
|
1584
1629
|
# Kept for backward compatibility, clients before 0.11.0 will read this
|
|
1585
1630
|
# field to check compatibility and hint the user to upgrade the CLI.
|
|
1586
1631
|
# TODO(aylei): remove this field after 0.13.0
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1632
|
+
api_version=str(server_constants.API_VERSION),
|
|
1633
|
+
version=sky.__version__,
|
|
1634
|
+
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1635
|
+
commit=sky.__commit__,
|
|
1636
|
+
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1637
|
+
'false').lower() == 'true',
|
|
1638
|
+
user=user if user is not None else None,
|
|
1639
|
+
)
|
|
1595
1640
|
|
|
1596
1641
|
|
|
1597
1642
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
@@ -1809,6 +1854,9 @@ if __name__ == '__main__':
|
|
|
1809
1854
|
global_tasks.append(background.create_task(metrics_server.serve()))
|
|
1810
1855
|
global_tasks.append(
|
|
1811
1856
|
background.create_task(requests_lib.requests_gc_daemon()))
|
|
1857
|
+
global_tasks.append(
|
|
1858
|
+
background.create_task(
|
|
1859
|
+
global_user_state.cluster_event_retention_daemon()))
|
|
1812
1860
|
threading.Thread(target=background.run_forever, daemon=True).start()
|
|
1813
1861
|
|
|
1814
1862
|
queue_server, workers = executor.start(config)
|