skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/common.py
CHANGED
@@ -9,11 +9,13 @@ import json
|
|
9
9
|
import os
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
|
+
import shutil
|
12
13
|
import subprocess
|
13
14
|
import sys
|
15
|
+
import tempfile
|
14
16
|
import time
|
15
17
|
import typing
|
16
|
-
from typing import Any, Dict, Literal, Optional, Tuple
|
18
|
+
from typing import Any, Dict, Literal, Optional, Tuple, Union
|
17
19
|
from urllib import parse
|
18
20
|
import uuid
|
19
21
|
|
@@ -27,6 +29,7 @@ from sky import skypilot_config
|
|
27
29
|
from sky.adaptors import common as adaptors_common
|
28
30
|
from sky.data import data_utils
|
29
31
|
from sky.server import constants as server_constants
|
32
|
+
from sky.server import rest
|
30
33
|
from sky.skylet import constants
|
31
34
|
from sky.usage import usage_lib
|
32
35
|
from sky.utils import annotations
|
@@ -240,9 +243,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
240
243
|
server_url = endpoint if endpoint is not None else get_server_url()
|
241
244
|
while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
|
242
245
|
try:
|
243
|
-
response =
|
244
|
-
|
245
|
-
|
246
|
+
response = rest.get(f'{server_url}/api/health',
|
247
|
+
timeout=2.5,
|
248
|
+
cookies=get_api_cookie_jar())
|
246
249
|
except requests.exceptions.Timeout:
|
247
250
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
248
251
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
@@ -327,6 +330,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
|
|
327
330
|
def _start_api_server(deploy: bool = False,
|
328
331
|
host: str = '127.0.0.1',
|
329
332
|
foreground: bool = False,
|
333
|
+
metrics: bool = False,
|
334
|
+
metrics_port: Optional[int] = None,
|
330
335
|
enable_basic_auth: bool = False):
|
331
336
|
"""Starts a SkyPilot API server locally."""
|
332
337
|
server_url = get_server_url(host)
|
@@ -357,10 +362,13 @@ def _start_api_server(deploy: bool = False,
|
|
357
362
|
args += ['--deploy']
|
358
363
|
if host is not None:
|
359
364
|
args += [f'--host={host}']
|
365
|
+
if metrics_port is not None:
|
366
|
+
args += [f'--metrics-port={metrics_port}']
|
360
367
|
|
361
368
|
if foreground:
|
362
369
|
# Replaces the current process with the API server
|
363
370
|
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
371
|
+
_set_metrics_env_var(os.environ, metrics, deploy)
|
364
372
|
if enable_basic_auth:
|
365
373
|
os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
366
374
|
os.execvp(args[0], args)
|
@@ -368,6 +376,10 @@ def _start_api_server(deploy: bool = False,
|
|
368
376
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
369
377
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
370
378
|
|
379
|
+
# For spawn mode, copy the environ to avoid polluting the SDK process.
|
380
|
+
server_env = os.environ.copy()
|
381
|
+
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
382
|
+
_set_metrics_env_var(server_env, metrics, deploy)
|
371
383
|
# Start the API server process in the background and don't wait for it.
|
372
384
|
# If this is called from a CLI invocation, we need
|
373
385
|
# start_new_session=True so that SIGINT on the CLI will not also kill
|
@@ -437,6 +449,26 @@ def _start_api_server(deploy: bool = False,
|
|
437
449
|
f'SkyPilot API server started. {dashboard_msg}'))
|
438
450
|
|
439
451
|
|
452
|
+
def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
|
453
|
+
deploy: bool):
|
454
|
+
"""Sets the metrics environment variables.
|
455
|
+
|
456
|
+
Args:
|
457
|
+
env: The environment variables to set.
|
458
|
+
metrics: Whether to enable metrics.
|
459
|
+
deploy: Whether the server is running in deploy mode, which means
|
460
|
+
multiple processes might be running.
|
461
|
+
"""
|
462
|
+
if metrics:
|
463
|
+
env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
|
464
|
+
if deploy:
|
465
|
+
metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
|
466
|
+
shutil.rmtree(metrics_dir, ignore_errors=True)
|
467
|
+
os.makedirs(metrics_dir, exist_ok=True)
|
468
|
+
# Refer to https://prometheus.github.io/client_python/multiprocess/
|
469
|
+
env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
|
470
|
+
|
471
|
+
|
440
472
|
def check_server_healthy(
|
441
473
|
endpoint: Optional[str] = None
|
442
474
|
) -> Tuple[Literal[
|
@@ -571,6 +603,8 @@ def get_skypilot_version_on_disk() -> str:
|
|
571
603
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
572
604
|
host: str = '127.0.0.1',
|
573
605
|
foreground: bool = False,
|
606
|
+
metrics: bool = False,
|
607
|
+
metrics_port: Optional[int] = None,
|
574
608
|
enable_basic_auth: bool = False):
|
575
609
|
api_server_status = None
|
576
610
|
try:
|
@@ -592,7 +626,8 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
592
626
|
# have started the server while we were waiting for the lock.
|
593
627
|
api_server_info = get_api_server_status(endpoint)
|
594
628
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
595
|
-
_start_api_server(deploy, host, foreground,
|
629
|
+
_start_api_server(deploy, host, foreground, metrics,
|
630
|
+
metrics_port, enable_basic_auth)
|
596
631
|
|
597
632
|
|
598
633
|
def check_server_healthy_or_start(func):
|
sky/server/constants.py
CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
|
|
7
7
|
# API server version, whenever there is a change in API server that requires a
|
8
8
|
# restart of the local API server or error out when the client does not match
|
9
9
|
# the server version.
|
10
|
-
API_VERSION = '
|
10
|
+
API_VERSION = '10'
|
11
11
|
|
12
12
|
# Prefix for API request names.
|
13
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
@@ -22,6 +22,10 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
|
|
22
22
|
# background.
|
23
23
|
CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
24
24
|
|
25
|
+
# The interval (seconds) for the volume status to be refreshed in the
|
26
|
+
# background.
|
27
|
+
VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
28
|
+
|
25
29
|
# Environment variable for a file path to the API cookie file.
|
26
30
|
# Keep in sync with websocket_proxy.py
|
27
31
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
sky/server/metrics.py
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
"""Instrumentation for the API server."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
|
7
|
+
import fastapi
|
8
|
+
from prometheus_client import generate_latest
|
9
|
+
from prometheus_client import multiprocess
|
10
|
+
import prometheus_client as prom
|
11
|
+
import starlette.middleware.base
|
12
|
+
import uvicorn
|
13
|
+
|
14
|
+
from sky import sky_logging
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
# Total number of API server requests, grouped by path, method, and status.
|
19
|
+
sky_apiserver_requests_total = prom.Counter(
|
20
|
+
'sky_apiserver_requests_total',
|
21
|
+
'Total number of API server requests',
|
22
|
+
['path', 'method', 'status'],
|
23
|
+
)
|
24
|
+
|
25
|
+
# Time spent processing API server requests, grouped by path, method, and
|
26
|
+
# status.
|
27
|
+
sky_apiserver_request_duration_seconds = prom.Histogram(
|
28
|
+
'sky_apiserver_request_duration_seconds',
|
29
|
+
'Time spent processing API server requests',
|
30
|
+
['path', 'method', 'status'],
|
31
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
|
32
|
+
float('inf')),
|
33
|
+
)
|
34
|
+
|
35
|
+
metrics_app = fastapi.FastAPI()
|
36
|
+
|
37
|
+
|
38
|
+
@metrics_app.get('/metrics')
|
39
|
+
async def metrics() -> fastapi.Response:
|
40
|
+
"""Expose aggregated Prometheus metrics from all worker processes."""
|
41
|
+
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
42
|
+
# In multiprocess mode, we need to collect metrics from all processes.
|
43
|
+
registry = prom.CollectorRegistry()
|
44
|
+
multiprocess.MultiProcessCollector(registry)
|
45
|
+
data = generate_latest(registry)
|
46
|
+
else:
|
47
|
+
data = generate_latest()
|
48
|
+
return fastapi.Response(content=data,
|
49
|
+
media_type=prom.CONTENT_TYPE_LATEST,
|
50
|
+
headers={'Cache-Control': 'no-cache'})
|
51
|
+
|
52
|
+
|
53
|
+
def run_metrics_server(host: str, port: int):
|
54
|
+
metrics_config = uvicorn.Config(
|
55
|
+
'sky.server.metrics:metrics_app',
|
56
|
+
host=host,
|
57
|
+
port=port,
|
58
|
+
workers=1,
|
59
|
+
)
|
60
|
+
metrics_server_instance = uvicorn.Server(metrics_config)
|
61
|
+
asyncio.run(metrics_server_instance.serve())
|
62
|
+
|
63
|
+
|
64
|
+
def _get_status_code_group(status_code: int) -> str:
|
65
|
+
"""Group status codes into classes (2xx, 5xx) to reduce cardinality."""
|
66
|
+
return f'{status_code // 100}xx'
|
67
|
+
|
68
|
+
|
69
|
+
def _is_streaming_api(path: str) -> bool:
|
70
|
+
"""Check if the path is a streaming API."""
|
71
|
+
path = path.rstrip('/')
|
72
|
+
return path.endswith('/logs') or path.endswith('/api/stream')
|
73
|
+
|
74
|
+
|
75
|
+
class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
76
|
+
"""Middleware to collect Prometheus metrics for HTTP requests."""
|
77
|
+
|
78
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
79
|
+
path = request.url.path
|
80
|
+
logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
|
81
|
+
streaming = _is_streaming_api(path)
|
82
|
+
if not streaming:
|
83
|
+
# Exclude streaming APIs, the duration is not meaningful.
|
84
|
+
# TODO(aylei): measure the duration of async execution instead.
|
85
|
+
start_time = time.time()
|
86
|
+
method = request.method
|
87
|
+
status_code_group = ''
|
88
|
+
|
89
|
+
try:
|
90
|
+
response = await call_next(request)
|
91
|
+
status_code_group = _get_status_code_group(response.status_code)
|
92
|
+
except Exception: # pylint: disable=broad-except
|
93
|
+
status_code_group = '5xx'
|
94
|
+
raise
|
95
|
+
finally:
|
96
|
+
sky_apiserver_requests_total.labels(path=path,
|
97
|
+
method=method,
|
98
|
+
status=status_code_group).inc()
|
99
|
+
if not streaming:
|
100
|
+
duration = time.time() - start_time
|
101
|
+
sky_apiserver_request_duration_seconds.labels(
|
102
|
+
path=path, method=method,
|
103
|
+
status=status_code_group).observe(duration)
|
104
|
+
|
105
|
+
return response
|
sky/server/requests/executor.py
CHANGED
@@ -149,10 +149,25 @@ class RequestWorker:
|
|
149
149
|
self.schedule_type = schedule_type
|
150
150
|
self.garanteed_parallelism = config.garanteed_parallelism
|
151
151
|
self.burstable_parallelism = config.burstable_parallelism
|
152
|
+
self._thread: Optional[threading.Thread] = None
|
153
|
+
self._cancel_event = threading.Event()
|
152
154
|
|
153
155
|
def __str__(self) -> str:
|
154
156
|
return f'Worker(schedule_type={self.schedule_type.value})'
|
155
157
|
|
158
|
+
def run_in_background(self) -> None:
|
159
|
+
# Thread dispatcher is sufficient for current scale, refer to
|
160
|
+
# tests/load_tests/test_queue_dispatcher.py for more details.
|
161
|
+
# Use daemon thread for automatic cleanup.
|
162
|
+
thread = threading.Thread(target=self.run, daemon=True)
|
163
|
+
thread.start()
|
164
|
+
self._thread = thread
|
165
|
+
|
166
|
+
def cancel(self) -> None:
|
167
|
+
if self._thread is not None:
|
168
|
+
self._cancel_event.set()
|
169
|
+
self._thread.join()
|
170
|
+
|
156
171
|
def process_request(self, executor: process.BurstableExecutor,
|
157
172
|
queue: RequestQueue) -> None:
|
158
173
|
try:
|
@@ -219,7 +234,7 @@ class RequestWorker:
|
|
219
234
|
burst_workers=self.burstable_parallelism,
|
220
235
|
initializer=executor_initializer,
|
221
236
|
initargs=(proc_group,))
|
222
|
-
while
|
237
|
+
while not self._cancel_event.is_set():
|
223
238
|
self.process_request(executor, queue)
|
224
239
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
225
240
|
except KeyboardInterrupt:
|
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
|
|
539
554
|
enqueue()
|
540
555
|
|
541
556
|
|
542
|
-
def start(
|
557
|
+
def start(
|
558
|
+
config: server_config.ServerConfig
|
559
|
+
) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
|
543
560
|
"""Start the request workers.
|
544
561
|
|
545
562
|
Request workers run in background, schedule the requests and delegate the
|
546
563
|
request execution to executor processes.
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
A tuple of the queue server process and the list of request worker
|
567
|
+
threads.
|
547
568
|
"""
|
548
569
|
global queue_backend
|
549
570
|
queue_backend = config.queue_backend
|
550
|
-
|
571
|
+
queue_server = None
|
551
572
|
# Setup the queues.
|
552
573
|
if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
|
553
574
|
logger.info('Creating shared request queues')
|
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
564
585
|
queue_server = multiprocessing.Process(
|
565
586
|
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
566
587
|
queue_server.start()
|
567
|
-
sub_procs.append(queue_server)
|
568
588
|
mp_queue.wait_for_queues_to_be_ready(queue_names,
|
569
589
|
queue_server,
|
570
590
|
port=port)
|
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
577
597
|
|
578
598
|
logger.info('Request queues created')
|
579
599
|
|
580
|
-
|
581
|
-
# Thread dispatcher is sufficient for current scale, refer to
|
582
|
-
# tests/load_tests/test_queue_dispatcher.py for more details.
|
583
|
-
# Use daemon thread for automatic cleanup.
|
584
|
-
thread = threading.Thread(target=worker.run, daemon=True)
|
585
|
-
thread.start()
|
586
|
-
|
600
|
+
workers = []
|
587
601
|
# Start a worker for long requests.
|
588
602
|
long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
|
589
603
|
config=config.long_worker_config)
|
590
|
-
|
604
|
+
long_worker.run_in_background()
|
605
|
+
workers.append(long_worker)
|
591
606
|
|
592
607
|
# Start a worker for short requests.
|
593
608
|
short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
|
594
609
|
config=config.short_worker_config)
|
595
|
-
|
596
|
-
|
610
|
+
short_worker.run_in_background()
|
611
|
+
workers.append(short_worker)
|
612
|
+
return queue_server, workers
|
sky/server/requests/payloads.py
CHANGED
@@ -5,7 +5,6 @@ kwargs for the payloads, otherwise, we have to keep the default values the sync
|
|
5
5
|
with the backend functions. The benefit of having the default values in the
|
6
6
|
payloads is that a user can find the default values in the Restful API docs.
|
7
7
|
"""
|
8
|
-
import getpass
|
9
8
|
import os
|
10
9
|
import typing
|
11
10
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
@@ -58,8 +57,7 @@ def request_body_env_vars() -> dict:
|
|
58
57
|
if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
|
59
58
|
env_vars[env_var] = os.environ[env_var]
|
60
59
|
env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
|
61
|
-
env_vars[constants.USER_ENV_VAR] =
|
62
|
-
getpass.getuser())
|
60
|
+
env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
|
63
61
|
env_vars[
|
64
62
|
usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
|
65
63
|
# Remove the path to config file, as the config content is included in the
|
@@ -370,6 +368,22 @@ class StorageBody(RequestBody):
|
|
370
368
|
name: str
|
371
369
|
|
372
370
|
|
371
|
+
class VolumeApplyBody(RequestBody):
|
372
|
+
"""The request body for the volume apply endpoint."""
|
373
|
+
name: str
|
374
|
+
volume_type: str
|
375
|
+
cloud: str
|
376
|
+
region: Optional[str] = None
|
377
|
+
zone: Optional[str] = None
|
378
|
+
size: Optional[str] = None
|
379
|
+
config: Optional[Dict[str, Any]] = None
|
380
|
+
|
381
|
+
|
382
|
+
class VolumeDeleteBody(RequestBody):
|
383
|
+
"""The request body for the volume delete endpoint."""
|
384
|
+
names: List[str]
|
385
|
+
|
386
|
+
|
373
387
|
class EndpointsBody(RequestBody):
|
374
388
|
"""The request body for the endpoint."""
|
375
389
|
cluster: str
|
@@ -613,3 +627,8 @@ class UpdateConfigBody(RequestBody):
|
|
613
627
|
class GetConfigBody(RequestBody):
|
614
628
|
"""The request body for getting the entire SkyPilot configuration."""
|
615
629
|
pass
|
630
|
+
|
631
|
+
|
632
|
+
class CostReportBody(RequestBody):
|
633
|
+
"""The request body for the cost report endpoint."""
|
634
|
+
days: Optional[int] = 30
|
sky/server/requests/requests.py
CHANGED
@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
|
|
38
38
|
COL_CLUSTER_NAME = 'cluster_name'
|
39
39
|
COL_USER_ID = 'user_id'
|
40
40
|
COL_STATUS_MSG = 'status_msg'
|
41
|
+
COL_SHOULD_RETRY = 'should_retry'
|
41
42
|
REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
|
42
43
|
|
43
44
|
# TODO(zhwu): For scalability, there are several TODOs:
|
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
|
|
86
87
|
'schedule_type',
|
87
88
|
COL_USER_ID,
|
88
89
|
COL_STATUS_MSG,
|
90
|
+
COL_SHOULD_RETRY,
|
89
91
|
]
|
90
92
|
|
91
93
|
|
@@ -115,6 +117,7 @@ class RequestPayload:
|
|
115
117
|
# Resources the request operates on.
|
116
118
|
cluster_name: Optional[str] = None
|
117
119
|
status_msg: Optional[str] = None
|
120
|
+
should_retry: bool = False
|
118
121
|
|
119
122
|
|
120
123
|
@dataclasses.dataclass
|
@@ -137,6 +140,8 @@ class Request:
|
|
137
140
|
cluster_name: Optional[str] = None
|
138
141
|
# Status message of the request, indicates the reason of current status.
|
139
142
|
status_msg: Optional[str] = None
|
143
|
+
# Whether the request should be retried.
|
144
|
+
should_retry: bool = False
|
140
145
|
|
141
146
|
@property
|
142
147
|
def log_path(self) -> pathlib.Path:
|
@@ -222,6 +227,7 @@ class Request:
|
|
222
227
|
user_name=user_name,
|
223
228
|
cluster_name=self.cluster_name,
|
224
229
|
status_msg=self.status_msg,
|
230
|
+
should_retry=self.should_retry,
|
225
231
|
)
|
226
232
|
|
227
233
|
def encode(self) -> RequestPayload:
|
@@ -243,6 +249,7 @@ class Request:
|
|
243
249
|
user_id=self.user_id,
|
244
250
|
cluster_name=self.cluster_name,
|
245
251
|
status_msg=self.status_msg,
|
252
|
+
should_retry=self.should_retry,
|
246
253
|
)
|
247
254
|
except (TypeError, ValueError) as e:
|
248
255
|
# The error is unexpected, so we don't suppress the stack trace.
|
@@ -274,6 +281,7 @@ class Request:
|
|
274
281
|
user_id=payload.user_id,
|
275
282
|
cluster_name=payload.cluster_name,
|
276
283
|
status_msg=payload.status_msg,
|
284
|
+
should_retry=payload.should_retry,
|
277
285
|
)
|
278
286
|
except (TypeError, ValueError) as e:
|
279
287
|
logger.error(
|
@@ -327,6 +335,44 @@ def refresh_cluster_status_event():
|
|
327
335
|
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
328
336
|
|
329
337
|
|
338
|
+
def refresh_volume_status_event():
|
339
|
+
"""Periodically refresh the volume status."""
|
340
|
+
# pylint: disable=import-outside-toplevel
|
341
|
+
from sky.volumes.server import core
|
342
|
+
|
343
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
344
|
+
# sent multiple times.
|
345
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
346
|
+
|
347
|
+
while True:
|
348
|
+
logger.info('=== Refreshing volume status ===')
|
349
|
+
core.volume_refresh()
|
350
|
+
logger.info('Volume status refreshed. Sleeping '
|
351
|
+
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
352
|
+
' seconds for the next refresh...\n')
|
353
|
+
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
354
|
+
|
355
|
+
|
356
|
+
def managed_job_status_refresh_event():
|
357
|
+
"""Refresh the managed job status for controller consolidation mode."""
|
358
|
+
# pylint: disable=import-outside-toplevel
|
359
|
+
from sky.jobs import utils as managed_job_utils
|
360
|
+
if not managed_job_utils.is_consolidation_mode():
|
361
|
+
return
|
362
|
+
# We run the recovery logic before starting the event loop as those two are
|
363
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
364
|
+
from sky.utils import controller_utils
|
365
|
+
if controller_utils.high_availability_specified(
|
366
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
367
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
368
|
+
# After recovery, we start the event loop.
|
369
|
+
from sky.skylet import events
|
370
|
+
event = events.ManagedJobEvent()
|
371
|
+
while True:
|
372
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
373
|
+
event.run()
|
374
|
+
|
375
|
+
|
330
376
|
@dataclasses.dataclass
|
331
377
|
class InternalRequestDaemon:
|
332
378
|
id: str
|
@@ -341,7 +387,14 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
341
387
|
# cluster being stopped or down when `sky status -r` is called.
|
342
388
|
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
343
389
|
name='status',
|
344
|
-
event_fn=refresh_cluster_status_event)
|
390
|
+
event_fn=refresh_cluster_status_event),
|
391
|
+
# Volume status refresh daemon to update the volume status periodically.
|
392
|
+
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
393
|
+
name='volume',
|
394
|
+
event_fn=refresh_volume_status_event),
|
395
|
+
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
396
|
+
name='managed-job-status',
|
397
|
+
event_fn=managed_job_status_refresh_event),
|
345
398
|
]
|
346
399
|
|
347
400
|
|
@@ -423,10 +476,14 @@ def create_table(cursor, conn):
|
|
423
476
|
{COL_CLUSTER_NAME} TEXT,
|
424
477
|
schedule_type TEXT,
|
425
478
|
{COL_USER_ID} TEXT,
|
426
|
-
{COL_STATUS_MSG} TEXT
|
479
|
+
{COL_STATUS_MSG} TEXT,
|
480
|
+
{COL_SHOULD_RETRY} INTEGER
|
481
|
+
)""")
|
427
482
|
|
428
483
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
|
429
484
|
'TEXT')
|
485
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
|
486
|
+
'INTEGER')
|
430
487
|
|
431
488
|
|
432
489
|
_DB = None
|
sky/server/rest.py
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
"""REST API client of SkyPilot API server"""
|
2
|
+
|
3
|
+
import contextlib
|
4
|
+
import contextvars
|
5
|
+
import functools
|
6
|
+
import time
|
7
|
+
import typing
|
8
|
+
from typing import Any, Callable, cast, Optional, TypeVar
|
9
|
+
|
10
|
+
import colorama
|
11
|
+
|
12
|
+
from sky import exceptions
|
13
|
+
from sky import sky_logging
|
14
|
+
from sky.adaptors import common as adaptors_common
|
15
|
+
from sky.utils import common_utils
|
16
|
+
from sky.utils import rich_utils
|
17
|
+
from sky.utils import ux_utils
|
18
|
+
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
20
|
+
|
21
|
+
if typing.TYPE_CHECKING:
|
22
|
+
import requests
|
23
|
+
|
24
|
+
else:
|
25
|
+
requests = adaptors_common.LazyImport('requests')
|
26
|
+
|
27
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
28
|
+
|
29
|
+
_RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
|
30
|
+
|
31
|
+
|
32
|
+
class RetryContext:
|
33
|
+
|
34
|
+
def __init__(self):
|
35
|
+
self.line_processed = 0
|
36
|
+
|
37
|
+
|
38
|
+
def retry_on_server_unavailable(max_wait_seconds: int = 600,
|
39
|
+
initial_backoff: float = 5.0,
|
40
|
+
max_backoff_factor: int = 5):
|
41
|
+
"""Decorator that retries a function when ServerTemporarilyUnavailableError
|
42
|
+
is caught.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
max_wait_seconds: Maximum number of seconds to wait for the server to
|
46
|
+
be healthy
|
47
|
+
initial_backoff: Initial backoff time in seconds
|
48
|
+
max_backoff_factor: Maximum backoff factor for exponential backoff
|
49
|
+
|
50
|
+
Notes(dev):
|
51
|
+
This decorator is mainly used in two scenarios:
|
52
|
+
1. Decorate a Restful API call to make the API call wait for server
|
53
|
+
recovery when server is temporarily unavailable. APIs like /api/get
|
54
|
+
and /api/stream should not be retried since sending them to a new
|
55
|
+
replica of API server will not work.
|
56
|
+
2. Decorate a SDK function to make the entire SDK function call get
|
57
|
+
retried when /api/get or /logs raises a retryable error. This
|
58
|
+
is typically triggered by a graceful upgrade of the API server,
|
59
|
+
where the pending requests and logs requests will be interrupted.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def decorator(func: F) -> F:
|
63
|
+
|
64
|
+
@functools.wraps(func)
|
65
|
+
def wrapper(*args, **kwargs) -> Any:
|
66
|
+
msg = (
|
67
|
+
f'{colorama.Fore.YELLOW}API server is temporarily: upgrade in '
|
68
|
+
f'progress. Waiting to resume...{colorama.Style.RESET_ALL}')
|
69
|
+
backoff = common_utils.Backoff(
|
70
|
+
initial_backoff=initial_backoff,
|
71
|
+
max_backoff_factor=max_backoff_factor)
|
72
|
+
start_time = time.time()
|
73
|
+
attempt = 0
|
74
|
+
|
75
|
+
with _retry_in_context():
|
76
|
+
while True:
|
77
|
+
attempt += 1
|
78
|
+
try:
|
79
|
+
return func(*args, **kwargs)
|
80
|
+
except exceptions.ServerTemporarilyUnavailableError as e:
|
81
|
+
# This will cause the status spinner being stopped and
|
82
|
+
# restarted in every retry loop. But it is necessary to
|
83
|
+
# stop the status spinner before retrying func() to
|
84
|
+
# avoid the status spinner get stuck if the func() runs
|
85
|
+
# for a long time without update status, e.g. sky logs.
|
86
|
+
with rich_utils.client_status(msg):
|
87
|
+
if time.time() - start_time > max_wait_seconds:
|
88
|
+
# pylint: disable=line-too-long
|
89
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
90
|
+
'Timeout waiting for the API server to be '
|
91
|
+
f'available after {max_wait_seconds}s.') \
|
92
|
+
from e
|
93
|
+
|
94
|
+
sleep_time = backoff.current_backoff()
|
95
|
+
time.sleep(sleep_time)
|
96
|
+
logger.debug('The API server is unavailable. '
|
97
|
+
f'Retrying {func.__name__} '
|
98
|
+
f'(attempt {attempt}, '
|
99
|
+
f'backoff {sleep_time}s).')
|
100
|
+
|
101
|
+
return cast(F, wrapper)
|
102
|
+
|
103
|
+
return decorator
|
104
|
+
|
105
|
+
|
106
|
+
@contextlib.contextmanager
|
107
|
+
def _retry_in_context():
|
108
|
+
token = _RETRY_CONTEXT.set(RetryContext())
|
109
|
+
try:
|
110
|
+
yield
|
111
|
+
finally:
|
112
|
+
_RETRY_CONTEXT.reset(token)
|
113
|
+
|
114
|
+
|
115
|
+
def get_retry_context() -> Optional[RetryContext]:
|
116
|
+
return _RETRY_CONTEXT.get()
|
117
|
+
|
118
|
+
|
119
|
+
def handle_server_unavailable(response: 'requests.Response') -> None:
|
120
|
+
if response.status_code == 503:
|
121
|
+
# TODO(aylei): Hacky, depends on how nginx controller handles backends
|
122
|
+
# with no ready endpoints. Should use self-defined status code or header
|
123
|
+
# to distinguish retryable server error from general 503 errors.
|
124
|
+
with ux_utils.print_exception_no_traceback():
|
125
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
126
|
+
'SkyPilot API server is temporarily unavailable. '
|
127
|
+
'Please try again later.')
|
128
|
+
|
129
|
+
|
130
|
+
@retry_on_server_unavailable()
|
131
|
+
def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
|
132
|
+
"""Send a POST request to the API server, retry on server temporarily
|
133
|
+
unavailable."""
|
134
|
+
response = requests.post(url, data=data, json=json, **kwargs)
|
135
|
+
handle_server_unavailable(response)
|
136
|
+
return response
|
137
|
+
|
138
|
+
|
139
|
+
@retry_on_server_unavailable()
|
140
|
+
def get(url, params=None, **kwargs) -> 'requests.Response':
|
141
|
+
"""Send a GET request to the API server, retry on server temporarily
|
142
|
+
unavailable."""
|
143
|
+
response = requests.get(url, params=params, **kwargs)
|
144
|
+
handle_server_unavailable(response)
|
145
|
+
return response
|
146
|
+
|
147
|
+
|
148
|
+
def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
|
149
|
+
"""Send a GET request to the API server without retry."""
|
150
|
+
response = requests.get(url, params=params, **kwargs)
|
151
|
+
handle_server_unavailable(response)
|
152
|
+
return response
|