skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/common.py
CHANGED
|
@@ -539,19 +539,27 @@ def _start_api_server(deploy: bool = False,
|
|
|
539
539
|
'is not a local URL')
|
|
540
540
|
|
|
541
541
|
# Check available memory before starting the server.
|
|
542
|
-
|
|
543
|
-
#
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
if
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
542
|
+
# Skip this warning if postgres is used, as:
|
|
543
|
+
# 1) that's almost certainly a remote API server;
|
|
544
|
+
# 2) the actual consolidation mode config is stashed in the database,
|
|
545
|
+
# and the value of `job_utils.is_consolidation_mode` will not be
|
|
546
|
+
# the actual value in the db, but only None as in this case, the
|
|
547
|
+
# whole YAML config is really just `db: <URI>`.
|
|
548
|
+
if skypilot_config.get_nested(('db',), None) is None:
|
|
549
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
|
550
|
+
# pylint: disable=import-outside-toplevel
|
|
551
|
+
import sky.jobs.utils as job_utils
|
|
552
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
553
|
+
if job_utils.is_consolidation_mode(
|
|
554
|
+
on_api_restart=True) else
|
|
555
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
556
|
+
if avail_mem_size_gb <= max_memory:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
|
|
559
|
+
f'only has {avail_mem_size_gb:.1f}GB memory available. '
|
|
560
|
+
f'At least {max_memory}GB is recommended to support higher '
|
|
561
|
+
'load with better performance.'
|
|
562
|
+
f'{colorama.Style.RESET_ALL}')
|
|
555
563
|
|
|
556
564
|
args = [sys.executable, *API_SERVER_CMD.split()]
|
|
557
565
|
if deploy:
|
|
@@ -560,8 +568,6 @@ def _start_api_server(deploy: bool = False,
|
|
|
560
568
|
args += [f'--host={host}']
|
|
561
569
|
if metrics_port is not None:
|
|
562
570
|
args += [f'--metrics-port={metrics_port}']
|
|
563
|
-
# Use this argument to disable the internal signal file check.
|
|
564
|
-
args += ['--start-with-python']
|
|
565
571
|
|
|
566
572
|
if foreground:
|
|
567
573
|
# Replaces the current process with the API server
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 22
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Callable
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
|
+
from sky.server.requests import request_names
|
|
10
11
|
from sky.utils import annotations
|
|
11
12
|
from sky.utils import common_utils
|
|
12
13
|
from sky.utils import env_options
|
|
@@ -26,7 +27,7 @@ class InternalRequestDaemon:
|
|
|
26
27
|
"""Internal daemon that runs an event in the background."""
|
|
27
28
|
|
|
28
29
|
id: str
|
|
29
|
-
name:
|
|
30
|
+
name: request_names.RequestName
|
|
30
31
|
event_fn: Callable[[], None]
|
|
31
32
|
default_log_level: str = 'INFO'
|
|
32
33
|
should_skip: Callable[[], bool] = _default_should_skip
|
|
@@ -195,26 +196,31 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
195
196
|
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
196
197
|
# set to updated status automatically, without showing users the hint of
|
|
197
198
|
# cluster being stopped or down when `sky status -r` is called.
|
|
198
|
-
InternalRequestDaemon(
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
199
|
+
InternalRequestDaemon(
|
|
200
|
+
id='skypilot-status-refresh-daemon',
|
|
201
|
+
name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
|
|
202
|
+
event_fn=refresh_cluster_status_event,
|
|
203
|
+
default_log_level='DEBUG'),
|
|
202
204
|
# Volume status refresh daemon to update the volume status periodically.
|
|
203
|
-
InternalRequestDaemon(
|
|
204
|
-
|
|
205
|
-
|
|
205
|
+
InternalRequestDaemon(
|
|
206
|
+
id='skypilot-volume-status-refresh-daemon',
|
|
207
|
+
name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
|
|
208
|
+
event_fn=refresh_volume_status_event),
|
|
206
209
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
207
|
-
name=
|
|
210
|
+
name=request_names.RequestName.
|
|
211
|
+
REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
|
|
208
212
|
event_fn=managed_job_status_refresh_event,
|
|
209
213
|
should_skip=should_skip_managed_job_status_refresh),
|
|
210
|
-
InternalRequestDaemon(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
214
|
+
InternalRequestDaemon(
|
|
215
|
+
id='sky-serve-status-refresh-daemon',
|
|
216
|
+
name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
|
|
217
|
+
event_fn=sky_serve_status_refresh_event,
|
|
218
|
+
should_skip=should_skip_sky_serve_status_refresh),
|
|
219
|
+
InternalRequestDaemon(
|
|
220
|
+
id='pool-status-refresh-daemon',
|
|
221
|
+
name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
|
|
222
|
+
event_fn=pool_status_refresh_event,
|
|
223
|
+
should_skip=should_skip_pool_status_refresh),
|
|
218
224
|
]
|
|
219
225
|
|
|
220
226
|
|
sky/server/requests/executor.py
CHANGED
|
@@ -47,6 +47,7 @@ from sky.server import metrics as metrics_lib
|
|
|
47
47
|
from sky.server.requests import payloads
|
|
48
48
|
from sky.server.requests import preconditions
|
|
49
49
|
from sky.server.requests import process
|
|
50
|
+
from sky.server.requests import request_names
|
|
50
51
|
from sky.server.requests import requests as api_requests
|
|
51
52
|
from sky.server.requests import threads
|
|
52
53
|
from sky.server.requests.queues import local_queue
|
|
@@ -395,7 +396,10 @@ def _request_execution_wrapper(request_id: str,
|
|
|
395
396
|
rss_begin = proc.memory_info().rss
|
|
396
397
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
397
398
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
398
|
-
signal.signal(
|
|
399
|
+
# Only set up signal handlers in the main thread, as signal.signal() raises
|
|
400
|
+
# ValueError if called from a non-main thread (e.g., in tests).
|
|
401
|
+
if threading.current_thread() is threading.main_thread():
|
|
402
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
399
403
|
|
|
400
404
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
401
405
|
|
|
@@ -688,7 +692,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
688
692
|
|
|
689
693
|
async def prepare_request_async(
|
|
690
694
|
request_id: str,
|
|
691
|
-
request_name:
|
|
695
|
+
request_name: request_names.RequestName,
|
|
692
696
|
request_body: payloads.RequestBody,
|
|
693
697
|
func: Callable[P, Any],
|
|
694
698
|
request_cluster_name: Optional[str] = None,
|
|
@@ -721,7 +725,7 @@ async def prepare_request_async(
|
|
|
721
725
|
|
|
722
726
|
|
|
723
727
|
async def schedule_request_async(request_id: str,
|
|
724
|
-
request_name:
|
|
728
|
+
request_name: request_names.RequestName,
|
|
725
729
|
request_body: payloads.RequestBody,
|
|
726
730
|
func: Callable[P, Any],
|
|
727
731
|
request_cluster_name: Optional[str] = None,
|
sky/server/requests/payloads.py
CHANGED
|
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
|
|
|
319
319
|
# Only return fields that are needed for the
|
|
320
320
|
# dashboard / CLI summary response
|
|
321
321
|
summary_response: bool = False
|
|
322
|
+
# Include the cluster handle in the response
|
|
323
|
+
include_handle: bool = True
|
|
322
324
|
|
|
323
325
|
|
|
324
326
|
class StartBody(RequestBody):
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Request names."""
|
|
2
|
+
import enum
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RequestName(str, enum.Enum):
|
|
6
|
+
"""Enum of all the request names."""
|
|
7
|
+
# General requests
|
|
8
|
+
CHECK = 'check'
|
|
9
|
+
ENABLED_CLOUDS = 'enabled_clouds'
|
|
10
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY = (
|
|
11
|
+
'realtime_kubernetes_gpu_availability')
|
|
12
|
+
KUBERNETES_NODE_INFO = 'kubernetes_node_info'
|
|
13
|
+
STATUS_KUBERNETES = 'status_kubernetes'
|
|
14
|
+
LIST_ACCELERATORS = 'list_accelerators'
|
|
15
|
+
LIST_ACCELERATOR_COUNTS = 'list_accelerator_counts'
|
|
16
|
+
OPTIMIZE = 'optimize'
|
|
17
|
+
# Cluster requests
|
|
18
|
+
CLUSTER_LAUNCH = 'launch'
|
|
19
|
+
CLUSTER_EXEC = 'exec'
|
|
20
|
+
CLUSTER_STOP = 'stop'
|
|
21
|
+
CLUSTER_STATUS = 'status'
|
|
22
|
+
CLUSTER_ENDPOINTS = 'endpoints'
|
|
23
|
+
CLUSTER_DOWN = 'down'
|
|
24
|
+
CLUSTER_START = 'start'
|
|
25
|
+
CLUSTER_AUTOSTOP = 'autostop'
|
|
26
|
+
CLUSTER_QUEUE = 'queue'
|
|
27
|
+
CLUSTER_JOB_STATUS = 'job_status'
|
|
28
|
+
CLUSTER_JOB_CANCEL = 'cancel'
|
|
29
|
+
CLUSTER_JOB_LOGS = 'logs'
|
|
30
|
+
CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
|
|
31
|
+
CLUSTER_COST_REPORT = 'cost_report'
|
|
32
|
+
# Storage requests
|
|
33
|
+
STORAGE_LS = 'storage_ls'
|
|
34
|
+
STORAGE_DELETE = 'storage_delete'
|
|
35
|
+
# Local requests
|
|
36
|
+
LOCAL_UP = 'local_up'
|
|
37
|
+
LOCAL_DOWN = 'local_down'
|
|
38
|
+
# API requests
|
|
39
|
+
API_CANCEL = 'api_cancel'
|
|
40
|
+
ALL_CONTEXTS = 'all_contexts'
|
|
41
|
+
# Managed jobs requests
|
|
42
|
+
JOBS_LAUNCH = 'jobs.launch'
|
|
43
|
+
JOBS_QUEUE = 'jobs.queue'
|
|
44
|
+
JOBS_QUEUE_V2 = 'jobs.queue_v2'
|
|
45
|
+
JOBS_CANCEL = 'jobs.cancel'
|
|
46
|
+
JOBS_LOGS = 'jobs.logs'
|
|
47
|
+
JOBS_DOWNLOAD_LOGS = 'jobs.download_logs'
|
|
48
|
+
JOBS_POOL_APPLY = 'jobs.pool_apply'
|
|
49
|
+
JOBS_POOL_DOWN = 'jobs.pool_down'
|
|
50
|
+
JOBS_POOL_STATUS = 'jobs.pool_status'
|
|
51
|
+
JOBS_POOL_LOGS = 'jobs.pool_logs'
|
|
52
|
+
JOBS_POOL_SYNC_DOWN_LOGS = 'jobs.pool_sync_down_logs'
|
|
53
|
+
# Serve requests
|
|
54
|
+
SERVE_UP = 'serve.up'
|
|
55
|
+
SERVE_UPDATE = 'serve.update'
|
|
56
|
+
SERVE_DOWN = 'serve.down'
|
|
57
|
+
SERVE_TERMINATE_REPLICA = 'serve.terminate_replica'
|
|
58
|
+
SERVE_STATUS = 'serve.status'
|
|
59
|
+
SERVE_LOGS = 'serve.logs'
|
|
60
|
+
SERVE_SYNC_DOWN_LOGS = 'serve.sync_down_logs'
|
|
61
|
+
# Volumes requests
|
|
62
|
+
VOLUME_LIST = 'volume_list'
|
|
63
|
+
VOLUME_DELETE = 'volume_delete'
|
|
64
|
+
VOLUME_APPLY = 'volume_apply'
|
|
65
|
+
# Workspaces requests
|
|
66
|
+
WORKSPACES_GET = 'workspaces.get'
|
|
67
|
+
WORKSPACES_UPDATE = 'workspaces.update'
|
|
68
|
+
WORKSPACES_CREATE = 'workspaces.create'
|
|
69
|
+
WORKSPACES_DELETE = 'workspaces.delete'
|
|
70
|
+
WORKSPACES_GET_CONFIG = 'workspaces.get_config'
|
|
71
|
+
WORKSPACES_UPDATE_CONFIG = 'workspaces.update_config'
|
|
72
|
+
# SSH node pools requests
|
|
73
|
+
SSH_NODE_POOLS_UP = 'ssh_node_pools.up'
|
|
74
|
+
SSH_NODE_POOLS_DOWN = 'ssh_node_pools.down'
|
|
75
|
+
# Internal request daemons
|
|
76
|
+
REQUEST_DAEMON_STATUS_REFRESH = 'status-refresh'
|
|
77
|
+
REQUEST_DAEMON_VOLUME_REFRESH = 'volume-refresh'
|
|
78
|
+
REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH = 'managed-job-status-refresh'
|
|
79
|
+
REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH = 'sky-serve-status-refresh'
|
|
80
|
+
REQUEST_DAEMON_POOL_STATUS_REFRESH = 'pool-status-refresh'
|
sky/server/requests/requests.py
CHANGED
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import enum
|
|
7
7
|
import functools
|
|
8
|
-
import json
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import shutil
|
|
@@ -21,6 +20,7 @@ import uuid
|
|
|
21
20
|
import anyio
|
|
22
21
|
import colorama
|
|
23
22
|
import filelock
|
|
23
|
+
import orjson
|
|
24
24
|
|
|
25
25
|
from sky import exceptions
|
|
26
26
|
from sky import global_user_state
|
|
@@ -213,8 +213,8 @@ class Request:
|
|
|
213
213
|
entrypoint=self.entrypoint.__name__,
|
|
214
214
|
request_body=self.request_body.model_dump_json(),
|
|
215
215
|
status=self.status.value,
|
|
216
|
-
return_value=
|
|
217
|
-
error=
|
|
216
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
217
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
218
218
|
pid=None,
|
|
219
219
|
created_at=self.created_at,
|
|
220
220
|
schedule_type=self.schedule_type.value,
|
|
@@ -237,8 +237,8 @@ class Request:
|
|
|
237
237
|
entrypoint=encoders.pickle_and_encode(self.entrypoint),
|
|
238
238
|
request_body=encoders.pickle_and_encode(self.request_body),
|
|
239
239
|
status=self.status.value,
|
|
240
|
-
return_value=
|
|
241
|
-
error=
|
|
240
|
+
return_value=orjson.dumps(self.return_value).decode('utf-8'),
|
|
241
|
+
error=orjson.dumps(self.error).decode('utf-8'),
|
|
242
242
|
pid=self.pid,
|
|
243
243
|
created_at=self.created_at,
|
|
244
244
|
schedule_type=self.schedule_type.value,
|
|
@@ -270,8 +270,8 @@ class Request:
|
|
|
270
270
|
entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
|
|
271
271
|
request_body=decoders.decode_and_unpickle(payload.request_body),
|
|
272
272
|
status=RequestStatus(payload.status),
|
|
273
|
-
return_value=
|
|
274
|
-
error=
|
|
273
|
+
return_value=orjson.loads(payload.return_value),
|
|
274
|
+
error=orjson.loads(payload.error),
|
|
275
275
|
pid=payload.pid,
|
|
276
276
|
created_at=payload.created_at,
|
|
277
277
|
schedule_type=ScheduleType(payload.schedule_type),
|
|
@@ -328,10 +328,11 @@ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
|
328
328
|
entrypoint=request.entrypoint.__name__
|
|
329
329
|
if request.entrypoint is not None else '',
|
|
330
330
|
request_body=request.request_body.model_dump_json()
|
|
331
|
-
if request.request_body is not None else
|
|
331
|
+
if request.request_body is not None else
|
|
332
|
+
orjson.dumps(None).decode('utf-8'),
|
|
332
333
|
status=request.status.value,
|
|
333
|
-
return_value=
|
|
334
|
-
error=
|
|
334
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
335
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
335
336
|
pid=None,
|
|
336
337
|
created_at=request.created_at,
|
|
337
338
|
schedule_type=request.schedule_type.value,
|
|
@@ -372,9 +373,9 @@ def _update_request_row_fields(
|
|
|
372
373
|
if 'user_id' not in fields:
|
|
373
374
|
content['user_id'] = ''
|
|
374
375
|
if 'return_value' not in fields:
|
|
375
|
-
content['return_value'] =
|
|
376
|
+
content['return_value'] = orjson.dumps(None).decode('utf-8')
|
|
376
377
|
if 'error' not in fields:
|
|
377
|
-
content['error'] =
|
|
378
|
+
content['error'] = orjson.dumps(None).decode('utf-8')
|
|
378
379
|
if 'schedule_type' not in fields:
|
|
379
380
|
content['schedule_type'] = ScheduleType.SHORT.value
|
|
380
381
|
# Optional fields in RequestPayload
|
|
@@ -393,94 +394,6 @@ def _update_request_row_fields(
|
|
|
393
394
|
return tuple(content[col] for col in REQUEST_COLUMNS)
|
|
394
395
|
|
|
395
396
|
|
|
396
|
-
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
397
|
-
"""Kill all pending and running requests for a cluster.
|
|
398
|
-
|
|
399
|
-
Args:
|
|
400
|
-
cluster_name: the name of the cluster.
|
|
401
|
-
exclude_request_names: exclude requests with these names. This is to
|
|
402
|
-
prevent killing the caller request.
|
|
403
|
-
"""
|
|
404
|
-
request_ids = [
|
|
405
|
-
request_task.request_id
|
|
406
|
-
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
407
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
408
|
-
exclude_request_names=[exclude_request_name],
|
|
409
|
-
cluster_names=[cluster_name],
|
|
410
|
-
fields=['request_id']))
|
|
411
|
-
]
|
|
412
|
-
kill_requests(request_ids)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
|
|
416
|
-
user_id: Optional[str] = None) -> List[str]:
|
|
417
|
-
"""Kill requests with a given request ID prefix."""
|
|
418
|
-
expanded_request_ids: Optional[List[str]] = None
|
|
419
|
-
if request_ids is not None:
|
|
420
|
-
expanded_request_ids = []
|
|
421
|
-
for request_id in request_ids:
|
|
422
|
-
request_tasks = get_requests_with_prefix(request_id,
|
|
423
|
-
fields=['request_id'])
|
|
424
|
-
if request_tasks is None or len(request_tasks) == 0:
|
|
425
|
-
continue
|
|
426
|
-
if len(request_tasks) > 1:
|
|
427
|
-
raise ValueError(f'Multiple requests found for '
|
|
428
|
-
f'request ID prefix: {request_id}')
|
|
429
|
-
expanded_request_ids.append(request_tasks[0].request_id)
|
|
430
|
-
return kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
434
|
-
user_id: Optional[str] = None) -> List[str]:
|
|
435
|
-
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
436
|
-
|
|
437
|
-
Args:
|
|
438
|
-
request_ids: The request IDs to kill. If None, all requests for the
|
|
439
|
-
user are killed.
|
|
440
|
-
user_id: The user ID to kill requests for. If None, all users are
|
|
441
|
-
killed.
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
A list of request IDs that were cancelled.
|
|
445
|
-
"""
|
|
446
|
-
if request_ids is None:
|
|
447
|
-
request_ids = [
|
|
448
|
-
request_task.request_id
|
|
449
|
-
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
450
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
451
|
-
# Avoid cancelling the cancel request itself.
|
|
452
|
-
exclude_request_names=['sky.api_cancel'],
|
|
453
|
-
user_id=user_id,
|
|
454
|
-
fields=['request_id']))
|
|
455
|
-
]
|
|
456
|
-
cancelled_request_ids = []
|
|
457
|
-
for request_id in request_ids:
|
|
458
|
-
with update_request(request_id) as request_record:
|
|
459
|
-
if request_record is None:
|
|
460
|
-
logger.debug(f'No request ID {request_id}')
|
|
461
|
-
continue
|
|
462
|
-
# Skip internal requests. The internal requests are scheduled with
|
|
463
|
-
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
464
|
-
if request_record.request_id in set(
|
|
465
|
-
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
466
|
-
continue
|
|
467
|
-
if request_record.status > RequestStatus.RUNNING:
|
|
468
|
-
logger.debug(f'Request {request_id} already finished')
|
|
469
|
-
continue
|
|
470
|
-
if request_record.pid is not None:
|
|
471
|
-
logger.debug(f'Killing request process {request_record.pid}')
|
|
472
|
-
# Use SIGTERM instead of SIGKILL:
|
|
473
|
-
# - The executor can handle SIGTERM gracefully
|
|
474
|
-
# - After SIGTERM, the executor can reuse the request process
|
|
475
|
-
# for other requests, avoiding the overhead of forking a new
|
|
476
|
-
# process for each request.
|
|
477
|
-
os.kill(request_record.pid, signal.SIGTERM)
|
|
478
|
-
request_record.status = RequestStatus.CANCELLED
|
|
479
|
-
request_record.finished_at = time.time()
|
|
480
|
-
cancelled_request_ids.append(request_id)
|
|
481
|
-
return cancelled_request_ids
|
|
482
|
-
|
|
483
|
-
|
|
484
397
|
def create_table(cursor, conn):
|
|
485
398
|
# Enable WAL mode to avoid locking issues.
|
|
486
399
|
# See: issue #1441 and PR #1509
|
|
@@ -624,6 +537,128 @@ def request_lock_path(request_id: str) -> str:
|
|
|
624
537
|
return os.path.join(lock_path, f'.{request_id}.lock')
|
|
625
538
|
|
|
626
539
|
|
|
540
|
+
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
541
|
+
"""Kill all pending and running requests for a cluster.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
cluster_name: the name of the cluster.
|
|
545
|
+
exclude_request_names: exclude requests with these names. This is to
|
|
546
|
+
prevent killing the caller request.
|
|
547
|
+
"""
|
|
548
|
+
request_ids = [
|
|
549
|
+
request_task.request_id
|
|
550
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
551
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
552
|
+
exclude_request_names=[exclude_request_name],
|
|
553
|
+
cluster_names=[cluster_name],
|
|
554
|
+
fields=['request_id']))
|
|
555
|
+
]
|
|
556
|
+
_kill_requests(request_ids)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
|
|
560
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
561
|
+
"""Kill requests with a given request ID prefix."""
|
|
562
|
+
expanded_request_ids: Optional[List[str]] = None
|
|
563
|
+
if request_ids is not None:
|
|
564
|
+
expanded_request_ids = []
|
|
565
|
+
for request_id in request_ids:
|
|
566
|
+
request_tasks = get_requests_with_prefix(request_id,
|
|
567
|
+
fields=['request_id'])
|
|
568
|
+
if request_tasks is None or len(request_tasks) == 0:
|
|
569
|
+
continue
|
|
570
|
+
if len(request_tasks) > 1:
|
|
571
|
+
raise ValueError(f'Multiple requests found for '
|
|
572
|
+
f'request ID prefix: {request_id}')
|
|
573
|
+
expanded_request_ids.append(request_tasks[0].request_id)
|
|
574
|
+
return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _should_kill_request(request_id: str,
|
|
578
|
+
request_record: Optional[Request]) -> bool:
|
|
579
|
+
if request_record is None:
|
|
580
|
+
logger.debug(f'No request ID {request_id}')
|
|
581
|
+
return False
|
|
582
|
+
# Skip internal requests. The internal requests are scheduled with
|
|
583
|
+
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
584
|
+
if request_record.request_id in set(
|
|
585
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
586
|
+
return False
|
|
587
|
+
if request_record.status > RequestStatus.RUNNING:
|
|
588
|
+
logger.debug(f'Request {request_id} already finished')
|
|
589
|
+
return False
|
|
590
|
+
return True
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _kill_requests(request_ids: Optional[List[str]] = None,
|
|
594
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
595
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
request_ids: The request IDs to kill. If None, all requests for the
|
|
599
|
+
user are killed.
|
|
600
|
+
user_id: The user ID to kill requests for. If None, all users are
|
|
601
|
+
killed.
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
A list of request IDs that were cancelled.
|
|
605
|
+
"""
|
|
606
|
+
if request_ids is None:
|
|
607
|
+
request_ids = [
|
|
608
|
+
request_task.request_id
|
|
609
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
610
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
611
|
+
# Avoid cancelling the cancel request itself.
|
|
612
|
+
exclude_request_names=['sky.api_cancel'],
|
|
613
|
+
user_id=user_id,
|
|
614
|
+
fields=['request_id']))
|
|
615
|
+
]
|
|
616
|
+
cancelled_request_ids = []
|
|
617
|
+
for request_id in request_ids:
|
|
618
|
+
with update_request(request_id) as request_record:
|
|
619
|
+
if not _should_kill_request(request_id, request_record):
|
|
620
|
+
continue
|
|
621
|
+
if request_record.pid is not None:
|
|
622
|
+
logger.debug(f'Killing request process {request_record.pid}')
|
|
623
|
+
# Use SIGTERM instead of SIGKILL:
|
|
624
|
+
# - The executor can handle SIGTERM gracefully
|
|
625
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
626
|
+
# for other requests, avoiding the overhead of forking a new
|
|
627
|
+
# process for each request.
|
|
628
|
+
os.kill(request_record.pid, signal.SIGTERM)
|
|
629
|
+
request_record.status = RequestStatus.CANCELLED
|
|
630
|
+
request_record.finished_at = time.time()
|
|
631
|
+
cancelled_request_ids.append(request_id)
|
|
632
|
+
return cancelled_request_ids
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@init_db_async
|
|
636
|
+
@asyncio_utils.shield
|
|
637
|
+
async def kill_request_async(request_id: str) -> bool:
|
|
638
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
True if the request was killed, False otherwise.
|
|
642
|
+
"""
|
|
643
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
644
|
+
request = await _get_request_no_lock_async(request_id)
|
|
645
|
+
if not _should_kill_request(request_id, request):
|
|
646
|
+
return False
|
|
647
|
+
assert request is not None
|
|
648
|
+
if request.pid is not None:
|
|
649
|
+
logger.debug(f'Killing request process {request.pid}')
|
|
650
|
+
# Use SIGTERM instead of SIGKILL:
|
|
651
|
+
# - The executor can handle SIGTERM gracefully
|
|
652
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
653
|
+
# for other requests, avoiding the overhead of forking a new
|
|
654
|
+
# process for each request.
|
|
655
|
+
os.kill(request.pid, signal.SIGTERM)
|
|
656
|
+
request.status = RequestStatus.CANCELLED
|
|
657
|
+
request.finished_at = time.time()
|
|
658
|
+
await _add_or_update_request_no_lock_async(request)
|
|
659
|
+
return True
|
|
660
|
+
|
|
661
|
+
|
|
627
662
|
@contextlib.contextmanager
|
|
628
663
|
@init_db
|
|
629
664
|
@metrics_lib.time_me
|
|
@@ -638,7 +673,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
|
638
673
|
_add_or_update_request_no_lock(request)
|
|
639
674
|
|
|
640
675
|
|
|
641
|
-
@
|
|
676
|
+
@init_db_async
|
|
642
677
|
@metrics_lib.time_me
|
|
643
678
|
@asyncio_utils.shield
|
|
644
679
|
async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
@@ -650,7 +685,7 @@ async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
|
650
685
|
await _add_or_update_request_no_lock_async(request)
|
|
651
686
|
|
|
652
687
|
|
|
653
|
-
@
|
|
688
|
+
@init_db_async
|
|
654
689
|
@metrics_lib.time_me
|
|
655
690
|
@asyncio_utils.shield
|
|
656
691
|
async def update_status_msg_async(request_id: str, status_msg: str) -> None:
|
|
@@ -60,12 +60,6 @@ def decode_status(
|
|
|
60
60
|
if 'handle' in cluster and cluster['handle'] is not None:
|
|
61
61
|
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
62
62
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
63
|
-
# this field is to be deprecated in the future.
|
|
64
|
-
# do not decode this field if it is not present.
|
|
65
|
-
if ('storage_mounts_metadata' in cluster and
|
|
66
|
-
cluster['storage_mounts_metadata'] is not None):
|
|
67
|
-
cluster['storage_mounts_metadata'] = decode_and_unpickle(
|
|
68
|
-
cluster['storage_mounts_metadata'])
|
|
69
63
|
if 'is_managed' not in cluster:
|
|
70
64
|
cluster['is_managed'] = False
|
|
71
65
|
response.append(responses.StatusResponse.model_validate(cluster))
|
|
@@ -60,13 +60,23 @@ def encode_status(
|
|
|
60
60
|
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
61
61
|
response = []
|
|
62
62
|
for cluster in clusters:
|
|
63
|
-
response_cluster = cluster.model_dump()
|
|
63
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
64
|
+
# These default setting is needed because last_use and status_updated_at
|
|
65
|
+
# used to be not optional.
|
|
66
|
+
# TODO(syang): remove this after v0.10.7 or v0.11.0
|
|
67
|
+
if 'last_use' not in response_cluster:
|
|
68
|
+
response_cluster['last_use'] = ''
|
|
69
|
+
if 'status_updated_at' not in response_cluster:
|
|
70
|
+
response_cluster['status_updated_at'] = 0
|
|
64
71
|
response_cluster['status'] = cluster['status'].value
|
|
65
72
|
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
73
|
cluster['handle'])
|
|
67
74
|
response_cluster['handle'] = pickle_and_encode(handle)
|
|
75
|
+
# TODO (syang) We still need to return this field for backwards
|
|
76
|
+
# compatibility.
|
|
77
|
+
# Remove this field at or after v0.10.7 or v0.11.0
|
|
68
78
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
69
|
-
|
|
79
|
+
None) # Always returns None.
|
|
70
80
|
response.append(response_cluster)
|
|
71
81
|
return response
|
|
72
82
|
|
|
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
206
216
|
@register_encoder('storage_ls')
|
|
207
217
|
def encode_storage_ls(
|
|
208
218
|
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
209
|
-
for storage_info in return_value
|
|
219
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
220
|
+
for storage_info in response_list:
|
|
210
221
|
storage_info['status'] = storage_info['status'].value
|
|
211
222
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
212
|
-
return
|
|
223
|
+
return response_list
|
|
213
224
|
|
|
214
225
|
|
|
215
226
|
@register_encoder('volume_list')
|
|
@@ -219,11 +230,11 @@ def encode_volume_list(
|
|
|
219
230
|
|
|
220
231
|
|
|
221
232
|
@register_encoder('job_status')
|
|
222
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
233
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
223
234
|
for job_id in return_value.keys():
|
|
224
235
|
if return_value[job_id] is not None:
|
|
225
236
|
return_value[job_id] = return_value[job_id].value
|
|
226
|
-
return return_value
|
|
237
|
+
return {str(k): v for k, v in return_value.items()}
|
|
227
238
|
|
|
228
239
|
|
|
229
240
|
@register_encoder('kubernetes_node_info')
|
|
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
|
|
|
235
246
|
@register_encoder('endpoints')
|
|
236
247
|
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
237
248
|
return {str(k): v for k, v in return_value.items()}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
252
|
+
def encode_realtime_gpu_availability(
|
|
253
|
+
return_value: List[Tuple[str,
|
|
254
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
255
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
256
|
+
# for JSON serialization.
|
|
257
|
+
result = []
|
|
258
|
+
for context, gpu_list in return_value:
|
|
259
|
+
gpu_availability_list = []
|
|
260
|
+
for gpu in gpu_list:
|
|
261
|
+
gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
|
|
262
|
+
gpu_availability_list.append(gpu_list_item)
|
|
263
|
+
result.append((context, gpu_availability_list))
|
|
264
|
+
return result
|