skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/backends/backend_utils.py +9 -6
  5. sky/backends/cloud_vm_ray_backend.py +2 -3
  6. sky/check.py +25 -13
  7. sky/client/cli/command.py +52 -24
  8. sky/cloud_stores.py +73 -0
  9. sky/clouds/aws.py +59 -11
  10. sky/core.py +7 -5
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  15. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
  28. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/data/data_utils.py +92 -1
  45. sky/data/mounting_utils.py +71 -2
  46. sky/data/storage.py +166 -9
  47. sky/global_user_state.py +14 -18
  48. sky/jobs/constants.py +2 -0
  49. sky/jobs/controller.py +62 -67
  50. sky/jobs/file_content_utils.py +80 -0
  51. sky/jobs/log_gc.py +201 -0
  52. sky/jobs/scheduler.py +15 -2
  53. sky/jobs/server/core.py +85 -13
  54. sky/jobs/server/server.py +14 -13
  55. sky/jobs/server/utils.py +28 -10
  56. sky/jobs/state.py +216 -40
  57. sky/jobs/utils.py +65 -28
  58. sky/metrics/utils.py +18 -0
  59. sky/optimizer.py +1 -1
  60. sky/provision/kubernetes/instance.py +88 -19
  61. sky/provision/kubernetes/volume.py +2 -2
  62. sky/schemas/api/responses.py +3 -5
  63. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  64. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  65. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  66. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  67. sky/serve/replica_managers.py +2 -2
  68. sky/serve/serve_utils.py +9 -2
  69. sky/serve/server/server.py +8 -7
  70. sky/server/common.py +21 -15
  71. sky/server/constants.py +1 -1
  72. sky/server/daemons.py +23 -17
  73. sky/server/requests/executor.py +7 -3
  74. sky/server/requests/payloads.py +2 -0
  75. sky/server/requests/request_names.py +80 -0
  76. sky/server/requests/requests.py +137 -102
  77. sky/server/requests/serializers/decoders.py +0 -6
  78. sky/server/requests/serializers/encoders.py +33 -6
  79. sky/server/server.py +105 -36
  80. sky/server/stream_utils.py +56 -13
  81. sky/setup_files/dependencies.py +2 -0
  82. sky/skylet/constants.py +6 -1
  83. sky/skylet/events.py +7 -0
  84. sky/skylet/services.py +18 -7
  85. sky/ssh_node_pools/server.py +5 -4
  86. sky/task.py +14 -42
  87. sky/templates/kubernetes-ray.yml.j2 +1 -1
  88. sky/templates/nebius-ray.yml.j2 +1 -0
  89. sky/templates/websocket_proxy.py +140 -12
  90. sky/users/permission.py +4 -1
  91. sky/utils/cli_utils/status_utils.py +8 -2
  92. sky/utils/context_utils.py +13 -1
  93. sky/utils/db/migration_utils.py +1 -1
  94. sky/utils/resource_checker.py +4 -1
  95. sky/utils/resources_utils.py +53 -29
  96. sky/utils/schemas.py +23 -4
  97. sky/volumes/server/server.py +4 -3
  98. sky/workspaces/server.py +7 -6
  99. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
  100. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
  101. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  102. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  108. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  109. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  110. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  111. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  112. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  113. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  114. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -539,19 +539,27 @@ def _start_api_server(deploy: bool = False,
539
539
  'is not a local URL')
540
540
 
541
541
  # Check available memory before starting the server.
542
- avail_mem_size_gb: float = common_utils.get_mem_size_gb()
543
- # pylint: disable=import-outside-toplevel
544
- import sky.jobs.utils as job_utils
545
- max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
546
- if job_utils.is_consolidation_mode(on_api_restart=True)
547
- else server_constants.MIN_AVAIL_MEM_GB)
548
- if avail_mem_size_gb <= max_memory:
549
- logger.warning(
550
- f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
551
- f'has {avail_mem_size_gb:.1f}GB memory available. '
552
- f'At least {max_memory}GB is recommended to support higher '
553
- 'load with better performance.'
554
- f'{colorama.Style.RESET_ALL}')
542
+ # Skip this warning if postgres is used, as:
543
+ # 1) that's almost certainly a remote API server;
544
+ # 2) the actual consolidation mode config is stashed in the database,
545
+ # and the value of `job_utils.is_consolidation_mode` will not be
546
+ # the actual value in the db, but only None as in this case, the
547
+ # whole YAML config is really just `db: <URI>`.
548
+ if skypilot_config.get_nested(('db',), None) is None:
549
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
550
+ # pylint: disable=import-outside-toplevel
551
+ import sky.jobs.utils as job_utils
552
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
553
+ if job_utils.is_consolidation_mode(
554
+ on_api_restart=True) else
555
+ server_constants.MIN_AVAIL_MEM_GB)
556
+ if avail_mem_size_gb <= max_memory:
557
+ logger.warning(
558
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
559
+ f'only has {avail_mem_size_gb:.1f}GB memory available. '
560
+ f'At least {max_memory}GB is recommended to support higher '
561
+ 'load with better performance.'
562
+ f'{colorama.Style.RESET_ALL}')
555
563
 
556
564
  args = [sys.executable, *API_SERVER_CMD.split()]
557
565
  if deploy:
@@ -560,8 +568,6 @@ def _start_api_server(deploy: bool = False,
560
568
  args += [f'--host={host}']
561
569
  if metrics_port is not None:
562
570
  args += [f'--metrics-port={metrics_port}']
563
- # Use this argument to disable the internal signal file check.
564
- args += ['--start-with-python']
565
571
 
566
572
  if foreground:
567
573
  # Replaces the current process with the API server
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 21
13
+ API_VERSION = 22
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py CHANGED
@@ -7,6 +7,7 @@ from typing import Callable
7
7
  from sky import sky_logging
8
8
  from sky import skypilot_config
9
9
  from sky.server import constants as server_constants
10
+ from sky.server.requests import request_names
10
11
  from sky.utils import annotations
11
12
  from sky.utils import common_utils
12
13
  from sky.utils import env_options
@@ -26,7 +27,7 @@ class InternalRequestDaemon:
26
27
  """Internal daemon that runs an event in the background."""
27
28
 
28
29
  id: str
29
- name: str
30
+ name: request_names.RequestName
30
31
  event_fn: Callable[[], None]
31
32
  default_log_level: str = 'INFO'
32
33
  should_skip: Callable[[], bool] = _default_should_skip
@@ -195,26 +196,31 @@ INTERNAL_REQUEST_DAEMONS = [
195
196
  # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
196
197
  # set to updated status automatically, without showing users the hint of
197
198
  # cluster being stopped or down when `sky status -r` is called.
198
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
199
- name='status-refresh',
200
- event_fn=refresh_cluster_status_event,
201
- default_log_level='DEBUG'),
199
+ InternalRequestDaemon(
200
+ id='skypilot-status-refresh-daemon',
201
+ name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
202
+ event_fn=refresh_cluster_status_event,
203
+ default_log_level='DEBUG'),
202
204
  # Volume status refresh daemon to update the volume status periodically.
203
- InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
204
- name='volume-refresh',
205
- event_fn=refresh_volume_status_event),
205
+ InternalRequestDaemon(
206
+ id='skypilot-volume-status-refresh-daemon',
207
+ name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
208
+ event_fn=refresh_volume_status_event),
206
209
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
207
- name='managed-job-status-refresh',
210
+ name=request_names.RequestName.
211
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
208
212
  event_fn=managed_job_status_refresh_event,
209
213
  should_skip=should_skip_managed_job_status_refresh),
210
- InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
211
- name='sky-serve-status-refresh',
212
- event_fn=sky_serve_status_refresh_event,
213
- should_skip=should_skip_sky_serve_status_refresh),
214
- InternalRequestDaemon(id='pool-status-refresh-daemon',
215
- name='pool-status-refresh',
216
- event_fn=pool_status_refresh_event,
217
- should_skip=should_skip_pool_status_refresh),
214
+ InternalRequestDaemon(
215
+ id='sky-serve-status-refresh-daemon',
216
+ name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
217
+ event_fn=sky_serve_status_refresh_event,
218
+ should_skip=should_skip_sky_serve_status_refresh),
219
+ InternalRequestDaemon(
220
+ id='pool-status-refresh-daemon',
221
+ name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
222
+ event_fn=pool_status_refresh_event,
223
+ should_skip=should_skip_pool_status_refresh),
218
224
  ]
219
225
 
220
226
 
@@ -47,6 +47,7 @@ from sky.server import metrics as metrics_lib
47
47
  from sky.server.requests import payloads
48
48
  from sky.server.requests import preconditions
49
49
  from sky.server.requests import process
50
+ from sky.server.requests import request_names
50
51
  from sky.server.requests import requests as api_requests
51
52
  from sky.server.requests import threads
52
53
  from sky.server.requests.queues import local_queue
@@ -395,7 +396,10 @@ def _request_execution_wrapper(request_id: str,
395
396
  rss_begin = proc.memory_info().rss
396
397
  db_utils.set_max_connections(num_db_connections_per_worker)
397
398
  # Handle the SIGTERM signal to abort the request processing gracefully.
398
- signal.signal(signal.SIGTERM, _sigterm_handler)
399
+ # Only set up signal handlers in the main thread, as signal.signal() raises
400
+ # ValueError if called from a non-main thread (e.g., in tests).
401
+ if threading.current_thread() is threading.main_thread():
402
+ signal.signal(signal.SIGTERM, _sigterm_handler)
399
403
 
400
404
  logger.info(f'Running request {request_id} with pid {pid}')
401
405
 
@@ -688,7 +692,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
688
692
 
689
693
  async def prepare_request_async(
690
694
  request_id: str,
691
- request_name: str,
695
+ request_name: request_names.RequestName,
692
696
  request_body: payloads.RequestBody,
693
697
  func: Callable[P, Any],
694
698
  request_cluster_name: Optional[str] = None,
@@ -721,7 +725,7 @@ async def prepare_request_async(
721
725
 
722
726
 
723
727
  async def schedule_request_async(request_id: str,
724
- request_name: str,
728
+ request_name: request_names.RequestName,
725
729
  request_body: payloads.RequestBody,
726
730
  func: Callable[P, Any],
727
731
  request_cluster_name: Optional[str] = None,
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
319
319
  # Only return fields that are needed for the
320
320
  # dashboard / CLI summary response
321
321
  summary_response: bool = False
322
+ # Include the cluster handle in the response
323
+ include_handle: bool = True
322
324
 
323
325
 
324
326
  class StartBody(RequestBody):
@@ -0,0 +1,80 @@
1
+ """Request names."""
2
+ import enum
3
+
4
+
5
+ class RequestName(str, enum.Enum):
6
+ """Enum of all the request names."""
7
+ # General requests
8
+ CHECK = 'check'
9
+ ENABLED_CLOUDS = 'enabled_clouds'
10
+ REALTIME_KUBERNETES_GPU_AVAILABILITY = (
11
+ 'realtime_kubernetes_gpu_availability')
12
+ KUBERNETES_NODE_INFO = 'kubernetes_node_info'
13
+ STATUS_KUBERNETES = 'status_kubernetes'
14
+ LIST_ACCELERATORS = 'list_accelerators'
15
+ LIST_ACCELERATOR_COUNTS = 'list_accelerator_counts'
16
+ OPTIMIZE = 'optimize'
17
+ # Cluster requests
18
+ CLUSTER_LAUNCH = 'launch'
19
+ CLUSTER_EXEC = 'exec'
20
+ CLUSTER_STOP = 'stop'
21
+ CLUSTER_STATUS = 'status'
22
+ CLUSTER_ENDPOINTS = 'endpoints'
23
+ CLUSTER_DOWN = 'down'
24
+ CLUSTER_START = 'start'
25
+ CLUSTER_AUTOSTOP = 'autostop'
26
+ CLUSTER_QUEUE = 'queue'
27
+ CLUSTER_JOB_STATUS = 'job_status'
28
+ CLUSTER_JOB_CANCEL = 'cancel'
29
+ CLUSTER_JOB_LOGS = 'logs'
30
+ CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
31
+ CLUSTER_COST_REPORT = 'cost_report'
32
+ # Storage requests
33
+ STORAGE_LS = 'storage_ls'
34
+ STORAGE_DELETE = 'storage_delete'
35
+ # Local requests
36
+ LOCAL_UP = 'local_up'
37
+ LOCAL_DOWN = 'local_down'
38
+ # API requests
39
+ API_CANCEL = 'api_cancel'
40
+ ALL_CONTEXTS = 'all_contexts'
41
+ # Managed jobs requests
42
+ JOBS_LAUNCH = 'jobs.launch'
43
+ JOBS_QUEUE = 'jobs.queue'
44
+ JOBS_QUEUE_V2 = 'jobs.queue_v2'
45
+ JOBS_CANCEL = 'jobs.cancel'
46
+ JOBS_LOGS = 'jobs.logs'
47
+ JOBS_DOWNLOAD_LOGS = 'jobs.download_logs'
48
+ JOBS_POOL_APPLY = 'jobs.pool_apply'
49
+ JOBS_POOL_DOWN = 'jobs.pool_down'
50
+ JOBS_POOL_STATUS = 'jobs.pool_status'
51
+ JOBS_POOL_LOGS = 'jobs.pool_logs'
52
+ JOBS_POOL_SYNC_DOWN_LOGS = 'jobs.pool_sync_down_logs'
53
+ # Serve requests
54
+ SERVE_UP = 'serve.up'
55
+ SERVE_UPDATE = 'serve.update'
56
+ SERVE_DOWN = 'serve.down'
57
+ SERVE_TERMINATE_REPLICA = 'serve.terminate_replica'
58
+ SERVE_STATUS = 'serve.status'
59
+ SERVE_LOGS = 'serve.logs'
60
+ SERVE_SYNC_DOWN_LOGS = 'serve.sync_down_logs'
61
+ # Volumes requests
62
+ VOLUME_LIST = 'volume_list'
63
+ VOLUME_DELETE = 'volume_delete'
64
+ VOLUME_APPLY = 'volume_apply'
65
+ # Workspaces requests
66
+ WORKSPACES_GET = 'workspaces.get'
67
+ WORKSPACES_UPDATE = 'workspaces.update'
68
+ WORKSPACES_CREATE = 'workspaces.create'
69
+ WORKSPACES_DELETE = 'workspaces.delete'
70
+ WORKSPACES_GET_CONFIG = 'workspaces.get_config'
71
+ WORKSPACES_UPDATE_CONFIG = 'workspaces.update_config'
72
+ # SSH node pools requests
73
+ SSH_NODE_POOLS_UP = 'ssh_node_pools.up'
74
+ SSH_NODE_POOLS_DOWN = 'ssh_node_pools.down'
75
+ # Internal request daemons
76
+ REQUEST_DAEMON_STATUS_REFRESH = 'status-refresh'
77
+ REQUEST_DAEMON_VOLUME_REFRESH = 'volume-refresh'
78
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH = 'managed-job-status-refresh'
79
+ REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH = 'sky-serve-status-refresh'
80
+ REQUEST_DAEMON_POOL_STATUS_REFRESH = 'pool-status-refresh'
@@ -5,7 +5,6 @@ import contextlib
5
5
  import dataclasses
6
6
  import enum
7
7
  import functools
8
- import json
9
8
  import os
10
9
  import pathlib
11
10
  import shutil
@@ -21,6 +20,7 @@ import uuid
21
20
  import anyio
22
21
  import colorama
23
22
  import filelock
23
+ import orjson
24
24
 
25
25
  from sky import exceptions
26
26
  from sky import global_user_state
@@ -213,8 +213,8 @@ class Request:
213
213
  entrypoint=self.entrypoint.__name__,
214
214
  request_body=self.request_body.model_dump_json(),
215
215
  status=self.status.value,
216
- return_value=json.dumps(None),
217
- error=json.dumps(None),
216
+ return_value=orjson.dumps(None).decode('utf-8'),
217
+ error=orjson.dumps(None).decode('utf-8'),
218
218
  pid=None,
219
219
  created_at=self.created_at,
220
220
  schedule_type=self.schedule_type.value,
@@ -237,8 +237,8 @@ class Request:
237
237
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
238
238
  request_body=encoders.pickle_and_encode(self.request_body),
239
239
  status=self.status.value,
240
- return_value=json.dumps(self.return_value),
241
- error=json.dumps(self.error),
240
+ return_value=orjson.dumps(self.return_value).decode('utf-8'),
241
+ error=orjson.dumps(self.error).decode('utf-8'),
242
242
  pid=self.pid,
243
243
  created_at=self.created_at,
244
244
  schedule_type=self.schedule_type.value,
@@ -270,8 +270,8 @@ class Request:
270
270
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
271
271
  request_body=decoders.decode_and_unpickle(payload.request_body),
272
272
  status=RequestStatus(payload.status),
273
- return_value=json.loads(payload.return_value),
274
- error=json.loads(payload.error),
273
+ return_value=orjson.loads(payload.return_value),
274
+ error=orjson.loads(payload.error),
275
275
  pid=payload.pid,
276
276
  created_at=payload.created_at,
277
277
  schedule_type=ScheduleType(payload.schedule_type),
@@ -328,10 +328,11 @@ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
328
328
  entrypoint=request.entrypoint.__name__
329
329
  if request.entrypoint is not None else '',
330
330
  request_body=request.request_body.model_dump_json()
331
- if request.request_body is not None else json.dumps(None),
331
+ if request.request_body is not None else
332
+ orjson.dumps(None).decode('utf-8'),
332
333
  status=request.status.value,
333
- return_value=json.dumps(None),
334
- error=json.dumps(None),
334
+ return_value=orjson.dumps(None).decode('utf-8'),
335
+ error=orjson.dumps(None).decode('utf-8'),
335
336
  pid=None,
336
337
  created_at=request.created_at,
337
338
  schedule_type=request.schedule_type.value,
@@ -372,9 +373,9 @@ def _update_request_row_fields(
372
373
  if 'user_id' not in fields:
373
374
  content['user_id'] = ''
374
375
  if 'return_value' not in fields:
375
- content['return_value'] = json.dumps(None)
376
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
376
377
  if 'error' not in fields:
377
- content['error'] = json.dumps(None)
378
+ content['error'] = orjson.dumps(None).decode('utf-8')
378
379
  if 'schedule_type' not in fields:
379
380
  content['schedule_type'] = ScheduleType.SHORT.value
380
381
  # Optional fields in RequestPayload
@@ -393,94 +394,6 @@ def _update_request_row_fields(
393
394
  return tuple(content[col] for col in REQUEST_COLUMNS)
394
395
 
395
396
 
396
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
397
- """Kill all pending and running requests for a cluster.
398
-
399
- Args:
400
- cluster_name: the name of the cluster.
401
- exclude_request_names: exclude requests with these names. This is to
402
- prevent killing the caller request.
403
- """
404
- request_ids = [
405
- request_task.request_id
406
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
407
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
408
- exclude_request_names=[exclude_request_name],
409
- cluster_names=[cluster_name],
410
- fields=['request_id']))
411
- ]
412
- kill_requests(request_ids)
413
-
414
-
415
- def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
416
- user_id: Optional[str] = None) -> List[str]:
417
- """Kill requests with a given request ID prefix."""
418
- expanded_request_ids: Optional[List[str]] = None
419
- if request_ids is not None:
420
- expanded_request_ids = []
421
- for request_id in request_ids:
422
- request_tasks = get_requests_with_prefix(request_id,
423
- fields=['request_id'])
424
- if request_tasks is None or len(request_tasks) == 0:
425
- continue
426
- if len(request_tasks) > 1:
427
- raise ValueError(f'Multiple requests found for '
428
- f'request ID prefix: {request_id}')
429
- expanded_request_ids.append(request_tasks[0].request_id)
430
- return kill_requests(request_ids=expanded_request_ids, user_id=user_id)
431
-
432
-
433
- def kill_requests(request_ids: Optional[List[str]] = None,
434
- user_id: Optional[str] = None) -> List[str]:
435
- """Kill a SkyPilot API request and set its status to cancelled.
436
-
437
- Args:
438
- request_ids: The request IDs to kill. If None, all requests for the
439
- user are killed.
440
- user_id: The user ID to kill requests for. If None, all users are
441
- killed.
442
-
443
- Returns:
444
- A list of request IDs that were cancelled.
445
- """
446
- if request_ids is None:
447
- request_ids = [
448
- request_task.request_id
449
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
450
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
451
- # Avoid cancelling the cancel request itself.
452
- exclude_request_names=['sky.api_cancel'],
453
- user_id=user_id,
454
- fields=['request_id']))
455
- ]
456
- cancelled_request_ids = []
457
- for request_id in request_ids:
458
- with update_request(request_id) as request_record:
459
- if request_record is None:
460
- logger.debug(f'No request ID {request_id}')
461
- continue
462
- # Skip internal requests. The internal requests are scheduled with
463
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
464
- if request_record.request_id in set(
465
- event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
466
- continue
467
- if request_record.status > RequestStatus.RUNNING:
468
- logger.debug(f'Request {request_id} already finished')
469
- continue
470
- if request_record.pid is not None:
471
- logger.debug(f'Killing request process {request_record.pid}')
472
- # Use SIGTERM instead of SIGKILL:
473
- # - The executor can handle SIGTERM gracefully
474
- # - After SIGTERM, the executor can reuse the request process
475
- # for other requests, avoiding the overhead of forking a new
476
- # process for each request.
477
- os.kill(request_record.pid, signal.SIGTERM)
478
- request_record.status = RequestStatus.CANCELLED
479
- request_record.finished_at = time.time()
480
- cancelled_request_ids.append(request_id)
481
- return cancelled_request_ids
482
-
483
-
484
397
  def create_table(cursor, conn):
485
398
  # Enable WAL mode to avoid locking issues.
486
399
  # See: issue #1441 and PR #1509
@@ -624,6 +537,128 @@ def request_lock_path(request_id: str) -> str:
624
537
  return os.path.join(lock_path, f'.{request_id}.lock')
625
538
 
626
539
 
540
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
541
+ """Kill all pending and running requests for a cluster.
542
+
543
+ Args:
544
+ cluster_name: the name of the cluster.
545
+ exclude_request_names: exclude requests with these names. This is to
546
+ prevent killing the caller request.
547
+ """
548
+ request_ids = [
549
+ request_task.request_id
550
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
551
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
552
+ exclude_request_names=[exclude_request_name],
553
+ cluster_names=[cluster_name],
554
+ fields=['request_id']))
555
+ ]
556
+ _kill_requests(request_ids)
557
+
558
+
559
+ def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
560
+ user_id: Optional[str] = None) -> List[str]:
561
+ """Kill requests with a given request ID prefix."""
562
+ expanded_request_ids: Optional[List[str]] = None
563
+ if request_ids is not None:
564
+ expanded_request_ids = []
565
+ for request_id in request_ids:
566
+ request_tasks = get_requests_with_prefix(request_id,
567
+ fields=['request_id'])
568
+ if request_tasks is None or len(request_tasks) == 0:
569
+ continue
570
+ if len(request_tasks) > 1:
571
+ raise ValueError(f'Multiple requests found for '
572
+ f'request ID prefix: {request_id}')
573
+ expanded_request_ids.append(request_tasks[0].request_id)
574
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
575
+
576
+
577
+ def _should_kill_request(request_id: str,
578
+ request_record: Optional[Request]) -> bool:
579
+ if request_record is None:
580
+ logger.debug(f'No request ID {request_id}')
581
+ return False
582
+ # Skip internal requests. The internal requests are scheduled with
583
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
584
+ if request_record.request_id in set(
585
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
586
+ return False
587
+ if request_record.status > RequestStatus.RUNNING:
588
+ logger.debug(f'Request {request_id} already finished')
589
+ return False
590
+ return True
591
+
592
+
593
+ def _kill_requests(request_ids: Optional[List[str]] = None,
594
+ user_id: Optional[str] = None) -> List[str]:
595
+ """Kill a SkyPilot API request and set its status to cancelled.
596
+
597
+ Args:
598
+ request_ids: The request IDs to kill. If None, all requests for the
599
+ user are killed.
600
+ user_id: The user ID to kill requests for. If None, all users are
601
+ killed.
602
+
603
+ Returns:
604
+ A list of request IDs that were cancelled.
605
+ """
606
+ if request_ids is None:
607
+ request_ids = [
608
+ request_task.request_id
609
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
610
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
611
+ # Avoid cancelling the cancel request itself.
612
+ exclude_request_names=['sky.api_cancel'],
613
+ user_id=user_id,
614
+ fields=['request_id']))
615
+ ]
616
+ cancelled_request_ids = []
617
+ for request_id in request_ids:
618
+ with update_request(request_id) as request_record:
619
+ if not _should_kill_request(request_id, request_record):
620
+ continue
621
+ if request_record.pid is not None:
622
+ logger.debug(f'Killing request process {request_record.pid}')
623
+ # Use SIGTERM instead of SIGKILL:
624
+ # - The executor can handle SIGTERM gracefully
625
+ # - After SIGTERM, the executor can reuse the request process
626
+ # for other requests, avoiding the overhead of forking a new
627
+ # process for each request.
628
+ os.kill(request_record.pid, signal.SIGTERM)
629
+ request_record.status = RequestStatus.CANCELLED
630
+ request_record.finished_at = time.time()
631
+ cancelled_request_ids.append(request_id)
632
+ return cancelled_request_ids
633
+
634
+
635
+ @init_db_async
636
+ @asyncio_utils.shield
637
+ async def kill_request_async(request_id: str) -> bool:
638
+ """Kill a SkyPilot API request and set its status to cancelled.
639
+
640
+ Returns:
641
+ True if the request was killed, False otherwise.
642
+ """
643
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
644
+ request = await _get_request_no_lock_async(request_id)
645
+ if not _should_kill_request(request_id, request):
646
+ return False
647
+ assert request is not None
648
+ if request.pid is not None:
649
+ logger.debug(f'Killing request process {request.pid}')
650
+ # Use SIGTERM instead of SIGKILL:
651
+ # - The executor can handle SIGTERM gracefully
652
+ # - After SIGTERM, the executor can reuse the request process
653
+ # for other requests, avoiding the overhead of forking a new
654
+ # process for each request.
655
+ os.kill(request.pid, signal.SIGTERM)
656
+ request.status = RequestStatus.CANCELLED
657
+ request.finished_at = time.time()
658
+ await _add_or_update_request_no_lock_async(request)
659
+ return True
660
+
661
+
627
662
  @contextlib.contextmanager
628
663
  @init_db
629
664
  @metrics_lib.time_me
@@ -638,7 +673,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
638
673
  _add_or_update_request_no_lock(request)
639
674
 
640
675
 
641
- @init_db
676
+ @init_db_async
642
677
  @metrics_lib.time_me
643
678
  @asyncio_utils.shield
644
679
  async def update_status_async(request_id: str, status: RequestStatus) -> None:
@@ -650,7 +685,7 @@ async def update_status_async(request_id: str, status: RequestStatus) -> None:
650
685
  await _add_or_update_request_no_lock_async(request)
651
686
 
652
687
 
653
- @init_db
688
+ @init_db_async
654
689
  @metrics_lib.time_me
655
690
  @asyncio_utils.shield
656
691
  async def update_status_msg_async(request_id: str, status_msg: str) -> None:
@@ -60,12 +60,6 @@ def decode_status(
60
60
  if 'handle' in cluster and cluster['handle'] is not None:
61
61
  cluster['handle'] = decode_and_unpickle(cluster['handle'])
62
62
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
63
- # this field is to be deprecated in the future.
64
- # do not decode this field if it is not present.
65
- if ('storage_mounts_metadata' in cluster and
66
- cluster['storage_mounts_metadata'] is not None):
67
- cluster['storage_mounts_metadata'] = decode_and_unpickle(
68
- cluster['storage_mounts_metadata'])
69
63
  if 'is_managed' not in cluster:
70
64
  cluster['is_managed'] = False
71
65
  response.append(responses.StatusResponse.model_validate(cluster))
@@ -60,13 +60,23 @@ def encode_status(
60
60
  clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
61
61
  response = []
62
62
  for cluster in clusters:
63
- response_cluster = cluster.model_dump()
63
+ response_cluster = cluster.model_dump(exclude_none=True)
64
+ # These default setting is needed because last_use and status_updated_at
65
+ # used to be not optional.
66
+ # TODO(syang): remove this after v0.10.7 or v0.11.0
67
+ if 'last_use' not in response_cluster:
68
+ response_cluster['last_use'] = ''
69
+ if 'status_updated_at' not in response_cluster:
70
+ response_cluster['status_updated_at'] = 0
64
71
  response_cluster['status'] = cluster['status'].value
65
72
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
73
  cluster['handle'])
67
74
  response_cluster['handle'] = pickle_and_encode(handle)
75
+ # TODO (syang) We still need to return this field for backwards
76
+ # compatibility.
77
+ # Remove this field at or after v0.10.7 or v0.11.0
68
78
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
69
- response_cluster['storage_mounts_metadata'])
79
+ None) # Always returns None.
70
80
  response.append(response_cluster)
71
81
  return response
72
82
 
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
206
216
  @register_encoder('storage_ls')
207
217
  def encode_storage_ls(
208
218
  return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
209
- for storage_info in return_value:
219
+ response_list = [storage_info.model_dump() for storage_info in return_value]
220
+ for storage_info in response_list:
210
221
  storage_info['status'] = storage_info['status'].value
211
222
  storage_info['store'] = [store.value for store in storage_info['store']]
212
- return [storage_info.model_dump() for storage_info in return_value]
223
+ return response_list
213
224
 
214
225
 
215
226
  @register_encoder('volume_list')
@@ -219,11 +230,11 @@ def encode_volume_list(
219
230
 
220
231
 
221
232
  @register_encoder('job_status')
222
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
233
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
223
234
  for job_id in return_value.keys():
224
235
  if return_value[job_id] is not None:
225
236
  return_value[job_id] = return_value[job_id].value
226
- return return_value
237
+ return {str(k): v for k, v in return_value.items()}
227
238
 
228
239
 
229
240
  @register_encoder('kubernetes_node_info')
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
235
246
  @register_encoder('endpoints')
236
247
  def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
237
248
  return {str(k): v for k, v in return_value.items()}
249
+
250
+
251
+ @register_encoder('realtime_kubernetes_gpu_availability')
252
+ def encode_realtime_gpu_availability(
253
+ return_value: List[Tuple[str,
254
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
255
+ # Convert RealtimeGpuAvailability namedtuples to lists
256
+ # for JSON serialization.
257
+ result = []
258
+ for context, gpu_list in return_value:
259
+ gpu_availability_list = []
260
+ for gpu in gpu_list:
261
+ gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
262
+ gpu_availability_list.append(gpu_list_item)
263
+ result.append((context, gpu_availability_list))
264
+ return result