skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/server/plugins.py ADDED
@@ -0,0 +1,238 @@
1
+ """Load plugins for the SkyPilot API server."""
2
+ import abc
3
+ import dataclasses
4
+ import importlib
5
+ import os
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ from fastapi import FastAPI
9
+
10
+ from sky import sky_logging
11
+ from sky.skylet import constants as skylet_constants
12
+ from sky.utils import common_utils
13
+ from sky.utils import config_utils
14
+ from sky.utils import yaml_utils
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ _DEFAULT_PLUGINS_CONFIG_PATH = '~/.sky/plugins.yaml'
19
+ _PLUGINS_CONFIG_ENV_VAR = (
20
+ f'{skylet_constants.SKYPILOT_SERVER_ENV_VAR_PREFIX}PLUGINS_CONFIG')
21
+
22
+
23
+ class ExtensionContext:
24
+ """Context provided to plugins during installation.
25
+
26
+ Attributes:
27
+ app: The FastAPI application instance.
28
+ rbac_rules: List of RBAC rules registered by the plugin.
29
+ Example:
30
+ [
31
+ ('user', RBACRule(path='/plugins/api/xx/*', method='POST')),
32
+ ('user', RBACRule(path='/plugins/api/xx/*', method='DELETE'))
33
+ ]
34
+ """
35
+
36
+ def __init__(self, app: Optional[FastAPI] = None):
37
+ self.app = app
38
+ self.rbac_rules: List[Tuple[str, RBACRule]] = []
39
+
40
+ def register_rbac_rule(self,
41
+ path: str,
42
+ method: str,
43
+ description: Optional[str] = None,
44
+ role: str = 'user') -> None:
45
+ """Register an RBAC rule for this plugin.
46
+
47
+ This method allows plugins to declare which endpoints should be
48
+ restricted to admin users during the install phase.
49
+
50
+ Args:
51
+ path: The path pattern to restrict (supports wildcards with
52
+ keyMatch2).
53
+ Example: '/plugins/api/credentials/*'
54
+ method: The HTTP method to restrict. Example: 'POST', 'DELETE'
55
+ description: Optional description of what this rule protects.
56
+ role: The role to add this rule to (default: 'user').
57
+ Rules added to 'user' role block regular users but allow
58
+ admins.
59
+
60
+ Example:
61
+ def install(self, ctx: ExtensionContext):
62
+ # Only admin can upload credentials
63
+ ctx.register_rbac_rule(
64
+ path='/plugins/api/credentials/*',
65
+ method='POST',
66
+ description='Only admin can upload credentials'
67
+ )
68
+ """
69
+ rule = RBACRule(path=path, method=method, description=description)
70
+ self.rbac_rules.append((role, rule))
71
+ logger.debug(f'Registered RBAC rule for {role}: {method} {path}'
72
+ f'{f" - {description}" if description else ""}')
73
+
74
+
75
+ @dataclasses.dataclass
76
+ class RBACRule:
77
+ """RBAC rule for a plugin endpoint.
78
+
79
+ Attributes:
80
+ path: The path pattern to match (supports wildcards with keyMatch2).
81
+ Example: '/plugins/api/credentials/*'
82
+ method: The HTTP method to restrict. Example: 'POST', 'DELETE'
83
+ description: Optional description of what this rule protects.
84
+ """
85
+ path: str
86
+ method: str
87
+ description: Optional[str] = None
88
+
89
+
90
+ class BasePlugin(abc.ABC):
91
+ """Base class for all SkyPilot server plugins."""
92
+
93
+ @property
94
+ def name(self) -> Optional[str]:
95
+ """Plugin name for display purposes."""
96
+ return None
97
+
98
+ @property
99
+ def js_extension_path(self) -> Optional[str]:
100
+ """Optional API route to the JavaScript extension to load."""
101
+ return None
102
+
103
+ @property
104
+ def version(self) -> Optional[str]:
105
+ """Plugin version."""
106
+ return None
107
+
108
+ @property
109
+ def commit(self) -> Optional[str]:
110
+ """Plugin git commit hash."""
111
+ return None
112
+
113
+ @abc.abstractmethod
114
+ def install(self, extension_context: ExtensionContext):
115
+ """Hook called by API server to let the plugin install itself."""
116
+ raise NotImplementedError
117
+
118
+ def shutdown(self):
119
+ """Hook called by API server to let the plugin shutdown."""
120
+ pass
121
+
122
+
123
+ def _config_schema():
124
+ plugin_schema = {
125
+ 'type': 'object',
126
+ 'required': ['class'],
127
+ 'additionalProperties': False,
128
+ 'properties': {
129
+ 'class': {
130
+ 'type': 'string',
131
+ },
132
+ 'parameters': {
133
+ 'type': 'object',
134
+ 'required': [],
135
+ 'additionalProperties': True,
136
+ },
137
+ },
138
+ }
139
+ return {
140
+ 'type': 'object',
141
+ 'required': [],
142
+ 'additionalProperties': False,
143
+ 'properties': {
144
+ 'plugins': {
145
+ 'type': 'array',
146
+ 'items': plugin_schema,
147
+ 'default': [],
148
+ },
149
+ },
150
+ }
151
+
152
+
153
+ def _load_plugin_config() -> Optional[config_utils.Config]:
154
+ """Load plugin config."""
155
+ config_path = os.getenv(_PLUGINS_CONFIG_ENV_VAR,
156
+ _DEFAULT_PLUGINS_CONFIG_PATH)
157
+ config_path = os.path.expanduser(config_path)
158
+ if not os.path.exists(config_path):
159
+ return None
160
+ config = yaml_utils.read_yaml(config_path) or {}
161
+ common_utils.validate_schema(config,
162
+ _config_schema(),
163
+ err_msg_prefix='Invalid plugins config: ')
164
+ return config_utils.Config.from_dict(config)
165
+
166
+
167
+ _PLUGINS: Dict[str, BasePlugin] = {}
168
+ _EXTENSION_CONTEXT: Optional[ExtensionContext] = None
169
+
170
+
171
+ def load_plugins(extension_context: ExtensionContext):
172
+ """Load and initialize plugins from the config."""
173
+ global _EXTENSION_CONTEXT
174
+ _EXTENSION_CONTEXT = extension_context
175
+
176
+ config = _load_plugin_config()
177
+ if not config:
178
+ return
179
+
180
+ for plugin_config in config.get('plugins', []):
181
+ class_path = plugin_config['class']
182
+ logger.debug(f'Loading plugins: {class_path}')
183
+ module_path, class_name = class_path.rsplit('.', 1)
184
+ try:
185
+ module = importlib.import_module(module_path)
186
+ except ImportError as e:
187
+ raise ImportError(
188
+ f'Failed to import plugin module: {module_path}. '
189
+ 'Please check if the module is installed in your Python '
190
+ 'environment.') from e
191
+ try:
192
+ plugin_cls = getattr(module, class_name)
193
+ except AttributeError as e:
194
+ raise AttributeError(
195
+ f'Could not find plugin {class_name} class in module '
196
+ f'{module_path}. ') from e
197
+ if not issubclass(plugin_cls, BasePlugin):
198
+ raise TypeError(
199
+ f'Plugin {class_path} must inherit from BasePlugin.')
200
+ parameters = plugin_config.get('parameters') or {}
201
+ plugin = plugin_cls(**parameters)
202
+ plugin.install(extension_context)
203
+ _PLUGINS[class_path] = plugin
204
+
205
+
206
+ def get_plugins() -> List[BasePlugin]:
207
+ """Return shallow copies of the registered plugins."""
208
+ return list(_PLUGINS.values())
209
+
210
+
211
+ def get_plugin_rbac_rules() -> Dict[str, List[Dict[str, str]]]:
212
+ """Collect RBAC rules from all loaded plugins.
213
+
214
+ Collects rules from the ExtensionContext.
215
+
216
+ Returns:
217
+ Dictionary mapping role names to lists of blocklist rules.
218
+ Example:
219
+ {
220
+ 'user': [
221
+ {'path': '/plugins/api/credentials/*', 'method': 'POST'},
222
+ {'path': '/plugins/api/credentials/*', 'method': 'DELETE'}
223
+ ]
224
+ }
225
+ """
226
+ rules_by_role: Dict[str, List[Dict[str, str]]] = {}
227
+
228
+ # Collect rules registered via ExtensionContext
229
+ if _EXTENSION_CONTEXT:
230
+ for role, rule in _EXTENSION_CONTEXT.rbac_rules:
231
+ if role not in rules_by_role:
232
+ rules_by_role[role] = []
233
+ rules_by_role[role].append({
234
+ 'path': rule.path,
235
+ 'method': rule.method,
236
+ })
237
+
238
+ return rules_by_role
@@ -44,6 +44,7 @@ from sky.server import common as server_common
44
44
  from sky.server import config as server_config
45
45
  from sky.server import constants as server_constants
46
46
  from sky.server import metrics as metrics_lib
47
+ from sky.server import plugins
47
48
  from sky.server.requests import payloads
48
49
  from sky.server.requests import preconditions
49
50
  from sky.server.requests import process
@@ -159,6 +160,8 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
159
160
  def executor_initializer(proc_group: str):
160
161
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
161
162
  f'{multiprocessing.current_process().pid}')
163
+ # Load plugins for executor process.
164
+ plugins.load_plugins(plugins.ExtensionContext())
162
165
  # Executor never stops, unless the whole process is killed.
163
166
  threading.Thread(target=metrics_lib.process_monitor,
164
167
  args=(f'worker:{proc_group}', threading.Event()),
@@ -533,8 +536,8 @@ def _request_execution_wrapper(request_id: str,
533
536
  # so that the "Request xxxx failed due to ..." log message will be
534
537
  # written to the original stdout and stderr file descriptors.
535
538
  _restore_output()
536
- logger.info(f'Request {request_id} failed due to '
537
- f'{common_utils.format_exception(e)}')
539
+ logger.error(f'Request {request_id} failed due to '
540
+ f'{common_utils.format_exception(e)}')
538
541
  return
539
542
  else:
540
543
  api_requests.set_request_succeeded(
@@ -82,7 +82,7 @@ def request_body_env_vars() -> dict:
82
82
  if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
83
83
  env_vars[env_var] = os.environ[env_var]
84
84
  env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
85
- env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
85
+ env_vars[constants.USER_ENV_VAR] = common_utils.get_local_user_name()
86
86
  env_vars[
87
87
  usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
88
88
  if not common.is_api_server_local():
@@ -482,6 +482,7 @@ class VolumeApplyBody(RequestBody):
482
482
  class VolumeDeleteBody(RequestBody):
483
483
  """The request body for the volume delete endpoint."""
484
484
  names: List[str]
485
+ purge: bool = False
485
486
 
486
487
 
487
488
  class VolumeListBody(RequestBody):
@@ -670,6 +671,11 @@ class KubernetesNodeInfoRequestBody(RequestBody):
670
671
  context: Optional[str] = None
671
672
 
672
673
 
674
+ class SlurmNodeInfoRequestBody(RequestBody):
675
+ """The request body for the slurm node info endpoint."""
676
+ slurm_cluster_name: Optional[str] = None
677
+
678
+
673
679
  class ListAcceleratorsBody(RequestBody):
674
680
  """The request body for the list accelerators endpoint."""
675
681
  gpus_only: bool = True
@@ -854,3 +860,26 @@ class RequestPayload(BasePayload):
854
860
  status_msg: Optional[str] = None
855
861
  should_retry: bool = False
856
862
  finished_at: Optional[float] = None
863
+
864
+
865
+ class SlurmGpuAvailabilityRequestBody(RequestBody):
866
+ """Request body for getting Slurm real-time GPU availability."""
867
+ name_filter: Optional[str] = None
868
+ quantity_filter: Optional[int] = None
869
+
870
+
871
+ class ClusterEventsBody(RequestBody):
872
+ """The request body for the cluster events endpoint."""
873
+ cluster_name: Optional[str] = None
874
+ cluster_hash: Optional[str] = None
875
+ event_type: str # 'STATUS_CHANGE' or 'DEBUG'
876
+ include_timestamps: bool = False
877
+ limit: Optional[
878
+ int] = None # If specified, returns at most this many events
879
+
880
+
881
+ class GetJobEventsBody(RequestBody):
882
+ """The request body for the get job task events endpoint."""
883
+ job_id: int
884
+ task_id: Optional[int] = None
885
+ limit: Optional[int] = 10 # Default to 10 most recent task events
@@ -10,6 +10,8 @@ class RequestName(str, enum.Enum):
10
10
  REALTIME_KUBERNETES_GPU_AVAILABILITY = (
11
11
  'realtime_kubernetes_gpu_availability')
12
12
  KUBERNETES_NODE_INFO = 'kubernetes_node_info'
13
+ REALTIME_SLURM_GPU_AVAILABILITY = 'realtime_slurm_gpu_availability'
14
+ SLURM_NODE_INFO = 'slurm_node_info'
13
15
  STATUS_KUBERNETES = 'status_kubernetes'
14
16
  LIST_ACCELERATORS = 'list_accelerators'
15
17
  LIST_ACCELERATOR_COUNTS = 'list_accelerator_counts'
@@ -29,6 +31,7 @@ class RequestName(str, enum.Enum):
29
31
  CLUSTER_JOB_LOGS = 'logs'
30
32
  CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
31
33
  CLUSTER_COST_REPORT = 'cost_report'
34
+ CLUSTER_EVENTS = 'cluster_events'
32
35
  # Storage requests
33
36
  STORAGE_LS = 'storage_ls'
34
37
  STORAGE_DELETE = 'storage_delete'
@@ -50,6 +53,7 @@ class RequestName(str, enum.Enum):
50
53
  JOBS_POOL_STATUS = 'jobs.pool_status'
51
54
  JOBS_POOL_LOGS = 'jobs.pool_logs'
52
55
  JOBS_POOL_SYNC_DOWN_LOGS = 'jobs.pool_sync_down_logs'
56
+ JOBS_EVENTS = 'jobs.events'
53
57
  # Serve requests
54
58
  SERVE_UP = 'serve.up'
55
59
  SERVE_UPDATE = 'serve.update'
@@ -33,6 +33,7 @@ from sky.server import daemons
33
33
  from sky.server.requests import payloads
34
34
  from sky.server.requests.serializers import decoders
35
35
  from sky.server.requests.serializers import encoders
36
+ from sky.server.requests.serializers import return_value_serializers
36
37
  from sky.utils import asyncio_utils
37
38
  from sky.utils import common_utils
38
39
  from sky.utils import ux_utils
@@ -231,13 +232,16 @@ class Request:
231
232
  assert isinstance(self.request_body,
232
233
  payloads.RequestBody), (self.name, self.request_body)
233
234
  try:
235
+ # Use version-aware serializer to handle backward compatibility
236
+ # for old clients that don't recognize new fields.
237
+ serializer = return_value_serializers.get_serializer(self.name)
234
238
  return payloads.RequestPayload(
235
239
  request_id=self.request_id,
236
240
  name=self.name,
237
241
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
238
242
  request_body=encoders.pickle_and_encode(self.request_body),
239
243
  status=self.status.value,
240
- return_value=orjson.dumps(self.return_value).decode('utf-8'),
244
+ return_value=serializer(self.return_value),
241
245
  error=orjson.dumps(self.error).decode('utf-8'),
242
246
  pid=self.pid,
243
247
  created_at=self.created_at,
@@ -869,11 +873,17 @@ async def create_if_not_exists_async(request: Request) -> bool:
869
873
  f'({request_columns}) VALUES '
870
874
  f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
871
875
  request_row = request.to_row()
872
- # Execute the SQL statement without getting the request lock.
873
- # The request lock is used to prevent racing with cancellation codepath,
874
- # but a request cannot be cancelled before it is created.
875
- row = await _DB.execute_get_returning_value_async(sql_statement,
876
- request_row)
876
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
877
+ logger.debug(f'Start creating request {request.request_id}')
878
+ try:
879
+ # Execute the SQL statement without getting the request lock.
880
+ # The request lock is used to prevent racing with cancellation codepath,
881
+ # but a request cannot be cancelled before it is created.
882
+ row = await _DB.execute_get_returning_value_async(
883
+ sql_statement, request_row)
884
+ finally:
885
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
886
+ logger.debug(f'End creating request {request.request_id}')
877
887
  return True if row else False
878
888
 
879
889
 
@@ -1030,9 +1040,15 @@ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
1030
1040
  def _add_or_update_request_no_lock(request: Request):
1031
1041
  """Add or update a REST request into the database."""
1032
1042
  assert _DB is not None
1033
- with _DB.conn:
1034
- cursor = _DB.conn.cursor()
1035
- cursor.execute(_add_or_update_request_sql, request.to_row())
1043
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1044
+ logger.debug(f'Start adding or updating request {request.request_id}')
1045
+ try:
1046
+ with _DB.conn:
1047
+ cursor = _DB.conn.cursor()
1048
+ cursor.execute(_add_or_update_request_sql, request.to_row())
1049
+ finally:
1050
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1051
+ logger.debug(f'End adding or updating request {request.request_id}')
1036
1052
 
1037
1053
 
1038
1054
  async def _add_or_update_request_no_lock_async(request: Request):
@@ -1121,8 +1137,14 @@ async def _delete_requests(request_ids: List[str]):
1121
1137
  """Clean up requests by their IDs."""
1122
1138
  id_list_str = ','.join(repr(request_id) for request_id in request_ids)
1123
1139
  assert _DB is not None
1124
- await _DB.execute_and_commit_async(
1125
- f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1140
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1141
+ logger.debug(f'Start deleting requests {request_ids}')
1142
+ try:
1143
+ await _DB.execute_and_commit_async(
1144
+ f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1145
+ finally:
1146
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1147
+ logger.debug(f'End deleting requests {request_ids}')
1126
1148
 
1127
1149
 
1128
1150
  async def clean_finished_requests_with_retention(retention_seconds: int,
@@ -69,6 +69,11 @@ def encode_status(
69
69
  response_cluster['last_use'] = ''
70
70
  if 'status_updated_at' not in response_cluster:
71
71
  response_cluster['status_updated_at'] = 0
72
+ # Ensure labels is always included, defaulting to empty dict if None
73
+ # This is needed because exclude_none=True would exclude None labels
74
+ if 'labels' not in response_cluster or response_cluster.get(
75
+ 'labels') is None:
76
+ response_cluster['labels'] = {}
72
77
  response_cluster['status'] = cluster['status'].value
73
78
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
74
79
  cluster['handle'])
@@ -266,6 +271,23 @@ def encode_realtime_gpu_availability(
266
271
  return encoded
267
272
 
268
273
 
274
+ @register_encoder('realtime_slurm_gpu_availability')
275
+ def encode_realtime_slurm_gpu_availability(
276
+ return_value: List[Tuple[str,
277
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
278
+ # Convert RealtimeGpuAvailability namedtuples to lists
279
+ # for JSON serialization.
280
+ encoded = []
281
+ for context, gpu_list in return_value:
282
+ converted_gpu_list = []
283
+ for gpu in gpu_list:
284
+ assert isinstance(gpu, models.RealtimeGpuAvailability), (
285
+ f'Expected RealtimeGpuAvailability, got {type(gpu)}')
286
+ converted_gpu_list.append(list(gpu))
287
+ encoded.append((context, converted_gpu_list))
288
+ return encoded
289
+
290
+
269
291
  @register_encoder('list_accelerators')
270
292
  def encode_list_accelerators(
271
293
  return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
@@ -0,0 +1,70 @@
1
+ """Version-aware serializers for request return values.
2
+
3
+ These serializers run at encode() time when remote_api_version is available,
4
+ to handle backward compatibility for old clients.
5
+
6
+ The existing encoders.py handles object -> dict conversion at set_return_value()
7
+ time. This module handles dict -> JSON string serialization at encode() time,
8
+ with version-aware field filtering for backward compatibility.
9
+ """
10
+ from typing import Any, Callable, Dict
11
+
12
+ import orjson
13
+
14
+ from sky.server import constants as server_constants
15
+ from sky.server import versions
16
+
17
+ handlers: Dict[str, Callable[[Any], str]] = {}
18
+
19
+
20
+ def register_serializer(*names: str):
21
+ """Decorator to register a version-aware serializer."""
22
+
23
+ def decorator(func):
24
+ for name in names:
25
+ if name != server_constants.DEFAULT_HANDLER_NAME:
26
+ name = server_constants.REQUEST_NAME_PREFIX + name
27
+ if name in handlers:
28
+ raise ValueError(f'Serializer {name} already registered: '
29
+ f'{handlers[name]}')
30
+ handlers[name] = func
31
+ return func
32
+
33
+ return decorator
34
+
35
+
36
+ def get_serializer(name: str) -> Callable[[Any], str]:
37
+ """Get the serializer for a request name."""
38
+ return handlers.get(name, handlers[server_constants.DEFAULT_HANDLER_NAME])
39
+
40
+
41
+ @register_serializer(server_constants.DEFAULT_HANDLER_NAME)
42
+ def default_serializer(return_value: Any) -> str:
43
+ """The default serializer."""
44
+ return orjson.dumps(return_value).decode('utf-8')
45
+
46
+
47
+ @register_serializer('kubernetes_node_info')
48
+ def serialize_kubernetes_node_info(return_value: Dict[str, Any]) -> str:
49
+ """Serialize kubernetes node info with version compatibility.
50
+
51
+ The is_ready field was added in API version 25. Remove it for old clients
52
+ that don't recognize it.
53
+ The cpu_count, memory_gb, cpu_free, and memory_free_gb fields were added
54
+ in API version 26. Remove them for old clients that don't recognize them.
55
+ """
56
+ remote_api_version = versions.get_remote_api_version()
57
+ if (return_value and remote_api_version is not None):
58
+ for node_info in return_value.get('node_info_dict', {}).values():
59
+ if remote_api_version < 25:
60
+ # Remove is_ready field for old clients that don't recognize it
61
+ node_info.pop('is_ready', None)
62
+ if remote_api_version < 26:
63
+ # Remove cpu_count, memory_gb, cpu_free, and
64
+ # memory_free_gb fields for old clients that don't
65
+ # recognize them
66
+ node_info.pop('cpu_count', None)
67
+ node_info.pop('memory_gb', None)
68
+ node_info.pop('cpu_free', None)
69
+ node_info.pop('memory_free_gb', None)
70
+ return orjson.dumps(return_value).decode('utf-8')