skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/users/permission.py CHANGED
@@ -3,6 +3,7 @@ import contextlib
3
3
  import hashlib
4
4
  import logging
5
5
  import os
6
+ import threading
6
7
  from typing import Generator, List, Optional
7
8
 
8
9
  import casbin
@@ -36,16 +37,23 @@ class PermissionService:
36
37
 
37
38
  def __init__(self):
38
39
  self.enforcer: Optional[casbin.Enforcer] = None
40
+ self._lock = threading.Lock()
39
41
 
40
- def _lazy_initialize(self):
42
+ def initialize(self):
43
+ self._lazy_initialize(full_initialize=True)
44
+
45
+ def _lazy_initialize(self, full_initialize: bool = False):
41
46
  if self.enforcer is not None:
42
47
  return
43
- with _policy_lock():
48
+ with self._lock:
49
+ if self.enforcer is not None:
50
+ return
44
51
  global _enforcer_instance
45
52
  if _enforcer_instance is None:
46
53
  engine = global_user_state.initialize_and_get_db()
47
- db_utils.add_all_tables_to_db_sqlalchemy(
48
- sqlalchemy_adapter.Base.metadata, engine)
54
+ if full_initialize:
55
+ db_utils.add_all_tables_to_db_sqlalchemy(
56
+ sqlalchemy_adapter.Base.metadata, engine)
49
57
  adapter = sqlalchemy_adapter.Adapter(
50
58
  engine, db_class=sqlalchemy_adapter.CasbinRule)
51
59
  model_path = os.path.join(os.path.dirname(__file__),
@@ -56,8 +64,10 @@ class PermissionService:
56
64
  # is successfully initialized, if we change it and then fail
57
65
  # we will set it to None and all subsequent calls will fail.
58
66
  _enforcer_instance = self
59
- self._maybe_initialize_policies()
60
- self._maybe_initialize_basic_auth_user()
67
+ if full_initialize:
68
+ with _policy_lock():
69
+ self._maybe_initialize_policies()
70
+ self._maybe_initialize_basic_auth_user()
61
71
  else:
62
72
  assert _enforcer_instance is not None
63
73
  self.enforcer = _enforcer_instance.enforcer
@@ -69,6 +79,26 @@ class PermissionService:
69
79
  'Enforcer should be initialized after _lazy_initialize()')
70
80
  return self.enforcer
71
81
 
82
+ def _get_plugin_rbac_rules(self):
83
+ """Get RBAC rules from loaded plugins.
84
+
85
+ Returns:
86
+ Dictionary of plugin RBAC rules, or empty dict if plugins module
87
+ is not available or no rules are defined.
88
+ """
89
+ try:
90
+ # pylint: disable=import-outside-toplevel
91
+ from sky.server import plugins as server_plugins
92
+ return server_plugins.get_plugin_rbac_rules()
93
+ except ImportError:
94
+ # Plugin module not available (e.g., not running as server)
95
+ logger.debug(
96
+ 'Plugin module not available, skipping plugin RBAC rules')
97
+ return {}
98
+ except Exception as e: # pylint: disable=broad-except
99
+ logger.warning(f'Failed to get plugin RBAC rules: {e}')
100
+ return {}
101
+
72
102
  def _maybe_initialize_basic_auth_user(self) -> None:
73
103
  """Initialize basic auth user if it is enabled."""
74
104
  basic_auth = os.environ.get(constants.SKYPILOT_INITIAL_BASIC_AUTH)
@@ -92,26 +122,29 @@ class PermissionService:
92
122
  def _maybe_initialize_policies(self) -> None:
93
123
  """Initialize policies if they don't already exist."""
94
124
  logger.debug(f'Initializing policies in process: {os.getpid()}')
95
- self._load_policy_no_lock()
96
125
 
97
126
  policy_updated = False
98
127
 
99
128
  # Check if policies are already initialized by looking for existing
100
129
  # permission policies in the enforcer
101
130
  enforcer = self._ensure_enforcer()
102
- existing_policies = enforcer.get_policy()
131
+ # Convert existing policies to set of tuples for O(1) lookups
132
+ existing_policies = {tuple(p) for p in enforcer.get_policy()}
133
+
134
+ # Get plugin RBAC rules dynamically
135
+ plugin_rules = self._get_plugin_rbac_rules()
103
136
 
104
137
  # If we already have policies for the expected roles, skip
105
138
  # initialization
106
- role_permissions = rbac.get_role_permissions()
139
+ role_permissions = rbac.get_role_permissions(plugin_rules=plugin_rules)
107
140
  expected_policies = []
108
141
  for role, permissions in role_permissions.items():
109
- if permissions['permissions'] and 'blocklist' in permissions[
110
- 'permissions']:
142
+ if permissions.get('permissions'
143
+ ) and 'blocklist' in permissions['permissions']:
111
144
  blocklist = permissions['permissions']['blocklist']
112
145
  for item in blocklist:
113
146
  expected_policies.append(
114
- [role, item['path'], item['method']])
147
+ (role, item['path'], item['method']))
115
148
 
116
149
  # Add workspace policy
117
150
  workspace_policy_permissions = rbac.get_workspace_policy_permissions()
@@ -120,50 +153,50 @@ class PermissionService:
120
153
 
121
154
  for workspace_name, users in workspace_policy_permissions.items():
122
155
  for user in users:
123
- expected_policies.append([user, workspace_name, '*'])
124
- logger.debug(f'Expected workspace policy: user={user}, '
125
- f'workspace={workspace_name}')
126
-
127
- # Check if all expected policies already exist
128
- policies_exist = all(
129
- any(policy == expected
130
- for policy in existing_policies)
131
- for expected in expected_policies)
132
-
133
- if not policies_exist:
134
- # Only clear and reinitialize if policies don't exist or are
135
- # incomplete
136
- logger.debug('Policies not found or incomplete, initializing...')
137
- # Only clear p policies (permission policies),
138
- # keep g policies (role policies)
139
- enforcer.remove_filtered_policy(0)
140
- for role, permissions in role_permissions.items():
141
- if permissions['permissions'] and 'blocklist' in permissions[
142
- 'permissions']:
143
- blocklist = permissions['permissions']['blocklist']
144
- for item in blocklist:
145
- path = item['path']
146
- method = item['method']
147
- logger.debug(f'Adding role policy: role={role}, '
148
- f'path={path}, method={method}')
149
- enforcer.add_policy(role, path, method)
150
- policy_updated = True
151
-
152
- for workspace_name, users in workspace_policy_permissions.items():
153
- for user in users:
154
- logger.debug(f'Initializing workspace policy: user={user}, '
155
- f'workspace={workspace_name}')
156
- enforcer.add_policy(user, workspace_name, '*')
157
- policy_updated = True
158
- logger.debug('Policies initialized successfully')
159
- else:
160
- logger.debug('Policies already exist, skipping initialization')
156
+ expected_policies.append((user, workspace_name, '*'))
157
+ # Check if all expected policies already exist and find missing ones
158
+ missing_policies = [
159
+ p for p in expected_policies if p not in existing_policies
160
+ ]
161
+ # Find policies to remove
162
+ expected_policies_set = set(expected_policies)
163
+ redundant_policies = [
164
+ p for p in existing_policies if p not in expected_policies_set
165
+ ]
166
+ if missing_policies:
167
+ # Add missing policies
168
+ logger.debug(f'Found {len(missing_policies)} missing policies, '
169
+ 'initializing...')
170
+ for p in missing_policies:
171
+ logger.debug(f'Adding policy: {p}')
172
+ enforcer.add_policy(*p)
173
+ policy_updated = True
174
+ logger.debug('Missing policies added successfully')
175
+
176
+ if redundant_policies:
177
+ # Remove redundant policies
178
+ logger.debug(f'Found {len(redundant_policies)} redundant policies, '
179
+ 'cleaning up...')
180
+ for p in redundant_policies:
181
+ logger.debug(f'Removing policy: {p}')
182
+ enforcer.remove_policy(*p)
183
+ policy_updated = True
184
+ logger.debug('Redundant policies removed successfully')
185
+
186
+ if not missing_policies and not redundant_policies:
187
+ logger.debug('Policies already in sync, skipping initialization')
161
188
 
162
189
  # Always ensure users have default roles (this is idempotent)
190
+ # Get users who already have roles (g policies) to avoid redundant calls
191
+ users_with_roles = {tuple(g)[0] for g in enforcer.get_grouping_policy()}
163
192
  all_users = global_user_state.get_all_users()
164
193
  for existing_user in all_users:
165
- user_added = self._add_user_if_not_exists_no_lock(existing_user.id)
166
- policy_updated = policy_updated or user_added
194
+ if str(existing_user.id) not in users_with_roles:
195
+ logger.debug(f'Adding role for user: {existing_user.name}'
196
+ f'({existing_user.id})')
197
+ user_added = self._add_user_if_not_exists_no_lock(
198
+ existing_user.id)
199
+ policy_updated = policy_updated or user_added
167
200
 
168
201
  if policy_updated:
169
202
  enforcer.save_policy()
sky/users/rbac.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """RBAC (Role-Based Access Control) functionality for SkyPilot API Server."""
2
2
 
3
3
  import enum
4
- from typing import Dict, List
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky import skypilot_config
@@ -55,8 +55,13 @@ def get_default_role() -> str:
55
55
 
56
56
 
57
57
  def get_role_permissions(
58
+ plugin_rules: Optional[Dict[str, List[Dict[str, str]]]] = None
58
59
  ) -> Dict[str, Dict[str, Dict[str, List[Dict[str, str]]]]]:
59
- """Get all role permissions from config.
60
+ """Get all role permissions from config and plugins.
61
+
62
+ Args:
63
+ plugin_rules: Optional dictionary of plugin RBAC rules to merge.
64
+ Format: {'user': [{'path': '...', 'method': '...'}]}
60
65
 
61
66
  Returns:
62
67
  Dictionary containing all roles and their permissions configuration.
@@ -91,9 +96,32 @@ def get_role_permissions(
91
96
  if 'user' not in config_permissions:
92
97
  config_permissions['user'] = {
93
98
  'permissions': {
94
- 'blocklist': _DEFAULT_USER_BLOCKLIST
99
+ 'blocklist': _DEFAULT_USER_BLOCKLIST.copy()
95
100
  }
96
101
  }
102
+
103
+ # Merge plugin rules into the appropriate roles
104
+ if plugin_rules:
105
+ for role, rules in plugin_rules.items():
106
+ if role not in supported_roles:
107
+ logger.warning(f'Plugin specified invalid role: {role}')
108
+ continue
109
+ if role not in config_permissions:
110
+ config_permissions[role] = {'permissions': {'blocklist': []}}
111
+ if 'permissions' not in config_permissions[role]:
112
+ config_permissions[role]['permissions'] = {'blocklist': []}
113
+ if 'blocklist' not in config_permissions[role]['permissions']:
114
+ config_permissions[role]['permissions']['blocklist'] = []
115
+
116
+ # Merge plugin rules, avoiding duplicates
117
+ existing_rules = config_permissions[role]['permissions'][
118
+ 'blocklist']
119
+ for rule in rules:
120
+ if rule not in existing_rules:
121
+ existing_rules.append(rule)
122
+ logger.debug(f'Added plugin RBAC rule for {role}: '
123
+ f'{rule["method"]} {rule["path"]}')
124
+
97
125
  return config_permissions
98
126
 
99
127
 
sky/utils/annotations.py CHANGED
@@ -1,14 +1,20 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal, TypeVar
4
+ import threading
5
+ import time
6
+ from typing import Callable, List, Literal, TypeVar
7
+ import weakref
5
8
 
6
9
  import cachetools
7
10
  from typing_extensions import ParamSpec
8
11
 
9
12
  # Whether the current process is a SkyPilot API server process.
10
13
  is_on_api_server = True
11
- _FUNCTIONS_NEED_RELOAD_CACHE = []
14
+ _FUNCTIONS_NEED_RELOAD_CACHE_LOCK = threading.Lock()
15
+ # Caches can be thread-local, use weakref to avoid blocking the GC when the
16
+ # thread is destroyed.
17
+ _FUNCTIONS_NEED_RELOAD_CACHE: List[weakref.ReferenceType] = []
12
18
 
13
19
  T = TypeVar('T')
14
20
  P = ParamSpec('P')
@@ -30,6 +36,94 @@ def client_api(func: Callable[P, T]) -> Callable[P, T]:
30
36
  return wrapper
31
37
 
32
38
 
39
+ def _register_functions_need_reload_cache(func: Callable) -> Callable:
40
+ """Register a cachefunction that needs to be reloaded for a new request.
41
+
42
+ The function will be registered as a weak reference to avoid blocking GC.
43
+ """
44
+ assert hasattr(func, 'cache_clear'), f'{func.__name__} is not cacheable'
45
+ wrapped_fn = func
46
+ try:
47
+ func_ref = weakref.ref(func)
48
+ except TypeError:
49
+ # The function might be not weakrefable (e.g. functools.lru_cache),
50
+ # wrap it in this case.
51
+ @functools.wraps(func)
52
+ def wrapper(*args, **kwargs):
53
+ return func(*args, **kwargs)
54
+
55
+ wrapper.cache_clear = func.cache_clear # type: ignore[attr-defined]
56
+ func_ref = weakref.ref(wrapper)
57
+ wrapped_fn = wrapper
58
+ with _FUNCTIONS_NEED_RELOAD_CACHE_LOCK:
59
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(func_ref)
60
+ return wrapped_fn
61
+
62
+
63
+ class ThreadLocalTTLCache(threading.local):
64
+ """Thread-local storage for _thread_local_lru_cache decorator."""
65
+
66
+ def __init__(self, func, maxsize: int, ttl: int):
67
+ super().__init__()
68
+ self.func = func
69
+ self.maxsize = maxsize
70
+ self.ttl = ttl
71
+
72
+ def get_cache(self):
73
+ if not hasattr(self, 'cache'):
74
+ self.cache = ttl_cache(scope='request',
75
+ maxsize=self.maxsize,
76
+ ttl=self.ttl,
77
+ timer=time.time)(self.func)
78
+ return self.cache
79
+
80
+ def __del__(self):
81
+ if hasattr(self, 'cache'):
82
+ self.cache.cache_clear()
83
+ self.cache = None
84
+
85
+
86
+ def thread_local_ttl_cache(maxsize=32, ttl=60 * 55):
87
+ """Thread-local TTL cache decorator.
88
+
89
+ Args:
90
+ maxsize: Maximum size of the cache.
91
+ ttl: Time to live for the cache in seconds.
92
+ Default is 55 minutes, a bit less than 1 hour
93
+ default lifetime of an STS token.
94
+ """
95
+
96
+ def decorator(func):
97
+ # Create thread-local storage for the LRU cache
98
+ local_cache = ThreadLocalTTLCache(func, maxsize, ttl)
99
+
100
+ # We can't apply the lru_cache here, because this runs at import time
101
+ # so we will always have the main thread's cache.
102
+
103
+ @functools.wraps(func)
104
+ def wrapper(*args, **kwargs):
105
+ # We are within the actual function call, which may be on a thread,
106
+ # so local_cache.cache will return the correct thread-local cache,
107
+ # which we can now apply and immediately call.
108
+ return local_cache.get_cache()(*args, **kwargs)
109
+
110
+ def cache_info():
111
+ # Note that this will only give the cache info for the current
112
+ # thread's cache.
113
+ return local_cache.get_cache().cache_info()
114
+
115
+ def cache_clear():
116
+ # Note that this will only clear the cache for the current thread.
117
+ local_cache.get_cache().cache_clear()
118
+
119
+ wrapper.cache_info = cache_info # type: ignore[attr-defined]
120
+ wrapper.cache_clear = cache_clear # type: ignore[attr-defined]
121
+
122
+ return wrapper
123
+
124
+ return decorator
125
+
126
+
33
127
  def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
34
128
  **lru_cache_kwargs) -> Callable:
35
129
  """LRU cache decorator for functions.
@@ -51,8 +145,7 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
51
145
  else:
52
146
  cached_func = functools.lru_cache(*lru_cache_args,
53
147
  **lru_cache_kwargs)(func)
54
- _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
55
- return cached_func
148
+ return _register_functions_need_reload_cache(cached_func)
56
149
 
57
150
  return decorator
58
151
 
@@ -72,13 +165,20 @@ def ttl_cache(scope: Literal['global', 'request'], *ttl_cache_args,
72
165
  else:
73
166
  cached_func = cachetools.cached(
74
167
  cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
75
- _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
76
- return cached_func
168
+ return _register_functions_need_reload_cache(cached_func)
77
169
 
78
170
  return decorator
79
171
 
80
172
 
81
173
  def clear_request_level_cache():
82
174
  """Clear the request-level cache."""
83
- for func in _FUNCTIONS_NEED_RELOAD_CACHE:
84
- func.cache_clear()
175
+ alive_entries = []
176
+ with _FUNCTIONS_NEED_RELOAD_CACHE_LOCK:
177
+ for entry in _FUNCTIONS_NEED_RELOAD_CACHE:
178
+ func = entry()
179
+ if func is None:
180
+ # Has been GC'ed, drop the reference.
181
+ continue
182
+ func.cache_clear()
183
+ alive_entries.append(entry)
184
+ _FUNCTIONS_NEED_RELOAD_CACHE[:] = alive_entries
sky/utils/auth_utils.py CHANGED
@@ -58,6 +58,34 @@ def _generate_rsa_key_pair() -> Tuple[str, str]:
58
58
  return public_key, private_key
59
59
 
60
60
 
61
+ def _ensure_key_permissions(private_key_path: str,
62
+ public_key_path: str) -> None:
63
+ """Ensure SSH key files and parent directory have correct permissions.
64
+
65
+ This is necessary because external factors (e.g., Kubernetes fsGroup,
66
+ volume mounts, umask) can modify file permissions after creation.
67
+ SSH requires private keys to have strict permissions (0600) and the
68
+ parent directory to not be group/world writable (0700).
69
+
70
+ This function is best-effort and will not raise exceptions if permission
71
+ changes fail (e.g., due to permission denied or read-only filesystem).
72
+ """
73
+
74
+ def _safe_chmod(path: str, mode: int) -> None:
75
+ """Attempt to chmod, logging warning on failure."""
76
+ try:
77
+ if os.path.exists(path):
78
+ os.chmod(path, mode)
79
+ except OSError as e:
80
+ logger.warning(f'Failed to set permissions on {path}: {e}')
81
+
82
+ # Ensure parent directory has correct permissions (0700)
83
+ key_dir = os.path.dirname(private_key_path)
84
+ _safe_chmod(key_dir, 0o700)
85
+ _safe_chmod(private_key_path, 0o600)
86
+ _safe_chmod(public_key_path, 0o644)
87
+
88
+
61
89
  def _save_key_pair(private_key_path: str, public_key_path: str,
62
90
  private_key: str, public_key: str) -> None:
63
91
  key_dir = os.path.dirname(private_key_path)
@@ -77,6 +105,11 @@ def _save_key_pair(private_key_path: str, public_key_path: str,
77
105
  opener=functools.partial(os.open, mode=0o644)) as f:
78
106
  f.write(public_key)
79
107
 
108
+ # Explicitly set permissions to ensure they are correct regardless of
109
+ # umask or pre-existing file permissions. The opener's mode parameter
110
+ # only applies when creating new files, and is still subject to umask.
111
+ _ensure_key_permissions(private_key_path, public_key_path)
112
+
80
113
 
81
114
  def get_or_generate_keys() -> Tuple[str, str]:
82
115
  """Returns the absolute private and public key paths."""
@@ -105,6 +138,9 @@ def get_or_generate_keys() -> Tuple[str, str]:
105
138
  assert os.path.exists(public_key_path), (
106
139
  'Private key found, but associated public key '
107
140
  f'{public_key_path} does not exist.')
141
+ # Ensure correct permissions every time, as external factors (e.g.,
142
+ # Kubernetes fsGroup) can modify them after creation.
143
+ _ensure_key_permissions(private_key_path, public_key_path)
108
144
  return private_key_path, public_key_path
109
145
 
110
146
 
@@ -133,6 +169,9 @@ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
133
169
  lock_dir = os.path.dirname(lock_path)
134
170
 
135
171
  if os.path.exists(private_key_path) and os.path.exists(public_key_path):
172
+ # Ensure correct permissions every time, as external factors (e.g.,
173
+ # Kubernetes fsGroup) can modify them after creation.
174
+ _ensure_key_permissions(private_key_path, public_key_path)
136
175
  return True
137
176
  # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
138
177
  # as the ssh configs will be written to this folder as well in
@@ -150,4 +189,7 @@ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
150
189
  assert os.path.exists(public_key_path), (
151
190
  'Private key found, but associated public key '
152
191
  f'{public_key_path} does not exist.')
192
+ # Ensure correct permissions every time, as external factors (e.g.,
193
+ # Kubernetes fsGroup) can modify them after creation.
194
+ _ensure_key_permissions(private_key_path, public_key_path)
153
195
  return True
@@ -13,9 +13,6 @@ from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
14
  from sky.utils import ux_utils
15
15
 
16
- if typing.TYPE_CHECKING:
17
- from sky.provision.kubernetes import utils as kubernetes_utils
18
-
19
16
  if typing.TYPE_CHECKING:
20
17
  from sky.provision.kubernetes import utils as kubernetes_utils
21
18
 
@@ -225,8 +222,25 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
225
222
  # exist in those cases.
226
223
  _get_name = (lambda cluster_record, _: cluster_record['name'])
227
224
  _get_user_hash = (lambda cluster_record, _: cluster_record['user_hash'])
228
- _get_user_name = (
229
- lambda cluster_record, _: cluster_record.get('user_name', '-'))
225
+
226
+
227
+ def get_user_display_name(user_name: str, user_id: Optional[str] = None) -> str:
228
+ """ Appends SA to the user name if the user is a service account. """
229
+ if user_id and user_id.lower().startswith('sa-'):
230
+ return f'{user_name} (SA)'
231
+ return user_name
232
+
233
+
234
+ def _get_user_name(cluster_record: _ClusterRecord,
235
+ truncate: bool = True) -> str:
236
+ del truncate
237
+ user_name = cluster_record.get('user_name', '-')
238
+ if user_name == '-':
239
+ return user_name
240
+ user_hash = cluster_record.get('user_hash')
241
+ return get_user_display_name(user_name, user_hash)
242
+
243
+
230
244
  _get_launched = (lambda cluster_record, _: log_utils.readable_time_duration(
231
245
  cluster_record['launched_at']))
232
246
  _get_duration = (lambda cluster_record, _: log_utils.readable_time_duration(
@@ -46,7 +46,8 @@ class SSHConfigHelper(object):
46
46
  ssh_cluster_key_path = constants.SKY_USER_FILE_PATH + '/ssh-keys/{}.key'
47
47
 
48
48
  @classmethod
49
- def _get_generated_config(cls, autogen_comment: str, host_name: str,
49
+ def _get_generated_config(cls, autogen_comment: str,
50
+ cluster_name_on_cloud: str, host_name: str,
50
51
  ip: str, username: str, ssh_key_path: str,
51
52
  proxy_command: Optional[str], port: int,
52
53
  docker_proxy_command: Optional[str]):
@@ -79,6 +80,7 @@ class SSHConfigHelper(object):
79
80
  UserKnownHostsFile=/dev/null
80
81
  GlobalKnownHostsFile=/dev/null
81
82
  Port {port}
83
+ SetEnv {constants.SKY_CLUSTER_NAME_ENV_VAR_KEY}={cluster_name_on_cloud}
82
84
  {proxy}
83
85
  """.rstrip())
84
86
  codegen = codegen + '\n'
@@ -111,6 +113,7 @@ class SSHConfigHelper(object):
111
113
  def add_cluster(
112
114
  cls,
113
115
  cluster_name: str,
116
+ cluster_name_on_cloud: str,
114
117
  ips: List[str],
115
118
  auth_config: Dict[str, str],
116
119
  ports: List[int],
@@ -135,6 +138,7 @@ class SSHConfigHelper(object):
135
138
  ports: List of port numbers for SSH corresponding to ips
136
139
  docker_user: If not None, use this user to ssh into the docker
137
140
  ssh_user: Override the ssh_user in auth_config
141
+ cluster_name_on_cloud: The cluster name as it appears in the cloud.
138
142
  """
139
143
  if ssh_user is None:
140
144
  username = auth_config['ssh_user']
@@ -227,10 +231,13 @@ class SSHConfigHelper(object):
227
231
  ip = 'localhost'
228
232
  port = constants.DEFAULT_DOCKER_PORT
229
233
  node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
234
+ node_proxy_command = proxy_command_for_nodes
235
+ if node_proxy_command is not None:
236
+ node_proxy_command = node_proxy_command.replace('%w', str(i))
230
237
  # TODO(romilb): Update port number when k8s supports multinode
231
238
  codegen += cls._get_generated_config(
232
- sky_autogen_comment, node_name, ip, username,
233
- key_path_for_config, proxy_command_for_nodes, port,
239
+ sky_autogen_comment, cluster_name_on_cloud, node_name, ip,
240
+ username, key_path_for_config, node_proxy_command, port,
234
241
  docker_proxy_command) + '\n'
235
242
 
236
243
  cluster_config_path = os.path.expanduser(