skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/users/permission.py CHANGED
@@ -3,6 +3,7 @@ import contextlib
3
3
  import hashlib
4
4
  import logging
5
5
  import os
6
+ import threading
6
7
  from typing import Generator, List, Optional
7
8
 
8
9
  import casbin
@@ -36,16 +37,23 @@ class PermissionService:
36
37
 
37
38
  def __init__(self):
38
39
  self.enforcer: Optional[casbin.Enforcer] = None
40
+ self._lock = threading.Lock()
39
41
 
40
- def _lazy_initialize(self):
42
+ def initialize(self):
43
+ self._lazy_initialize(full_initialize=True)
44
+
45
+ def _lazy_initialize(self, full_initialize: bool = False):
41
46
  if self.enforcer is not None:
42
47
  return
43
- with _policy_lock():
48
+ with self._lock:
49
+ if self.enforcer is not None:
50
+ return
44
51
  global _enforcer_instance
45
52
  if _enforcer_instance is None:
46
53
  engine = global_user_state.initialize_and_get_db()
47
- db_utils.add_all_tables_to_db_sqlalchemy(
48
- sqlalchemy_adapter.Base.metadata, engine)
54
+ if full_initialize:
55
+ db_utils.add_all_tables_to_db_sqlalchemy(
56
+ sqlalchemy_adapter.Base.metadata, engine)
49
57
  adapter = sqlalchemy_adapter.Adapter(
50
58
  engine, db_class=sqlalchemy_adapter.CasbinRule)
51
59
  model_path = os.path.join(os.path.dirname(__file__),
@@ -56,8 +64,10 @@ class PermissionService:
56
64
  # is successfully initialized, if we change it and then fail
57
65
  # we will set it to None and all subsequent calls will fail.
58
66
  _enforcer_instance = self
59
- self._maybe_initialize_policies()
60
- self._maybe_initialize_basic_auth_user()
67
+ if full_initialize:
68
+ with _policy_lock():
69
+ self._maybe_initialize_policies()
70
+ self._maybe_initialize_basic_auth_user()
61
71
  else:
62
72
  assert _enforcer_instance is not None
63
73
  self.enforcer = _enforcer_instance.enforcer
@@ -112,14 +122,14 @@ class PermissionService:
112
122
  def _maybe_initialize_policies(self) -> None:
113
123
  """Initialize policies if they don't already exist."""
114
124
  logger.debug(f'Initializing policies in process: {os.getpid()}')
115
- self._load_policy_no_lock()
116
125
 
117
126
  policy_updated = False
118
127
 
119
128
  # Check if policies are already initialized by looking for existing
120
129
  # permission policies in the enforcer
121
130
  enforcer = self._ensure_enforcer()
122
- existing_policies = enforcer.get_policy()
131
+ # Convert existing policies to set of tuples for O(1) lookups
132
+ existing_policies = {tuple(p) for p in enforcer.get_policy()}
123
133
 
124
134
  # Get plugin RBAC rules dynamically
125
135
  plugin_rules = self._get_plugin_rbac_rules()
@@ -129,12 +139,12 @@ class PermissionService:
129
139
  role_permissions = rbac.get_role_permissions(plugin_rules=plugin_rules)
130
140
  expected_policies = []
131
141
  for role, permissions in role_permissions.items():
132
- if permissions['permissions'] and 'blocklist' in permissions[
133
- 'permissions']:
142
+ if permissions.get('permissions'
143
+ ) and 'blocklist' in permissions['permissions']:
134
144
  blocklist = permissions['permissions']['blocklist']
135
145
  for item in blocklist:
136
146
  expected_policies.append(
137
- [role, item['path'], item['method']])
147
+ (role, item['path'], item['method']))
138
148
 
139
149
  # Add workspace policy
140
150
  workspace_policy_permissions = rbac.get_workspace_policy_permissions()
@@ -143,50 +153,50 @@ class PermissionService:
143
153
 
144
154
  for workspace_name, users in workspace_policy_permissions.items():
145
155
  for user in users:
146
- expected_policies.append([user, workspace_name, '*'])
147
- logger.debug(f'Expected workspace policy: user={user}, '
148
- f'workspace={workspace_name}')
149
-
150
- # Check if all expected policies already exist
151
- policies_exist = all(
152
- any(policy == expected
153
- for policy in existing_policies)
154
- for expected in expected_policies)
155
-
156
- if not policies_exist:
157
- # Only clear and reinitialize if policies don't exist or are
158
- # incomplete
159
- logger.debug('Policies not found or incomplete, initializing...')
160
- # Only clear p policies (permission policies),
161
- # keep g policies (role policies)
162
- enforcer.remove_filtered_policy(0)
163
- for role, permissions in role_permissions.items():
164
- if permissions['permissions'] and 'blocklist' in permissions[
165
- 'permissions']:
166
- blocklist = permissions['permissions']['blocklist']
167
- for item in blocklist:
168
- path = item['path']
169
- method = item['method']
170
- logger.debug(f'Adding role policy: role={role}, '
171
- f'path={path}, method={method}')
172
- enforcer.add_policy(role, path, method)
173
- policy_updated = True
174
-
175
- for workspace_name, users in workspace_policy_permissions.items():
176
- for user in users:
177
- logger.debug(f'Initializing workspace policy: user={user}, '
178
- f'workspace={workspace_name}')
179
- enforcer.add_policy(user, workspace_name, '*')
180
- policy_updated = True
181
- logger.debug('Policies initialized successfully')
182
- else:
183
- logger.debug('Policies already exist, skipping initialization')
156
+ expected_policies.append((user, workspace_name, '*'))
157
+ # Check if all expected policies already exist and find missing ones
158
+ missing_policies = [
159
+ p for p in expected_policies if p not in existing_policies
160
+ ]
161
+ # Find policies to remove
162
+ expected_policies_set = set(expected_policies)
163
+ redundant_policies = [
164
+ p for p in existing_policies if p not in expected_policies_set
165
+ ]
166
+ if missing_policies:
167
+ # Add missing policies
168
+ logger.debug(f'Found {len(missing_policies)} missing policies, '
169
+ 'initializing...')
170
+ for p in missing_policies:
171
+ logger.debug(f'Adding policy: {p}')
172
+ enforcer.add_policy(*p)
173
+ policy_updated = True
174
+ logger.debug('Missing policies added successfully')
175
+
176
+ if redundant_policies:
177
+ # Remove redundant policies
178
+ logger.debug(f'Found {len(redundant_policies)} redundant policies, '
179
+ 'cleaning up...')
180
+ for p in redundant_policies:
181
+ logger.debug(f'Removing policy: {p}')
182
+ enforcer.remove_policy(*p)
183
+ policy_updated = True
184
+ logger.debug('Redundant policies removed successfully')
185
+
186
+ if not missing_policies and not redundant_policies:
187
+ logger.debug('Policies already in sync, skipping initialization')
184
188
 
185
189
  # Always ensure users have default roles (this is idempotent)
190
+ # Get users who already have roles (g policies) to avoid redundant calls
191
+ users_with_roles = {tuple(g)[0] for g in enforcer.get_grouping_policy()}
186
192
  all_users = global_user_state.get_all_users()
187
193
  for existing_user in all_users:
188
- user_added = self._add_user_if_not_exists_no_lock(existing_user.id)
189
- policy_updated = policy_updated or user_added
194
+ if str(existing_user.id) not in users_with_roles:
195
+ logger.debug(f'Adding role for user: {existing_user.name}'
196
+ f'({existing_user.id})')
197
+ user_added = self._add_user_if_not_exists_no_lock(
198
+ existing_user.id)
199
+ policy_updated = policy_updated or user_added
190
200
 
191
201
  if policy_updated:
192
202
  enforcer.save_policy()
sky/utils/auth_utils.py CHANGED
@@ -58,6 +58,34 @@ def _generate_rsa_key_pair() -> Tuple[str, str]:
58
58
  return public_key, private_key
59
59
 
60
60
 
61
+ def _ensure_key_permissions(private_key_path: str,
62
+ public_key_path: str) -> None:
63
+ """Ensure SSH key files and parent directory have correct permissions.
64
+
65
+ This is necessary because external factors (e.g., Kubernetes fsGroup,
66
+ volume mounts, umask) can modify file permissions after creation.
67
+ SSH requires private keys to have strict permissions (0600) and the
68
+ parent directory to not be group/world writable (0700).
69
+
70
+ This function is best-effort and will not raise exceptions if permission
71
+ changes fail (e.g., due to permission denied or read-only filesystem).
72
+ """
73
+
74
+ def _safe_chmod(path: str, mode: int) -> None:
75
+ """Attempt to chmod, logging warning on failure."""
76
+ try:
77
+ if os.path.exists(path):
78
+ os.chmod(path, mode)
79
+ except OSError as e:
80
+ logger.warning(f'Failed to set permissions on {path}: {e}')
81
+
82
+ # Ensure parent directory has correct permissions (0700)
83
+ key_dir = os.path.dirname(private_key_path)
84
+ _safe_chmod(key_dir, 0o700)
85
+ _safe_chmod(private_key_path, 0o600)
86
+ _safe_chmod(public_key_path, 0o644)
87
+
88
+
61
89
  def _save_key_pair(private_key_path: str, public_key_path: str,
62
90
  private_key: str, public_key: str) -> None:
63
91
  key_dir = os.path.dirname(private_key_path)
@@ -77,6 +105,11 @@ def _save_key_pair(private_key_path: str, public_key_path: str,
77
105
  opener=functools.partial(os.open, mode=0o644)) as f:
78
106
  f.write(public_key)
79
107
 
108
+ # Explicitly set permissions to ensure they are correct regardless of
109
+ # umask or pre-existing file permissions. The opener's mode parameter
110
+ # only applies when creating new files, and is still subject to umask.
111
+ _ensure_key_permissions(private_key_path, public_key_path)
112
+
80
113
 
81
114
  def get_or_generate_keys() -> Tuple[str, str]:
82
115
  """Returns the absolute private and public key paths."""
@@ -105,6 +138,9 @@ def get_or_generate_keys() -> Tuple[str, str]:
105
138
  assert os.path.exists(public_key_path), (
106
139
  'Private key found, but associated public key '
107
140
  f'{public_key_path} does not exist.')
141
+ # Ensure correct permissions every time, as external factors (e.g.,
142
+ # Kubernetes fsGroup) can modify them after creation.
143
+ _ensure_key_permissions(private_key_path, public_key_path)
108
144
  return private_key_path, public_key_path
109
145
 
110
146
 
@@ -133,6 +169,9 @@ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
133
169
  lock_dir = os.path.dirname(lock_path)
134
170
 
135
171
  if os.path.exists(private_key_path) and os.path.exists(public_key_path):
172
+ # Ensure correct permissions every time, as external factors (e.g.,
173
+ # Kubernetes fsGroup) can modify them after creation.
174
+ _ensure_key_permissions(private_key_path, public_key_path)
136
175
  return True
137
176
  # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
138
177
  # as the ssh configs will be written to this folder as well in
@@ -150,4 +189,7 @@ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
150
189
  assert os.path.exists(public_key_path), (
151
190
  'Private key found, but associated public key '
152
191
  f'{public_key_path} does not exist.')
192
+ # Ensure correct permissions every time, as external factors (e.g.,
193
+ # Kubernetes fsGroup) can modify them after creation.
194
+ _ensure_key_permissions(private_key_path, public_key_path)
153
195
  return True
@@ -13,9 +13,6 @@ from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
14
  from sky.utils import ux_utils
15
15
 
16
- if typing.TYPE_CHECKING:
17
- from sky.provision.kubernetes import utils as kubernetes_utils
18
-
19
16
  if typing.TYPE_CHECKING:
20
17
  from sky.provision.kubernetes import utils as kubernetes_utils
21
18
 
@@ -225,8 +222,25 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
225
222
  # exist in those cases.
226
223
  _get_name = (lambda cluster_record, _: cluster_record['name'])
227
224
  _get_user_hash = (lambda cluster_record, _: cluster_record['user_hash'])
228
- _get_user_name = (
229
- lambda cluster_record, _: cluster_record.get('user_name', '-'))
225
+
226
+
227
+ def get_user_display_name(user_name: str, user_id: Optional[str] = None) -> str:
228
+ """ Appends SA to the user name if the user is a service account. """
229
+ if user_id and user_id.lower().startswith('sa-'):
230
+ return f'{user_name} (SA)'
231
+ return user_name
232
+
233
+
234
+ def _get_user_name(cluster_record: _ClusterRecord,
235
+ truncate: bool = True) -> str:
236
+ del truncate
237
+ user_name = cluster_record.get('user_name', '-')
238
+ if user_name == '-':
239
+ return user_name
240
+ user_hash = cluster_record.get('user_hash')
241
+ return get_user_display_name(user_name, user_hash)
242
+
243
+
230
244
  _get_launched = (lambda cluster_record, _: log_utils.readable_time_duration(
231
245
  cluster_record['launched_at']))
232
246
  _get_duration = (lambda cluster_record, _: log_utils.readable_time_duration(
@@ -46,7 +46,8 @@ class SSHConfigHelper(object):
46
46
  ssh_cluster_key_path = constants.SKY_USER_FILE_PATH + '/ssh-keys/{}.key'
47
47
 
48
48
  @classmethod
49
- def _get_generated_config(cls, autogen_comment: str, host_name: str,
49
+ def _get_generated_config(cls, autogen_comment: str,
50
+ cluster_name_on_cloud: str, host_name: str,
50
51
  ip: str, username: str, ssh_key_path: str,
51
52
  proxy_command: Optional[str], port: int,
52
53
  docker_proxy_command: Optional[str]):
@@ -79,6 +80,7 @@ class SSHConfigHelper(object):
79
80
  UserKnownHostsFile=/dev/null
80
81
  GlobalKnownHostsFile=/dev/null
81
82
  Port {port}
83
+ SetEnv {constants.SKY_CLUSTER_NAME_ENV_VAR_KEY}={cluster_name_on_cloud}
82
84
  {proxy}
83
85
  """.rstrip())
84
86
  codegen = codegen + '\n'
@@ -111,6 +113,7 @@ class SSHConfigHelper(object):
111
113
  def add_cluster(
112
114
  cls,
113
115
  cluster_name: str,
116
+ cluster_name_on_cloud: str,
114
117
  ips: List[str],
115
118
  auth_config: Dict[str, str],
116
119
  ports: List[int],
@@ -135,6 +138,7 @@ class SSHConfigHelper(object):
135
138
  ports: List of port numbers for SSH corresponding to ips
136
139
  docker_user: If not None, use this user to ssh into the docker
137
140
  ssh_user: Override the ssh_user in auth_config
141
+ cluster_name_on_cloud: The cluster name as it appears in the cloud.
138
142
  """
139
143
  if ssh_user is None:
140
144
  username = auth_config['ssh_user']
@@ -227,10 +231,13 @@ class SSHConfigHelper(object):
227
231
  ip = 'localhost'
228
232
  port = constants.DEFAULT_DOCKER_PORT
229
233
  node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
234
+ node_proxy_command = proxy_command_for_nodes
235
+ if node_proxy_command is not None:
236
+ node_proxy_command = node_proxy_command.replace('%w', str(i))
230
237
  # TODO(romilb): Update port number when k8s supports multinode
231
238
  codegen += cls._get_generated_config(
232
- sky_autogen_comment, node_name, ip, username,
233
- key_path_for_config, proxy_command_for_nodes, port,
239
+ sky_autogen_comment, cluster_name_on_cloud, node_name, ip,
240
+ username, key_path_for_config, node_proxy_command, port,
234
241
  docker_proxy_command) + '\n'
235
242
 
236
243
  cluster_config_path = os.path.expanduser(