skypilot-nightly 1.0.0.dev20250710__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/clouds/kubernetes.py +137 -23
  3. sky/core.py +3 -1
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
  6. sky/dashboard/out/_next/static/chunks/{webpack-fd62f17bd9ce1fcc.js → webpack-60070a62f55486a6.js} +1 -1
  7. sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
  8. sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
  9. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  10. sky/dashboard/out/clusters/[cluster].html +1 -1
  11. sky/dashboard/out/clusters.html +1 -1
  12. sky/dashboard/out/config.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/infra/[context].html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/global_user_state.py +10 -11
  24. sky/jobs/state.py +10 -11
  25. sky/jobs/utils.py +11 -3
  26. sky/provision/kubernetes/utils.py +132 -0
  27. sky/skypilot_config.py +4 -1
  28. sky/templates/kubernetes-ray.yml.j2 +298 -10
  29. sky/users/permission.py +15 -1
  30. sky/users/token_service.py +25 -3
  31. sky/utils/schemas.py +3 -0
  32. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
  33. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +41 -41
  34. sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
  35. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
  36. /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
  37. /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
  38. /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
  39. /sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
  40. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
@@ -293,13 +293,82 @@ available_node_types:
293
293
  kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
294
294
  kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
295
295
  {% endif %}
296
- {% if k8s_kueue_local_queue_name %}
296
+ {% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
297
297
  annotations:
298
- kueue.x-k8s.io/retriable-in-group: "false"
299
- kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
300
- {% if k8s_max_run_duration_seconds %}
301
- provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
302
- {% endif %}
298
+ {% if k8s_kueue_local_queue_name %}
299
+ kueue.x-k8s.io/retriable-in-group: "false"
300
+ kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
301
+ {% if k8s_max_run_duration_seconds %}
302
+ provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
303
+ {% endif %}
304
+ {% endif %}
305
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
306
+ # Values from google cloud guide
307
+ {% if k8s_enable_gpudirect_tcpx %}
308
+ devices.gke.io/container.tcpx-daemon: |+
309
+ - path: /dev/nvidia0
310
+ - path: /dev/nvidia1
311
+ - path: /dev/nvidia2
312
+ - path: /dev/nvidia3
313
+ - path: /dev/nvidia4
314
+ - path: /dev/nvidia5
315
+ - path: /dev/nvidia6
316
+ - path: /dev/nvidia7
317
+ - path: /dev/nvidiactl
318
+ - path: /dev/nvidia-uvm
319
+ networking.gke.io/default-interface: 'eth0'
320
+ networking.gke.io/interfaces: |
321
+ [
322
+ {"interfaceName":"eth0","network":"default"},
323
+ {"interfaceName":"eth1","network":"vpc1"},
324
+ {"interfaceName":"eth2","network":"vpc2"},
325
+ {"interfaceName":"eth3","network":"vpc3"},
326
+ {"interfaceName":"eth4","network":"vpc4"}
327
+ ]
328
+ {% endif %}
329
+ {% if k8s_enable_gpudirect_tcpxo %}
330
+ devices.gke.io/container.tcpxo-daemon: |+
331
+ - path: /dev/nvidia0
332
+ - path: /dev/nvidia1
333
+ - path: /dev/nvidia2
334
+ - path: /dev/nvidia3
335
+ - path: /dev/nvidia4
336
+ - path: /dev/nvidia5
337
+ - path: /dev/nvidia6
338
+ - path: /dev/nvidia7
339
+ - path: /dev/nvidiactl
340
+ - path: /dev/nvidia-uvm
341
+ - path: /dev/dmabuf_import_helper
342
+ networking.gke.io/default-interface: 'eth0'
343
+ networking.gke.io/interfaces: |
344
+ [
345
+ {"interfaceName":"eth0","network":"default"},
346
+ {"interfaceName":"eth1","network":"vpc1"},
347
+ {"interfaceName":"eth2","network":"vpc2"},
348
+ {"interfaceName":"eth3","network":"vpc3"},
349
+ {"interfaceName":"eth4","network":"vpc4"},
350
+ {"interfaceName":"eth5","network":"vpc5"},
351
+ {"interfaceName":"eth6","network":"vpc6"},
352
+ {"interfaceName":"eth7","network":"vpc7"},
353
+ {"interfaceName":"eth8","network":"vpc8"}
354
+ ]
355
+ {% endif %}
356
+ {% if k8s_enable_gpudirect_rdma %}
357
+ networking.gke.io/default-interface: 'eth0'
358
+ networking.gke.io/interfaces: |
359
+ [
360
+ {"interfaceName":"eth0","network":"default"},
361
+ {"interfaceName":"eth1","network":"gvnic-1"},
362
+ {"interfaceName":"eth2","network":"rdma-0"},
363
+ {"interfaceName":"eth3","network":"rdma-1"},
364
+ {"interfaceName":"eth4","network":"rdma-2"},
365
+ {"interfaceName":"eth5","network":"rdma-3"},
366
+ {"interfaceName":"eth6","network":"rdma-4"},
367
+ {"interfaceName":"eth7","network":"rdma-5"},
368
+ {"interfaceName":"eth8","network":"rdma-6"},
369
+ {"interfaceName":"eth9","network":"rdma-7"}
370
+ ]
371
+ {% endif %}
303
372
  {% endif %}
304
373
  spec:
305
374
  # serviceAccountName: skypilot-service-account
@@ -396,6 +465,41 @@ available_node_types:
396
465
  persistentVolumeClaim:
397
466
  claimName: {{volume_mount.volume_name_on_cloud}}
398
467
  {% endfor %}
468
+ {% if k8s_enable_gpudirect_tcpx %}
469
+ - name: libraries
470
+ hostPath:
471
+ path: /home/kubernetes/bin/nvidia/lib64
472
+ - name: tcpx-socket
473
+ emptyDir: {}
474
+ - name: sys
475
+ hostPath:
476
+ path: /sys
477
+ - name: proc-sys
478
+ hostPath:
479
+ path: /proc/sys
480
+ {% endif %}
481
+ {% if k8s_enable_gpudirect_tcpxo %}
482
+ - name: libraries
483
+ hostPath:
484
+ path: /home/kubernetes/bin/nvidia
485
+ - name: sys
486
+ hostPath:
487
+ path: /sys
488
+ - name: proc-sys
489
+ hostPath:
490
+ path: /proc/sys
491
+ - name: aperture-devices
492
+ hostPath:
493
+ path: /dev/aperture_devices
494
+ {% endif %}
495
+ {% if k8s_enable_gpudirect_rdma %}
496
+ - name: library-dir-host
497
+ hostPath:
498
+ path: /home/kubernetes/bin/nvidia
499
+ - name: gib
500
+ hostPath:
501
+ path: /home/kubernetes/bin/gib
502
+ {% endif %}
399
503
  containers:
400
504
  - name: ray-node
401
505
  imagePullPolicy: Always
@@ -409,6 +513,113 @@ available_node_types:
409
513
  - name: {{ key }}
410
514
  value: {{ value }}
411
515
  {% endfor %}
516
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
517
+ # Page recommends setting NCCL values for GPUDirect TCPX for best performance.
518
+ {% if k8s_enable_gpudirect_tcpx %}
519
+ - name: LD_LIBRARY_PATH
520
+ value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
521
+ - name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
522
+ value: eth1,eth2,eth3,eth4
523
+ - name: NCCL_GPUDIRECTTCPX_CTRL_DEV
524
+ value: eth0
525
+ - name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
526
+ value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
527
+ - name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
528
+ value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
529
+ - name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
530
+ value: "500000"
531
+ - name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
532
+ value: "/tmp"
533
+ - name: NCCL_GPUDIRECTTCPX_FORCE_ACK
534
+ value: "0"
535
+ - name: NCCL_SOCKET_IFNAME
536
+ value: eth0
537
+ - name: NCCL_CROSS_NIC
538
+ value: "0"
539
+ - name: NCCL_ALGO
540
+ value: Ring
541
+ - name: NCCL_PROTO
542
+ value: Simple
543
+ - name: NCCL_NSOCKS_PERTHREAD
544
+ value: "4"
545
+ - name: NCCL_SOCKET_NTHREADS
546
+ value: "1"
547
+ - name: NCCL_NET_GDR_LEVEL
548
+ value: PIX
549
+ - name: NCCL_DYNAMIC_CHUNK_SIZE
550
+ value: "524288"
551
+ - name: NCCL_P2P_PXN_LEVEL
552
+ value: "0"
553
+ - name: NCCL_P2P_NET_CHUNKSIZE
554
+ value: "524288"
555
+ - name: NCCL_P2P_PCI_CHUNKSIZE
556
+ value: "524288"
557
+ - name: NCCL_P2P_NVL_CHUNKSIZE
558
+ value: "1048576"
559
+ - name: NCCL_BUFFSIZE
560
+ value: "4194304"
561
+ - name: NCCL_MAX_NCHANNELS
562
+ value: "8"
563
+ - name: NCCL_MIN_NCHANNELS
564
+ value: "8"
565
+ - name: CUDA_VISIBLE_DEVICES
566
+ value: "0,1,2,3,4,5,6,7"
567
+ {% endif %}
568
+ {% if k8s_enable_gpudirect_tcpxo %}
569
+ - name: LD_LIBRARY_PATH
570
+ value: /usr/local/nvidia/lib64
571
+ - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
572
+ value: /dev/aperture_devices
573
+ - name: NCCL_FASTRAK_CTRL_DEV
574
+ value: eth0
575
+ - name: NCCL_FASTRAK_IFNAME
576
+ value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
577
+ - name: NCCL_SOCKET_IFNAME
578
+ value: eth0
579
+ - name: NCCL_CROSS_NIC
580
+ value: "0"
581
+ - name: NCCL_ALGO
582
+ value: Ring,Tree
583
+ - name: NCCL_PROTO
584
+ value: Simple,LL128
585
+ - name: NCCL_MIN_NCHANNELS
586
+ value: "4"
587
+ - name: NCCL_TUNER_PLUGIN
588
+ value: libnccl-tuner.so
589
+ - name: NCCL_TUNER_CONFIG_PATH
590
+ value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
591
+ - name: CUDA_VISIBLE_DEVICES
592
+ value: "0,1,2,3,4,5,6,7"
593
+ {% endif %}
594
+ {% if k8s_enable_gpudirect_rdma %}
595
+ - name: LD_LIBRARY_PATH
596
+ value: /usr/local/nvidia/lib64
597
+ - name: NCCL_NET
598
+ value: gIB
599
+ - name: NCCL_CROSS_NIC
600
+ value: "0"
601
+ - name: NCCL_NET_GDR_LEVEL
602
+ value: PIX
603
+ - name: NCCL_P2P_NET_CHUNKSIZE
604
+ value: "131072"
605
+ - name: NCCL_NVLS_CHUNKSIZE
606
+ value: "524288"
607
+ - name: NCCL_IB_ADAPTIVE_ROUTING
608
+ value: "1"
609
+ - name: NCCL_IB_QPS_PER_CONNECTION
610
+ value: "4"
611
+ - name: NCCL_IB_TC
612
+ value: "52"
613
+ - name: NCCL_IB_FIFO_TC
614
+ value: "84"
615
+ {% if k8s_enable_gpudirect_rdma_a4 %}
616
+ - name: NCCL_TUNER_CONFIG_PATH
617
+ value: /usr/local/gib/configs/tuner_config_a4.txtpb
618
+ {% else %}
619
+ - name: NCCL_TUNER_CONFIG_PATH
620
+ value: /usr/local/gib/configs/tuner_config_a3u.txtpb
621
+ {% endif %}
622
+ {% endif %}
412
623
  {% if k8s_fuse_device_required %}
413
624
  - name: FUSERMOUNT_SHARED_DIR
414
625
  value: {{k8s_fusermount_shared_dir}}
@@ -752,11 +963,27 @@ available_node_types:
752
963
  - name: secret-volume
753
964
  readOnly: true
754
965
  mountPath: "/etc/secret-volume"
755
- # This volume allocates shared memory for Ray to use for its plasma
756
- # object store. If you do not provide this, Ray will fall back to
757
- # /tmp which cause slowdowns if is not a shared memory volume.
758
966
  - mountPath: /dev/shm
759
967
  name: dshm
968
+ {% if k8s_enable_gpudirect_tcpx %}
969
+ - name: tcpx-socket
970
+ mountPath: /tmp
971
+ - name: libraries
972
+ mountPath: /usr/local/nvidia/lib64
973
+ readOnly: true
974
+ {% endif %}
975
+ {% if k8s_enable_gpudirect_tcpxo %}
976
+ - name: libraries
977
+ mountPath: /usr/local/nvidia
978
+ - name: aperture-devices
979
+ mountPath: /dev/aperture_devices
980
+ {% endif %}
981
+ {% if k8s_enable_gpudirect_rdma %}
982
+ - name: library-dir-host
983
+ mountPath: /usr/local/nvidia
984
+ - name: gib
985
+ mountPath: /usr/local/gib
986
+ {% endif %}
760
987
  {% if high_availability %}
761
988
  - name: {{k8s_high_availability_deployment_volume_mount_name}}
762
989
  mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
@@ -794,7 +1021,68 @@ available_node_types:
794
1021
  add:
795
1022
  - IPC_LOCK
796
1023
  {% endif %}
797
-
1024
+ {% if k8s_enable_gpudirect_tcpx %}
1025
+ # GPUDirect TCPX daemon sidecar container
1026
+ - name: tcpx-daemon
1027
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
1028
+ imagePullPolicy: Always
1029
+ command:
1030
+ - /tcpgpudmarxd/build/app/tcpgpudmarxd
1031
+ - --gpu_nic_preset
1032
+ - a3vm
1033
+ - --gpu_shmem_type
1034
+ - fd
1035
+ - --uds_path
1036
+ - /run/tcpx
1037
+ - --setup_param
1038
+ - --verbose
1039
+ - "128"
1040
+ - "2"
1041
+ - "0"
1042
+ securityContext:
1043
+ capabilities:
1044
+ add:
1045
+ - NET_ADMIN
1046
+ volumeMounts:
1047
+ - name: libraries
1048
+ mountPath: /usr/local/nvidia/lib64
1049
+ readOnly: true
1050
+ - name: tcpx-socket
1051
+ mountPath: /run/tcpx
1052
+ - name: sys
1053
+ mountPath: /hostsysfs
1054
+ - name: proc-sys
1055
+ mountPath: /hostprocsysfs
1056
+ env:
1057
+ - name: LD_LIBRARY_PATH
1058
+ value: /usr/local/nvidia/lib64
1059
+ {% endif %}
1060
+ {% if k8s_enable_gpudirect_tcpxo %}
1061
+ - name: tcpxo-daemon
1062
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
1063
+ imagePullPolicy: Always
1064
+ command: ["/bin/sh", "-c"]
1065
+ args:
1066
+ - |
1067
+ set -ex
1068
+ chmod 755 /fts/entrypoint_rxdm_container.sh
1069
+ /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
1070
+ securityContext:
1071
+ capabilities:
1072
+ add:
1073
+ - NET_ADMIN
1074
+ - NET_BIND_SERVICE
1075
+ volumeMounts:
1076
+ - name: libraries
1077
+ mountPath: /usr/local/nvidia
1078
+ - name: sys
1079
+ mountPath: /hostsysfs
1080
+ - name: proc-sys
1081
+ mountPath: /hostprocsysfs
1082
+ env:
1083
+ - name: LD_LIBRARY_PATH
1084
+ value: /usr/local/nvidia/lib64
1085
+ {% endif %}
798
1086
 
799
1087
  {% if high_availability %}
800
1088
  pvc_spec:
sky/users/permission.py CHANGED
@@ -34,6 +34,11 @@ class PermissionService:
34
34
  """Permission service for SkyPilot API Server."""
35
35
 
36
36
  def __init__(self):
37
+ self.enforcer = None
38
+
39
+ def _lazy_initialize(self):
40
+ if self.enforcer is not None:
41
+ return
37
42
  with _policy_lock():
38
43
  global _enforcer_instance
39
44
  if _enforcer_instance is None:
@@ -73,7 +78,6 @@ class PermissionService:
73
78
 
74
79
  def _maybe_initialize_policies(self) -> None:
75
80
  """Initialize policies if they don't already exist."""
76
- # TODO(zhwu): we should avoid running this on client side.
77
81
  logger.debug(f'Initializing policies in process: {os.getpid()}')
78
82
  self._load_policy_no_lock()
79
83
 
@@ -152,6 +156,7 @@ class PermissionService:
152
156
 
153
157
  def add_user_if_not_exists(self, user_id: str) -> None:
154
158
  """Add user role relationship."""
159
+ self._lazy_initialize()
155
160
  with _policy_lock():
156
161
  self._add_user_if_not_exists_no_lock(user_id)
157
162
 
@@ -171,6 +176,7 @@ class PermissionService:
171
176
 
172
177
  def delete_user(self, user_id: str) -> None:
173
178
  """Delete user role relationship."""
179
+ self._lazy_initialize()
174
180
  with _policy_lock():
175
181
  # Get current roles
176
182
  self._load_policy_no_lock()
@@ -184,6 +190,7 @@ class PermissionService:
184
190
 
185
191
  def update_role(self, user_id: str, new_role: str) -> None:
186
192
  """Update user role relationship."""
193
+ self._lazy_initialize()
187
194
  with _policy_lock():
188
195
  # Get current roles
189
196
  self._load_policy_no_lock()
@@ -216,6 +223,7 @@ class PermissionService:
216
223
  Returns:
217
224
  A list of role names that the user has.
218
225
  """
226
+ self._lazy_initialize()
219
227
  self._load_policy_no_lock()
220
228
  return self.enforcer.get_roles_for_user(user_id)
221
229
 
@@ -228,6 +236,7 @@ class PermissionService:
228
236
  # it is a hot path in every request. It is ok to have a stale policy,
229
237
  # as long as it is eventually consistent.
230
238
  # self._load_policy_no_lock()
239
+ self._lazy_initialize()
231
240
  return self.enforcer.enforce(user_id, path, method)
232
241
 
233
242
  def _load_policy_no_lock(self):
@@ -236,6 +245,7 @@ class PermissionService:
236
245
 
237
246
  def load_policy(self):
238
247
  """Load policy from storage with lock."""
248
+ self._lazy_initialize()
239
249
  with _policy_lock():
240
250
  self._load_policy_no_lock()
241
251
 
@@ -251,6 +261,7 @@ class PermissionService:
251
261
  For public workspaces, the permission is granted via a wildcard policy
252
262
  ('*').
253
263
  """
264
+ self._lazy_initialize()
254
265
  if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
255
266
  # When it is not on API server, we allow all users to access all
256
267
  # workspaces, as the workspace check has been done on API server.
@@ -307,6 +318,7 @@ class PermissionService:
307
318
  For public workspaces, this should be ['*'].
308
319
  For private workspaces, this should be specific user IDs.
309
320
  """
321
+ self._lazy_initialize()
310
322
  with _policy_lock():
311
323
  for user in users:
312
324
  logger.debug(f'Adding workspace policy: user={user}, '
@@ -324,6 +336,7 @@ class PermissionService:
324
336
  For public workspaces, this should be ['*'].
325
337
  For private workspaces, this should be specific user IDs.
326
338
  """
339
+ self._lazy_initialize()
327
340
  with _policy_lock():
328
341
  self._load_policy_no_lock()
329
342
  # Remove all existing policies for this workspace
@@ -337,6 +350,7 @@ class PermissionService:
337
350
 
338
351
  def remove_workspace_policy(self, workspace_name: str) -> None:
339
352
  """Remove workspace policy."""
353
+ self._lazy_initialize()
340
354
  with _policy_lock():
341
355
  self.enforcer.remove_filtered_policy(1, workspace_name)
342
356
  self.enforcer.save_policy()
@@ -5,6 +5,7 @@ import datetime
5
5
  import hashlib
6
6
  import os
7
7
  import secrets
8
+ import threading
8
9
  from typing import Any, Dict, Generator, Optional
9
10
 
10
11
  import filelock
@@ -44,12 +45,21 @@ class TokenService:
44
45
  """Service for managing JWT-based service account tokens."""
45
46
 
46
47
  def __init__(self):
47
- self.secret_key = self._get_or_generate_secret()
48
+ self.secret_key = None
49
+ self.init_lock = threading.Lock()
50
+
51
+ def _lazy_initialize(self):
52
+ if self.secret_key is not None:
53
+ return
54
+ with self.init_lock:
55
+ if self.secret_key is not None:
56
+ return
57
+ self.secret_key = self._get_or_generate_secret()
48
58
 
49
59
  def _get_or_generate_secret(self) -> str:
50
60
  """Get JWT secret from database or generate a new one."""
51
- with _jwt_secret_lock():
52
- # Try to get from database (persistent across deployments)
61
+
62
+ def _get_secret_from_db():
53
63
  try:
54
64
  db_secret = global_user_state.get_system_config(
55
65
  JWT_SECRET_DB_KEY)
@@ -58,7 +68,17 @@ class TokenService:
58
68
  return db_secret
59
69
  except Exception as e: # pylint: disable=broad-except
60
70
  logger.debug(f'Failed to get JWT secret from database: {e}')
71
+ return None
72
+
73
+ # Try to get from database (persistent across deployments)
74
+ token_from_db = _get_secret_from_db()
75
+ if token_from_db:
76
+ return token_from_db
61
77
 
78
+ with _jwt_secret_lock():
79
+ token_from_db = _get_secret_from_db()
80
+ if token_from_db:
81
+ return token_from_db
62
82
  # Generate a new secret and store in database
63
83
  new_secret = secrets.token_urlsafe(64)
64
84
  try:
@@ -91,6 +111,7 @@ class TokenService:
91
111
  Returns:
92
112
  Dict containing token info including the JWT token
93
113
  """
114
+ self._lazy_initialize()
94
115
  now = datetime.datetime.now(datetime.timezone.utc)
95
116
  token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
96
117
 
@@ -144,6 +165,7 @@ class TokenService:
144
165
  Returns:
145
166
  Decoded token payload or None if invalid
146
167
  """
168
+ self._lazy_initialize()
147
169
  if not token.startswith('sky_'):
148
170
  return None
149
171
 
sky/utils/schemas.py CHANGED
@@ -1106,6 +1106,9 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1106
1106
  },
1107
1107
  },
1108
1108
  },
1109
+ 'remote_identity': {
1110
+ 'type': 'string',
1111
+ }
1109
1112
  }
1110
1113
 
1111
1114
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250710
3
+ Version: 1.0.0.dev20250711
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0