skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +6 -4
  3. sky/clouds/kubernetes.py +137 -23
  4. sky/core.py +3 -1
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
  7. sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-60070a62f55486a6.js} +1 -1
  9. sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
  10. sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/global_user_state.py +10 -11
  26. sky/jobs/constants.py +1 -1
  27. sky/jobs/controller.py +7 -0
  28. sky/jobs/server/core.py +2 -1
  29. sky/jobs/server/utils.py +81 -0
  30. sky/jobs/state.py +58 -40
  31. sky/jobs/utils.py +45 -6
  32. sky/provision/kubernetes/instance.py +17 -0
  33. sky/provision/kubernetes/utils.py +134 -0
  34. sky/provision/provisioner.py +20 -0
  35. sky/skylet/constants.py +1 -6
  36. sky/skylet/job_lib.py +30 -8
  37. sky/skypilot_config.py +8 -3
  38. sky/task.py +17 -0
  39. sky/templates/kubernetes-ray.yml.j2 +298 -10
  40. sky/users/permission.py +18 -1
  41. sky/users/token_service.py +25 -3
  42. sky/utils/common_utils.py +13 -0
  43. sky/utils/db_utils.py +16 -0
  44. sky/utils/schemas.py +6 -0
  45. sky/utils/ux_utils.py +2 -4
  46. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
  47. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +55 -54
  48. sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
  49. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
  50. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
  51. /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
  52. /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
  53. /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
  54. /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
@@ -293,13 +293,82 @@ available_node_types:
293
293
  kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
294
294
  kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
295
295
  {% endif %}
296
- {% if k8s_kueue_local_queue_name %}
296
+ {% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
297
297
  annotations:
298
- kueue.x-k8s.io/retriable-in-group: "false"
299
- kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
300
- {% if k8s_max_run_duration_seconds %}
301
- provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
302
- {% endif %}
298
+ {% if k8s_kueue_local_queue_name %}
299
+ kueue.x-k8s.io/retriable-in-group: "false"
300
+ kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
301
+ {% if k8s_max_run_duration_seconds %}
302
+ provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
303
+ {% endif %}
304
+ {% endif %}
305
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
306
+ # Values from google cloud guide
307
+ {% if k8s_enable_gpudirect_tcpx %}
308
+ devices.gke.io/container.tcpx-daemon: |+
309
+ - path: /dev/nvidia0
310
+ - path: /dev/nvidia1
311
+ - path: /dev/nvidia2
312
+ - path: /dev/nvidia3
313
+ - path: /dev/nvidia4
314
+ - path: /dev/nvidia5
315
+ - path: /dev/nvidia6
316
+ - path: /dev/nvidia7
317
+ - path: /dev/nvidiactl
318
+ - path: /dev/nvidia-uvm
319
+ networking.gke.io/default-interface: 'eth0'
320
+ networking.gke.io/interfaces: |
321
+ [
322
+ {"interfaceName":"eth0","network":"default"},
323
+ {"interfaceName":"eth1","network":"vpc1"},
324
+ {"interfaceName":"eth2","network":"vpc2"},
325
+ {"interfaceName":"eth3","network":"vpc3"},
326
+ {"interfaceName":"eth4","network":"vpc4"}
327
+ ]
328
+ {% endif %}
329
+ {% if k8s_enable_gpudirect_tcpxo %}
330
+ devices.gke.io/container.tcpxo-daemon: |+
331
+ - path: /dev/nvidia0
332
+ - path: /dev/nvidia1
333
+ - path: /dev/nvidia2
334
+ - path: /dev/nvidia3
335
+ - path: /dev/nvidia4
336
+ - path: /dev/nvidia5
337
+ - path: /dev/nvidia6
338
+ - path: /dev/nvidia7
339
+ - path: /dev/nvidiactl
340
+ - path: /dev/nvidia-uvm
341
+ - path: /dev/dmabuf_import_helper
342
+ networking.gke.io/default-interface: 'eth0'
343
+ networking.gke.io/interfaces: |
344
+ [
345
+ {"interfaceName":"eth0","network":"default"},
346
+ {"interfaceName":"eth1","network":"vpc1"},
347
+ {"interfaceName":"eth2","network":"vpc2"},
348
+ {"interfaceName":"eth3","network":"vpc3"},
349
+ {"interfaceName":"eth4","network":"vpc4"},
350
+ {"interfaceName":"eth5","network":"vpc5"},
351
+ {"interfaceName":"eth6","network":"vpc6"},
352
+ {"interfaceName":"eth7","network":"vpc7"},
353
+ {"interfaceName":"eth8","network":"vpc8"}
354
+ ]
355
+ {% endif %}
356
+ {% if k8s_enable_gpudirect_rdma %}
357
+ networking.gke.io/default-interface: 'eth0'
358
+ networking.gke.io/interfaces: |
359
+ [
360
+ {"interfaceName":"eth0","network":"default"},
361
+ {"interfaceName":"eth1","network":"gvnic-1"},
362
+ {"interfaceName":"eth2","network":"rdma-0"},
363
+ {"interfaceName":"eth3","network":"rdma-1"},
364
+ {"interfaceName":"eth4","network":"rdma-2"},
365
+ {"interfaceName":"eth5","network":"rdma-3"},
366
+ {"interfaceName":"eth6","network":"rdma-4"},
367
+ {"interfaceName":"eth7","network":"rdma-5"},
368
+ {"interfaceName":"eth8","network":"rdma-6"},
369
+ {"interfaceName":"eth9","network":"rdma-7"}
370
+ ]
371
+ {% endif %}
303
372
  {% endif %}
304
373
  spec:
305
374
  # serviceAccountName: skypilot-service-account
@@ -396,6 +465,41 @@ available_node_types:
396
465
  persistentVolumeClaim:
397
466
  claimName: {{volume_mount.volume_name_on_cloud}}
398
467
  {% endfor %}
468
+ {% if k8s_enable_gpudirect_tcpx %}
469
+ - name: libraries
470
+ hostPath:
471
+ path: /home/kubernetes/bin/nvidia/lib64
472
+ - name: tcpx-socket
473
+ emptyDir: {}
474
+ - name: sys
475
+ hostPath:
476
+ path: /sys
477
+ - name: proc-sys
478
+ hostPath:
479
+ path: /proc/sys
480
+ {% endif %}
481
+ {% if k8s_enable_gpudirect_tcpxo %}
482
+ - name: libraries
483
+ hostPath:
484
+ path: /home/kubernetes/bin/nvidia
485
+ - name: sys
486
+ hostPath:
487
+ path: /sys
488
+ - name: proc-sys
489
+ hostPath:
490
+ path: /proc/sys
491
+ - name: aperture-devices
492
+ hostPath:
493
+ path: /dev/aperture_devices
494
+ {% endif %}
495
+ {% if k8s_enable_gpudirect_rdma %}
496
+ - name: library-dir-host
497
+ hostPath:
498
+ path: /home/kubernetes/bin/nvidia
499
+ - name: gib
500
+ hostPath:
501
+ path: /home/kubernetes/bin/gib
502
+ {% endif %}
399
503
  containers:
400
504
  - name: ray-node
401
505
  imagePullPolicy: Always
@@ -409,6 +513,113 @@ available_node_types:
409
513
  - name: {{ key }}
410
514
  value: {{ value }}
411
515
  {% endfor %}
516
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
517
+ # Page recommends setting NCCL values for GPUDirect TCPX for best performance.
518
+ {% if k8s_enable_gpudirect_tcpx %}
519
+ - name: LD_LIBRARY_PATH
520
+ value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
521
+ - name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
522
+ value: eth1,eth2,eth3,eth4
523
+ - name: NCCL_GPUDIRECTTCPX_CTRL_DEV
524
+ value: eth0
525
+ - name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
526
+ value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
527
+ - name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
528
+ value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
529
+ - name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
530
+ value: "500000"
531
+ - name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
532
+ value: "/tmp"
533
+ - name: NCCL_GPUDIRECTTCPX_FORCE_ACK
534
+ value: "0"
535
+ - name: NCCL_SOCKET_IFNAME
536
+ value: eth0
537
+ - name: NCCL_CROSS_NIC
538
+ value: "0"
539
+ - name: NCCL_ALGO
540
+ value: Ring
541
+ - name: NCCL_PROTO
542
+ value: Simple
543
+ - name: NCCL_NSOCKS_PERTHREAD
544
+ value: "4"
545
+ - name: NCCL_SOCKET_NTHREADS
546
+ value: "1"
547
+ - name: NCCL_NET_GDR_LEVEL
548
+ value: PIX
549
+ - name: NCCL_DYNAMIC_CHUNK_SIZE
550
+ value: "524288"
551
+ - name: NCCL_P2P_PXN_LEVEL
552
+ value: "0"
553
+ - name: NCCL_P2P_NET_CHUNKSIZE
554
+ value: "524288"
555
+ - name: NCCL_P2P_PCI_CHUNKSIZE
556
+ value: "524288"
557
+ - name: NCCL_P2P_NVL_CHUNKSIZE
558
+ value: "1048576"
559
+ - name: NCCL_BUFFSIZE
560
+ value: "4194304"
561
+ - name: NCCL_MAX_NCHANNELS
562
+ value: "8"
563
+ - name: NCCL_MIN_NCHANNELS
564
+ value: "8"
565
+ - name: CUDA_VISIBLE_DEVICES
566
+ value: "0,1,2,3,4,5,6,7"
567
+ {% endif %}
568
+ {% if k8s_enable_gpudirect_tcpxo %}
569
+ - name: LD_LIBRARY_PATH
570
+ value: /usr/local/nvidia/lib64
571
+ - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
572
+ value: /dev/aperture_devices
573
+ - name: NCCL_FASTRAK_CTRL_DEV
574
+ value: eth0
575
+ - name: NCCL_FASTRAK_IFNAME
576
+ value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
577
+ - name: NCCL_SOCKET_IFNAME
578
+ value: eth0
579
+ - name: NCCL_CROSS_NIC
580
+ value: "0"
581
+ - name: NCCL_ALGO
582
+ value: Ring,Tree
583
+ - name: NCCL_PROTO
584
+ value: Simple,LL128
585
+ - name: NCCL_MIN_NCHANNELS
586
+ value: "4"
587
+ - name: NCCL_TUNER_PLUGIN
588
+ value: libnccl-tuner.so
589
+ - name: NCCL_TUNER_CONFIG_PATH
590
+ value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
591
+ - name: CUDA_VISIBLE_DEVICES
592
+ value: "0,1,2,3,4,5,6,7"
593
+ {% endif %}
594
+ {% if k8s_enable_gpudirect_rdma %}
595
+ - name: LD_LIBRARY_PATH
596
+ value: /usr/local/nvidia/lib64
597
+ - name: NCCL_NET
598
+ value: gIB
599
+ - name: NCCL_CROSS_NIC
600
+ value: "0"
601
+ - name: NCCL_NET_GDR_LEVEL
602
+ value: PIX
603
+ - name: NCCL_P2P_NET_CHUNKSIZE
604
+ value: "131072"
605
+ - name: NCCL_NVLS_CHUNKSIZE
606
+ value: "524288"
607
+ - name: NCCL_IB_ADAPTIVE_ROUTING
608
+ value: "1"
609
+ - name: NCCL_IB_QPS_PER_CONNECTION
610
+ value: "4"
611
+ - name: NCCL_IB_TC
612
+ value: "52"
613
+ - name: NCCL_IB_FIFO_TC
614
+ value: "84"
615
+ {% if k8s_enable_gpudirect_rdma_a4 %}
616
+ - name: NCCL_TUNER_CONFIG_PATH
617
+ value: /usr/local/gib/configs/tuner_config_a4.txtpb
618
+ {% else %}
619
+ - name: NCCL_TUNER_CONFIG_PATH
620
+ value: /usr/local/gib/configs/tuner_config_a3u.txtpb
621
+ {% endif %}
622
+ {% endif %}
412
623
  {% if k8s_fuse_device_required %}
413
624
  - name: FUSERMOUNT_SHARED_DIR
414
625
  value: {{k8s_fusermount_shared_dir}}
@@ -752,11 +963,27 @@ available_node_types:
752
963
  - name: secret-volume
753
964
  readOnly: true
754
965
  mountPath: "/etc/secret-volume"
755
- # This volume allocates shared memory for Ray to use for its plasma
756
- # object store. If you do not provide this, Ray will fall back to
757
- # /tmp which cause slowdowns if is not a shared memory volume.
758
966
  - mountPath: /dev/shm
759
967
  name: dshm
968
+ {% if k8s_enable_gpudirect_tcpx %}
969
+ - name: tcpx-socket
970
+ mountPath: /tmp
971
+ - name: libraries
972
+ mountPath: /usr/local/nvidia/lib64
973
+ readOnly: true
974
+ {% endif %}
975
+ {% if k8s_enable_gpudirect_tcpxo %}
976
+ - name: libraries
977
+ mountPath: /usr/local/nvidia
978
+ - name: aperture-devices
979
+ mountPath: /dev/aperture_devices
980
+ {% endif %}
981
+ {% if k8s_enable_gpudirect_rdma %}
982
+ - name: library-dir-host
983
+ mountPath: /usr/local/nvidia
984
+ - name: gib
985
+ mountPath: /usr/local/gib
986
+ {% endif %}
760
987
  {% if high_availability %}
761
988
  - name: {{k8s_high_availability_deployment_volume_mount_name}}
762
989
  mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
@@ -794,7 +1021,68 @@ available_node_types:
794
1021
  add:
795
1022
  - IPC_LOCK
796
1023
  {% endif %}
797
-
1024
+ {% if k8s_enable_gpudirect_tcpx %}
1025
+ # GPUDirect TCPX daemon sidecar container
1026
+ - name: tcpx-daemon
1027
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
1028
+ imagePullPolicy: Always
1029
+ command:
1030
+ - /tcpgpudmarxd/build/app/tcpgpudmarxd
1031
+ - --gpu_nic_preset
1032
+ - a3vm
1033
+ - --gpu_shmem_type
1034
+ - fd
1035
+ - --uds_path
1036
+ - /run/tcpx
1037
+ - --setup_param
1038
+ - --verbose
1039
+ - "128"
1040
+ - "2"
1041
+ - "0"
1042
+ securityContext:
1043
+ capabilities:
1044
+ add:
1045
+ - NET_ADMIN
1046
+ volumeMounts:
1047
+ - name: libraries
1048
+ mountPath: /usr/local/nvidia/lib64
1049
+ readOnly: true
1050
+ - name: tcpx-socket
1051
+ mountPath: /run/tcpx
1052
+ - name: sys
1053
+ mountPath: /hostsysfs
1054
+ - name: proc-sys
1055
+ mountPath: /hostprocsysfs
1056
+ env:
1057
+ - name: LD_LIBRARY_PATH
1058
+ value: /usr/local/nvidia/lib64
1059
+ {% endif %}
1060
+ {% if k8s_enable_gpudirect_tcpxo %}
1061
+ - name: tcpxo-daemon
1062
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
1063
+ imagePullPolicy: Always
1064
+ command: ["/bin/sh", "-c"]
1065
+ args:
1066
+ - |
1067
+ set -ex
1068
+ chmod 755 /fts/entrypoint_rxdm_container.sh
1069
+ /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
1070
+ securityContext:
1071
+ capabilities:
1072
+ add:
1073
+ - NET_ADMIN
1074
+ - NET_BIND_SERVICE
1075
+ volumeMounts:
1076
+ - name: libraries
1077
+ mountPath: /usr/local/nvidia
1078
+ - name: sys
1079
+ mountPath: /hostsysfs
1080
+ - name: proc-sys
1081
+ mountPath: /hostprocsysfs
1082
+ env:
1083
+ - name: LD_LIBRARY_PATH
1084
+ value: /usr/local/nvidia/lib64
1085
+ {% endif %}
798
1086
 
799
1087
  {% if high_availability %}
800
1088
  pvc_spec:
sky/users/permission.py CHANGED
@@ -15,6 +15,7 @@ from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
17
  from sky.utils import common_utils
18
+ from sky.utils import db_utils
18
19
 
19
20
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
20
21
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
@@ -33,11 +34,18 @@ class PermissionService:
33
34
  """Permission service for SkyPilot API Server."""
34
35
 
35
36
  def __init__(self):
37
+ self.enforcer = None
38
+
39
+ def _lazy_initialize(self):
40
+ if self.enforcer is not None:
41
+ return
36
42
  with _policy_lock():
37
43
  global _enforcer_instance
38
44
  if _enforcer_instance is None:
39
45
  _enforcer_instance = self
40
46
  engine = global_user_state.initialize_and_get_db()
47
+ db_utils.add_tables_to_db_sqlalchemy(
48
+ sqlalchemy_adapter.Base.metadata, engine)
41
49
  adapter = sqlalchemy_adapter.Adapter(engine)
42
50
  model_path = os.path.join(os.path.dirname(__file__),
43
51
  'model.conf')
@@ -70,7 +78,6 @@ class PermissionService:
70
78
 
71
79
  def _maybe_initialize_policies(self) -> None:
72
80
  """Initialize policies if they don't already exist."""
73
- # TODO(zhwu): we should avoid running this on client side.
74
81
  logger.debug(f'Initializing policies in process: {os.getpid()}')
75
82
  self._load_policy_no_lock()
76
83
 
@@ -149,6 +156,7 @@ class PermissionService:
149
156
 
150
157
  def add_user_if_not_exists(self, user_id: str) -> None:
151
158
  """Add user role relationship."""
159
+ self._lazy_initialize()
152
160
  with _policy_lock():
153
161
  self._add_user_if_not_exists_no_lock(user_id)
154
162
 
@@ -168,6 +176,7 @@ class PermissionService:
168
176
 
169
177
  def delete_user(self, user_id: str) -> None:
170
178
  """Delete user role relationship."""
179
+ self._lazy_initialize()
171
180
  with _policy_lock():
172
181
  # Get current roles
173
182
  self._load_policy_no_lock()
@@ -181,6 +190,7 @@ class PermissionService:
181
190
 
182
191
  def update_role(self, user_id: str, new_role: str) -> None:
183
192
  """Update user role relationship."""
193
+ self._lazy_initialize()
184
194
  with _policy_lock():
185
195
  # Get current roles
186
196
  self._load_policy_no_lock()
@@ -213,6 +223,7 @@ class PermissionService:
213
223
  Returns:
214
224
  A list of role names that the user has.
215
225
  """
226
+ self._lazy_initialize()
216
227
  self._load_policy_no_lock()
217
228
  return self.enforcer.get_roles_for_user(user_id)
218
229
 
@@ -225,6 +236,7 @@ class PermissionService:
225
236
  # it is a hot path in every request. It is ok to have a stale policy,
226
237
  # as long as it is eventually consistent.
227
238
  # self._load_policy_no_lock()
239
+ self._lazy_initialize()
228
240
  return self.enforcer.enforce(user_id, path, method)
229
241
 
230
242
  def _load_policy_no_lock(self):
@@ -233,6 +245,7 @@ class PermissionService:
233
245
 
234
246
  def load_policy(self):
235
247
  """Load policy from storage with lock."""
248
+ self._lazy_initialize()
236
249
  with _policy_lock():
237
250
  self._load_policy_no_lock()
238
251
 
@@ -248,6 +261,7 @@ class PermissionService:
248
261
  For public workspaces, the permission is granted via a wildcard policy
249
262
  ('*').
250
263
  """
264
+ self._lazy_initialize()
251
265
  if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
252
266
  # When it is not on API server, we allow all users to access all
253
267
  # workspaces, as the workspace check has been done on API server.
@@ -304,6 +318,7 @@ class PermissionService:
304
318
  For public workspaces, this should be ['*'].
305
319
  For private workspaces, this should be specific user IDs.
306
320
  """
321
+ self._lazy_initialize()
307
322
  with _policy_lock():
308
323
  for user in users:
309
324
  logger.debug(f'Adding workspace policy: user={user}, '
@@ -321,6 +336,7 @@ class PermissionService:
321
336
  For public workspaces, this should be ['*'].
322
337
  For private workspaces, this should be specific user IDs.
323
338
  """
339
+ self._lazy_initialize()
324
340
  with _policy_lock():
325
341
  self._load_policy_no_lock()
326
342
  # Remove all existing policies for this workspace
@@ -334,6 +350,7 @@ class PermissionService:
334
350
 
335
351
  def remove_workspace_policy(self, workspace_name: str) -> None:
336
352
  """Remove workspace policy."""
353
+ self._lazy_initialize()
337
354
  with _policy_lock():
338
355
  self.enforcer.remove_filtered_policy(1, workspace_name)
339
356
  self.enforcer.save_policy()
@@ -5,6 +5,7 @@ import datetime
5
5
  import hashlib
6
6
  import os
7
7
  import secrets
8
+ import threading
8
9
  from typing import Any, Dict, Generator, Optional
9
10
 
10
11
  import filelock
@@ -44,12 +45,21 @@ class TokenService:
44
45
  """Service for managing JWT-based service account tokens."""
45
46
 
46
47
  def __init__(self):
47
- self.secret_key = self._get_or_generate_secret()
48
+ self.secret_key = None
49
+ self.init_lock = threading.Lock()
50
+
51
+ def _lazy_initialize(self):
52
+ if self.secret_key is not None:
53
+ return
54
+ with self.init_lock:
55
+ if self.secret_key is not None:
56
+ return
57
+ self.secret_key = self._get_or_generate_secret()
48
58
 
49
59
  def _get_or_generate_secret(self) -> str:
50
60
  """Get JWT secret from database or generate a new one."""
51
- with _jwt_secret_lock():
52
- # Try to get from database (persistent across deployments)
61
+
62
+ def _get_secret_from_db():
53
63
  try:
54
64
  db_secret = global_user_state.get_system_config(
55
65
  JWT_SECRET_DB_KEY)
@@ -58,7 +68,17 @@ class TokenService:
58
68
  return db_secret
59
69
  except Exception as e: # pylint: disable=broad-except
60
70
  logger.debug(f'Failed to get JWT secret from database: {e}')
71
+ return None
72
+
73
+ # Try to get from database (persistent across deployments)
74
+ token_from_db = _get_secret_from_db()
75
+ if token_from_db:
76
+ return token_from_db
61
77
 
78
+ with _jwt_secret_lock():
79
+ token_from_db = _get_secret_from_db()
80
+ if token_from_db:
81
+ return token_from_db
62
82
  # Generate a new secret and store in database
63
83
  new_secret = secrets.token_urlsafe(64)
64
84
  try:
@@ -91,6 +111,7 @@ class TokenService:
91
111
  Returns:
92
112
  Dict containing token info including the JWT token
93
113
  """
114
+ self._lazy_initialize()
94
115
  now = datetime.datetime.now(datetime.timezone.utc)
95
116
  token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
96
117
 
@@ -144,6 +165,7 @@ class TokenService:
144
165
  Returns:
145
166
  Decoded token payload or None if invalid
146
167
  """
168
+ self._lazy_initialize()
147
169
  if not token.startswith('sky_'):
148
170
  return None
149
171
 
sky/utils/common_utils.py CHANGED
@@ -11,6 +11,7 @@ import platform
11
11
  import random
12
12
  import re
13
13
  import socket
14
+ import subprocess
14
15
  import sys
15
16
  import time
16
17
  import typing
@@ -87,6 +88,18 @@ def generate_user_hash() -> str:
87
88
  return user_hash
88
89
 
89
90
 
91
+ def get_git_commit(path: Optional[str] = None) -> Optional[str]:
92
+ try:
93
+ result = subprocess.run(['git', 'rev-parse', 'HEAD'],
94
+ capture_output=True,
95
+ text=True,
96
+ cwd=path,
97
+ check=True)
98
+ return result.stdout.strip()
99
+ except subprocess.CalledProcessError:
100
+ return None
101
+
102
+
90
103
  def get_user_hash() -> str:
91
104
  """Returns a unique user-machine specific hash as a user id.
92
105
 
sky/utils/db_utils.py CHANGED
@@ -84,6 +84,22 @@ def add_column_to_table(
84
84
  conn.commit()
85
85
 
86
86
 
87
+ def add_tables_to_db_sqlalchemy(
88
+ metadata: sqlalchemy.MetaData,
89
+ engine: sqlalchemy.Engine,
90
+ ):
91
+ """Add tables to the database."""
92
+ for table in metadata.tables.values():
93
+ try:
94
+ table.create(bind=engine, checkfirst=True)
95
+ except (sqlalchemy_exc.OperationalError,
96
+ sqlalchemy_exc.ProgrammingError) as e:
97
+ if 'already exists' in str(e):
98
+ pass
99
+ else:
100
+ raise
101
+
102
+
87
103
  def add_column_to_table_sqlalchemy(
88
104
  session: 'Session',
89
105
  table_name: str,
sky/utils/schemas.py CHANGED
@@ -870,6 +870,9 @@ def get_task_schema():
870
870
  'type': 'array',
871
871
  'items': get_volume_mount_schema(),
872
872
  },
873
+ '_metadata': {
874
+ 'type': 'object',
875
+ },
873
876
  **_experimental_task_schema(),
874
877
  }
875
878
  }
@@ -1103,6 +1106,9 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1103
1106
  },
1104
1107
  },
1105
1108
  },
1109
+ 'remote_identity': {
1110
+ 'type': 'string',
1111
+ }
1106
1112
  }
1107
1113
 
1108
1114
 
sky/utils/ux_utils.py CHANGED
@@ -253,9 +253,7 @@ def command_hint_messages(hint_type: CommandHintType,
253
253
  f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
254
254
  f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
255
255
  f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
256
- f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
257
- f'{BOLD}sky jobs queue{RESET_BOLD}'
258
- f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
259
- f'{BOLD}sky jobs dashboard{RESET_BOLD}')
256
+ f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
257
+ f'{BOLD}sky jobs queue{RESET_BOLD}')
260
258
  else:
261
259
  raise ValueError(f'Invalid hint type: {hint_type}')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250709
3
+ Version: 1.0.0.dev20250711
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0