skypilot-nightly 1.0.0.dev20250710__py3-none-any.whl → 1.0.0.dev20250712__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/clouds/kubernetes.py +137 -23
- sky/core.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1043-1b39779691bb4030.js → 1043-5e5ef6198735ff7e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1871-cf1a47986d716dd2.js +6 -0
- sky/dashboard/out/_next/static/chunks/6601-d38d10f957dff832.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6ff4e45dfb49d11d.js → 6989-eab0e9c16b64fd9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/938-8e25c8ea0baa271a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4608dc89f95eba89.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-980d6f6b64ca7833.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-fd62f17bd9ce1fcc.js → webpack-4d50ce5087a63a95.js} +1 -1
- sky/dashboard/out/_next/static/css/a713705ccc8fe059.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +10 -11
- sky/jobs/state.py +10 -11
- sky/jobs/utils.py +11 -3
- sky/optimizer.py +22 -14
- sky/provision/kubernetes/utils.py +132 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +4 -1
- sky/templates/kubernetes-ray.yml.j2 +298 -10
- sky/users/permission.py +15 -1
- sky/users/token_service.py +25 -3
- sky/utils/schemas.py +3 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/RECORD +49 -49
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
- sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +0 -16
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
- /sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-49ff6c04332cc621.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/top_level.txt +0 -0
@@ -293,13 +293,82 @@ available_node_types:
|
|
293
293
|
kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
|
294
294
|
kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
|
295
295
|
{% endif %}
|
296
|
-
{% if k8s_kueue_local_queue_name %}
|
296
|
+
{% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
|
297
297
|
annotations:
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
298
|
+
{% if k8s_kueue_local_queue_name %}
|
299
|
+
kueue.x-k8s.io/retriable-in-group: "false"
|
300
|
+
kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
|
301
|
+
{% if k8s_max_run_duration_seconds %}
|
302
|
+
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
|
303
|
+
{% endif %}
|
304
|
+
{% endif %}
|
305
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
|
306
|
+
# Values from google cloud guide
|
307
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
308
|
+
devices.gke.io/container.tcpx-daemon: |+
|
309
|
+
- path: /dev/nvidia0
|
310
|
+
- path: /dev/nvidia1
|
311
|
+
- path: /dev/nvidia2
|
312
|
+
- path: /dev/nvidia3
|
313
|
+
- path: /dev/nvidia4
|
314
|
+
- path: /dev/nvidia5
|
315
|
+
- path: /dev/nvidia6
|
316
|
+
- path: /dev/nvidia7
|
317
|
+
- path: /dev/nvidiactl
|
318
|
+
- path: /dev/nvidia-uvm
|
319
|
+
networking.gke.io/default-interface: 'eth0'
|
320
|
+
networking.gke.io/interfaces: |
|
321
|
+
[
|
322
|
+
{"interfaceName":"eth0","network":"default"},
|
323
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
324
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
325
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
326
|
+
{"interfaceName":"eth4","network":"vpc4"}
|
327
|
+
]
|
328
|
+
{% endif %}
|
329
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
330
|
+
devices.gke.io/container.tcpxo-daemon: |+
|
331
|
+
- path: /dev/nvidia0
|
332
|
+
- path: /dev/nvidia1
|
333
|
+
- path: /dev/nvidia2
|
334
|
+
- path: /dev/nvidia3
|
335
|
+
- path: /dev/nvidia4
|
336
|
+
- path: /dev/nvidia5
|
337
|
+
- path: /dev/nvidia6
|
338
|
+
- path: /dev/nvidia7
|
339
|
+
- path: /dev/nvidiactl
|
340
|
+
- path: /dev/nvidia-uvm
|
341
|
+
- path: /dev/dmabuf_import_helper
|
342
|
+
networking.gke.io/default-interface: 'eth0'
|
343
|
+
networking.gke.io/interfaces: |
|
344
|
+
[
|
345
|
+
{"interfaceName":"eth0","network":"default"},
|
346
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
347
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
348
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
349
|
+
{"interfaceName":"eth4","network":"vpc4"},
|
350
|
+
{"interfaceName":"eth5","network":"vpc5"},
|
351
|
+
{"interfaceName":"eth6","network":"vpc6"},
|
352
|
+
{"interfaceName":"eth7","network":"vpc7"},
|
353
|
+
{"interfaceName":"eth8","network":"vpc8"}
|
354
|
+
]
|
355
|
+
{% endif %}
|
356
|
+
{% if k8s_enable_gpudirect_rdma %}
|
357
|
+
networking.gke.io/default-interface: 'eth0'
|
358
|
+
networking.gke.io/interfaces: |
|
359
|
+
[
|
360
|
+
{"interfaceName":"eth0","network":"default"},
|
361
|
+
{"interfaceName":"eth1","network":"gvnic-1"},
|
362
|
+
{"interfaceName":"eth2","network":"rdma-0"},
|
363
|
+
{"interfaceName":"eth3","network":"rdma-1"},
|
364
|
+
{"interfaceName":"eth4","network":"rdma-2"},
|
365
|
+
{"interfaceName":"eth5","network":"rdma-3"},
|
366
|
+
{"interfaceName":"eth6","network":"rdma-4"},
|
367
|
+
{"interfaceName":"eth7","network":"rdma-5"},
|
368
|
+
{"interfaceName":"eth8","network":"rdma-6"},
|
369
|
+
{"interfaceName":"eth9","network":"rdma-7"}
|
370
|
+
]
|
371
|
+
{% endif %}
|
303
372
|
{% endif %}
|
304
373
|
spec:
|
305
374
|
# serviceAccountName: skypilot-service-account
|
@@ -396,6 +465,41 @@ available_node_types:
|
|
396
465
|
persistentVolumeClaim:
|
397
466
|
claimName: {{volume_mount.volume_name_on_cloud}}
|
398
467
|
{% endfor %}
|
468
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
469
|
+
- name: libraries
|
470
|
+
hostPath:
|
471
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
472
|
+
- name: tcpx-socket
|
473
|
+
emptyDir: {}
|
474
|
+
- name: sys
|
475
|
+
hostPath:
|
476
|
+
path: /sys
|
477
|
+
- name: proc-sys
|
478
|
+
hostPath:
|
479
|
+
path: /proc/sys
|
480
|
+
{% endif %}
|
481
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
482
|
+
- name: libraries
|
483
|
+
hostPath:
|
484
|
+
path: /home/kubernetes/bin/nvidia
|
485
|
+
- name: sys
|
486
|
+
hostPath:
|
487
|
+
path: /sys
|
488
|
+
- name: proc-sys
|
489
|
+
hostPath:
|
490
|
+
path: /proc/sys
|
491
|
+
- name: aperture-devices
|
492
|
+
hostPath:
|
493
|
+
path: /dev/aperture_devices
|
494
|
+
{% endif %}
|
495
|
+
{% if k8s_enable_gpudirect_rdma %}
|
496
|
+
- name: library-dir-host
|
497
|
+
hostPath:
|
498
|
+
path: /home/kubernetes/bin/nvidia
|
499
|
+
- name: gib
|
500
|
+
hostPath:
|
501
|
+
path: /home/kubernetes/bin/gib
|
502
|
+
{% endif %}
|
399
503
|
containers:
|
400
504
|
- name: ray-node
|
401
505
|
imagePullPolicy: Always
|
@@ -409,6 +513,113 @@ available_node_types:
|
|
409
513
|
- name: {{ key }}
|
410
514
|
value: {{ value }}
|
411
515
|
{% endfor %}
|
516
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
|
517
|
+
# Page recommends setting NCCL values for GPUDirect TCPX for best performance.
|
518
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
519
|
+
- name: LD_LIBRARY_PATH
|
520
|
+
value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
|
521
|
+
- name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
|
522
|
+
value: eth1,eth2,eth3,eth4
|
523
|
+
- name: NCCL_GPUDIRECTTCPX_CTRL_DEV
|
524
|
+
value: eth0
|
525
|
+
- name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
|
526
|
+
value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
|
527
|
+
- name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
|
528
|
+
value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
|
529
|
+
- name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
|
530
|
+
value: "500000"
|
531
|
+
- name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
|
532
|
+
value: "/tmp"
|
533
|
+
- name: NCCL_GPUDIRECTTCPX_FORCE_ACK
|
534
|
+
value: "0"
|
535
|
+
- name: NCCL_SOCKET_IFNAME
|
536
|
+
value: eth0
|
537
|
+
- name: NCCL_CROSS_NIC
|
538
|
+
value: "0"
|
539
|
+
- name: NCCL_ALGO
|
540
|
+
value: Ring
|
541
|
+
- name: NCCL_PROTO
|
542
|
+
value: Simple
|
543
|
+
- name: NCCL_NSOCKS_PERTHREAD
|
544
|
+
value: "4"
|
545
|
+
- name: NCCL_SOCKET_NTHREADS
|
546
|
+
value: "1"
|
547
|
+
- name: NCCL_NET_GDR_LEVEL
|
548
|
+
value: PIX
|
549
|
+
- name: NCCL_DYNAMIC_CHUNK_SIZE
|
550
|
+
value: "524288"
|
551
|
+
- name: NCCL_P2P_PXN_LEVEL
|
552
|
+
value: "0"
|
553
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
554
|
+
value: "524288"
|
555
|
+
- name: NCCL_P2P_PCI_CHUNKSIZE
|
556
|
+
value: "524288"
|
557
|
+
- name: NCCL_P2P_NVL_CHUNKSIZE
|
558
|
+
value: "1048576"
|
559
|
+
- name: NCCL_BUFFSIZE
|
560
|
+
value: "4194304"
|
561
|
+
- name: NCCL_MAX_NCHANNELS
|
562
|
+
value: "8"
|
563
|
+
- name: NCCL_MIN_NCHANNELS
|
564
|
+
value: "8"
|
565
|
+
- name: CUDA_VISIBLE_DEVICES
|
566
|
+
value: "0,1,2,3,4,5,6,7"
|
567
|
+
{% endif %}
|
568
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
569
|
+
- name: LD_LIBRARY_PATH
|
570
|
+
value: /usr/local/nvidia/lib64
|
571
|
+
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
|
572
|
+
value: /dev/aperture_devices
|
573
|
+
- name: NCCL_FASTRAK_CTRL_DEV
|
574
|
+
value: eth0
|
575
|
+
- name: NCCL_FASTRAK_IFNAME
|
576
|
+
value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
|
577
|
+
- name: NCCL_SOCKET_IFNAME
|
578
|
+
value: eth0
|
579
|
+
- name: NCCL_CROSS_NIC
|
580
|
+
value: "0"
|
581
|
+
- name: NCCL_ALGO
|
582
|
+
value: Ring,Tree
|
583
|
+
- name: NCCL_PROTO
|
584
|
+
value: Simple,LL128
|
585
|
+
- name: NCCL_MIN_NCHANNELS
|
586
|
+
value: "4"
|
587
|
+
- name: NCCL_TUNER_PLUGIN
|
588
|
+
value: libnccl-tuner.so
|
589
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
590
|
+
value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
|
591
|
+
- name: CUDA_VISIBLE_DEVICES
|
592
|
+
value: "0,1,2,3,4,5,6,7"
|
593
|
+
{% endif %}
|
594
|
+
{% if k8s_enable_gpudirect_rdma %}
|
595
|
+
- name: LD_LIBRARY_PATH
|
596
|
+
value: /usr/local/nvidia/lib64
|
597
|
+
- name: NCCL_NET
|
598
|
+
value: gIB
|
599
|
+
- name: NCCL_CROSS_NIC
|
600
|
+
value: "0"
|
601
|
+
- name: NCCL_NET_GDR_LEVEL
|
602
|
+
value: PIX
|
603
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
604
|
+
value: "131072"
|
605
|
+
- name: NCCL_NVLS_CHUNKSIZE
|
606
|
+
value: "524288"
|
607
|
+
- name: NCCL_IB_ADAPTIVE_ROUTING
|
608
|
+
value: "1"
|
609
|
+
- name: NCCL_IB_QPS_PER_CONNECTION
|
610
|
+
value: "4"
|
611
|
+
- name: NCCL_IB_TC
|
612
|
+
value: "52"
|
613
|
+
- name: NCCL_IB_FIFO_TC
|
614
|
+
value: "84"
|
615
|
+
{% if k8s_enable_gpudirect_rdma_a4 %}
|
616
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
617
|
+
value: /usr/local/gib/configs/tuner_config_a4.txtpb
|
618
|
+
{% else %}
|
619
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
620
|
+
value: /usr/local/gib/configs/tuner_config_a3u.txtpb
|
621
|
+
{% endif %}
|
622
|
+
{% endif %}
|
412
623
|
{% if k8s_fuse_device_required %}
|
413
624
|
- name: FUSERMOUNT_SHARED_DIR
|
414
625
|
value: {{k8s_fusermount_shared_dir}}
|
@@ -752,11 +963,27 @@ available_node_types:
|
|
752
963
|
- name: secret-volume
|
753
964
|
readOnly: true
|
754
965
|
mountPath: "/etc/secret-volume"
|
755
|
-
# This volume allocates shared memory for Ray to use for its plasma
|
756
|
-
# object store. If you do not provide this, Ray will fall back to
|
757
|
-
# /tmp which cause slowdowns if is not a shared memory volume.
|
758
966
|
- mountPath: /dev/shm
|
759
967
|
name: dshm
|
968
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
969
|
+
- name: tcpx-socket
|
970
|
+
mountPath: /tmp
|
971
|
+
- name: libraries
|
972
|
+
mountPath: /usr/local/nvidia/lib64
|
973
|
+
readOnly: true
|
974
|
+
{% endif %}
|
975
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
976
|
+
- name: libraries
|
977
|
+
mountPath: /usr/local/nvidia
|
978
|
+
- name: aperture-devices
|
979
|
+
mountPath: /dev/aperture_devices
|
980
|
+
{% endif %}
|
981
|
+
{% if k8s_enable_gpudirect_rdma %}
|
982
|
+
- name: library-dir-host
|
983
|
+
mountPath: /usr/local/nvidia
|
984
|
+
- name: gib
|
985
|
+
mountPath: /usr/local/gib
|
986
|
+
{% endif %}
|
760
987
|
{% if high_availability %}
|
761
988
|
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
762
989
|
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
@@ -794,7 +1021,68 @@ available_node_types:
|
|
794
1021
|
add:
|
795
1022
|
- IPC_LOCK
|
796
1023
|
{% endif %}
|
797
|
-
|
1024
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
1025
|
+
# GPUDirect TCPX daemon sidecar container
|
1026
|
+
- name: tcpx-daemon
|
1027
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
|
1028
|
+
imagePullPolicy: Always
|
1029
|
+
command:
|
1030
|
+
- /tcpgpudmarxd/build/app/tcpgpudmarxd
|
1031
|
+
- --gpu_nic_preset
|
1032
|
+
- a3vm
|
1033
|
+
- --gpu_shmem_type
|
1034
|
+
- fd
|
1035
|
+
- --uds_path
|
1036
|
+
- /run/tcpx
|
1037
|
+
- --setup_param
|
1038
|
+
- --verbose
|
1039
|
+
- "128"
|
1040
|
+
- "2"
|
1041
|
+
- "0"
|
1042
|
+
securityContext:
|
1043
|
+
capabilities:
|
1044
|
+
add:
|
1045
|
+
- NET_ADMIN
|
1046
|
+
volumeMounts:
|
1047
|
+
- name: libraries
|
1048
|
+
mountPath: /usr/local/nvidia/lib64
|
1049
|
+
readOnly: true
|
1050
|
+
- name: tcpx-socket
|
1051
|
+
mountPath: /run/tcpx
|
1052
|
+
- name: sys
|
1053
|
+
mountPath: /hostsysfs
|
1054
|
+
- name: proc-sys
|
1055
|
+
mountPath: /hostprocsysfs
|
1056
|
+
env:
|
1057
|
+
- name: LD_LIBRARY_PATH
|
1058
|
+
value: /usr/local/nvidia/lib64
|
1059
|
+
{% endif %}
|
1060
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
1061
|
+
- name: tcpxo-daemon
|
1062
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
|
1063
|
+
imagePullPolicy: Always
|
1064
|
+
command: ["/bin/sh", "-c"]
|
1065
|
+
args:
|
1066
|
+
- |
|
1067
|
+
set -ex
|
1068
|
+
chmod 755 /fts/entrypoint_rxdm_container.sh
|
1069
|
+
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
|
1070
|
+
securityContext:
|
1071
|
+
capabilities:
|
1072
|
+
add:
|
1073
|
+
- NET_ADMIN
|
1074
|
+
- NET_BIND_SERVICE
|
1075
|
+
volumeMounts:
|
1076
|
+
- name: libraries
|
1077
|
+
mountPath: /usr/local/nvidia
|
1078
|
+
- name: sys
|
1079
|
+
mountPath: /hostsysfs
|
1080
|
+
- name: proc-sys
|
1081
|
+
mountPath: /hostprocsysfs
|
1082
|
+
env:
|
1083
|
+
- name: LD_LIBRARY_PATH
|
1084
|
+
value: /usr/local/nvidia/lib64
|
1085
|
+
{% endif %}
|
798
1086
|
|
799
1087
|
{% if high_availability %}
|
800
1088
|
pvc_spec:
|
sky/users/permission.py
CHANGED
@@ -34,6 +34,11 @@ class PermissionService:
|
|
34
34
|
"""Permission service for SkyPilot API Server."""
|
35
35
|
|
36
36
|
def __init__(self):
|
37
|
+
self.enforcer = None
|
38
|
+
|
39
|
+
def _lazy_initialize(self):
|
40
|
+
if self.enforcer is not None:
|
41
|
+
return
|
37
42
|
with _policy_lock():
|
38
43
|
global _enforcer_instance
|
39
44
|
if _enforcer_instance is None:
|
@@ -73,7 +78,6 @@ class PermissionService:
|
|
73
78
|
|
74
79
|
def _maybe_initialize_policies(self) -> None:
|
75
80
|
"""Initialize policies if they don't already exist."""
|
76
|
-
# TODO(zhwu): we should avoid running this on client side.
|
77
81
|
logger.debug(f'Initializing policies in process: {os.getpid()}')
|
78
82
|
self._load_policy_no_lock()
|
79
83
|
|
@@ -152,6 +156,7 @@ class PermissionService:
|
|
152
156
|
|
153
157
|
def add_user_if_not_exists(self, user_id: str) -> None:
|
154
158
|
"""Add user role relationship."""
|
159
|
+
self._lazy_initialize()
|
155
160
|
with _policy_lock():
|
156
161
|
self._add_user_if_not_exists_no_lock(user_id)
|
157
162
|
|
@@ -171,6 +176,7 @@ class PermissionService:
|
|
171
176
|
|
172
177
|
def delete_user(self, user_id: str) -> None:
|
173
178
|
"""Delete user role relationship."""
|
179
|
+
self._lazy_initialize()
|
174
180
|
with _policy_lock():
|
175
181
|
# Get current roles
|
176
182
|
self._load_policy_no_lock()
|
@@ -184,6 +190,7 @@ class PermissionService:
|
|
184
190
|
|
185
191
|
def update_role(self, user_id: str, new_role: str) -> None:
|
186
192
|
"""Update user role relationship."""
|
193
|
+
self._lazy_initialize()
|
187
194
|
with _policy_lock():
|
188
195
|
# Get current roles
|
189
196
|
self._load_policy_no_lock()
|
@@ -216,6 +223,7 @@ class PermissionService:
|
|
216
223
|
Returns:
|
217
224
|
A list of role names that the user has.
|
218
225
|
"""
|
226
|
+
self._lazy_initialize()
|
219
227
|
self._load_policy_no_lock()
|
220
228
|
return self.enforcer.get_roles_for_user(user_id)
|
221
229
|
|
@@ -228,6 +236,7 @@ class PermissionService:
|
|
228
236
|
# it is a hot path in every request. It is ok to have a stale policy,
|
229
237
|
# as long as it is eventually consistent.
|
230
238
|
# self._load_policy_no_lock()
|
239
|
+
self._lazy_initialize()
|
231
240
|
return self.enforcer.enforce(user_id, path, method)
|
232
241
|
|
233
242
|
def _load_policy_no_lock(self):
|
@@ -236,6 +245,7 @@ class PermissionService:
|
|
236
245
|
|
237
246
|
def load_policy(self):
|
238
247
|
"""Load policy from storage with lock."""
|
248
|
+
self._lazy_initialize()
|
239
249
|
with _policy_lock():
|
240
250
|
self._load_policy_no_lock()
|
241
251
|
|
@@ -251,6 +261,7 @@ class PermissionService:
|
|
251
261
|
For public workspaces, the permission is granted via a wildcard policy
|
252
262
|
('*').
|
253
263
|
"""
|
264
|
+
self._lazy_initialize()
|
254
265
|
if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
|
255
266
|
# When it is not on API server, we allow all users to access all
|
256
267
|
# workspaces, as the workspace check has been done on API server.
|
@@ -307,6 +318,7 @@ class PermissionService:
|
|
307
318
|
For public workspaces, this should be ['*'].
|
308
319
|
For private workspaces, this should be specific user IDs.
|
309
320
|
"""
|
321
|
+
self._lazy_initialize()
|
310
322
|
with _policy_lock():
|
311
323
|
for user in users:
|
312
324
|
logger.debug(f'Adding workspace policy: user={user}, '
|
@@ -324,6 +336,7 @@ class PermissionService:
|
|
324
336
|
For public workspaces, this should be ['*'].
|
325
337
|
For private workspaces, this should be specific user IDs.
|
326
338
|
"""
|
339
|
+
self._lazy_initialize()
|
327
340
|
with _policy_lock():
|
328
341
|
self._load_policy_no_lock()
|
329
342
|
# Remove all existing policies for this workspace
|
@@ -337,6 +350,7 @@ class PermissionService:
|
|
337
350
|
|
338
351
|
def remove_workspace_policy(self, workspace_name: str) -> None:
|
339
352
|
"""Remove workspace policy."""
|
353
|
+
self._lazy_initialize()
|
340
354
|
with _policy_lock():
|
341
355
|
self.enforcer.remove_filtered_policy(1, workspace_name)
|
342
356
|
self.enforcer.save_policy()
|
sky/users/token_service.py
CHANGED
@@ -5,6 +5,7 @@ import datetime
|
|
5
5
|
import hashlib
|
6
6
|
import os
|
7
7
|
import secrets
|
8
|
+
import threading
|
8
9
|
from typing import Any, Dict, Generator, Optional
|
9
10
|
|
10
11
|
import filelock
|
@@ -44,12 +45,21 @@ class TokenService:
|
|
44
45
|
"""Service for managing JWT-based service account tokens."""
|
45
46
|
|
46
47
|
def __init__(self):
|
47
|
-
self.secret_key =
|
48
|
+
self.secret_key = None
|
49
|
+
self.init_lock = threading.Lock()
|
50
|
+
|
51
|
+
def _lazy_initialize(self):
|
52
|
+
if self.secret_key is not None:
|
53
|
+
return
|
54
|
+
with self.init_lock:
|
55
|
+
if self.secret_key is not None:
|
56
|
+
return
|
57
|
+
self.secret_key = self._get_or_generate_secret()
|
48
58
|
|
49
59
|
def _get_or_generate_secret(self) -> str:
|
50
60
|
"""Get JWT secret from database or generate a new one."""
|
51
|
-
|
52
|
-
|
61
|
+
|
62
|
+
def _get_secret_from_db():
|
53
63
|
try:
|
54
64
|
db_secret = global_user_state.get_system_config(
|
55
65
|
JWT_SECRET_DB_KEY)
|
@@ -58,7 +68,17 @@ class TokenService:
|
|
58
68
|
return db_secret
|
59
69
|
except Exception as e: # pylint: disable=broad-except
|
60
70
|
logger.debug(f'Failed to get JWT secret from database: {e}')
|
71
|
+
return None
|
72
|
+
|
73
|
+
# Try to get from database (persistent across deployments)
|
74
|
+
token_from_db = _get_secret_from_db()
|
75
|
+
if token_from_db:
|
76
|
+
return token_from_db
|
61
77
|
|
78
|
+
with _jwt_secret_lock():
|
79
|
+
token_from_db = _get_secret_from_db()
|
80
|
+
if token_from_db:
|
81
|
+
return token_from_db
|
62
82
|
# Generate a new secret and store in database
|
63
83
|
new_secret = secrets.token_urlsafe(64)
|
64
84
|
try:
|
@@ -91,6 +111,7 @@ class TokenService:
|
|
91
111
|
Returns:
|
92
112
|
Dict containing token info including the JWT token
|
93
113
|
"""
|
114
|
+
self._lazy_initialize()
|
94
115
|
now = datetime.datetime.now(datetime.timezone.utc)
|
95
116
|
token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
|
96
117
|
|
@@ -144,6 +165,7 @@ class TokenService:
|
|
144
165
|
Returns:
|
145
166
|
Decoded token payload or None if invalid
|
146
167
|
"""
|
168
|
+
self._lazy_initialize()
|
147
169
|
if not token.startswith('sky_'):
|
148
170
|
return None
|
149
171
|
|
sky/utils/schemas.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250712
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -67,6 +67,7 @@ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
|
|
67
67
|
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
|
68
68
|
Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
|
69
69
|
Requires-Dist: msgraph-sdk; extra == "azure"
|
70
|
+
Requires-Dist: msrestazure; extra == "azure"
|
70
71
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
|
71
72
|
Provides-Extra: gcp
|
72
73
|
Requires-Dist: google-api-python-client>=2.69.0; extra == "gcp"
|
@@ -140,6 +141,7 @@ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
140
141
|
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
141
142
|
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
142
143
|
Requires-Dist: msgraph-sdk; extra == "all"
|
144
|
+
Requires-Dist: msrestazure; extra == "all"
|
143
145
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
144
146
|
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
145
147
|
Requires-Dist: google-cloud-storage; extra == "all"
|