skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -4
- sky/clouds/kubernetes.py +137 -23
- sky/core.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
- sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-60070a62f55486a6.js} +1 -1
- sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
- sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +10 -11
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +7 -0
- sky/jobs/server/core.py +2 -1
- sky/jobs/server/utils.py +81 -0
- sky/jobs/state.py +58 -40
- sky/jobs/utils.py +45 -6
- sky/provision/kubernetes/instance.py +17 -0
- sky/provision/kubernetes/utils.py +134 -0
- sky/provision/provisioner.py +20 -0
- sky/skylet/constants.py +1 -6
- sky/skylet/job_lib.py +30 -8
- sky/skypilot_config.py +8 -3
- sky/task.py +17 -0
- sky/templates/kubernetes-ray.yml.j2 +298 -10
- sky/users/permission.py +18 -1
- sky/users/token_service.py +25 -3
- sky/utils/common_utils.py +13 -0
- sky/utils/db_utils.py +16 -0
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +2 -4
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +55 -54
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
- /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
@@ -293,13 +293,82 @@ available_node_types:
|
|
293
293
|
kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
|
294
294
|
kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
|
295
295
|
{% endif %}
|
296
|
-
{% if k8s_kueue_local_queue_name %}
|
296
|
+
{% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
|
297
297
|
annotations:
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
298
|
+
{% if k8s_kueue_local_queue_name %}
|
299
|
+
kueue.x-k8s.io/retriable-in-group: "false"
|
300
|
+
kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
|
301
|
+
{% if k8s_max_run_duration_seconds %}
|
302
|
+
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
|
303
|
+
{% endif %}
|
304
|
+
{% endif %}
|
305
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
|
306
|
+
# Values from google cloud guide
|
307
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
308
|
+
devices.gke.io/container.tcpx-daemon: |+
|
309
|
+
- path: /dev/nvidia0
|
310
|
+
- path: /dev/nvidia1
|
311
|
+
- path: /dev/nvidia2
|
312
|
+
- path: /dev/nvidia3
|
313
|
+
- path: /dev/nvidia4
|
314
|
+
- path: /dev/nvidia5
|
315
|
+
- path: /dev/nvidia6
|
316
|
+
- path: /dev/nvidia7
|
317
|
+
- path: /dev/nvidiactl
|
318
|
+
- path: /dev/nvidia-uvm
|
319
|
+
networking.gke.io/default-interface: 'eth0'
|
320
|
+
networking.gke.io/interfaces: |
|
321
|
+
[
|
322
|
+
{"interfaceName":"eth0","network":"default"},
|
323
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
324
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
325
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
326
|
+
{"interfaceName":"eth4","network":"vpc4"}
|
327
|
+
]
|
328
|
+
{% endif %}
|
329
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
330
|
+
devices.gke.io/container.tcpxo-daemon: |+
|
331
|
+
- path: /dev/nvidia0
|
332
|
+
- path: /dev/nvidia1
|
333
|
+
- path: /dev/nvidia2
|
334
|
+
- path: /dev/nvidia3
|
335
|
+
- path: /dev/nvidia4
|
336
|
+
- path: /dev/nvidia5
|
337
|
+
- path: /dev/nvidia6
|
338
|
+
- path: /dev/nvidia7
|
339
|
+
- path: /dev/nvidiactl
|
340
|
+
- path: /dev/nvidia-uvm
|
341
|
+
- path: /dev/dmabuf_import_helper
|
342
|
+
networking.gke.io/default-interface: 'eth0'
|
343
|
+
networking.gke.io/interfaces: |
|
344
|
+
[
|
345
|
+
{"interfaceName":"eth0","network":"default"},
|
346
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
347
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
348
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
349
|
+
{"interfaceName":"eth4","network":"vpc4"},
|
350
|
+
{"interfaceName":"eth5","network":"vpc5"},
|
351
|
+
{"interfaceName":"eth6","network":"vpc6"},
|
352
|
+
{"interfaceName":"eth7","network":"vpc7"},
|
353
|
+
{"interfaceName":"eth8","network":"vpc8"}
|
354
|
+
]
|
355
|
+
{% endif %}
|
356
|
+
{% if k8s_enable_gpudirect_rdma %}
|
357
|
+
networking.gke.io/default-interface: 'eth0'
|
358
|
+
networking.gke.io/interfaces: |
|
359
|
+
[
|
360
|
+
{"interfaceName":"eth0","network":"default"},
|
361
|
+
{"interfaceName":"eth1","network":"gvnic-1"},
|
362
|
+
{"interfaceName":"eth2","network":"rdma-0"},
|
363
|
+
{"interfaceName":"eth3","network":"rdma-1"},
|
364
|
+
{"interfaceName":"eth4","network":"rdma-2"},
|
365
|
+
{"interfaceName":"eth5","network":"rdma-3"},
|
366
|
+
{"interfaceName":"eth6","network":"rdma-4"},
|
367
|
+
{"interfaceName":"eth7","network":"rdma-5"},
|
368
|
+
{"interfaceName":"eth8","network":"rdma-6"},
|
369
|
+
{"interfaceName":"eth9","network":"rdma-7"}
|
370
|
+
]
|
371
|
+
{% endif %}
|
303
372
|
{% endif %}
|
304
373
|
spec:
|
305
374
|
# serviceAccountName: skypilot-service-account
|
@@ -396,6 +465,41 @@ available_node_types:
|
|
396
465
|
persistentVolumeClaim:
|
397
466
|
claimName: {{volume_mount.volume_name_on_cloud}}
|
398
467
|
{% endfor %}
|
468
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
469
|
+
- name: libraries
|
470
|
+
hostPath:
|
471
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
472
|
+
- name: tcpx-socket
|
473
|
+
emptyDir: {}
|
474
|
+
- name: sys
|
475
|
+
hostPath:
|
476
|
+
path: /sys
|
477
|
+
- name: proc-sys
|
478
|
+
hostPath:
|
479
|
+
path: /proc/sys
|
480
|
+
{% endif %}
|
481
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
482
|
+
- name: libraries
|
483
|
+
hostPath:
|
484
|
+
path: /home/kubernetes/bin/nvidia
|
485
|
+
- name: sys
|
486
|
+
hostPath:
|
487
|
+
path: /sys
|
488
|
+
- name: proc-sys
|
489
|
+
hostPath:
|
490
|
+
path: /proc/sys
|
491
|
+
- name: aperture-devices
|
492
|
+
hostPath:
|
493
|
+
path: /dev/aperture_devices
|
494
|
+
{% endif %}
|
495
|
+
{% if k8s_enable_gpudirect_rdma %}
|
496
|
+
- name: library-dir-host
|
497
|
+
hostPath:
|
498
|
+
path: /home/kubernetes/bin/nvidia
|
499
|
+
- name: gib
|
500
|
+
hostPath:
|
501
|
+
path: /home/kubernetes/bin/gib
|
502
|
+
{% endif %}
|
399
503
|
containers:
|
400
504
|
- name: ray-node
|
401
505
|
imagePullPolicy: Always
|
@@ -409,6 +513,113 @@ available_node_types:
|
|
409
513
|
- name: {{ key }}
|
410
514
|
value: {{ value }}
|
411
515
|
{% endfor %}
|
516
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
|
517
|
+
# Page recommends setting NCCL values for GPUDirect TCPX for best performance.
|
518
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
519
|
+
- name: LD_LIBRARY_PATH
|
520
|
+
value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
|
521
|
+
- name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
|
522
|
+
value: eth1,eth2,eth3,eth4
|
523
|
+
- name: NCCL_GPUDIRECTTCPX_CTRL_DEV
|
524
|
+
value: eth0
|
525
|
+
- name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
|
526
|
+
value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
|
527
|
+
- name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
|
528
|
+
value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
|
529
|
+
- name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
|
530
|
+
value: "500000"
|
531
|
+
- name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
|
532
|
+
value: "/tmp"
|
533
|
+
- name: NCCL_GPUDIRECTTCPX_FORCE_ACK
|
534
|
+
value: "0"
|
535
|
+
- name: NCCL_SOCKET_IFNAME
|
536
|
+
value: eth0
|
537
|
+
- name: NCCL_CROSS_NIC
|
538
|
+
value: "0"
|
539
|
+
- name: NCCL_ALGO
|
540
|
+
value: Ring
|
541
|
+
- name: NCCL_PROTO
|
542
|
+
value: Simple
|
543
|
+
- name: NCCL_NSOCKS_PERTHREAD
|
544
|
+
value: "4"
|
545
|
+
- name: NCCL_SOCKET_NTHREADS
|
546
|
+
value: "1"
|
547
|
+
- name: NCCL_NET_GDR_LEVEL
|
548
|
+
value: PIX
|
549
|
+
- name: NCCL_DYNAMIC_CHUNK_SIZE
|
550
|
+
value: "524288"
|
551
|
+
- name: NCCL_P2P_PXN_LEVEL
|
552
|
+
value: "0"
|
553
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
554
|
+
value: "524288"
|
555
|
+
- name: NCCL_P2P_PCI_CHUNKSIZE
|
556
|
+
value: "524288"
|
557
|
+
- name: NCCL_P2P_NVL_CHUNKSIZE
|
558
|
+
value: "1048576"
|
559
|
+
- name: NCCL_BUFFSIZE
|
560
|
+
value: "4194304"
|
561
|
+
- name: NCCL_MAX_NCHANNELS
|
562
|
+
value: "8"
|
563
|
+
- name: NCCL_MIN_NCHANNELS
|
564
|
+
value: "8"
|
565
|
+
- name: CUDA_VISIBLE_DEVICES
|
566
|
+
value: "0,1,2,3,4,5,6,7"
|
567
|
+
{% endif %}
|
568
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
569
|
+
- name: LD_LIBRARY_PATH
|
570
|
+
value: /usr/local/nvidia/lib64
|
571
|
+
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
|
572
|
+
value: /dev/aperture_devices
|
573
|
+
- name: NCCL_FASTRAK_CTRL_DEV
|
574
|
+
value: eth0
|
575
|
+
- name: NCCL_FASTRAK_IFNAME
|
576
|
+
value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
|
577
|
+
- name: NCCL_SOCKET_IFNAME
|
578
|
+
value: eth0
|
579
|
+
- name: NCCL_CROSS_NIC
|
580
|
+
value: "0"
|
581
|
+
- name: NCCL_ALGO
|
582
|
+
value: Ring,Tree
|
583
|
+
- name: NCCL_PROTO
|
584
|
+
value: Simple,LL128
|
585
|
+
- name: NCCL_MIN_NCHANNELS
|
586
|
+
value: "4"
|
587
|
+
- name: NCCL_TUNER_PLUGIN
|
588
|
+
value: libnccl-tuner.so
|
589
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
590
|
+
value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
|
591
|
+
- name: CUDA_VISIBLE_DEVICES
|
592
|
+
value: "0,1,2,3,4,5,6,7"
|
593
|
+
{% endif %}
|
594
|
+
{% if k8s_enable_gpudirect_rdma %}
|
595
|
+
- name: LD_LIBRARY_PATH
|
596
|
+
value: /usr/local/nvidia/lib64
|
597
|
+
- name: NCCL_NET
|
598
|
+
value: gIB
|
599
|
+
- name: NCCL_CROSS_NIC
|
600
|
+
value: "0"
|
601
|
+
- name: NCCL_NET_GDR_LEVEL
|
602
|
+
value: PIX
|
603
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
604
|
+
value: "131072"
|
605
|
+
- name: NCCL_NVLS_CHUNKSIZE
|
606
|
+
value: "524288"
|
607
|
+
- name: NCCL_IB_ADAPTIVE_ROUTING
|
608
|
+
value: "1"
|
609
|
+
- name: NCCL_IB_QPS_PER_CONNECTION
|
610
|
+
value: "4"
|
611
|
+
- name: NCCL_IB_TC
|
612
|
+
value: "52"
|
613
|
+
- name: NCCL_IB_FIFO_TC
|
614
|
+
value: "84"
|
615
|
+
{% if k8s_enable_gpudirect_rdma_a4 %}
|
616
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
617
|
+
value: /usr/local/gib/configs/tuner_config_a4.txtpb
|
618
|
+
{% else %}
|
619
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
620
|
+
value: /usr/local/gib/configs/tuner_config_a3u.txtpb
|
621
|
+
{% endif %}
|
622
|
+
{% endif %}
|
412
623
|
{% if k8s_fuse_device_required %}
|
413
624
|
- name: FUSERMOUNT_SHARED_DIR
|
414
625
|
value: {{k8s_fusermount_shared_dir}}
|
@@ -752,11 +963,27 @@ available_node_types:
|
|
752
963
|
- name: secret-volume
|
753
964
|
readOnly: true
|
754
965
|
mountPath: "/etc/secret-volume"
|
755
|
-
# This volume allocates shared memory for Ray to use for its plasma
|
756
|
-
# object store. If you do not provide this, Ray will fall back to
|
757
|
-
# /tmp which cause slowdowns if is not a shared memory volume.
|
758
966
|
- mountPath: /dev/shm
|
759
967
|
name: dshm
|
968
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
969
|
+
- name: tcpx-socket
|
970
|
+
mountPath: /tmp
|
971
|
+
- name: libraries
|
972
|
+
mountPath: /usr/local/nvidia/lib64
|
973
|
+
readOnly: true
|
974
|
+
{% endif %}
|
975
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
976
|
+
- name: libraries
|
977
|
+
mountPath: /usr/local/nvidia
|
978
|
+
- name: aperture-devices
|
979
|
+
mountPath: /dev/aperture_devices
|
980
|
+
{% endif %}
|
981
|
+
{% if k8s_enable_gpudirect_rdma %}
|
982
|
+
- name: library-dir-host
|
983
|
+
mountPath: /usr/local/nvidia
|
984
|
+
- name: gib
|
985
|
+
mountPath: /usr/local/gib
|
986
|
+
{% endif %}
|
760
987
|
{% if high_availability %}
|
761
988
|
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
762
989
|
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
@@ -794,7 +1021,68 @@ available_node_types:
|
|
794
1021
|
add:
|
795
1022
|
- IPC_LOCK
|
796
1023
|
{% endif %}
|
797
|
-
|
1024
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
1025
|
+
# GPUDirect TCPX daemon sidecar container
|
1026
|
+
- name: tcpx-daemon
|
1027
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
|
1028
|
+
imagePullPolicy: Always
|
1029
|
+
command:
|
1030
|
+
- /tcpgpudmarxd/build/app/tcpgpudmarxd
|
1031
|
+
- --gpu_nic_preset
|
1032
|
+
- a3vm
|
1033
|
+
- --gpu_shmem_type
|
1034
|
+
- fd
|
1035
|
+
- --uds_path
|
1036
|
+
- /run/tcpx
|
1037
|
+
- --setup_param
|
1038
|
+
- --verbose
|
1039
|
+
- "128"
|
1040
|
+
- "2"
|
1041
|
+
- "0"
|
1042
|
+
securityContext:
|
1043
|
+
capabilities:
|
1044
|
+
add:
|
1045
|
+
- NET_ADMIN
|
1046
|
+
volumeMounts:
|
1047
|
+
- name: libraries
|
1048
|
+
mountPath: /usr/local/nvidia/lib64
|
1049
|
+
readOnly: true
|
1050
|
+
- name: tcpx-socket
|
1051
|
+
mountPath: /run/tcpx
|
1052
|
+
- name: sys
|
1053
|
+
mountPath: /hostsysfs
|
1054
|
+
- name: proc-sys
|
1055
|
+
mountPath: /hostprocsysfs
|
1056
|
+
env:
|
1057
|
+
- name: LD_LIBRARY_PATH
|
1058
|
+
value: /usr/local/nvidia/lib64
|
1059
|
+
{% endif %}
|
1060
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
1061
|
+
- name: tcpxo-daemon
|
1062
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
|
1063
|
+
imagePullPolicy: Always
|
1064
|
+
command: ["/bin/sh", "-c"]
|
1065
|
+
args:
|
1066
|
+
- |
|
1067
|
+
set -ex
|
1068
|
+
chmod 755 /fts/entrypoint_rxdm_container.sh
|
1069
|
+
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
|
1070
|
+
securityContext:
|
1071
|
+
capabilities:
|
1072
|
+
add:
|
1073
|
+
- NET_ADMIN
|
1074
|
+
- NET_BIND_SERVICE
|
1075
|
+
volumeMounts:
|
1076
|
+
- name: libraries
|
1077
|
+
mountPath: /usr/local/nvidia
|
1078
|
+
- name: sys
|
1079
|
+
mountPath: /hostsysfs
|
1080
|
+
- name: proc-sys
|
1081
|
+
mountPath: /hostprocsysfs
|
1082
|
+
env:
|
1083
|
+
- name: LD_LIBRARY_PATH
|
1084
|
+
value: /usr/local/nvidia/lib64
|
1085
|
+
{% endif %}
|
798
1086
|
|
799
1087
|
{% if high_availability %}
|
800
1088
|
pvc_spec:
|
sky/users/permission.py
CHANGED
@@ -15,6 +15,7 @@ from sky import sky_logging
|
|
15
15
|
from sky.skylet import constants
|
16
16
|
from sky.users import rbac
|
17
17
|
from sky.utils import common_utils
|
18
|
+
from sky.utils import db_utils
|
18
19
|
|
19
20
|
logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
|
20
21
|
logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
|
@@ -33,11 +34,18 @@ class PermissionService:
|
|
33
34
|
"""Permission service for SkyPilot API Server."""
|
34
35
|
|
35
36
|
def __init__(self):
|
37
|
+
self.enforcer = None
|
38
|
+
|
39
|
+
def _lazy_initialize(self):
|
40
|
+
if self.enforcer is not None:
|
41
|
+
return
|
36
42
|
with _policy_lock():
|
37
43
|
global _enforcer_instance
|
38
44
|
if _enforcer_instance is None:
|
39
45
|
_enforcer_instance = self
|
40
46
|
engine = global_user_state.initialize_and_get_db()
|
47
|
+
db_utils.add_tables_to_db_sqlalchemy(
|
48
|
+
sqlalchemy_adapter.Base.metadata, engine)
|
41
49
|
adapter = sqlalchemy_adapter.Adapter(engine)
|
42
50
|
model_path = os.path.join(os.path.dirname(__file__),
|
43
51
|
'model.conf')
|
@@ -70,7 +78,6 @@ class PermissionService:
|
|
70
78
|
|
71
79
|
def _maybe_initialize_policies(self) -> None:
|
72
80
|
"""Initialize policies if they don't already exist."""
|
73
|
-
# TODO(zhwu): we should avoid running this on client side.
|
74
81
|
logger.debug(f'Initializing policies in process: {os.getpid()}')
|
75
82
|
self._load_policy_no_lock()
|
76
83
|
|
@@ -149,6 +156,7 @@ class PermissionService:
|
|
149
156
|
|
150
157
|
def add_user_if_not_exists(self, user_id: str) -> None:
|
151
158
|
"""Add user role relationship."""
|
159
|
+
self._lazy_initialize()
|
152
160
|
with _policy_lock():
|
153
161
|
self._add_user_if_not_exists_no_lock(user_id)
|
154
162
|
|
@@ -168,6 +176,7 @@ class PermissionService:
|
|
168
176
|
|
169
177
|
def delete_user(self, user_id: str) -> None:
|
170
178
|
"""Delete user role relationship."""
|
179
|
+
self._lazy_initialize()
|
171
180
|
with _policy_lock():
|
172
181
|
# Get current roles
|
173
182
|
self._load_policy_no_lock()
|
@@ -181,6 +190,7 @@ class PermissionService:
|
|
181
190
|
|
182
191
|
def update_role(self, user_id: str, new_role: str) -> None:
|
183
192
|
"""Update user role relationship."""
|
193
|
+
self._lazy_initialize()
|
184
194
|
with _policy_lock():
|
185
195
|
# Get current roles
|
186
196
|
self._load_policy_no_lock()
|
@@ -213,6 +223,7 @@ class PermissionService:
|
|
213
223
|
Returns:
|
214
224
|
A list of role names that the user has.
|
215
225
|
"""
|
226
|
+
self._lazy_initialize()
|
216
227
|
self._load_policy_no_lock()
|
217
228
|
return self.enforcer.get_roles_for_user(user_id)
|
218
229
|
|
@@ -225,6 +236,7 @@ class PermissionService:
|
|
225
236
|
# it is a hot path in every request. It is ok to have a stale policy,
|
226
237
|
# as long as it is eventually consistent.
|
227
238
|
# self._load_policy_no_lock()
|
239
|
+
self._lazy_initialize()
|
228
240
|
return self.enforcer.enforce(user_id, path, method)
|
229
241
|
|
230
242
|
def _load_policy_no_lock(self):
|
@@ -233,6 +245,7 @@ class PermissionService:
|
|
233
245
|
|
234
246
|
def load_policy(self):
|
235
247
|
"""Load policy from storage with lock."""
|
248
|
+
self._lazy_initialize()
|
236
249
|
with _policy_lock():
|
237
250
|
self._load_policy_no_lock()
|
238
251
|
|
@@ -248,6 +261,7 @@ class PermissionService:
|
|
248
261
|
For public workspaces, the permission is granted via a wildcard policy
|
249
262
|
('*').
|
250
263
|
"""
|
264
|
+
self._lazy_initialize()
|
251
265
|
if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
|
252
266
|
# When it is not on API server, we allow all users to access all
|
253
267
|
# workspaces, as the workspace check has been done on API server.
|
@@ -304,6 +318,7 @@ class PermissionService:
|
|
304
318
|
For public workspaces, this should be ['*'].
|
305
319
|
For private workspaces, this should be specific user IDs.
|
306
320
|
"""
|
321
|
+
self._lazy_initialize()
|
307
322
|
with _policy_lock():
|
308
323
|
for user in users:
|
309
324
|
logger.debug(f'Adding workspace policy: user={user}, '
|
@@ -321,6 +336,7 @@ class PermissionService:
|
|
321
336
|
For public workspaces, this should be ['*'].
|
322
337
|
For private workspaces, this should be specific user IDs.
|
323
338
|
"""
|
339
|
+
self._lazy_initialize()
|
324
340
|
with _policy_lock():
|
325
341
|
self._load_policy_no_lock()
|
326
342
|
# Remove all existing policies for this workspace
|
@@ -334,6 +350,7 @@ class PermissionService:
|
|
334
350
|
|
335
351
|
def remove_workspace_policy(self, workspace_name: str) -> None:
|
336
352
|
"""Remove workspace policy."""
|
353
|
+
self._lazy_initialize()
|
337
354
|
with _policy_lock():
|
338
355
|
self.enforcer.remove_filtered_policy(1, workspace_name)
|
339
356
|
self.enforcer.save_policy()
|
sky/users/token_service.py
CHANGED
@@ -5,6 +5,7 @@ import datetime
|
|
5
5
|
import hashlib
|
6
6
|
import os
|
7
7
|
import secrets
|
8
|
+
import threading
|
8
9
|
from typing import Any, Dict, Generator, Optional
|
9
10
|
|
10
11
|
import filelock
|
@@ -44,12 +45,21 @@ class TokenService:
|
|
44
45
|
"""Service for managing JWT-based service account tokens."""
|
45
46
|
|
46
47
|
def __init__(self):
|
47
|
-
self.secret_key =
|
48
|
+
self.secret_key = None
|
49
|
+
self.init_lock = threading.Lock()
|
50
|
+
|
51
|
+
def _lazy_initialize(self):
|
52
|
+
if self.secret_key is not None:
|
53
|
+
return
|
54
|
+
with self.init_lock:
|
55
|
+
if self.secret_key is not None:
|
56
|
+
return
|
57
|
+
self.secret_key = self._get_or_generate_secret()
|
48
58
|
|
49
59
|
def _get_or_generate_secret(self) -> str:
|
50
60
|
"""Get JWT secret from database or generate a new one."""
|
51
|
-
|
52
|
-
|
61
|
+
|
62
|
+
def _get_secret_from_db():
|
53
63
|
try:
|
54
64
|
db_secret = global_user_state.get_system_config(
|
55
65
|
JWT_SECRET_DB_KEY)
|
@@ -58,7 +68,17 @@ class TokenService:
|
|
58
68
|
return db_secret
|
59
69
|
except Exception as e: # pylint: disable=broad-except
|
60
70
|
logger.debug(f'Failed to get JWT secret from database: {e}')
|
71
|
+
return None
|
72
|
+
|
73
|
+
# Try to get from database (persistent across deployments)
|
74
|
+
token_from_db = _get_secret_from_db()
|
75
|
+
if token_from_db:
|
76
|
+
return token_from_db
|
61
77
|
|
78
|
+
with _jwt_secret_lock():
|
79
|
+
token_from_db = _get_secret_from_db()
|
80
|
+
if token_from_db:
|
81
|
+
return token_from_db
|
62
82
|
# Generate a new secret and store in database
|
63
83
|
new_secret = secrets.token_urlsafe(64)
|
64
84
|
try:
|
@@ -91,6 +111,7 @@ class TokenService:
|
|
91
111
|
Returns:
|
92
112
|
Dict containing token info including the JWT token
|
93
113
|
"""
|
114
|
+
self._lazy_initialize()
|
94
115
|
now = datetime.datetime.now(datetime.timezone.utc)
|
95
116
|
token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
|
96
117
|
|
@@ -144,6 +165,7 @@ class TokenService:
|
|
144
165
|
Returns:
|
145
166
|
Decoded token payload or None if invalid
|
146
167
|
"""
|
168
|
+
self._lazy_initialize()
|
147
169
|
if not token.startswith('sky_'):
|
148
170
|
return None
|
149
171
|
|
sky/utils/common_utils.py
CHANGED
@@ -11,6 +11,7 @@ import platform
|
|
11
11
|
import random
|
12
12
|
import re
|
13
13
|
import socket
|
14
|
+
import subprocess
|
14
15
|
import sys
|
15
16
|
import time
|
16
17
|
import typing
|
@@ -87,6 +88,18 @@ def generate_user_hash() -> str:
|
|
87
88
|
return user_hash
|
88
89
|
|
89
90
|
|
91
|
+
def get_git_commit(path: Optional[str] = None) -> Optional[str]:
|
92
|
+
try:
|
93
|
+
result = subprocess.run(['git', 'rev-parse', 'HEAD'],
|
94
|
+
capture_output=True,
|
95
|
+
text=True,
|
96
|
+
cwd=path,
|
97
|
+
check=True)
|
98
|
+
return result.stdout.strip()
|
99
|
+
except subprocess.CalledProcessError:
|
100
|
+
return None
|
101
|
+
|
102
|
+
|
90
103
|
def get_user_hash() -> str:
|
91
104
|
"""Returns a unique user-machine specific hash as a user id.
|
92
105
|
|
sky/utils/db_utils.py
CHANGED
@@ -84,6 +84,22 @@ def add_column_to_table(
|
|
84
84
|
conn.commit()
|
85
85
|
|
86
86
|
|
87
|
+
def add_tables_to_db_sqlalchemy(
|
88
|
+
metadata: sqlalchemy.MetaData,
|
89
|
+
engine: sqlalchemy.Engine,
|
90
|
+
):
|
91
|
+
"""Add tables to the database."""
|
92
|
+
for table in metadata.tables.values():
|
93
|
+
try:
|
94
|
+
table.create(bind=engine, checkfirst=True)
|
95
|
+
except (sqlalchemy_exc.OperationalError,
|
96
|
+
sqlalchemy_exc.ProgrammingError) as e:
|
97
|
+
if 'already exists' in str(e):
|
98
|
+
pass
|
99
|
+
else:
|
100
|
+
raise
|
101
|
+
|
102
|
+
|
87
103
|
def add_column_to_table_sqlalchemy(
|
88
104
|
session: 'Session',
|
89
105
|
table_name: str,
|
sky/utils/schemas.py
CHANGED
@@ -870,6 +870,9 @@ def get_task_schema():
|
|
870
870
|
'type': 'array',
|
871
871
|
'items': get_volume_mount_schema(),
|
872
872
|
},
|
873
|
+
'_metadata': {
|
874
|
+
'type': 'object',
|
875
|
+
},
|
873
876
|
**_experimental_task_schema(),
|
874
877
|
}
|
875
878
|
}
|
@@ -1103,6 +1106,9 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
|
|
1103
1106
|
},
|
1104
1107
|
},
|
1105
1108
|
},
|
1109
|
+
'remote_identity': {
|
1110
|
+
'type': 'string',
|
1111
|
+
}
|
1106
1112
|
}
|
1107
1113
|
|
1108
1114
|
|
sky/utils/ux_utils.py
CHANGED
@@ -253,9 +253,7 @@ def command_hint_messages(hint_type: CommandHintType,
|
|
253
253
|
f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
|
254
254
|
f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
|
255
255
|
f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
|
256
|
-
f'\n{
|
257
|
-
f'{BOLD}sky jobs queue{RESET_BOLD}'
|
258
|
-
f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
|
259
|
-
f'{BOLD}sky jobs dashboard{RESET_BOLD}')
|
256
|
+
f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
|
257
|
+
f'{BOLD}sky jobs queue{RESET_BOLD}')
|
260
258
|
else:
|
261
259
|
raise ValueError(f'Invalid hint type: {hint_type}')
|