xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
integration/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
2
|
+
integration/docker_manager_test.py,sha256=J2xijy6crRtrwQXrEvtOEY7mo1kEJYhcIYMZ7w0OGa4,2514
|
|
3
|
+
integration/gcluster_a3mega_test.py,sha256=KRxgAsTbo6QBVvhhaEHSis_EvTPi2B-ZIlhEoGClcFs,6046
|
|
4
|
+
integration/gcluster_a3ultra_test.py,sha256=8wEtlQN1_uIBUsidvH_l7Ab-ikDpnABrlu9k1TIBz4Q,5846
|
|
5
|
+
integration/gcluster_a4_test.py,sha256=GCe6BujHCvM62kIGOd-9Wvz-IrR0BY5d83bGD1cmsQ0,5754
|
|
6
|
+
integration/gcluster_test.py,sha256=3GSOMszzNW6Yr4T4PFIpmszonwDAAGpSdKutUA77O-g,3304
|
|
7
|
+
xpk/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
|
|
8
|
+
xpk/main.py,sha256=7YBpzpHxV61c2Js0-uUHUXxTW7cWhLLdMc263rYRW3E,2416
|
|
9
|
+
xpk/api/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
10
|
+
xpk/api/storage_crd.yaml,sha256=r4WFXnSJJ25EUF-t4Ljfbl-cJoSaiFiZkP8451eTub4,1260
|
|
11
|
+
xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
12
|
+
xpk/commands/batch.py,sha256=Cj1bDpzPMoPdhaKKrOJJLJ3JzRvJrCMn8huQoHHIZJI,4192
|
|
13
|
+
xpk/commands/cluster.py,sha256=CG4Ieg3QADNUAwyPzZh0P3mop4Pv1DILrNMJVLvgbi8,41682
|
|
14
|
+
xpk/commands/cluster_gcluster.py,sha256=MOxQfQ19sxaDtDBfIzUdrxw4FpboDIiGFKEtFLwfEgQ,13080
|
|
15
|
+
xpk/commands/cluster_gcluster_test.py,sha256=MoMlHbHnCI4ZrnMai1Zw71IP1ORxVVh23cMD5qgXOtQ,6136
|
|
16
|
+
xpk/commands/cluster_test.py,sha256=tT1oEUXqNkzElgHZ-UXJGkJEGyakuaPb1cLHjV-eeM0,2800
|
|
17
|
+
xpk/commands/common.py,sha256=nxLKPhXuAMW7wq-5xL2YeOgDTMUDcLGMujatOUHJ3s4,2504
|
|
18
|
+
xpk/commands/config.py,sha256=gFNkf3ibsvZmcPpkpKXe-KJmHO5IKucNwLCXNgKvaDc,836
|
|
19
|
+
xpk/commands/info.py,sha256=uhv5mPfgg9N-5JhQw4dT2jujL9ZC5kzGA18h9NFfm5A,7429
|
|
20
|
+
xpk/commands/inspector.py,sha256=JruseZl9ZIlR9-Lv_pn8YHfLdyiMgHHSf7xPGBAtXTM,12616
|
|
21
|
+
xpk/commands/job.py,sha256=rPIfWvgm5mLz7K7YDLK721ZcUcg5OEmYVAPAtRtB5Ag,6718
|
|
22
|
+
xpk/commands/kind.py,sha256=NM2CoKJXrYtY9DHZ2yM097WGcI1kno_G2b23tlLVIEI,7658
|
|
23
|
+
xpk/commands/kjob_common.py,sha256=bRaORiGVjPAdN0T3aRmbcQgXYe-EtjoVKePdWzQ5xU4,1928
|
|
24
|
+
xpk/commands/run.py,sha256=D0zgmnGeBLATphYhzQj29EScxrMmAKqPRhP6nfWuYcY,4085
|
|
25
|
+
xpk/commands/shell.py,sha256=mRHMwm3Izzsue4bocekm82Rg_cPUaGMClSlvNzNXQ-o,4467
|
|
26
|
+
xpk/commands/storage.py,sha256=90nz84_ut-uwdIVrxnLYq0K4uYpZWncKM-ZZ_2bRzcI,11505
|
|
27
|
+
xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
|
|
28
|
+
xpk/commands/workload.py,sha256=S6g2B8bzhM_XNGY4NJUYVODcIuvutKQmdl0VV7jwGzw,29239
|
|
29
|
+
xpk/commands/workload_test.py,sha256=e6D1j2Akewq50_76oD84gSCdcW44PEOsIxLPjaL11s8,2344
|
|
30
|
+
xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
31
|
+
xpk/core/capacity.py,sha256=ohrVvguNwSv7HufdhAhp9dEtf49QT9eXPaPuI9BuAck,7401
|
|
32
|
+
xpk/core/cluster.py,sha256=H5-x0xYijQbgINbAIXovu6JoR-9iKnuVQrMVy1xrEP0,26359
|
|
33
|
+
xpk/core/cluster_private.py,sha256=RLi0C7bV0NEUXl6QKQzvUT0weN9EdqPvjuuOQsNO0DY,6868
|
|
34
|
+
xpk/core/commands.py,sha256=mtkT_ZsuHj5HTvFAj7mMd8kMXsWoWSA4zqHjlRiFL4o,10369
|
|
35
|
+
xpk/core/config.py,sha256=G7MKXARftcuoKSV90F40HXh7kYj1ePde8O8dCp95QoI,3407
|
|
36
|
+
xpk/core/config_test.py,sha256=v1qfyFRzLkYSQ7Wn4nx1N0dBSOFXidLWDfhkeHDZOVM,1847
|
|
37
|
+
xpk/core/docker_container.py,sha256=GvkCJ2S5UKn8uh3pZhRd3X7iS0-PsQpRO8l7QhywVGc,7604
|
|
38
|
+
xpk/core/docker_image.py,sha256=MIU397IGIPwkTZFK-ZGEWuc3RmUIF3sQQZUiUj2gLqA,6775
|
|
39
|
+
xpk/core/docker_manager.py,sha256=JBFgyD6O7LKwEHJC7YuSoCDZqrFRtb-LjgWNqkfAbR0,10566
|
|
40
|
+
xpk/core/docker_resources.py,sha256=_aKgpUjyJB2krQ1PkHrotB7K4kByLmPLbuvl_UVvuX8,12843
|
|
41
|
+
xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
|
|
42
|
+
xpk/core/gcloud_context.py,sha256=xZdVoRNLlE-kwXY5djoyQ0I0-KEh5nAohrVql7Jl42k,6649
|
|
43
|
+
xpk/core/gcloud_context_test.py,sha256=YY0R6j-m62coVK2MAjWXDIdxdP6J5yn6R1RiTDkuExQ,2719
|
|
44
|
+
xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
|
|
45
|
+
xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
|
|
46
|
+
xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
|
|
47
|
+
xpk/core/kjob.py,sha256=symYO3DQfSNP6MAJE54QZuCPDF2kseaxZ-_mmsoBQjo,14478
|
|
48
|
+
xpk/core/kueue_manager.py,sha256=TFdl33N-b-eSrOrsnJueP1H4qYWpkX_lFDhLSVvfeGM,13292
|
|
49
|
+
xpk/core/kueue_manager_test.py,sha256=PtsSSfwtdk-2wK_dr_OQb8oYWVce5nmDN6iowND_3b4,20062
|
|
50
|
+
xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
|
|
51
|
+
xpk/core/mtc.py,sha256=pO7p3l-EzLFdTE8MdwWV8i0Zu-7epGql_kPoksVofIU,6259
|
|
52
|
+
xpk/core/nap.py,sha256=uA33XccGjEF5RZRO5IpUMMzCf-u6D73cwwPEj4q1qvc,12820
|
|
53
|
+
xpk/core/network.py,sha256=Oulb7U69lWkpOKxOC1C7ekJDpC51TLwd7XdZA3NQ7E0,10505
|
|
54
|
+
xpk/core/nodepool.py,sha256=LfS_RvKHmS7f97hc_UidUC13FFEyGvfageSlt8d-5hw,23227
|
|
55
|
+
xpk/core/nodepool_test.py,sha256=YNvp8WXznAI8DscrN9-BSnqII2AtRju2guxrobvCH8A,8805
|
|
56
|
+
xpk/core/pathways.py,sha256=s-h_ofMrbFn3J6NFmT5OMe_HiUQIkI90ty7xbS05iA8,10710
|
|
57
|
+
xpk/core/ray.py,sha256=JWhc_ToRHpF4_URGnuE_47FMgamaRsA4KVUMpqThWzw,6145
|
|
58
|
+
xpk/core/resources.py,sha256=HlYNPQlaJa5y-pb70aVJzdiiOLDnoACF-wJXePM4ejs,8077
|
|
59
|
+
xpk/core/scheduling.py,sha256=_o8QkQxVM-8z5K5ATslM_qQ87f5dzYE0ZeBcwQ6Oqic,9702
|
|
60
|
+
xpk/core/scheduling_test.py,sha256=m9KcglAbg0qly095PmrUOZxJYUE2UeQmkBNIWn5nFyk,979
|
|
61
|
+
xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
|
|
62
|
+
xpk/core/system_characteristics.py,sha256=qHgqyQF6RVjU059YU0eB66_ia74tV-ogDyZbqCTUfw0,22589
|
|
63
|
+
xpk/core/system_characteristics_test.py,sha256=MWvbaEQRoepYNZKOQcs73ppcTKW8plYQ9FQBotfA27I,2250
|
|
64
|
+
xpk/core/vertex.py,sha256=orIZAVwZruRJQ6-vgc1wShuTsiipdH-zHQ9O4ie_HSA,3638
|
|
65
|
+
xpk/core/workload.py,sha256=6TVZM15n8W7046VgmmH9Jv54MrhExtLQH3GaiwlV8Xs,8959
|
|
66
|
+
xpk/core/workload_test.py,sha256=tVTvrwDRXD3O1GCoftgEBWilCYTN74ayP1KRP0vptx0,857
|
|
67
|
+
xpk/core/blueprint/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
68
|
+
xpk/core/blueprint/blueprint_definitions.py,sha256=5i331XA-2yP_ALyB6XU5tP2Tf9iHcIX5g0TilxQi8zE,1800
|
|
69
|
+
xpk/core/blueprint/blueprint_generator.py,sha256=VWhp89upY7vqVFUqcpWR6zwTBoFUSahX2sQxXEdUpSk,36393
|
|
70
|
+
xpk/core/blueprint/blueprint_test.py,sha256=T058Dq-x4wQQqjs33BWhjHdT4qJLDwsIcc7vareHh_c,7204
|
|
71
|
+
xpk/core/blueprint/testing/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
72
|
+
xpk/core/remote_state/__init__.py,sha256=PkV8D9WOtlJHH5AIxsQaKeIBcmupT_Ol_bwJgN6G2I8,561
|
|
73
|
+
xpk/core/remote_state/fuse_remote_state.py,sha256=3Dx4ZZd0NFF5-MlqGWHzz8H4bjYiPOWdF_YSEnKUPQ8,3246
|
|
74
|
+
xpk/core/remote_state/remote_state_client.py,sha256=6PcR92Xy_RMjlF4AscanQ1jXNHnewLWGNC2v53jbzD4,1077
|
|
75
|
+
xpk/core/workload_decorators/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
76
|
+
xpk/core/workload_decorators/rdma_decorator.py,sha256=isbgPnjdu2AT_Da1nVUIRoGE_qZ7jMDOKCgZOLq5r2A,4006
|
|
77
|
+
xpk/core/workload_decorators/storage_decorator.py,sha256=DDYQVO1OKTLhveDOA4V6b2RWr4n0fbwHdnoFFmW7iaQ,2000
|
|
78
|
+
xpk/core/workload_decorators/tcpx_decorator.py,sha256=m5EgzEHjbcOD13ygY91mQdhwQt4Gr5PyalVkKcHyeV8,5975
|
|
79
|
+
xpk/core/workload_decorators/tcpx_decorator_test.py,sha256=iTBS3X_-VwA2oveNDjscduLtll0VOJyFRCp4xmsjg7w,8515
|
|
80
|
+
xpk/core/workload_decorators/tcpxo_decorator.py,sha256=_nLX7tbnxhnS-xv4Jijd1JOP76V4LpNCfW3Np404Cqw,6537
|
|
81
|
+
xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
82
|
+
xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
|
|
83
|
+
xpk/parser/cluster.py,sha256=1cc2pCMbceF6Lx6MXINlgEfiIRUYMITha3eCE6R8SrI,30708
|
|
84
|
+
xpk/parser/cluster_test.py,sha256=h7lILF2V37WzQSZjGD2jCqI1iWJNY7BLODZf1uhK9QA,1806
|
|
85
|
+
xpk/parser/common.py,sha256=1wihGYBe01ci4quZnkeR60-u_lapSOG4WhRFrE3yFR8,8108
|
|
86
|
+
xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
|
|
87
|
+
xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
|
|
88
|
+
xpk/parser/info.py,sha256=UJohxVVWdt9IgUXoPsrVae2DN1BjAVGWrSN2ajrB8RQ,1860
|
|
89
|
+
xpk/parser/inspector.py,sha256=hAPAZ2k9iSJgC1mjnz3rMleInsAQ8PmkyyUKFyBmsgY,1997
|
|
90
|
+
xpk/parser/job.py,sha256=5RdE70rucGfrsn65l7Ho6RmO06mag1S0AO-3saVuXyw,4328
|
|
91
|
+
xpk/parser/kind.py,sha256=sgPCqNVrgmFLcOBEbhlaphwVXxMh_opP9ntCq4KPePE,2682
|
|
92
|
+
xpk/parser/run.py,sha256=oi_ksSyJ8Ooffe2EgoV_ecpmXEmNGVotjpIQH-HjufE,1481
|
|
93
|
+
xpk/parser/shell.py,sha256=VC8p-kz9XjJZW9DXZ-rnv41XnRDRpQRFywHpB5j7tfc,1970
|
|
94
|
+
xpk/parser/storage.py,sha256=XNynqulEzTmT8_G6wkeBwfXX0XQ1lsd6BFcx0H6rGfU,9971
|
|
95
|
+
xpk/parser/validators.py,sha256=-NBZelvfwZRzjz-YUCreD8EzMLHll8PZM-d-MVm2PG4,1192
|
|
96
|
+
xpk/parser/version.py,sha256=eJo4PAbbmRQZulgKBs_ytbVgV9zAaaXeNzMMxmgFMVY,769
|
|
97
|
+
xpk/parser/workload.py,sha256=oI6y66Old_z6PxFPTZ3LvektUsOZ7U8f6g6cscuzh9g,27208
|
|
98
|
+
xpk/parser/workload_test.py,sha256=t0aAqEsAMz1U3xTMrBcm773d9kwYOcvLtQk5n4jqWPw,2075
|
|
99
|
+
xpk/templates/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
|
|
100
|
+
xpk/templates/cluster_preheat.yaml.j2,sha256=1e8jYagQE6O7BjAfuwmEqGG1b8AOsLRlQm4V68ZnGNs,721
|
|
101
|
+
xpk/templates/filestore-pv.yaml,sha256=FxKZkAXa2czIYblq77iewQjCjOjs-FptuF3YLOByfLo,316
|
|
102
|
+
xpk/templates/filestore-pvc.yaml,sha256=Rf80UNYs3XTUdOJuWCeFq80TKXq5FhfafRE89hq7y9o,161
|
|
103
|
+
xpk/templates/filestore-sc.yaml,sha256=vHzcU7jk0B5z7EgTfQmMM1m2TIzR70ny6MvthyYdqhE,213
|
|
104
|
+
xpk/templates/fuse-pv.yaml,sha256=-CM6AYAy4HKOErd9ogiM-6vWIuWS5yXoXyBAp2EoZsM,321
|
|
105
|
+
xpk/templates/fuse-pvc.yaml,sha256=heGWvRIetukAI9pH9auXxnU2H-G_8pL_wRIzoWDLVH8,218
|
|
106
|
+
xpk/templates/kueue_config.yaml.j2,sha256=ZAZwzZ28piwlXi9Offo2CvrQ3K9gv1r9BJTPavnUEdY,2055
|
|
107
|
+
xpk/templates/kueue_gke_default_topology.yaml.j2,sha256=wW3qt6p3VDPgFVX7Ozw4-O4QgQ3mhH8U3osKnmuOFaE,299
|
|
108
|
+
xpk/templates/kueue_sub_slicing_topology.yaml.j2,sha256=Lv54uS3fC7sJNJdk4DFtnOiLSgq4wq38n_BQCx4Tz6Q,542
|
|
109
|
+
xpk/templates/mtc-cpc.yaml,sha256=MPx75tog09kjRAvHoNOPCEobigQ17d7pYCUnZCevSDQ,340
|
|
110
|
+
xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
|
|
111
|
+
xpk/templates/volume_bundle.yaml,sha256=sqeag7GPWqGNQ5doZtO9IVAX_vKYRO73-aBE7waEtSY,129
|
|
112
|
+
xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
113
|
+
xpk/utils/console.py,sha256=hRbvtog_VAzuxt5GfwK5GZdd5SWaa7kvWG8zo_qFRQc,1519
|
|
114
|
+
xpk/utils/execution_context.py,sha256=WYxm6NExBIP6iLAWaL5aV858riGJbAHn0Zs6fmKlmzE,784
|
|
115
|
+
xpk/utils/feature_flags.py,sha256=b8zB_zEiKcoOnyGHc7vuQN0ruRTdY8ixqUjWT7Ilp-M,824
|
|
116
|
+
xpk/utils/file.py,sha256=hi9v4gfwiB3JHi3tnelPbm_dlTUt47U0wvvWKQqMjiQ,2500
|
|
117
|
+
xpk/utils/gcs_utils.py,sha256=zg-XSTv4G4TFjeT2bNBm2WLdDXPrOZi0rNv_JdppNg4,4113
|
|
118
|
+
xpk/utils/kubectl.py,sha256=WKB9UhpouPN9G4n2ejRi_PgsYLI0R01gzkS1WGU6mJA,1828
|
|
119
|
+
xpk/utils/kueue.py,sha256=P1Pu_crGuOgYxjl8CczTgtQoum0w1sbSLGPOaEZ5180,713
|
|
120
|
+
xpk/utils/network.py,sha256=dGS5rxIm_zaayDElHNlzalaf09M99by5ckL_lGDl_yQ,4293
|
|
121
|
+
xpk/utils/objects.py,sha256=OwMNxB4TGX21qnJPdZo2YBMPMbQPqOtHMh19QhoRNRY,2498
|
|
122
|
+
xpk/utils/templates.py,sha256=5VAUtv-F6ICL5mxZ3Xtzdh8FEc0-86jFbqhgwW-QtcM,1277
|
|
123
|
+
xpk/utils/topology.py,sha256=WcAG3kzA8krlbyiZpTRDI2TteNozn3r1GH914Lykcp8,1374
|
|
124
|
+
xpk/utils/topology_test.py,sha256=jDXCPgBPfByqjhi0W9A5c8uOHOYjRav53nNlg71ipjk,1943
|
|
125
|
+
xpk/utils/validation.py,sha256=-Qd5jqkVzQHkJvmQnGjHjtAcfvz064Vbo6_sl4EnYKw,3497
|
|
126
|
+
xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
|
|
127
|
+
xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
|
|
128
|
+
xpk-0.14.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
129
|
+
xpk-0.14.1.dist-info/METADATA,sha256=41WVkaR-Bgj64aC9vi26hgI9g6CpnDtEF1G3yXhbpzI,71934
|
|
130
|
+
xpk-0.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
131
|
+
xpk-0.14.1.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
|
|
132
|
+
xpk-0.14.1.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
|
|
133
|
+
xpk-0.14.1.dist-info/RECORD,,
|
xpk/core/kueue.py
DELETED
|
@@ -1,561 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2024 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from argparse import Namespace
|
|
18
|
-
|
|
19
|
-
import math
|
|
20
|
-
import packaging
|
|
21
|
-
from packaging.version import Version
|
|
22
|
-
|
|
23
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
-
from ..utils.file import write_tmp_file
|
|
25
|
-
from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
26
|
-
from .commands import (
|
|
27
|
-
run_command_for_value,
|
|
28
|
-
run_command_with_updates,
|
|
29
|
-
run_command_with_updates_retry,
|
|
30
|
-
)
|
|
31
|
-
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
|
|
32
|
-
from .resources import AutoprovisioningConfig
|
|
33
|
-
from .scheduling import (
|
|
34
|
-
create_accelerator_label,
|
|
35
|
-
create_machine_label,
|
|
36
|
-
get_total_chips_requested_from_args,
|
|
37
|
-
)
|
|
38
|
-
from .system_characteristics import (
|
|
39
|
-
AcceleratorTypeToAcceleratorCharacteristics,
|
|
40
|
-
SystemCharacteristics,
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
KUEUE_VERSION = 'v0.12.2'
|
|
44
|
-
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
45
|
-
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
46
|
-
WAIT_FOR_KUEUE_TIMEOUT = '10m'
|
|
47
|
-
MEMORY_SIZE_PER_VM = 1.2
|
|
48
|
-
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
49
|
-
|
|
50
|
-
packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
|
|
51
|
-
|
|
52
|
-
topology_yaml = """apiVersion: kueue.x-k8s.io/v1alpha1
|
|
53
|
-
kind: Topology
|
|
54
|
-
metadata:
|
|
55
|
-
name: "gke-default"
|
|
56
|
-
spec:
|
|
57
|
-
levels:
|
|
58
|
-
- nodeLabel: "cloud.google.com/gce-topology-block"
|
|
59
|
-
- nodeLabel: "cloud.google.com/gce-topology-subblock"
|
|
60
|
-
- nodeLabel: "cloud.google.com/gce-topology-host"
|
|
61
|
-
- nodeLabel: "kubernetes.io/hostname"
|
|
62
|
-
---
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
|
|
66
|
-
kind: ResourceFlavor
|
|
67
|
-
metadata:
|
|
68
|
-
name: {cluster_hardware_name}
|
|
69
|
-
spec:
|
|
70
|
-
nodeLabels:
|
|
71
|
-
{accelerator_label}
|
|
72
|
-
{machine_label}
|
|
73
|
-
{topology_label}
|
|
74
|
-
---
|
|
75
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
76
|
-
kind: AdmissionCheck
|
|
77
|
-
metadata:
|
|
78
|
-
name: dws-prov
|
|
79
|
-
spec:
|
|
80
|
-
controllerName: kueue.x-k8s.io/provisioning-request
|
|
81
|
-
parameters:
|
|
82
|
-
apiGroup: kueue.x-k8s.io
|
|
83
|
-
kind: ProvisioningRequestConfig
|
|
84
|
-
name: dws-config
|
|
85
|
-
---
|
|
86
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
87
|
-
kind: ProvisioningRequestConfig
|
|
88
|
-
metadata:
|
|
89
|
-
name: dws-config
|
|
90
|
-
spec:
|
|
91
|
-
provisioningClassName: queued-provisioning.gke.io
|
|
92
|
-
podSetUpdates:
|
|
93
|
-
nodeSelector:
|
|
94
|
-
- key: autoscaling.gke.io/provisioning-request
|
|
95
|
-
valueFromProvisioningClassDetail: ResizeRequestName
|
|
96
|
-
managedResources:
|
|
97
|
-
- {managed_resource}
|
|
98
|
-
---
|
|
99
|
-
{pw_resource_flavors}
|
|
100
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
101
|
-
kind: ClusterQueue
|
|
102
|
-
metadata:
|
|
103
|
-
name: {cluster_queue_name}
|
|
104
|
-
spec:
|
|
105
|
-
preemption:
|
|
106
|
-
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
|
|
107
|
-
withinClusterQueue: LowerPriority
|
|
108
|
-
namespaceSelector: {{}} # match all.
|
|
109
|
-
resourceGroups:
|
|
110
|
-
{covered_resources_config}
|
|
111
|
-
{pw_resources_kueue}
|
|
112
|
-
{admission_checks}
|
|
113
|
-
---
|
|
114
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
115
|
-
kind: LocalQueue
|
|
116
|
-
metadata:
|
|
117
|
-
namespace: default
|
|
118
|
-
name: {local_queue_name}
|
|
119
|
-
spec:
|
|
120
|
-
clusterQueue: {cluster_queue_name}
|
|
121
|
-
---
|
|
122
|
-
apiVersion: scheduling.k8s.io/v1
|
|
123
|
-
kind: PriorityClass
|
|
124
|
-
metadata:
|
|
125
|
-
name: very-low
|
|
126
|
-
value: 100
|
|
127
|
-
globalDefault: false
|
|
128
|
-
description: "Very Low"
|
|
129
|
-
---
|
|
130
|
-
apiVersion: scheduling.k8s.io/v1
|
|
131
|
-
kind: PriorityClass
|
|
132
|
-
metadata:
|
|
133
|
-
name: low
|
|
134
|
-
value: 250
|
|
135
|
-
globalDefault: false
|
|
136
|
-
description: "Low"
|
|
137
|
-
---
|
|
138
|
-
apiVersion: scheduling.k8s.io/v1
|
|
139
|
-
kind: PriorityClass
|
|
140
|
-
metadata:
|
|
141
|
-
name: medium
|
|
142
|
-
value: 500
|
|
143
|
-
globalDefault: false
|
|
144
|
-
description: "Medium"
|
|
145
|
-
---
|
|
146
|
-
apiVersion: scheduling.k8s.io/v1
|
|
147
|
-
kind: PriorityClass
|
|
148
|
-
metadata:
|
|
149
|
-
name: high
|
|
150
|
-
value: 750
|
|
151
|
-
globalDefault: false
|
|
152
|
-
description: "High"
|
|
153
|
-
---
|
|
154
|
-
apiVersion: scheduling.k8s.io/v1
|
|
155
|
-
kind: PriorityClass
|
|
156
|
-
metadata:
|
|
157
|
-
name: very-high
|
|
158
|
-
value: 1000
|
|
159
|
-
globalDefault: false
|
|
160
|
-
description: "Very High"
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
cluster_preheat_yml = """
|
|
164
|
-
apiVersion: apps/v1
|
|
165
|
-
kind: DaemonSet
|
|
166
|
-
metadata:
|
|
167
|
-
name: {cachekey}
|
|
168
|
-
labels:
|
|
169
|
-
k8s-app: {cachekey}
|
|
170
|
-
spec:
|
|
171
|
-
selector:
|
|
172
|
-
matchLabels:
|
|
173
|
-
k8s-app: {cachekey}
|
|
174
|
-
updateStrategy:
|
|
175
|
-
type: RollingUpdate
|
|
176
|
-
template:
|
|
177
|
-
metadata:
|
|
178
|
-
labels:
|
|
179
|
-
name: {cachekey}
|
|
180
|
-
k8s-app: {cachekey}
|
|
181
|
-
spec:
|
|
182
|
-
affinity:
|
|
183
|
-
nodeAffinity:
|
|
184
|
-
requiredDuringSchedulingIgnoredDuringExecution:
|
|
185
|
-
nodeSelectorTerms:
|
|
186
|
-
- matchExpressions:
|
|
187
|
-
- key: {nodeSelectorKey}
|
|
188
|
-
operator: Exists
|
|
189
|
-
tolerations:
|
|
190
|
-
- operator: "Exists"
|
|
191
|
-
containers:
|
|
192
|
-
- image: {image_name}
|
|
193
|
-
name: {cachekey}
|
|
194
|
-
command: [ "sleep", "inf" ]
|
|
195
|
-
"""
|
|
196
|
-
|
|
197
|
-
kueue_controller_manager_yml = """
|
|
198
|
-
apiVersion: apps/v1
|
|
199
|
-
kind: Deployment
|
|
200
|
-
metadata:
|
|
201
|
-
labels:
|
|
202
|
-
app.kubernetes.io/component: controller
|
|
203
|
-
app.kubernetes.io/name: kueue
|
|
204
|
-
control-plane: controller-manager
|
|
205
|
-
name: kueue-controller-manager
|
|
206
|
-
namespace: kueue-system
|
|
207
|
-
spec:
|
|
208
|
-
replicas: 1
|
|
209
|
-
selector:
|
|
210
|
-
matchLabels:
|
|
211
|
-
control-plane: controller-manager
|
|
212
|
-
template:
|
|
213
|
-
metadata:
|
|
214
|
-
annotations:
|
|
215
|
-
kubectl.kubernetes.io/default-container: manager
|
|
216
|
-
labels:
|
|
217
|
-
app.kubernetes.io/component: controller
|
|
218
|
-
app.kubernetes.io/name: kueue
|
|
219
|
-
control-plane: controller-manager
|
|
220
|
-
spec:
|
|
221
|
-
containers:
|
|
222
|
-
- args:
|
|
223
|
-
- --config=/controller_manager_config.yaml
|
|
224
|
-
- --zap-log-level=2
|
|
225
|
-
command:
|
|
226
|
-
- /manager
|
|
227
|
-
image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION}
|
|
228
|
-
imagePullPolicy: Always
|
|
229
|
-
livenessProbe:
|
|
230
|
-
httpGet:
|
|
231
|
-
path: /healthz
|
|
232
|
-
port: 8081
|
|
233
|
-
initialDelaySeconds: 15
|
|
234
|
-
periodSeconds: 20
|
|
235
|
-
name: manager
|
|
236
|
-
ports:
|
|
237
|
-
- containerPort: 8082
|
|
238
|
-
name: visibility
|
|
239
|
-
protocol: TCP
|
|
240
|
-
- containerPort: 9443
|
|
241
|
-
name: webhook-server
|
|
242
|
-
protocol: TCP
|
|
243
|
-
readinessProbe:
|
|
244
|
-
httpGet:
|
|
245
|
-
path: /readyz
|
|
246
|
-
port: 8081
|
|
247
|
-
initialDelaySeconds: 5
|
|
248
|
-
periodSeconds: 10
|
|
249
|
-
resources:
|
|
250
|
-
limits:
|
|
251
|
-
cpu: 1000m
|
|
252
|
-
memory: {memory_limit_size}
|
|
253
|
-
requests:
|
|
254
|
-
cpu: 1000m
|
|
255
|
-
memory: 512Mi
|
|
256
|
-
securityContext:
|
|
257
|
-
allowPrivilegeEscalation: false
|
|
258
|
-
volumeMounts:
|
|
259
|
-
- mountPath: /visibility
|
|
260
|
-
name: visibility
|
|
261
|
-
- mountPath: /tmp/k8s-webhook-server/serving-certs
|
|
262
|
-
name: cert
|
|
263
|
-
readOnly: true
|
|
264
|
-
- mountPath: /controller_manager_config.yaml
|
|
265
|
-
name: manager-config
|
|
266
|
-
subPath: controller_manager_config.yaml
|
|
267
|
-
securityContext:
|
|
268
|
-
runAsNonRoot: true
|
|
269
|
-
serviceAccountName: kueue-controller-manager
|
|
270
|
-
terminationGracePeriodSeconds: 10
|
|
271
|
-
volumes:
|
|
272
|
-
- name: visibility
|
|
273
|
-
emptyDir: {{}}
|
|
274
|
-
- name: cert
|
|
275
|
-
secret:
|
|
276
|
-
defaultMode: 420
|
|
277
|
-
secretName: kueue-webhook-server-cert
|
|
278
|
-
- configMap:
|
|
279
|
-
name: kueue-manager-config
|
|
280
|
-
name: manager-config
|
|
281
|
-
"""
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def verify_kueuectl(args: Namespace) -> None:
|
|
285
|
-
"""Verify if kueuectl is installed.
|
|
286
|
-
Args:
|
|
287
|
-
args: user provided arguments.
|
|
288
|
-
Returns:
|
|
289
|
-
None
|
|
290
|
-
"""
|
|
291
|
-
xpk_print('Veryfing kueuectl installation')
|
|
292
|
-
|
|
293
|
-
command = 'kubectl kueue version'
|
|
294
|
-
task = 'Verify kueuectl installation on cluster'
|
|
295
|
-
verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
|
|
296
|
-
|
|
297
|
-
if verify_kueuectl_installed_code == 0:
|
|
298
|
-
xpk_print('kueuectl found')
|
|
299
|
-
|
|
300
|
-
if verify_kueuectl_installed_code != 0:
|
|
301
|
-
xpk_print(
|
|
302
|
-
'kueuectl not found. Please follow'
|
|
303
|
-
' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
|
|
304
|
-
' to install kueuectl.'
|
|
305
|
-
)
|
|
306
|
-
xpk_exit(verify_kueuectl_installed_code)
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
def delete_multikueueconfigs_definitions(args) -> int:
|
|
310
|
-
command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
|
|
311
|
-
task = 'Delete multikueueconfigs crds'
|
|
312
|
-
return_code = run_command_with_updates_retry(command, task, args)
|
|
313
|
-
if return_code != 0:
|
|
314
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
315
|
-
return return_code
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
def delete_multikueueclusters_definitions(args) -> int:
|
|
319
|
-
command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
|
|
320
|
-
task = 'Delete multikueueclusters crds'
|
|
321
|
-
return_code = run_command_with_updates_retry(command, task, args)
|
|
322
|
-
if return_code != 0:
|
|
323
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
324
|
-
return return_code
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
def get_kueue_version(args) -> tuple[int, str]:
|
|
328
|
-
command = 'kubectl kueue version'
|
|
329
|
-
task = 'Get kueue version on server'
|
|
330
|
-
return_code, val = run_command_for_value(command, task, args)
|
|
331
|
-
if return_code != 0:
|
|
332
|
-
return return_code, ''
|
|
333
|
-
lines = val.splitlines()
|
|
334
|
-
if len(lines) == 1:
|
|
335
|
-
return 1, ''
|
|
336
|
-
server_version_line = lines[1]
|
|
337
|
-
manager_image_version = server_version_line.split(':')[-1]
|
|
338
|
-
return return_code, manager_image_version
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def install_kueue_on_cluster(args) -> int:
|
|
342
|
-
"""Install Kueue on the cluster.
|
|
343
|
-
|
|
344
|
-
Args:
|
|
345
|
-
args: user provided arguments for running the command.
|
|
346
|
-
|
|
347
|
-
Returns:
|
|
348
|
-
0 if successful and 1 otherwise.
|
|
349
|
-
"""
|
|
350
|
-
|
|
351
|
-
err_code, kueue_version_installed = get_kueue_version(args)
|
|
352
|
-
if err_code == 0:
|
|
353
|
-
if Version(kueue_version_installed) < Version('v0.9.0') and Version(
|
|
354
|
-
KUEUE_VERSION
|
|
355
|
-
) >= Version('v0.9.0'):
|
|
356
|
-
xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
|
|
357
|
-
upgrade_code = delete_multikueueclusters_definitions(args)
|
|
358
|
-
if upgrade_code != 0:
|
|
359
|
-
return upgrade_code
|
|
360
|
-
upgrade_code = delete_multikueueconfigs_definitions(args)
|
|
361
|
-
if upgrade_code != 0:
|
|
362
|
-
return upgrade_code
|
|
363
|
-
|
|
364
|
-
command = (
|
|
365
|
-
'kubectl apply --server-side --force-conflicts -f'
|
|
366
|
-
f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
|
|
367
|
-
)
|
|
368
|
-
task = 'Set Kueue On Cluster'
|
|
369
|
-
return_code = run_command_with_updates_retry(command, task, args)
|
|
370
|
-
if return_code != 0:
|
|
371
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
372
|
-
return return_code
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
def wait_for_kueue_available(args: Namespace) -> int:
|
|
376
|
-
"""Wait for Kueue to be fully available.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
args: user provided arguments for running the command.
|
|
380
|
-
|
|
381
|
-
Returns:
|
|
382
|
-
0 if successful and 1 otherwise.
|
|
383
|
-
"""
|
|
384
|
-
command = (
|
|
385
|
-
'kubectl wait deploy/kueue-controller-manager -nkueue-system'
|
|
386
|
-
f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
|
|
387
|
-
)
|
|
388
|
-
task = 'Wait for Kueue to be available'
|
|
389
|
-
return_code = run_command_with_updates(command, task, args)
|
|
390
|
-
if return_code != 0:
|
|
391
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
392
|
-
return return_code
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
def install_kueue_crs(
|
|
396
|
-
args,
|
|
397
|
-
system: SystemCharacteristics,
|
|
398
|
-
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
399
|
-
flex_with_tpu=False,
|
|
400
|
-
) -> int:
|
|
401
|
-
"""Install Kueue Custom Resources.
|
|
402
|
-
|
|
403
|
-
Args:
|
|
404
|
-
args: user provided arguments for running the command.
|
|
405
|
-
system: system level arguments.
|
|
406
|
-
autoprovisioning_config: Autoprovisioning config to configure kueue with if
|
|
407
|
-
autoprovisioning is enabled.
|
|
408
|
-
|
|
409
|
-
Returns:
|
|
410
|
-
0 if successful and 1 otherwise.
|
|
411
|
-
"""
|
|
412
|
-
device_type = system.device_type
|
|
413
|
-
cluster_hardware_name = f'{args.num_slices}x{device_type}'
|
|
414
|
-
resource_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
415
|
-
system.accelerator_type
|
|
416
|
-
].resource_type
|
|
417
|
-
|
|
418
|
-
autoprovisioning_enabled = False
|
|
419
|
-
if autoprovisioning_config:
|
|
420
|
-
# Determine total resources available based on autoprovisioning max chips.
|
|
421
|
-
autoprovisioning_enabled = True
|
|
422
|
-
total_chips = autoprovisioning_config.maximum_chips
|
|
423
|
-
cluster_hardware_name = f'{system.gke_accelerator}'
|
|
424
|
-
else:
|
|
425
|
-
# Determine total chips based on user specified topology.
|
|
426
|
-
total_chips = get_total_chips_requested_from_args(args, system)
|
|
427
|
-
if args.flex and flex_with_tpu is False:
|
|
428
|
-
admission_checks = """
|
|
429
|
-
admissionChecks:
|
|
430
|
-
- dws-prov
|
|
431
|
-
"""
|
|
432
|
-
else:
|
|
433
|
-
admission_checks = ''
|
|
434
|
-
|
|
435
|
-
covered_resources_config = get_kueue_covered_resources_config(
|
|
436
|
-
cluster_hardware_name=cluster_hardware_name,
|
|
437
|
-
resource_type=resource_type,
|
|
438
|
-
total_chips=total_chips,
|
|
439
|
-
cpu_limit=args.cpu_limit,
|
|
440
|
-
memory_limit=args.memory_limit,
|
|
441
|
-
)
|
|
442
|
-
topology_label = ''
|
|
443
|
-
if system.device_type in [
|
|
444
|
-
H100_MEGA_DEVICE_TYPE,
|
|
445
|
-
H200_DEVICE_TYPE,
|
|
446
|
-
B200_DEVICE_TYPE,
|
|
447
|
-
]:
|
|
448
|
-
topology_label = 'topologyName: "gke-default"'
|
|
449
|
-
res_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
450
|
-
system.accelerator_type
|
|
451
|
-
].resource_type
|
|
452
|
-
yml_string = cluster_set_crd_yaml.format(
|
|
453
|
-
system=system,
|
|
454
|
-
cluster_hardware_name=cluster_hardware_name,
|
|
455
|
-
accelerator_label=create_accelerator_label(
|
|
456
|
-
system.accelerator_type, system
|
|
457
|
-
),
|
|
458
|
-
machine_label=create_machine_label(
|
|
459
|
-
system.accelerator_type, system, autoprovisioning_enabled
|
|
460
|
-
),
|
|
461
|
-
topology_label=topology_label,
|
|
462
|
-
covered_resources_config=covered_resources_config,
|
|
463
|
-
resource_type=res_type,
|
|
464
|
-
pw_resource_flavors=add_pw_resource_flavors(args),
|
|
465
|
-
pw_resources_kueue=add_pw_resources_to_kueue(args),
|
|
466
|
-
admission_checks=admission_checks,
|
|
467
|
-
managed_resource=res_type,
|
|
468
|
-
cluster_queue_name=CLUSTER_QUEUE_NAME,
|
|
469
|
-
local_queue_name=LOCAL_QUEUE_NAME,
|
|
470
|
-
)
|
|
471
|
-
if system.device_type in [
|
|
472
|
-
H100_MEGA_DEVICE_TYPE,
|
|
473
|
-
H200_DEVICE_TYPE,
|
|
474
|
-
B200_DEVICE_TYPE,
|
|
475
|
-
]:
|
|
476
|
-
yml_string = topology_yaml + yml_string
|
|
477
|
-
|
|
478
|
-
tmp = write_tmp_file(yml_string)
|
|
479
|
-
command = f'kubectl apply -f {str(tmp)}'
|
|
480
|
-
|
|
481
|
-
task = 'Applying Kueue Custom Resources'
|
|
482
|
-
return_code = run_command_with_updates_retry(command, task, args)
|
|
483
|
-
if return_code != 0:
|
|
484
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
485
|
-
return return_code
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def get_kueue_covered_resources_config(
|
|
489
|
-
cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
|
|
490
|
-
) -> str:
|
|
491
|
-
"""Gets Kueue covered resources configuration.
|
|
492
|
-
|
|
493
|
-
Args:
|
|
494
|
-
cluster_hardware_name: cluster hardware name.
|
|
495
|
-
resource_type: resource type of tpu or gpu.
|
|
496
|
-
total_chips: total number of chips for the specific resource type.
|
|
497
|
-
|
|
498
|
-
Returns:
|
|
499
|
-
A string of Kueue covered resources configuration.
|
|
500
|
-
"""
|
|
501
|
-
config_format = """
|
|
502
|
-
- coveredResources: {resource_types}
|
|
503
|
-
flavors:
|
|
504
|
-
- name: {cluster_hardware_name}
|
|
505
|
-
resources:
|
|
506
|
-
- name: "{resource_type}"
|
|
507
|
-
nominalQuota: {total_chips}"""
|
|
508
|
-
resource_types = [resource_type]
|
|
509
|
-
if cpu_limit:
|
|
510
|
-
config_format = config_format + """
|
|
511
|
-
- name: "cpu"
|
|
512
|
-
nominalQuota: {cpu_limit}"""
|
|
513
|
-
resource_types.append('cpu')
|
|
514
|
-
if memory_limit:
|
|
515
|
-
config_format = config_format + """
|
|
516
|
-
- name: "memory"
|
|
517
|
-
nominalQuota: {memory_limit}"""
|
|
518
|
-
resource_types.append('memory')
|
|
519
|
-
|
|
520
|
-
config_string = config_format.format(
|
|
521
|
-
cluster_hardware_name=cluster_hardware_name,
|
|
522
|
-
resource_types=resource_types,
|
|
523
|
-
resource_type=resource_type,
|
|
524
|
-
total_chips=total_chips,
|
|
525
|
-
cpu_limit=cpu_limit,
|
|
526
|
-
memory_limit=memory_limit,
|
|
527
|
-
)
|
|
528
|
-
return config_string
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
def update_kueue_resources_if_necessary(args):
|
|
532
|
-
"""Update the kueue manifest to increase the resources for the kueue controller manager.
|
|
533
|
-
|
|
534
|
-
Args:
|
|
535
|
-
args: user provided arguments for running the command.
|
|
536
|
-
|
|
537
|
-
Returns:
|
|
538
|
-
0 if successful and 1 otherwise.
|
|
539
|
-
"""
|
|
540
|
-
# Get total number of nodes
|
|
541
|
-
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
|
|
542
|
-
return_code, out = run_command_for_value(
|
|
543
|
-
cmd_total_node_num, 'Count total nodes', args
|
|
544
|
-
)
|
|
545
|
-
if return_code != 0:
|
|
546
|
-
xpk_exit(1)
|
|
547
|
-
# 1.2MiB per VM or 4GiB (whichever is greater).
|
|
548
|
-
new_memory_limit = (
|
|
549
|
-
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
|
|
550
|
-
)
|
|
551
|
-
yml_string = kueue_controller_manager_yml.format(
|
|
552
|
-
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
|
|
553
|
-
)
|
|
554
|
-
tmp = write_tmp_file(yml_string)
|
|
555
|
-
command = f'kubectl apply -f {str(tmp)}'
|
|
556
|
-
|
|
557
|
-
task = 'Updating Kueue Controller Manager resources'
|
|
558
|
-
return_code = run_command_with_updates_retry(command, task, args)
|
|
559
|
-
if return_code != 0:
|
|
560
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
561
|
-
return return_code
|