xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,133 @@
1
+ integration/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
2
+ integration/docker_manager_test.py,sha256=J2xijy6crRtrwQXrEvtOEY7mo1kEJYhcIYMZ7w0OGa4,2514
3
+ integration/gcluster_a3mega_test.py,sha256=KRxgAsTbo6QBVvhhaEHSis_EvTPi2B-ZIlhEoGClcFs,6046
4
+ integration/gcluster_a3ultra_test.py,sha256=8wEtlQN1_uIBUsidvH_l7Ab-ikDpnABrlu9k1TIBz4Q,5846
5
+ integration/gcluster_a4_test.py,sha256=GCe6BujHCvM62kIGOd-9Wvz-IrR0BY5d83bGD1cmsQ0,5754
6
+ integration/gcluster_test.py,sha256=3GSOMszzNW6Yr4T4PFIpmszonwDAAGpSdKutUA77O-g,3304
7
+ xpk/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
8
+ xpk/main.py,sha256=7YBpzpHxV61c2Js0-uUHUXxTW7cWhLLdMc263rYRW3E,2416
9
+ xpk/api/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
10
+ xpk/api/storage_crd.yaml,sha256=r4WFXnSJJ25EUF-t4Ljfbl-cJoSaiFiZkP8451eTub4,1260
11
+ xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
12
+ xpk/commands/batch.py,sha256=Cj1bDpzPMoPdhaKKrOJJLJ3JzRvJrCMn8huQoHHIZJI,4192
13
+ xpk/commands/cluster.py,sha256=CG4Ieg3QADNUAwyPzZh0P3mop4Pv1DILrNMJVLvgbi8,41682
14
+ xpk/commands/cluster_gcluster.py,sha256=MOxQfQ19sxaDtDBfIzUdrxw4FpboDIiGFKEtFLwfEgQ,13080
15
+ xpk/commands/cluster_gcluster_test.py,sha256=MoMlHbHnCI4ZrnMai1Zw71IP1ORxVVh23cMD5qgXOtQ,6136
16
+ xpk/commands/cluster_test.py,sha256=tT1oEUXqNkzElgHZ-UXJGkJEGyakuaPb1cLHjV-eeM0,2800
17
+ xpk/commands/common.py,sha256=nxLKPhXuAMW7wq-5xL2YeOgDTMUDcLGMujatOUHJ3s4,2504
18
+ xpk/commands/config.py,sha256=gFNkf3ibsvZmcPpkpKXe-KJmHO5IKucNwLCXNgKvaDc,836
19
+ xpk/commands/info.py,sha256=uhv5mPfgg9N-5JhQw4dT2jujL9ZC5kzGA18h9NFfm5A,7429
20
+ xpk/commands/inspector.py,sha256=JruseZl9ZIlR9-Lv_pn8YHfLdyiMgHHSf7xPGBAtXTM,12616
21
+ xpk/commands/job.py,sha256=rPIfWvgm5mLz7K7YDLK721ZcUcg5OEmYVAPAtRtB5Ag,6718
22
+ xpk/commands/kind.py,sha256=NM2CoKJXrYtY9DHZ2yM097WGcI1kno_G2b23tlLVIEI,7658
23
+ xpk/commands/kjob_common.py,sha256=bRaORiGVjPAdN0T3aRmbcQgXYe-EtjoVKePdWzQ5xU4,1928
24
+ xpk/commands/run.py,sha256=D0zgmnGeBLATphYhzQj29EScxrMmAKqPRhP6nfWuYcY,4085
25
+ xpk/commands/shell.py,sha256=mRHMwm3Izzsue4bocekm82Rg_cPUaGMClSlvNzNXQ-o,4467
26
+ xpk/commands/storage.py,sha256=90nz84_ut-uwdIVrxnLYq0K4uYpZWncKM-ZZ_2bRzcI,11505
27
+ xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
28
+ xpk/commands/workload.py,sha256=S6g2B8bzhM_XNGY4NJUYVODcIuvutKQmdl0VV7jwGzw,29239
29
+ xpk/commands/workload_test.py,sha256=e6D1j2Akewq50_76oD84gSCdcW44PEOsIxLPjaL11s8,2344
30
+ xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
31
+ xpk/core/capacity.py,sha256=ohrVvguNwSv7HufdhAhp9dEtf49QT9eXPaPuI9BuAck,7401
32
+ xpk/core/cluster.py,sha256=H5-x0xYijQbgINbAIXovu6JoR-9iKnuVQrMVy1xrEP0,26359
33
+ xpk/core/cluster_private.py,sha256=RLi0C7bV0NEUXl6QKQzvUT0weN9EdqPvjuuOQsNO0DY,6868
34
+ xpk/core/commands.py,sha256=mtkT_ZsuHj5HTvFAj7mMd8kMXsWoWSA4zqHjlRiFL4o,10369
35
+ xpk/core/config.py,sha256=G7MKXARftcuoKSV90F40HXh7kYj1ePde8O8dCp95QoI,3407
36
+ xpk/core/config_test.py,sha256=v1qfyFRzLkYSQ7Wn4nx1N0dBSOFXidLWDfhkeHDZOVM,1847
37
+ xpk/core/docker_container.py,sha256=GvkCJ2S5UKn8uh3pZhRd3X7iS0-PsQpRO8l7QhywVGc,7604
38
+ xpk/core/docker_image.py,sha256=MIU397IGIPwkTZFK-ZGEWuc3RmUIF3sQQZUiUj2gLqA,6775
39
+ xpk/core/docker_manager.py,sha256=JBFgyD6O7LKwEHJC7YuSoCDZqrFRtb-LjgWNqkfAbR0,10566
40
+ xpk/core/docker_resources.py,sha256=_aKgpUjyJB2krQ1PkHrotB7K4kByLmPLbuvl_UVvuX8,12843
41
+ xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
42
+ xpk/core/gcloud_context.py,sha256=xZdVoRNLlE-kwXY5djoyQ0I0-KEh5nAohrVql7Jl42k,6649
43
+ xpk/core/gcloud_context_test.py,sha256=YY0R6j-m62coVK2MAjWXDIdxdP6J5yn6R1RiTDkuExQ,2719
44
+ xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
45
+ xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
46
+ xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
47
+ xpk/core/kjob.py,sha256=symYO3DQfSNP6MAJE54QZuCPDF2kseaxZ-_mmsoBQjo,14478
48
+ xpk/core/kueue_manager.py,sha256=TFdl33N-b-eSrOrsnJueP1H4qYWpkX_lFDhLSVvfeGM,13292
49
+ xpk/core/kueue_manager_test.py,sha256=PtsSSfwtdk-2wK_dr_OQb8oYWVce5nmDN6iowND_3b4,20062
50
+ xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
51
+ xpk/core/mtc.py,sha256=pO7p3l-EzLFdTE8MdwWV8i0Zu-7epGql_kPoksVofIU,6259
52
+ xpk/core/nap.py,sha256=uA33XccGjEF5RZRO5IpUMMzCf-u6D73cwwPEj4q1qvc,12820
53
+ xpk/core/network.py,sha256=Oulb7U69lWkpOKxOC1C7ekJDpC51TLwd7XdZA3NQ7E0,10505
54
+ xpk/core/nodepool.py,sha256=LfS_RvKHmS7f97hc_UidUC13FFEyGvfageSlt8d-5hw,23227
55
+ xpk/core/nodepool_test.py,sha256=YNvp8WXznAI8DscrN9-BSnqII2AtRju2guxrobvCH8A,8805
56
+ xpk/core/pathways.py,sha256=s-h_ofMrbFn3J6NFmT5OMe_HiUQIkI90ty7xbS05iA8,10710
57
+ xpk/core/ray.py,sha256=JWhc_ToRHpF4_URGnuE_47FMgamaRsA4KVUMpqThWzw,6145
58
+ xpk/core/resources.py,sha256=HlYNPQlaJa5y-pb70aVJzdiiOLDnoACF-wJXePM4ejs,8077
59
+ xpk/core/scheduling.py,sha256=_o8QkQxVM-8z5K5ATslM_qQ87f5dzYE0ZeBcwQ6Oqic,9702
60
+ xpk/core/scheduling_test.py,sha256=m9KcglAbg0qly095PmrUOZxJYUE2UeQmkBNIWn5nFyk,979
61
+ xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
62
+ xpk/core/system_characteristics.py,sha256=qHgqyQF6RVjU059YU0eB66_ia74tV-ogDyZbqCTUfw0,22589
63
+ xpk/core/system_characteristics_test.py,sha256=MWvbaEQRoepYNZKOQcs73ppcTKW8plYQ9FQBotfA27I,2250
64
+ xpk/core/vertex.py,sha256=orIZAVwZruRJQ6-vgc1wShuTsiipdH-zHQ9O4ie_HSA,3638
65
+ xpk/core/workload.py,sha256=6TVZM15n8W7046VgmmH9Jv54MrhExtLQH3GaiwlV8Xs,8959
66
+ xpk/core/workload_test.py,sha256=tVTvrwDRXD3O1GCoftgEBWilCYTN74ayP1KRP0vptx0,857
67
+ xpk/core/blueprint/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
68
+ xpk/core/blueprint/blueprint_definitions.py,sha256=5i331XA-2yP_ALyB6XU5tP2Tf9iHcIX5g0TilxQi8zE,1800
69
+ xpk/core/blueprint/blueprint_generator.py,sha256=VWhp89upY7vqVFUqcpWR6zwTBoFUSahX2sQxXEdUpSk,36393
70
+ xpk/core/blueprint/blueprint_test.py,sha256=T058Dq-x4wQQqjs33BWhjHdT4qJLDwsIcc7vareHh_c,7204
71
+ xpk/core/blueprint/testing/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
72
+ xpk/core/remote_state/__init__.py,sha256=PkV8D9WOtlJHH5AIxsQaKeIBcmupT_Ol_bwJgN6G2I8,561
73
+ xpk/core/remote_state/fuse_remote_state.py,sha256=3Dx4ZZd0NFF5-MlqGWHzz8H4bjYiPOWdF_YSEnKUPQ8,3246
74
+ xpk/core/remote_state/remote_state_client.py,sha256=6PcR92Xy_RMjlF4AscanQ1jXNHnewLWGNC2v53jbzD4,1077
75
+ xpk/core/workload_decorators/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
76
+ xpk/core/workload_decorators/rdma_decorator.py,sha256=isbgPnjdu2AT_Da1nVUIRoGE_qZ7jMDOKCgZOLq5r2A,4006
77
+ xpk/core/workload_decorators/storage_decorator.py,sha256=DDYQVO1OKTLhveDOA4V6b2RWr4n0fbwHdnoFFmW7iaQ,2000
78
+ xpk/core/workload_decorators/tcpx_decorator.py,sha256=m5EgzEHjbcOD13ygY91mQdhwQt4Gr5PyalVkKcHyeV8,5975
79
+ xpk/core/workload_decorators/tcpx_decorator_test.py,sha256=iTBS3X_-VwA2oveNDjscduLtll0VOJyFRCp4xmsjg7w,8515
80
+ xpk/core/workload_decorators/tcpxo_decorator.py,sha256=_nLX7tbnxhnS-xv4Jijd1JOP76V4LpNCfW3Np404Cqw,6537
81
+ xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
82
+ xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
83
+ xpk/parser/cluster.py,sha256=1cc2pCMbceF6Lx6MXINlgEfiIRUYMITha3eCE6R8SrI,30708
84
+ xpk/parser/cluster_test.py,sha256=h7lILF2V37WzQSZjGD2jCqI1iWJNY7BLODZf1uhK9QA,1806
85
+ xpk/parser/common.py,sha256=1wihGYBe01ci4quZnkeR60-u_lapSOG4WhRFrE3yFR8,8108
86
+ xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
87
+ xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
88
+ xpk/parser/info.py,sha256=UJohxVVWdt9IgUXoPsrVae2DN1BjAVGWrSN2ajrB8RQ,1860
89
+ xpk/parser/inspector.py,sha256=hAPAZ2k9iSJgC1mjnz3rMleInsAQ8PmkyyUKFyBmsgY,1997
90
+ xpk/parser/job.py,sha256=5RdE70rucGfrsn65l7Ho6RmO06mag1S0AO-3saVuXyw,4328
91
+ xpk/parser/kind.py,sha256=sgPCqNVrgmFLcOBEbhlaphwVXxMh_opP9ntCq4KPePE,2682
92
+ xpk/parser/run.py,sha256=oi_ksSyJ8Ooffe2EgoV_ecpmXEmNGVotjpIQH-HjufE,1481
93
+ xpk/parser/shell.py,sha256=VC8p-kz9XjJZW9DXZ-rnv41XnRDRpQRFywHpB5j7tfc,1970
94
+ xpk/parser/storage.py,sha256=XNynqulEzTmT8_G6wkeBwfXX0XQ1lsd6BFcx0H6rGfU,9971
95
+ xpk/parser/validators.py,sha256=-NBZelvfwZRzjz-YUCreD8EzMLHll8PZM-d-MVm2PG4,1192
96
+ xpk/parser/version.py,sha256=eJo4PAbbmRQZulgKBs_ytbVgV9zAaaXeNzMMxmgFMVY,769
97
+ xpk/parser/workload.py,sha256=oI6y66Old_z6PxFPTZ3LvektUsOZ7U8f6g6cscuzh9g,27208
98
+ xpk/parser/workload_test.py,sha256=t0aAqEsAMz1U3xTMrBcm773d9kwYOcvLtQk5n4jqWPw,2075
99
+ xpk/templates/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
100
+ xpk/templates/cluster_preheat.yaml.j2,sha256=1e8jYagQE6O7BjAfuwmEqGG1b8AOsLRlQm4V68ZnGNs,721
101
+ xpk/templates/filestore-pv.yaml,sha256=FxKZkAXa2czIYblq77iewQjCjOjs-FptuF3YLOByfLo,316
102
+ xpk/templates/filestore-pvc.yaml,sha256=Rf80UNYs3XTUdOJuWCeFq80TKXq5FhfafRE89hq7y9o,161
103
+ xpk/templates/filestore-sc.yaml,sha256=vHzcU7jk0B5z7EgTfQmMM1m2TIzR70ny6MvthyYdqhE,213
104
+ xpk/templates/fuse-pv.yaml,sha256=-CM6AYAy4HKOErd9ogiM-6vWIuWS5yXoXyBAp2EoZsM,321
105
+ xpk/templates/fuse-pvc.yaml,sha256=heGWvRIetukAI9pH9auXxnU2H-G_8pL_wRIzoWDLVH8,218
106
+ xpk/templates/kueue_config.yaml.j2,sha256=ZAZwzZ28piwlXi9Offo2CvrQ3K9gv1r9BJTPavnUEdY,2055
107
+ xpk/templates/kueue_gke_default_topology.yaml.j2,sha256=wW3qt6p3VDPgFVX7Ozw4-O4QgQ3mhH8U3osKnmuOFaE,299
108
+ xpk/templates/kueue_sub_slicing_topology.yaml.j2,sha256=Lv54uS3fC7sJNJdk4DFtnOiLSgq4wq38n_BQCx4Tz6Q,542
109
+ xpk/templates/mtc-cpc.yaml,sha256=MPx75tog09kjRAvHoNOPCEobigQ17d7pYCUnZCevSDQ,340
110
+ xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
111
+ xpk/templates/volume_bundle.yaml,sha256=sqeag7GPWqGNQ5doZtO9IVAX_vKYRO73-aBE7waEtSY,129
112
+ xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
113
+ xpk/utils/console.py,sha256=hRbvtog_VAzuxt5GfwK5GZdd5SWaa7kvWG8zo_qFRQc,1519
114
+ xpk/utils/execution_context.py,sha256=WYxm6NExBIP6iLAWaL5aV858riGJbAHn0Zs6fmKlmzE,784
115
+ xpk/utils/feature_flags.py,sha256=b8zB_zEiKcoOnyGHc7vuQN0ruRTdY8ixqUjWT7Ilp-M,824
116
+ xpk/utils/file.py,sha256=hi9v4gfwiB3JHi3tnelPbm_dlTUt47U0wvvWKQqMjiQ,2500
117
+ xpk/utils/gcs_utils.py,sha256=zg-XSTv4G4TFjeT2bNBm2WLdDXPrOZi0rNv_JdppNg4,4113
118
+ xpk/utils/kubectl.py,sha256=WKB9UhpouPN9G4n2ejRi_PgsYLI0R01gzkS1WGU6mJA,1828
119
+ xpk/utils/kueue.py,sha256=P1Pu_crGuOgYxjl8CczTgtQoum0w1sbSLGPOaEZ5180,713
120
+ xpk/utils/network.py,sha256=dGS5rxIm_zaayDElHNlzalaf09M99by5ckL_lGDl_yQ,4293
121
+ xpk/utils/objects.py,sha256=OwMNxB4TGX21qnJPdZo2YBMPMbQPqOtHMh19QhoRNRY,2498
122
+ xpk/utils/templates.py,sha256=5VAUtv-F6ICL5mxZ3Xtzdh8FEc0-86jFbqhgwW-QtcM,1277
123
+ xpk/utils/topology.py,sha256=WcAG3kzA8krlbyiZpTRDI2TteNozn3r1GH914Lykcp8,1374
124
+ xpk/utils/topology_test.py,sha256=jDXCPgBPfByqjhi0W9A5c8uOHOYjRav53nNlg71ipjk,1943
125
+ xpk/utils/validation.py,sha256=-Qd5jqkVzQHkJvmQnGjHjtAcfvz064Vbo6_sl4EnYKw,3497
126
+ xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
127
+ xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
128
+ xpk-0.14.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
129
+ xpk-0.14.1.dist-info/METADATA,sha256=41WVkaR-Bgj64aC9vi26hgI9g6CpnDtEF1G3yXhbpzI,71934
130
+ xpk-0.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
131
+ xpk-0.14.1.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
132
+ xpk-0.14.1.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
133
+ xpk-0.14.1.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ integration
2
+ xpk
xpk/core/kueue.py DELETED
@@ -1,561 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- from argparse import Namespace
18
-
19
- import math
20
- import packaging
21
- from packaging.version import Version
22
-
23
- from ..utils.console import xpk_exit, xpk_print
24
- from ..utils.file import write_tmp_file
25
- from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
26
- from .commands import (
27
- run_command_for_value,
28
- run_command_with_updates,
29
- run_command_with_updates_retry,
30
- )
31
- from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
32
- from .resources import AutoprovisioningConfig
33
- from .scheduling import (
34
- create_accelerator_label,
35
- create_machine_label,
36
- get_total_chips_requested_from_args,
37
- )
38
- from .system_characteristics import (
39
- AcceleratorTypeToAcceleratorCharacteristics,
40
- SystemCharacteristics,
41
- )
42
-
43
- KUEUE_VERSION = 'v0.12.2'
44
- CLUSTER_QUEUE_NAME = 'cluster-queue'
45
- LOCAL_QUEUE_NAME = 'multislice-queue'
46
- WAIT_FOR_KUEUE_TIMEOUT = '10m'
47
- MEMORY_SIZE_PER_VM = 1.2
48
- MIN_MEMORY_LIMIT_SIZE = 4096
49
-
50
- packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
51
-
52
- topology_yaml = """apiVersion: kueue.x-k8s.io/v1alpha1
53
- kind: Topology
54
- metadata:
55
- name: "gke-default"
56
- spec:
57
- levels:
58
- - nodeLabel: "cloud.google.com/gce-topology-block"
59
- - nodeLabel: "cloud.google.com/gce-topology-subblock"
60
- - nodeLabel: "cloud.google.com/gce-topology-host"
61
- - nodeLabel: "kubernetes.io/hostname"
62
- ---
63
- """
64
-
65
- cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
66
- kind: ResourceFlavor
67
- metadata:
68
- name: {cluster_hardware_name}
69
- spec:
70
- nodeLabels:
71
- {accelerator_label}
72
- {machine_label}
73
- {topology_label}
74
- ---
75
- apiVersion: kueue.x-k8s.io/v1beta1
76
- kind: AdmissionCheck
77
- metadata:
78
- name: dws-prov
79
- spec:
80
- controllerName: kueue.x-k8s.io/provisioning-request
81
- parameters:
82
- apiGroup: kueue.x-k8s.io
83
- kind: ProvisioningRequestConfig
84
- name: dws-config
85
- ---
86
- apiVersion: kueue.x-k8s.io/v1beta1
87
- kind: ProvisioningRequestConfig
88
- metadata:
89
- name: dws-config
90
- spec:
91
- provisioningClassName: queued-provisioning.gke.io
92
- podSetUpdates:
93
- nodeSelector:
94
- - key: autoscaling.gke.io/provisioning-request
95
- valueFromProvisioningClassDetail: ResizeRequestName
96
- managedResources:
97
- - {managed_resource}
98
- ---
99
- {pw_resource_flavors}
100
- apiVersion: kueue.x-k8s.io/v1beta1
101
- kind: ClusterQueue
102
- metadata:
103
- name: {cluster_queue_name}
104
- spec:
105
- preemption:
106
- reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
107
- withinClusterQueue: LowerPriority
108
- namespaceSelector: {{}} # match all.
109
- resourceGroups:
110
- {covered_resources_config}
111
- {pw_resources_kueue}
112
- {admission_checks}
113
- ---
114
- apiVersion: kueue.x-k8s.io/v1beta1
115
- kind: LocalQueue
116
- metadata:
117
- namespace: default
118
- name: {local_queue_name}
119
- spec:
120
- clusterQueue: {cluster_queue_name}
121
- ---
122
- apiVersion: scheduling.k8s.io/v1
123
- kind: PriorityClass
124
- metadata:
125
- name: very-low
126
- value: 100
127
- globalDefault: false
128
- description: "Very Low"
129
- ---
130
- apiVersion: scheduling.k8s.io/v1
131
- kind: PriorityClass
132
- metadata:
133
- name: low
134
- value: 250
135
- globalDefault: false
136
- description: "Low"
137
- ---
138
- apiVersion: scheduling.k8s.io/v1
139
- kind: PriorityClass
140
- metadata:
141
- name: medium
142
- value: 500
143
- globalDefault: false
144
- description: "Medium"
145
- ---
146
- apiVersion: scheduling.k8s.io/v1
147
- kind: PriorityClass
148
- metadata:
149
- name: high
150
- value: 750
151
- globalDefault: false
152
- description: "High"
153
- ---
154
- apiVersion: scheduling.k8s.io/v1
155
- kind: PriorityClass
156
- metadata:
157
- name: very-high
158
- value: 1000
159
- globalDefault: false
160
- description: "Very High"
161
- """
162
-
163
- cluster_preheat_yml = """
164
- apiVersion: apps/v1
165
- kind: DaemonSet
166
- metadata:
167
- name: {cachekey}
168
- labels:
169
- k8s-app: {cachekey}
170
- spec:
171
- selector:
172
- matchLabels:
173
- k8s-app: {cachekey}
174
- updateStrategy:
175
- type: RollingUpdate
176
- template:
177
- metadata:
178
- labels:
179
- name: {cachekey}
180
- k8s-app: {cachekey}
181
- spec:
182
- affinity:
183
- nodeAffinity:
184
- requiredDuringSchedulingIgnoredDuringExecution:
185
- nodeSelectorTerms:
186
- - matchExpressions:
187
- - key: {nodeSelectorKey}
188
- operator: Exists
189
- tolerations:
190
- - operator: "Exists"
191
- containers:
192
- - image: {image_name}
193
- name: {cachekey}
194
- command: [ "sleep", "inf" ]
195
- """
196
-
197
- kueue_controller_manager_yml = """
198
- apiVersion: apps/v1
199
- kind: Deployment
200
- metadata:
201
- labels:
202
- app.kubernetes.io/component: controller
203
- app.kubernetes.io/name: kueue
204
- control-plane: controller-manager
205
- name: kueue-controller-manager
206
- namespace: kueue-system
207
- spec:
208
- replicas: 1
209
- selector:
210
- matchLabels:
211
- control-plane: controller-manager
212
- template:
213
- metadata:
214
- annotations:
215
- kubectl.kubernetes.io/default-container: manager
216
- labels:
217
- app.kubernetes.io/component: controller
218
- app.kubernetes.io/name: kueue
219
- control-plane: controller-manager
220
- spec:
221
- containers:
222
- - args:
223
- - --config=/controller_manager_config.yaml
224
- - --zap-log-level=2
225
- command:
226
- - /manager
227
- image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION}
228
- imagePullPolicy: Always
229
- livenessProbe:
230
- httpGet:
231
- path: /healthz
232
- port: 8081
233
- initialDelaySeconds: 15
234
- periodSeconds: 20
235
- name: manager
236
- ports:
237
- - containerPort: 8082
238
- name: visibility
239
- protocol: TCP
240
- - containerPort: 9443
241
- name: webhook-server
242
- protocol: TCP
243
- readinessProbe:
244
- httpGet:
245
- path: /readyz
246
- port: 8081
247
- initialDelaySeconds: 5
248
- periodSeconds: 10
249
- resources:
250
- limits:
251
- cpu: 1000m
252
- memory: {memory_limit_size}
253
- requests:
254
- cpu: 1000m
255
- memory: 512Mi
256
- securityContext:
257
- allowPrivilegeEscalation: false
258
- volumeMounts:
259
- - mountPath: /visibility
260
- name: visibility
261
- - mountPath: /tmp/k8s-webhook-server/serving-certs
262
- name: cert
263
- readOnly: true
264
- - mountPath: /controller_manager_config.yaml
265
- name: manager-config
266
- subPath: controller_manager_config.yaml
267
- securityContext:
268
- runAsNonRoot: true
269
- serviceAccountName: kueue-controller-manager
270
- terminationGracePeriodSeconds: 10
271
- volumes:
272
- - name: visibility
273
- emptyDir: {{}}
274
- - name: cert
275
- secret:
276
- defaultMode: 420
277
- secretName: kueue-webhook-server-cert
278
- - configMap:
279
- name: kueue-manager-config
280
- name: manager-config
281
- """
282
-
283
-
284
- def verify_kueuectl(args: Namespace) -> None:
285
- """Verify if kueuectl is installed.
286
- Args:
287
- args: user provided arguments.
288
- Returns:
289
- None
290
- """
291
- xpk_print('Veryfing kueuectl installation')
292
-
293
- command = 'kubectl kueue version'
294
- task = 'Verify kueuectl installation on cluster'
295
- verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
296
-
297
- if verify_kueuectl_installed_code == 0:
298
- xpk_print('kueuectl found')
299
-
300
- if verify_kueuectl_installed_code != 0:
301
- xpk_print(
302
- 'kueuectl not found. Please follow'
303
- ' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
304
- ' to install kueuectl.'
305
- )
306
- xpk_exit(verify_kueuectl_installed_code)
307
-
308
-
309
- def delete_multikueueconfigs_definitions(args) -> int:
310
- command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
311
- task = 'Delete multikueueconfigs crds'
312
- return_code = run_command_with_updates_retry(command, task, args)
313
- if return_code != 0:
314
- xpk_print(f'{task} returned ERROR {return_code}')
315
- return return_code
316
-
317
-
318
- def delete_multikueueclusters_definitions(args) -> int:
319
- command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
320
- task = 'Delete multikueueclusters crds'
321
- return_code = run_command_with_updates_retry(command, task, args)
322
- if return_code != 0:
323
- xpk_print(f'{task} returned ERROR {return_code}')
324
- return return_code
325
-
326
-
327
- def get_kueue_version(args) -> tuple[int, str]:
328
- command = 'kubectl kueue version'
329
- task = 'Get kueue version on server'
330
- return_code, val = run_command_for_value(command, task, args)
331
- if return_code != 0:
332
- return return_code, ''
333
- lines = val.splitlines()
334
- if len(lines) == 1:
335
- return 1, ''
336
- server_version_line = lines[1]
337
- manager_image_version = server_version_line.split(':')[-1]
338
- return return_code, manager_image_version
339
-
340
-
341
- def install_kueue_on_cluster(args) -> int:
342
- """Install Kueue on the cluster.
343
-
344
- Args:
345
- args: user provided arguments for running the command.
346
-
347
- Returns:
348
- 0 if successful and 1 otherwise.
349
- """
350
-
351
- err_code, kueue_version_installed = get_kueue_version(args)
352
- if err_code == 0:
353
- if Version(kueue_version_installed) < Version('v0.9.0') and Version(
354
- KUEUE_VERSION
355
- ) >= Version('v0.9.0'):
356
- xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
357
- upgrade_code = delete_multikueueclusters_definitions(args)
358
- if upgrade_code != 0:
359
- return upgrade_code
360
- upgrade_code = delete_multikueueconfigs_definitions(args)
361
- if upgrade_code != 0:
362
- return upgrade_code
363
-
364
- command = (
365
- 'kubectl apply --server-side --force-conflicts -f'
366
- f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
367
- )
368
- task = 'Set Kueue On Cluster'
369
- return_code = run_command_with_updates_retry(command, task, args)
370
- if return_code != 0:
371
- xpk_print(f'{task} returned ERROR {return_code}')
372
- return return_code
373
-
374
-
375
- def wait_for_kueue_available(args: Namespace) -> int:
376
- """Wait for Kueue to be fully available.
377
-
378
- Args:
379
- args: user provided arguments for running the command.
380
-
381
- Returns:
382
- 0 if successful and 1 otherwise.
383
- """
384
- command = (
385
- 'kubectl wait deploy/kueue-controller-manager -nkueue-system'
386
- f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
387
- )
388
- task = 'Wait for Kueue to be available'
389
- return_code = run_command_with_updates(command, task, args)
390
- if return_code != 0:
391
- xpk_print(f'{task} returned ERROR {return_code}')
392
- return return_code
393
-
394
-
395
- def install_kueue_crs(
396
- args,
397
- system: SystemCharacteristics,
398
- autoprovisioning_config: AutoprovisioningConfig | None,
399
- flex_with_tpu=False,
400
- ) -> int:
401
- """Install Kueue Custom Resources.
402
-
403
- Args:
404
- args: user provided arguments for running the command.
405
- system: system level arguments.
406
- autoprovisioning_config: Autoprovisioning config to configure kueue with if
407
- autoprovisioning is enabled.
408
-
409
- Returns:
410
- 0 if successful and 1 otherwise.
411
- """
412
- device_type = system.device_type
413
- cluster_hardware_name = f'{args.num_slices}x{device_type}'
414
- resource_type = AcceleratorTypeToAcceleratorCharacteristics[
415
- system.accelerator_type
416
- ].resource_type
417
-
418
- autoprovisioning_enabled = False
419
- if autoprovisioning_config:
420
- # Determine total resources available based on autoprovisioning max chips.
421
- autoprovisioning_enabled = True
422
- total_chips = autoprovisioning_config.maximum_chips
423
- cluster_hardware_name = f'{system.gke_accelerator}'
424
- else:
425
- # Determine total chips based on user specified topology.
426
- total_chips = get_total_chips_requested_from_args(args, system)
427
- if args.flex and flex_with_tpu is False:
428
- admission_checks = """
429
- admissionChecks:
430
- - dws-prov
431
- """
432
- else:
433
- admission_checks = ''
434
-
435
- covered_resources_config = get_kueue_covered_resources_config(
436
- cluster_hardware_name=cluster_hardware_name,
437
- resource_type=resource_type,
438
- total_chips=total_chips,
439
- cpu_limit=args.cpu_limit,
440
- memory_limit=args.memory_limit,
441
- )
442
- topology_label = ''
443
- if system.device_type in [
444
- H100_MEGA_DEVICE_TYPE,
445
- H200_DEVICE_TYPE,
446
- B200_DEVICE_TYPE,
447
- ]:
448
- topology_label = 'topologyName: "gke-default"'
449
- res_type = AcceleratorTypeToAcceleratorCharacteristics[
450
- system.accelerator_type
451
- ].resource_type
452
- yml_string = cluster_set_crd_yaml.format(
453
- system=system,
454
- cluster_hardware_name=cluster_hardware_name,
455
- accelerator_label=create_accelerator_label(
456
- system.accelerator_type, system
457
- ),
458
- machine_label=create_machine_label(
459
- system.accelerator_type, system, autoprovisioning_enabled
460
- ),
461
- topology_label=topology_label,
462
- covered_resources_config=covered_resources_config,
463
- resource_type=res_type,
464
- pw_resource_flavors=add_pw_resource_flavors(args),
465
- pw_resources_kueue=add_pw_resources_to_kueue(args),
466
- admission_checks=admission_checks,
467
- managed_resource=res_type,
468
- cluster_queue_name=CLUSTER_QUEUE_NAME,
469
- local_queue_name=LOCAL_QUEUE_NAME,
470
- )
471
- if system.device_type in [
472
- H100_MEGA_DEVICE_TYPE,
473
- H200_DEVICE_TYPE,
474
- B200_DEVICE_TYPE,
475
- ]:
476
- yml_string = topology_yaml + yml_string
477
-
478
- tmp = write_tmp_file(yml_string)
479
- command = f'kubectl apply -f {str(tmp)}'
480
-
481
- task = 'Applying Kueue Custom Resources'
482
- return_code = run_command_with_updates_retry(command, task, args)
483
- if return_code != 0:
484
- xpk_print(f'{task} returned ERROR {return_code}')
485
- return return_code
486
-
487
-
488
- def get_kueue_covered_resources_config(
489
- cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
490
- ) -> str:
491
- """Gets Kueue covered resources configuration.
492
-
493
- Args:
494
- cluster_hardware_name: cluster hardware name.
495
- resource_type: resource type of tpu or gpu.
496
- total_chips: total number of chips for the specific resource type.
497
-
498
- Returns:
499
- A string of Kueue covered resources configuration.
500
- """
501
- config_format = """
502
- - coveredResources: {resource_types}
503
- flavors:
504
- - name: {cluster_hardware_name}
505
- resources:
506
- - name: "{resource_type}"
507
- nominalQuota: {total_chips}"""
508
- resource_types = [resource_type]
509
- if cpu_limit:
510
- config_format = config_format + """
511
- - name: "cpu"
512
- nominalQuota: {cpu_limit}"""
513
- resource_types.append('cpu')
514
- if memory_limit:
515
- config_format = config_format + """
516
- - name: "memory"
517
- nominalQuota: {memory_limit}"""
518
- resource_types.append('memory')
519
-
520
- config_string = config_format.format(
521
- cluster_hardware_name=cluster_hardware_name,
522
- resource_types=resource_types,
523
- resource_type=resource_type,
524
- total_chips=total_chips,
525
- cpu_limit=cpu_limit,
526
- memory_limit=memory_limit,
527
- )
528
- return config_string
529
-
530
-
531
- def update_kueue_resources_if_necessary(args):
532
- """Update the kueue manifest to increase the resources for the kueue controller manager.
533
-
534
- Args:
535
- args: user provided arguments for running the command.
536
-
537
- Returns:
538
- 0 if successful and 1 otherwise.
539
- """
540
- # Get total number of nodes
541
- cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
542
- return_code, out = run_command_for_value(
543
- cmd_total_node_num, 'Count total nodes', args
544
- )
545
- if return_code != 0:
546
- xpk_exit(1)
547
- # 1.2MiB per VM or 4GiB (whichever is greater).
548
- new_memory_limit = (
549
- f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
550
- )
551
- yml_string = kueue_controller_manager_yml.format(
552
- memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
553
- )
554
- tmp = write_tmp_file(yml_string)
555
- command = f'kubectl apply -f {str(tmp)}'
556
-
557
- task = 'Updating Kueue Controller Manager resources'
558
- return_code = run_command_with_updates_retry(command, task, args)
559
- if return_code != 0:
560
- xpk_print(f'{task} returned ERROR {return_code}')
561
- return return_code