xpk 0.17.1__py3-none-any.whl → 0.17.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/parser/core.py CHANGED
@@ -23,13 +23,9 @@ from .cluster import set_cluster_parser
23
23
  from .inspector import set_inspector_parser
24
24
  from .storage import set_storage_parser
25
25
  from .workload import set_workload_parsers
26
- from .batch import set_batch_parser
27
- from .job import set_job_parser
28
26
  from .info import set_info_parser
29
27
  from .kind import set_kind_parser
30
- from .shell import set_shell_parser
31
28
  from .version import set_version_parser
32
- from .run import set_run_parser
33
29
 
34
30
 
35
31
  def set_parser(parser: argparse.ArgumentParser):
@@ -54,20 +50,10 @@ def set_parser(parser: argparse.ArgumentParser):
54
50
  "info",
55
51
  help="Commands around listing kueue clusterqueues and localqueues.",
56
52
  )
57
- batch_parser = xpk_subcommands.add_parser(
58
- "batch",
59
- help="commands around running batch job",
60
- )
61
- job_parser = xpk_subcommands.add_parser(
62
- "job", help="commands around listing, cancelling and investigating jobs"
63
- )
64
53
  kind_parser = xpk_subcommands.add_parser(
65
54
  "kind",
66
55
  help="commands around Kind cluster management",
67
56
  )
68
- shell_parser = xpk_subcommands.add_parser(
69
- "shell", help="Commands around configuring and using interactive shell."
70
- )
71
57
  version_parser = xpk_subcommands.add_parser(
72
58
  "version", help="Command to get xpk version"
73
59
  )
@@ -76,11 +62,6 @@ def set_parser(parser: argparse.ArgumentParser):
76
62
  "config", help="Commands to set and retrieve values from xpk config."
77
63
  )
78
64
 
79
- run_parser = xpk_subcommands.add_parser(
80
- "run",
81
- help="Command to run parallel jobs",
82
- )
83
-
84
65
  def default_subcommand_function(
85
66
  _args,
86
67
  ) -> int: # args is unused, so pylint: disable=invalid-name
@@ -96,14 +77,10 @@ def set_parser(parser: argparse.ArgumentParser):
96
77
  parser.print_help()
97
78
  cluster_parser.print_help()
98
79
  workload_parser.print_help()
99
- batch_parser.print_help()
100
80
  info_parser.print_help()
101
- job_parser.print_help()
102
- shell_parser.print_help()
103
81
  version_parser.print_help()
104
82
  kind_parser.print_help()
105
83
  config_parser.print_help()
106
- run_parser.print_help()
107
84
 
108
85
  storage_parser.print_help()
109
86
  return 0
@@ -111,25 +88,17 @@ def set_parser(parser: argparse.ArgumentParser):
111
88
  parser.set_defaults(func=default_subcommand_function)
112
89
  workload_parser.set_defaults(func=default_subcommand_function)
113
90
  cluster_parser.set_defaults(func=default_subcommand_function)
114
- batch_parser.set_defaults(func=default_subcommand_function)
115
91
  info_parser.set_defaults(func=default_subcommand_function)
116
- job_parser.set_defaults(func=default_subcommand_function)
117
92
  kind_parser.set_defaults(func=default_subcommand_function)
118
- shell_parser.set_defaults(func=default_subcommand_function)
119
93
  storage_parser.set_defaults(func=default_subcommand_function)
120
94
  version_parser.set_defaults(func=default_subcommand_function)
121
95
  config_parser.set_defaults(func=default_subcommand_function)
122
- run_parser.set_defaults(func=default_subcommand_function)
123
96
 
124
97
  set_workload_parsers(workload_parser=workload_parser)
125
98
  set_cluster_parser(cluster_parser=cluster_parser)
126
99
  set_inspector_parser(inspector_parser=inspector_parser)
127
- set_batch_parser(batch_parser=batch_parser)
128
100
  set_info_parser(info_parser=info_parser)
129
- set_job_parser(job_parser=job_parser)
130
101
  set_kind_parser(kind_parser=kind_parser)
131
- set_shell_parser(shell_parser=shell_parser)
132
102
  set_storage_parser(storage_parser=storage_parser)
133
103
  set_version_parser(version_parser=version_parser)
134
104
  set_config_parsers(config_parser=config_parser)
135
- set_run_parser(run_parser=run_parser)
xpk/utils/validation.py CHANGED
@@ -37,14 +37,6 @@ class SystemDependency(Enum):
37
37
  ' to install xpk prerequisites.'
38
38
  ),
39
39
  )
40
- KJOB = _SystemDependency(
41
- command='kubectl kjob --help',
42
- message=(
43
- '`kjobctl` not installed. Please follow'
44
- ' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
45
- ' to install xpk prerequisites.'
46
- ),
47
- )
48
40
  GCLOUD = _SystemDependency(
49
41
  command='gcloud version',
50
42
  message=(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.17.1
3
+ Version: 0.17.3
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -20,34 +20,29 @@ xpk/blueprints/a4/config-map.yaml.tftpl,sha256=o6LeGIYUfFGyj3vj-8ztV5ildQ46QZVl7
20
20
  xpk/blueprints/a4/nccl-rdma-installer-a4.yaml,sha256=if3WOmNLVGTJIJHU76EWC1FyiIXDTRIXcwo4OsBxarQ,2113
21
21
  xpk/blueprints/a4/storage_crd.yaml,sha256=r4WFXnSJJ25EUF-t4Ljfbl-cJoSaiFiZkP8451eTub4,1260
22
22
  xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
23
- xpk/commands/batch.py,sha256=Cj1bDpzPMoPdhaKKrOJJLJ3JzRvJrCMn8huQoHHIZJI,4192
24
- xpk/commands/cluster.py,sha256=DtMiIYdYsciXldoWqAfxPIxl9Hc9kbYIj2LsdBER0PI,46172
25
- xpk/commands/cluster_gcluster.py,sha256=x26UqoT8RFX5T9ftQXPEL12HMnMFTi8lret16dnZCms,13970
26
- xpk/commands/cluster_gcluster_test.py,sha256=UcqTTkrQv-R753AtsQvinwgI2vqI6lMHPPEfHPS5e-4,6655
27
- xpk/commands/cluster_test.py,sha256=-7EjuOoGSZhdnLBNBNCMKM6laDYy02aPncbSfUYcrUs,24147
23
+ xpk/commands/cluster.py,sha256=5ebvHXe8Bi4haMd1RokJbjP0LemqXxC1EseqOLWIkGw,45659
24
+ xpk/commands/cluster_gcluster.py,sha256=Ig8jLjsiyFgw9U4BBEzDK2diA9m0STKQgz-uUTG_vYE,13731
25
+ xpk/commands/cluster_gcluster_test.py,sha256=s1wwkcdY4LTxmk_Tx5PKdh9pZmEoo1n8XlzybWalc0M,6165
26
+ xpk/commands/cluster_test.py,sha256=aMkwKrhoEuqElME16ztx5lwv4zT0z_xV0L3in1RaW6M,24017
28
27
  xpk/commands/common.py,sha256=p43sspD5RfYRj3Se_b-X0s0dbBs1PMI1qtySg6zZKKg,2706
29
28
  xpk/commands/config.py,sha256=L_zRpQTxMcSh6rxOT8gG263V6YGqzVoz4UxdWywTFdA,850
30
29
  xpk/commands/info.py,sha256=uhv5mPfgg9N-5JhQw4dT2jujL9ZC5kzGA18h9NFfm5A,7429
31
30
  xpk/commands/inspector.py,sha256=FPasKtGuEZKNXIQin4AG49clfD4b53NxXpWqBPZIIoE,12955
32
- xpk/commands/job.py,sha256=rPIfWvgm5mLz7K7YDLK721ZcUcg5OEmYVAPAtRtB5Ag,6718
33
- xpk/commands/kind.py,sha256=GNqsaoLInifFQ_ZGpbN_3xA8ExyeyOqBMdnoPV-PqYI,7813
34
- xpk/commands/kjob_common.py,sha256=bRaORiGVjPAdN0T3aRmbcQgXYe-EtjoVKePdWzQ5xU4,1928
31
+ xpk/commands/kind.py,sha256=ck4zaJh9kaNluEdBew9OVj4cZXjUYk_ElycqGLo6f7g,7355
35
32
  xpk/commands/managed_ml_diagnostics.py,sha256=87wmFbnYQY-kEpJfPo1Up53xM5P_P5wOlXczxHzxJjQ,6984
36
33
  xpk/commands/managed_ml_diagnostics_test.py,sha256=pQ1YUGMGRQFJYTS_1o9YyGUzYdLaBdA84LjbnncaeEo,3828
37
- xpk/commands/run.py,sha256=D0zgmnGeBLATphYhzQj29EScxrMmAKqPRhP6nfWuYcY,4085
38
- xpk/commands/shell.py,sha256=mRHMwm3Izzsue4bocekm82Rg_cPUaGMClSlvNzNXQ-o,4467
39
- xpk/commands/storage.py,sha256=kPViq6mrfGeAJwScdMs_kUJg-QxEO6SrEvyBbXhCzEI,11439
34
+ xpk/commands/storage.py,sha256=cSTJN9Mjvdsvk_Nk43kVdQFhp89nxWbanDsTOGZCkpQ,10708
40
35
  xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
41
36
  xpk/commands/workload.py,sha256=l99NRFLs7pXuaLdn5d-Pid-cZulKpB3FNus-HdNDtZw,31513
42
37
  xpk/commands/workload_test.py,sha256=iXTY7VR1KrlPZZyh1Zm0N946kIP1iV2Fnqx1NtOYDJU,7274
43
38
  xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
44
39
  xpk/core/capacity.py,sha256=_TyWayBkNU8fBpz1LTbCddEFZiZW5Qz-xmJnQMsXh0c,10534
45
40
  xpk/core/capacity_test.py,sha256=jZjMHTYlFLdAmBN1t9k29iABCSE5hlW0--q7QLDQpfQ,4330
46
- xpk/core/cluster.py,sha256=zAv46s-UB2r-I6cSkH7YzwAkGgD8Vxn7dJsXb_CMEQI,24062
41
+ xpk/core/cluster.py,sha256=3nl77I_MgQpBZsZSzsiQ_7IyFRzfLrYNRUL1gsSNhKU,24036
47
42
  xpk/core/cluster_private.py,sha256=RLi0C7bV0NEUXl6QKQzvUT0weN9EdqPvjuuOQsNO0DY,6868
48
43
  xpk/core/cluster_test.py,sha256=J4Wk7E--ik_IsWWzL_iWGWbx99Ih03m-0bs-uU7gGDg,5853
49
44
  xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
50
- xpk/core/config.py,sha256=L3iPFvzFCpW8IEAvlbkuEHYBYXmRTC0BAaR7I_5_Peo,5146
45
+ xpk/core/config.py,sha256=7U8jI5oZcgV_UnOHSS3huUIlDmPNREM-ml0N1Y9IvGM,4612
51
46
  xpk/core/config_test.py,sha256=POSuofK0LFbNNygDAo2fjtKY4NMrRjUFeGcpBh9JOS4,3569
52
47
  xpk/core/docker_container.py,sha256=8hqWWNKtjf6dqCFRpfndTMGvN_NS6zhfBr7YuKfh7qo,7626
53
48
  xpk/core/docker_image.py,sha256=9vwqbb6Mc3C5ZEOph03WS-EWI5hxMYGGigqzIMkDTjE,6909
@@ -59,7 +54,6 @@ xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKg
59
54
  xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
60
55
  xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
61
56
  xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
62
- xpk/core/kjob.py,sha256=Ustta_ygXaacmgb1Av6QW4Epw0S_r-b-tjrMA6uNVj0,14240
63
57
  xpk/core/kueue_manager.py,sha256=JB8DcD-RFvBdC9Mk_DDCAkI2Km8W5-KMTRMVec06LlM,20010
64
58
  xpk/core/kueue_manager_test.py,sha256=FfBd1vninU_fcJ9wZev45-vpEsH12a9-XKysk_h4auo,22008
65
59
  xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
@@ -75,7 +69,7 @@ xpk/core/resources.py,sha256=dDsG_LOtcU17p1UKgOYyjdPxbMfqcb7pJ4SjfLDA6Os,9389
75
69
  xpk/core/scheduling.py,sha256=RMoei_HUs03rfrEC-HYk7ONzg9BRKwr59-KljCR2TMo,11560
76
70
  xpk/core/scheduling_test.py,sha256=iYnzXv_MjN743pa4zYAgRqb-6dB9nVPpLI7JP5S8M2I,14463
77
71
  xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
78
- xpk/core/system_characteristics.py,sha256=Tam8wjUz77E6jAJib-r0GsTBmdjo9uaEkXmIdWuzGO8,32844
72
+ xpk/core/system_characteristics.py,sha256=ZQbTbjaeT3Q12kmobz14U878w3FWnXDCetiLZQlVAdY,32127
79
73
  xpk/core/system_characteristics_test.py,sha256=sREN8u8bC0ze_q9hY3v-ZxC7so-_Ox1mt_DkIbUgHJ4,7477
80
74
  xpk/core/telemetry.py,sha256=R7IONNl5heMoNcOurfT3I34XJrBEODKVY88ONiDGuqE,7512
81
75
  xpk/core/telemetry_test.py,sha256=ll-B1ut9X-por17fpQnNb6hKrfyoZanMWRPbvqWrXss,8261
@@ -100,25 +94,21 @@ xpk/core/testing/__init__.py,sha256=PkV8D9WOtlJHH5AIxsQaKeIBcmupT_Ol_bwJgN6G2I8,
100
94
  xpk/core/testing/commands_tester.py,sha256=mQOSFggESeTdzqG4srAPV9ezmoeT90r22K58yAty9sE,4445
101
95
  xpk/core/testing/commands_tester_test.py,sha256=NnLWh7TJ9rKtb-DtB-vwkxvCe5wNtvUJ0f6sOa87Ht4,4023
102
96
  xpk/core/workload_decorators/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
103
- xpk/core/workload_decorators/rdma_decorator.py,sha256=isbgPnjdu2AT_Da1nVUIRoGE_qZ7jMDOKCgZOLq5r2A,4006
97
+ xpk/core/workload_decorators/rdma_decorator.py,sha256=02HVA_jSyzlVtSQnQj7aPdK03h7v5YyioBqEen6pbj0,3636
104
98
  xpk/core/workload_decorators/storage_decorator.py,sha256=DDYQVO1OKTLhveDOA4V6b2RWr4n0fbwHdnoFFmW7iaQ,2000
105
- xpk/core/workload_decorators/tcpx_decorator.py,sha256=6yvofTv6_XmRfI-nESZjGYeLmGrza1rWxeJGET0TqXU,6182
106
- xpk/core/workload_decorators/tcpx_decorator_test.py,sha256=iTBS3X_-VwA2oveNDjscduLtll0VOJyFRCp4xmsjg7w,8515
107
- xpk/core/workload_decorators/tcpxo_decorator.py,sha256=_nLX7tbnxhnS-xv4Jijd1JOP76V4LpNCfW3Np404Cqw,6537
99
+ xpk/core/workload_decorators/tcpx_decorator.py,sha256=cLOntH2ekBcPeiPW0sU3TRozSCpcTxgxpzncrMbRj44,5962
100
+ xpk/core/workload_decorators/tcpx_decorator_test.py,sha256=BmTWsFoBeLb9xhQh3kpqSiarkYax4bj2wLeZ9GrQzag,6089
101
+ xpk/core/workload_decorators/tcpxo_decorator.py,sha256=5SgL-7aTHclN7rvCGvEOjZoUixBmyjfuhVIUBFmneug,6124
108
102
  xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
109
- xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
110
103
  xpk/parser/cluster.py,sha256=U2T-Q4yS86PWeFLNfknYWDDzZfubCKqIhqasxKLmErI,31342
111
104
  xpk/parser/cluster_test.py,sha256=xzQEC3IeAMpwsbNbHLuaNKxR3iaZcm3z4m3i61G62d4,6581
112
- xpk/parser/common.py,sha256=w6u6rqCOO23572C99PV1N8Fsp-vTP0C7Kv5tdWGEQO8,11691
105
+ xpk/parser/common.py,sha256=sJYGjrn2YgFxelDCYB18s1R8Md8GpDcMQNoAezxDDIs,7257
113
106
  xpk/parser/common_test.py,sha256=_6Fm2pUF7h4K0G5qxGabXSYr4ng9ihOzlViE6oLQwQs,1557
114
107
  xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
115
- xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
108
+ xpk/parser/core.py,sha256=P2Dx3AbTlDoWnCCrMhVdr3Fs5FEzYFlmSiugkun1GL0,3623
116
109
  xpk/parser/info.py,sha256=UJohxVVWdt9IgUXoPsrVae2DN1BjAVGWrSN2ajrB8RQ,1860
117
110
  xpk/parser/inspector.py,sha256=hAPAZ2k9iSJgC1mjnz3rMleInsAQ8PmkyyUKFyBmsgY,1997
118
- xpk/parser/job.py,sha256=5RdE70rucGfrsn65l7Ho6RmO06mag1S0AO-3saVuXyw,4328
119
111
  xpk/parser/kind.py,sha256=sgPCqNVrgmFLcOBEbhlaphwVXxMh_opP9ntCq4KPePE,2682
120
- xpk/parser/run.py,sha256=oi_ksSyJ8Ooffe2EgoV_ecpmXEmNGVotjpIQH-HjufE,1481
121
- xpk/parser/shell.py,sha256=VC8p-kz9XjJZW9DXZ-rnv41XnRDRpQRFywHpB5j7tfc,1970
122
112
  xpk/parser/storage.py,sha256=0V1d1htsjoa-SuxOX_vNxz2Lg4Nue9CBe_H0bNS2Hv0,10270
123
113
  xpk/parser/storage_test.py,sha256=i_F9cuQXHRvUy4RJwbfuuI8ZVpTpkkY96sZ1GZ4dLPw,1494
124
114
  xpk/parser/validators.py,sha256=-NBZelvfwZRzjz-YUCreD8EzMLHll8PZM-d-MVm2PG4,1192
@@ -139,7 +129,6 @@ xpk/templates/kueue_sub_slicing_topology.yaml.j2,sha256=UXjpRFqCIcoebwcMeD9Lo4fe
139
129
  xpk/templates/kueue_super_slicing_topology.yaml.j2,sha256=4WkSfQ2A5-jnKWiHWj2WXlv4sQmAcfxzbJCW-cWUE8E,264
140
130
  xpk/templates/mtc-cpc.yaml,sha256=MPx75tog09kjRAvHoNOPCEobigQ17d7pYCUnZCevSDQ,340
141
131
  xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
142
- xpk/templates/volume_bundle.yaml,sha256=sqeag7GPWqGNQ5doZtO9IVAX_vKYRO73-aBE7waEtSY,129
143
132
  xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
144
133
  xpk/utils/console.py,sha256=AJWSyjuWyLjb7SYt8kPb0gw9N84EN9LbLxYCXjC-6Ds,2464
145
134
  xpk/utils/console_test.py,sha256=x1v7v9VrIZwAKH-eOzj1lAY4EsHxJ6ruhfEOzpssO6o,2944
@@ -158,13 +147,13 @@ xpk/utils/user_agent.py,sha256=1NMtixC1RIr_MwM5pJ0THQ0x1-fCQA92TFHjWAVZldw,1083
158
147
  xpk/utils/user_agent_test.py,sha256=lkv8LqzhlA1gXFVeBzoLwE1_iGnm8G9LzkkElMrIrx0,1774
159
148
  xpk/utils/user_input.py,sha256=kMdCcPWdkI31f1mJcMsNGda-xKyKxEerpSLpCqIWYPc,1503
160
149
  xpk/utils/user_input_test.py,sha256=xO34jkMoTAk5Cmw7yHTk-7YexzC2UZ6ajihV8lnlAyI,2666
161
- xpk/utils/validation.py,sha256=irL9579RbvwxiGn1t3zhhPo-0oHgdUPOSYsUuFqsDSM,3039
150
+ xpk/utils/validation.py,sha256=rE9LTkXJT7jIesodFb9pONL7ixhLqiQleyoaz7N39Dw,2765
162
151
  xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
163
152
  xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
164
153
  xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
165
- xpk-0.17.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
166
- xpk-0.17.1.dist-info/METADATA,sha256=7g7GYCSOZ1MjXNwXaMHto0yLF-MV2A2zwdfm98Qw2Eo,7930
167
- xpk-0.17.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
168
- xpk-0.17.1.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
169
- xpk-0.17.1.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
170
- xpk-0.17.1.dist-info/RECORD,,
154
+ xpk-0.17.3.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
155
+ xpk-0.17.3.dist-info/METADATA,sha256=ONK-6JpzJboT0wF60svxA4SKJTSqeQ4KNfgSvZ_kkDY,7930
156
+ xpk-0.17.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
157
+ xpk-0.17.3.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
158
+ xpk-0.17.3.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
159
+ xpk-0.17.3.dist-info/RECORD,,
xpk/commands/batch.py DELETED
@@ -1,144 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- import re
18
- from argparse import Namespace
19
-
20
- from ..core.cluster import (
21
- setup_k8s_service_accounts,
22
- get_cluster_credentials,
23
- )
24
- from ..core.commands import run_command_for_value
25
- from ..core.gcloud_context import add_zone_and_project
26
- from ..core.kjob import (
27
- AppProfileDefaults,
28
- JobTemplateDefaults,
29
- get_storage_annotations,
30
- prepare_kjob,
31
- )
32
- from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
- from ..utils.console import xpk_exit, xpk_print
34
- from ..utils.execution_context import is_dry_run
35
- from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
36
- from .kind import set_local_cluster_command
37
- from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
38
-
39
-
40
- def batch(args: Namespace) -> None:
41
- """Run batch task.
42
- This function runs passed script in non-blocking manner.
43
- Args:
44
- args: user provided arguments for running the command.
45
- Returns:
46
- None
47
- """
48
- if should_validate_dependencies(args):
49
- validate_dependencies_list([
50
- SystemDependency.KUBECTL,
51
- SystemDependency.KJOB,
52
- SystemDependency.GCLOUD,
53
- ])
54
- if not args.kind_cluster:
55
- add_zone_and_project(args)
56
- get_cluster_credentials(args)
57
- else:
58
- set_cluster_command_code = set_local_cluster_command(args)
59
- if set_cluster_command_code != 0:
60
- xpk_exit(set_cluster_command_code)
61
-
62
- if not is_dry_run():
63
- err_code = prepare_kjob(args)
64
- if err_code > 0:
65
- xpk_exit(err_code)
66
- setup_k8s_service_accounts()
67
-
68
- submit_job(args)
69
-
70
-
71
- def submit_job(args: Namespace) -> None:
72
- cmd = (
73
- 'kubectl kjob create slurm'
74
- f' --profile {AppProfileDefaults.NAME.value}'
75
- f' --localqueue {LOCAL_QUEUE_NAME}'
76
- f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
77
- ' --first-node-ip'
78
- )
79
- cmd = add_gpu_networking_annotations_to_command(args, cmd)
80
- cmd = add_TAS_annotations_to_command(args, cmd)
81
-
82
- annotations = [] if is_dry_run() else get_storage_annotations(args)
83
- for annotation in annotations:
84
- cmd += f' --pod-template-annotation {annotation}'
85
-
86
- if args.ignore_unknown_flags:
87
- cmd += ' --ignore-unknown-flags'
88
-
89
- cmd += f' -- {args.script} --partition {LOCAL_QUEUE_NAME}'
90
-
91
- if args.array is not None:
92
- cmd += f' --array {args.array}'
93
-
94
- if args.cpus_per_task is not None:
95
- cmd += f' --cpus-per-task {args.cpus_per_task}'
96
-
97
- if args.gpus_per_task is not None:
98
- cmd += f' --gpus-per-task {args.gpus_per_task}'
99
-
100
- if args.mem is not None:
101
- cmd += f' --mem {args.mem}'
102
-
103
- if args.mem_per_task is not None:
104
- cmd += f' --mem-per-task {args.mem_per_task}'
105
-
106
- if args.mem_per_cpu is not None:
107
- cmd += f' --mem-per-cpu {args.mem_per_cpu}'
108
-
109
- if args.mem_per_gpu is not None:
110
- cmd += f' --mem-per-gpu {args.mem_per_gpu}'
111
-
112
- if args.nodes is not None:
113
- cmd += f' --nodes {args.nodes}'
114
-
115
- if args.ntasks is not None:
116
- cmd += f' --ntasks {args.ntasks}'
117
-
118
- if args.output is not None:
119
- cmd += f' --output {args.output}'
120
-
121
- if args.error is not None:
122
- cmd += f' --error {args.error}'
123
-
124
- if args.input is not None:
125
- cmd += f' --input {args.input}'
126
-
127
- if args.job_name is not None:
128
- cmd += f' --job-name {args.job_name}'
129
-
130
- if args.chdir is not None:
131
- cmd += f' --chdir {args.chdir}'
132
-
133
- if args.time is not None:
134
- cmd += f' --time {args.time}'
135
-
136
- return_code, return_value = run_command_for_value(cmd, 'submit job')
137
-
138
- if return_code != 0:
139
- xpk_print(f'Running batch job returned ERROR {return_code}')
140
- xpk_exit(return_code)
141
-
142
- m = re.match(r'job\.batch/([-a-z0-9]+)', return_value)
143
- if m:
144
- xpk_print(f'Job name: {m.group(1)}')
xpk/commands/job.py DELETED
@@ -1,244 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- import re
18
- import sys
19
-
20
- from ruamel.yaml import YAML
21
- from typing import cast
22
-
23
- from ..core.commands import run_command_for_value, run_command_with_updates
24
- from ..core.cluster import get_cluster_credentials
25
- from ..core.gcloud_context import add_zone_and_project
26
- from ..core.kjob import AppProfileDefaults
27
- from ..utils.console import xpk_exit, xpk_print
28
- from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
29
- from .kind import set_local_cluster_command
30
-
31
-
32
- JOBS_DRY_RUN_YAML = """
33
- items:
34
- - apiVersion: slurm.k8s.io/v1alpha1
35
- kind: SlurmJob
36
- metadata:
37
- annotations:
38
- kjobctl.x-k8s.io/script: echo hello
39
- creationTimestamp: '2024-04-29T12:00:00Z'
40
- labels:
41
- kjobctl.x-k8s.io/app-profile: default
42
- name: golden-job
43
- namespace: default
44
- spec:
45
- script: echo hello
46
- """
47
-
48
- PODS_DRY_RUN_RESULT = """
49
- foo-pod 2/2 Running 0 2d
50
- bar-pod 1/1 Evicted 0 1d
51
- """
52
-
53
-
54
- def job_info(args):
55
- """Run commands obtaining information about a job given by name.
56
-
57
- Args:
58
- args: user provided arguments for running the command.
59
-
60
- Returns:
61
- None
62
- """
63
- if should_validate_dependencies(args):
64
- validate_dependencies_list([
65
- SystemDependency.KUBECTL,
66
- SystemDependency.KJOB,
67
- SystemDependency.GCLOUD,
68
- ])
69
- job_name = args.name
70
-
71
- desc_command = f'kubectl-kjob describe slurm {job_name}'
72
- desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data')
73
- if desc_code != 0:
74
- xpk_print(f'Data info request returned ERROR {desc_code}')
75
- xpk_exit(desc_code)
76
-
77
- job_command = (
78
- 'kubectl-kjob list slurm -o yaml --field-selector'
79
- f' metadata.name=={job_name}'
80
- )
81
- job_code, job_text = run_command_for_value(
82
- job_command,
83
- 'Getting job info',
84
- dry_run_return_val=JOBS_DRY_RUN_YAML,
85
- )
86
- if job_code != 0:
87
- xpk_print(f'Job info request returned ERROR {job_code}')
88
- xpk_exit(job_code)
89
-
90
- pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
91
- pods_code, pods_text = run_command_for_value(
92
- pods_command,
93
- 'Getting pods list',
94
- dry_run_return_val=PODS_DRY_RUN_RESULT,
95
- )
96
- if pods_code != 0:
97
- xpk_print(f'Pods list request returned ERROR {pods_code}')
98
- xpk_exit(pods_code)
99
-
100
- yaml = YAML(typ='safe')
101
- job_yaml = yaml.load(job_text)['items'][0]
102
-
103
- output = {
104
- 'Job name': job_name,
105
- 'Script name': get_script_name(job_yaml),
106
- 'Profile': get_profile(job_yaml),
107
- 'Labels': job_yaml.get('metadata').get('labels', []),
108
- 'Mounts': get_mounts(job_yaml),
109
- 'Pods': get_pods(pods_text),
110
- 'Entrypoint environment variables template': get_kjob_env_vars(desc_text),
111
- }
112
-
113
- yaml.default_flow_style = False
114
- yaml.sort_base_mapping_type_on_output = False
115
- yaml.dump(output, sys.stdout)
116
-
117
-
118
- def get_profile(job_yaml: dict) -> str:
119
- containers: list[dict] = (
120
- job_yaml.get('spec', {})
121
- .get('template', {})
122
- .get('spec', {})
123
- .get('containers', [])
124
- )
125
- env_vars = next(iter(containers), {}).get('env', [])
126
- profile = next((x['value'] for x in env_vars if x['name'] == 'PROFILE'), '')
127
- return profile
128
-
129
-
130
- def get_mounts(job_yaml: dict) -> list[dict]:
131
- containers: list[dict] = (
132
- job_yaml.get('spec', {})
133
- .get('template', {})
134
- .get('spec', {})
135
- .get('containers', [])
136
- )
137
- mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
138
- return mounts
139
-
140
-
141
- def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
142
- regex = r'(SLURM_[A-Z_]*=.*)'
143
- search_res = re.findall(regex, job_desc_text)
144
- return search_res
145
-
146
-
147
- def get_pods(pods_text: str) -> list[dict[str, str]]:
148
- pods_lines = pods_text.strip().split('\n')
149
- pods_lines_tokenized = [line.split() for line in pods_lines]
150
- return [
151
- {
152
- 'Name': tokens[0],
153
- 'Status': tokens[2],
154
- }
155
- for tokens in pods_lines_tokenized
156
- ]
157
-
158
-
159
- def get_script_name(job_yaml: dict) -> str | None:
160
- return cast(
161
- str | None,
162
- job_yaml.get('metadata', {})
163
- .get('annotations', {})
164
- .get('kjobctl.x-k8s.io/script', ''),
165
- )
166
-
167
-
168
- def job_list(args) -> None:
169
- """Function around job list.
170
-
171
- Args:
172
- args: user provided arguments for running the command.
173
-
174
- Returns:
175
- None
176
- """
177
- if should_validate_dependencies(args):
178
- validate_dependencies_list([
179
- SystemDependency.KUBECTL,
180
- SystemDependency.KJOB,
181
- SystemDependency.GCLOUD,
182
- ])
183
- if not args.kind_cluster:
184
- add_zone_and_project(args)
185
- get_cluster_credentials(args)
186
- msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
187
- else:
188
- set_cluster_command_code = set_local_cluster_command(args)
189
- msg = 'Listing jobs:'
190
- if set_cluster_command_code != 0:
191
- xpk_exit(set_cluster_command_code)
192
-
193
- xpk_print(msg, flush=True)
194
-
195
- return_code = run_slurm_job_list_command()
196
- xpk_exit(return_code)
197
-
198
-
199
- def run_slurm_job_list_command() -> int:
200
- cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
201
-
202
- return_code = run_command_with_updates(cmd, 'list jobs')
203
- if return_code != 0:
204
- xpk_print(f'Listing jobs returned ERROR {return_code}')
205
- return return_code
206
-
207
-
208
- def job_cancel(args) -> None:
209
- """Function around job cancel.
210
-
211
- Args:
212
- args: user provided arguments for running the command.
213
-
214
- Returns:
215
- None
216
- """
217
- if should_validate_dependencies(args):
218
- validate_dependencies_list([
219
- SystemDependency.KUBECTL,
220
- SystemDependency.KJOB,
221
- SystemDependency.GCLOUD,
222
- ])
223
-
224
- xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
225
- if not args.kind_cluster:
226
- add_zone_and_project(args)
227
- get_cluster_credentials(args)
228
- else:
229
- set_cluster_command_code = set_local_cluster_command(args)
230
- if set_cluster_command_code != 0:
231
- xpk_exit(set_cluster_command_code)
232
-
233
- return_code = run_slurm_job_delete_command(args)
234
- xpk_exit(return_code)
235
-
236
-
237
- def run_slurm_job_delete_command(args) -> int:
238
- list_of_jobs = ' '.join(args.name)
239
- cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
240
-
241
- return_code = run_command_with_updates(cmd, 'delete job')
242
- if return_code != 0:
243
- xpk_print(f'Delete job request returned ERROR {return_code}')
244
- return return_code
@@ -1,60 +0,0 @@
1
- """
2
- Copyright 2025 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- from ..core.capacity import (
18
- B200_DEVICE_TYPE,
19
- H100_MEGA_DEVICE_TYPE,
20
- H200_DEVICE_TYPE,
21
- )
22
- from ..core.cluster import get_gpu_type_from_cluster
23
- from ..core.kjob import (
24
- get_a3mega_pod_template_annotations,
25
- get_a3ultra_pod_template_annotations,
26
- get_a4_pod_template_annotations,
27
- Kueue_TAS_annotation,
28
- )
29
- from .common import is_TAS_possible
30
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
31
-
32
-
33
- def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
34
- gpu_type = get_gpu_type_from_cluster(args)
35
-
36
- annotations: tuple
37
- if gpu_type == H100_MEGA_DEVICE_TYPE:
38
- annotations = get_a3mega_pod_template_annotations()
39
- elif gpu_type == H200_DEVICE_TYPE:
40
- annotations = get_a3ultra_pod_template_annotations()
41
- elif gpu_type == B200_DEVICE_TYPE:
42
- annotations = get_a4_pod_template_annotations()
43
- else:
44
- annotations = tuple()
45
-
46
- flags = [
47
- f" --pod-template-annotation {annotation} " for annotation in annotations
48
- ]
49
- cmd += "\\\n".join(flags)
50
-
51
- return cmd
52
-
53
-
54
- def add_TAS_annotations_to_command(args, cmd: str) -> str:
55
- system_characteristics = get_cluster_system_characteristics(args)
56
- capacity_type = get_cluster_capacity_type(args)
57
- if is_TAS_possible(system_characteristics, capacity_type):
58
- cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
59
-
60
- return cmd