xpk 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. integration/gcluster_a3mega_test.py +11 -0
  2. integration/gcluster_a3ultra_test.py +11 -0
  3. integration/gcluster_a4_test.py +11 -0
  4. xpk/commands/cluster.py +57 -21
  5. xpk/commands/cluster_gcluster.py +25 -5
  6. xpk/commands/cluster_gcluster_test.py +11 -2
  7. xpk/commands/cluster_test.py +233 -12
  8. xpk/commands/config.py +3 -5
  9. xpk/commands/kind.py +1 -1
  10. xpk/commands/storage.py +8 -10
  11. xpk/commands/workload.py +28 -12
  12. xpk/commands/workload_test.py +3 -3
  13. xpk/core/blueprint/blueprint_generator.py +70 -33
  14. xpk/core/blueprint/blueprint_test.py +9 -0
  15. xpk/core/capacity.py +46 -8
  16. xpk/core/capacity_test.py +32 -1
  17. xpk/core/cluster.py +37 -57
  18. xpk/core/cluster_test.py +95 -0
  19. xpk/core/commands.py +4 -10
  20. xpk/core/config.py +9 -2
  21. xpk/core/gcloud_context.py +18 -12
  22. xpk/core/gcloud_context_test.py +111 -1
  23. xpk/core/kjob.py +6 -9
  24. xpk/core/kueue_manager.py +192 -32
  25. xpk/core/kueue_manager_test.py +132 -4
  26. xpk/core/nodepool.py +21 -29
  27. xpk/core/nodepool_test.py +17 -15
  28. xpk/core/scheduling.py +16 -1
  29. xpk/core/scheduling_test.py +85 -6
  30. xpk/core/system_characteristics.py +77 -19
  31. xpk/core/system_characteristics_test.py +80 -5
  32. xpk/core/telemetry.py +263 -0
  33. xpk/core/telemetry_test.py +211 -0
  34. xpk/main.py +31 -13
  35. xpk/parser/cluster.py +48 -9
  36. xpk/parser/cluster_test.py +42 -3
  37. xpk/parser/workload.py +12 -0
  38. xpk/parser/workload_test.py +4 -4
  39. xpk/telemetry_uploader.py +29 -0
  40. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  41. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  42. xpk/utils/console.py +41 -10
  43. xpk/utils/console_test.py +106 -0
  44. xpk/utils/feature_flags.py +7 -1
  45. xpk/utils/file.py +4 -1
  46. xpk/utils/topology.py +4 -0
  47. xpk/utils/user_agent.py +35 -0
  48. xpk/utils/user_agent_test.py +44 -0
  49. xpk/utils/user_input.py +48 -0
  50. xpk/utils/user_input_test.py +92 -0
  51. xpk/utils/validation.py +0 -11
  52. xpk/utils/versions.py +31 -0
  53. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
  54. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
  55. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
  56. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
  57. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
  58. {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,211 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ import json
19
+ from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
20
+ from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
21
+ from ..utils.execution_context import set_dry_run
22
+ from ..utils.feature_flags import FeatureFlags
23
+ from pytest_mock import MockerFixture
24
+
25
+
26
+ @pytest.fixture(autouse=True)
27
+ def setup_mocks(mocker: MockerFixture):
28
+ mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
29
+ mocker.patch('time.time', return_value=0)
30
+ mocker.patch('platform.python_version', return_value='99.99.99')
31
+ mocker.patch('os.path.basename', return_value='xpk.py')
32
+ mocker.patch('os.path.abspath', return_value='/home/xpk_user')
33
+ set_dry_run(False)
34
+ xpk_config.set(CLIENT_ID_KEY, 'client_id')
35
+ yield
36
+ xpk_config.set(CLIENT_ID_KEY, None)
37
+
38
+
39
+ @pytest.mark.parametrize(
40
+ argnames='feature_flag,config_value,expected',
41
+ argvalues=[
42
+ (True, 'true', True),
43
+ (False, 'true', False),
44
+ (True, None, True),
45
+ (True, 'false', False),
46
+ ],
47
+ )
48
+ def test_should_send_telemetry_returns_correct_value(
49
+ feature_flag: bool, config_value: str, expected: bool
50
+ ):
51
+ xpk_config.set(SEND_TELEMETRY_KEY, config_value)
52
+ FeatureFlags.TELEMETRY_ENABLED = feature_flag
53
+ assert should_send_telemetry() is expected
54
+
55
+
56
+ def test_metrics_collector_generates_client_id_if_not_present():
57
+ xpk_config.set(CLIENT_ID_KEY, None)
58
+ MetricsCollector.log_start(command='test')
59
+ payload = json.loads(MetricsCollector.flush())
60
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
61
+ assert extension_json['client_install_id'] is not None
62
+ assert len(extension_json['client_install_id']) > 0
63
+
64
+
65
+ def test_metrics_collector_logs_start_event_correctly():
66
+ MetricsCollector.log_start(command='test')
67
+ payload = json.loads(MetricsCollector.flush())
68
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
69
+ assert extension_json == {
70
+ 'client_install_id': 'client_id',
71
+ 'console_type': 'XPK',
72
+ 'event_metadata': [
73
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
74
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
75
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
76
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
77
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
78
+ {'key': 'XPK_COMMAND', 'value': 'test'},
79
+ ],
80
+ 'event_name': 'start',
81
+ 'event_type': 'commands',
82
+ 'release_version': 'v0.15.0',
83
+ }
84
+
85
+
86
+ def test_metrics_collector_generates_client_id_when_not_present():
87
+ xpk_config.set(CLIENT_ID_KEY, None)
88
+ MetricsCollector.log_start(command='test')
89
+ payload = json.loads(MetricsCollector.flush())
90
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
91
+ assert extension_json['client_install_id'] is not None
92
+ assert len(extension_json['client_install_id']) > 0
93
+
94
+
95
+ def test_metrics_collector_logs_complete_event_correctly():
96
+ MetricsCollector.log_complete(exit_code=2)
97
+ payload = json.loads(MetricsCollector.flush())
98
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
99
+ assert extension_json == {
100
+ 'client_install_id': 'client_id',
101
+ 'console_type': 'XPK',
102
+ 'event_metadata': [
103
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
104
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
105
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
106
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
107
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
108
+ {'key': 'XPK_EXIT_CODE', 'value': '2'},
109
+ ],
110
+ 'event_name': 'complete',
111
+ 'event_type': 'commands',
112
+ 'release_version': 'v0.15.0',
113
+ }
114
+
115
+
116
+ def test_metrics_collector_logs_custom_event_correctly():
117
+ MetricsCollector.log_custom(
118
+ name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
119
+ )
120
+ payload = json.loads(MetricsCollector.flush())
121
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
122
+ assert extension_json == {
123
+ 'client_install_id': 'client_id',
124
+ 'console_type': 'XPK',
125
+ 'event_metadata': [
126
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
127
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
128
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
129
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
130
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
131
+ {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
132
+ ],
133
+ 'event_name': 'test',
134
+ 'event_type': 'custom',
135
+ 'release_version': 'v0.15.0',
136
+ }
137
+
138
+
139
+ def test_metrics_collector_logs_correct_envelope():
140
+ MetricsCollector.log_start(command='test')
141
+ MetricsCollector.log_custom(
142
+ name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
143
+ )
144
+ MetricsCollector.log_complete(exit_code=2)
145
+ payload = json.loads(MetricsCollector.flush())
146
+ assert payload['client_info'] == {'client_type': 'XPK'}
147
+ assert payload['log_source_name'] == 'CONCORD'
148
+ assert payload['request_time_ms'] == 0
149
+ assert len(payload['log_event']) == 3
150
+
151
+
152
+ def test_metrics_collector_does_not_flush_event_twice():
153
+ MetricsCollector.log_start(command='test')
154
+ MetricsCollector.flush()
155
+ MetricsCollector.log_start(command='version')
156
+ payload = json.loads(MetricsCollector.flush())
157
+ assert len(payload['log_event']) == 1
158
+
159
+
160
+ @pytest.mark.parametrize(
161
+ argnames='dry_run,expected', argvalues=[(False, 'false'), (True, 'true')]
162
+ )
163
+ def test_metrics_collector_logs_correct_dry_run_value(
164
+ dry_run: bool, expected: str
165
+ ):
166
+ set_dry_run(dry_run)
167
+ MetricsCollector.log_start(command='test')
168
+ payload = MetricsCollector.flush()
169
+ assert _get_metadata_value(payload, 'XPK_DRY_RUN') == expected
170
+
171
+
172
+ @pytest.mark.parametrize(
173
+ argnames='basename,expected',
174
+ argvalues=[
175
+ ('xpk', 'true'),
176
+ ('xpk.py', 'false'),
177
+ ],
178
+ )
179
+ def test_metrics_collectors_logs_correct_running_as_pip_value(
180
+ basename: str, expected: str, mocker: MockerFixture
181
+ ):
182
+ mocker.patch('os.path.basename', return_value=basename)
183
+ MetricsCollector.log_start(command='test')
184
+ payload = MetricsCollector.flush()
185
+ assert _get_metadata_value(payload, 'XPK_RUNNING_AS_PIP') == expected
186
+
187
+
188
+ @pytest.mark.parametrize(
189
+ argnames='abspath,expected',
190
+ argvalues=[
191
+ ('/site-packages/', 'false'),
192
+ ('/dist-packages/', 'false'),
193
+ ('/home/xpk_user', 'true'),
194
+ ],
195
+ )
196
+ def test_metrics_collectors_logs_correct_running_from_source_value(
197
+ abspath: str, expected: str, mocker: MockerFixture
198
+ ):
199
+ mocker.patch('os.path.abspath', return_value=abspath)
200
+ MetricsCollector.log_start(command='test')
201
+ payload = MetricsCollector.flush()
202
+ assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
203
+
204
+
205
+ def _get_metadata_value(payload_str: str, key: str) -> str | None:
206
+ payload = json.loads(payload_str)
207
+ metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
208
+ 'event_metadata'
209
+ ]
210
+ matching = (item['value'] for item in metadata if item['key'] == key)
211
+ return next(matching, None)
xpk/main.py CHANGED
@@ -32,11 +32,13 @@ Next Steps:
32
32
  """
33
33
 
34
34
  import argparse
35
+ import argcomplete
35
36
  import sys
36
37
 
37
38
  from .parser.core import set_parser
38
39
  from .core.updates import print_xpk_hello
39
- from .utils.console import xpk_print
40
+ from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
41
+ from .utils.console import xpk_print, exit_code_to_int
40
42
  from .utils.execution_context import set_context
41
43
  ################### Compatibility Check ###################
42
44
  # Check that the user runs the below version or greater.
@@ -59,19 +61,35 @@ if (
59
61
 
60
62
 
61
63
  def main() -> None:
62
- # Create top level parser for xpk command.
63
- parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
64
- set_parser(parser=parser)
64
+ try:
65
+ # Create top level parser for xpk command.
66
+ parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
67
+ set_parser(parser=parser)
68
+ argcomplete.autocomplete(parser)
65
69
 
66
- main_args = parser.parse_args()
67
- main_args.enable_ray_cluster = False
68
- set_context(
69
- dry_run_value='dry_run' in main_args and main_args.dry_run,
70
- quiet_value='quiet' in main_args and main_args.quiet,
71
- )
72
- print_xpk_hello()
73
- main_args.func(main_args)
74
- xpk_print('XPK Done.', flush=True)
70
+ main_args = parser.parse_args()
71
+ main_args.enable_ray_cluster = False
72
+ set_context(
73
+ dry_run_value='dry_run' in main_args and main_args.dry_run,
74
+ quiet_value=(
75
+ ('quiet' in main_args and main_args.quiet)
76
+ or ('force' in main_args and main_args.force)
77
+ ),
78
+ )
79
+ MetricsCollector.log_start(main_args.xpk_subcommands)
80
+ print_xpk_hello()
81
+ main_args.func(main_args)
82
+ xpk_print('XPK Done.', flush=True)
83
+ MetricsCollector.log_complete(0)
84
+ except SystemExit as e:
85
+ MetricsCollector.log_complete(exit_code_to_int(e.code))
86
+ raise
87
+ except:
88
+ MetricsCollector.log_complete(-1)
89
+ raise
90
+ finally:
91
+ if should_send_telemetry():
92
+ send_clearcut_payload(MetricsCollector.flush())
75
93
 
76
94
 
77
95
  if __name__ == '__main__':
xpk/parser/cluster.py CHANGED
@@ -26,7 +26,8 @@ from ..commands.cluster import (
26
26
  cluster_describe,
27
27
  cluster_list,
28
28
  )
29
- from ..commands.config import xpk_cfg
29
+ from ..core.config import xpk_config
30
+ from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
30
31
  from ..core.config import CFG_BUCKET_KEY
31
32
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
32
33
  from .common import add_shared_arguments, ParserOrArgumentGroup
@@ -103,6 +104,10 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
103
104
  type=str,
104
105
  default=None,
105
106
  help='The tpu type to use, v5litepod-16, etc.',
107
+ metavar='TPU_TYPE',
108
+ choices=get_system_characteristics_keys_by_accelerator_type(
109
+ [AcceleratorType.TPU]
110
+ ),
106
111
  )
107
112
  cluster_device_group.add_argument(
108
113
  '--device-type',
@@ -112,6 +117,8 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
112
117
  'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
113
118
  ' h100-80gb-8, n2-standard-32-4 etc.'
114
119
  ),
120
+ metavar='DEVICE_TYPE',
121
+ choices=get_system_characteristics_keys_by_accelerator_type(),
115
122
  )
116
123
 
117
124
  ### Optional arguments specific to "cluster create"
@@ -124,7 +131,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
124
131
  cluster_create_optional_arguments.add_argument(
125
132
  '--cluster-state-gcs-bucket',
126
133
  type=str,
127
- default=xpk_cfg.get(CFG_BUCKET_KEY),
134
+ default=xpk_config.get(CFG_BUCKET_KEY),
128
135
  help='The name of the bucket to store cluster state.',
129
136
  required=False,
130
137
  )
@@ -144,11 +151,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
144
151
  ),
145
152
  )
146
153
  if FeatureFlags.SUB_SLICING_ENABLED:
147
- cluster_create_optional_arguments.add_argument(
148
- '--sub-slicing',
149
- action='store_true',
150
- help='Whether to set up cluster to support sub-slicing',
151
- )
154
+ add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
152
155
 
153
156
  autoprovisioning_arguments = cluster_create_parser.add_argument_group(
154
157
  'Autoprovisioning Arguments',
@@ -209,6 +212,10 @@ def set_cluster_create_pathways_parser(
209
212
  type=str,
210
213
  default=None,
211
214
  help='The tpu type to use, v5litepod-16, etc.',
215
+ metavar='TPU_TYPE',
216
+ choices=get_system_characteristics_keys_by_accelerator_type(
217
+ [AcceleratorType.TPU]
218
+ ),
212
219
  )
213
220
 
214
221
  ### Optional arguments specific to "cluster create-pathways"
@@ -221,6 +228,10 @@ def set_cluster_create_pathways_parser(
221
228
  add_shared_cluster_create_optional_arguments(
222
229
  cluster_create_pathways_optional_arguments
223
230
  )
231
+ if FeatureFlags.SUB_SLICING_ENABLED:
232
+ add_cluster_create_sub_slicing_arguments(
233
+ cluster_create_pathways_optional_arguments
234
+ )
224
235
 
225
236
  autoprovisioning_arguments = (
226
237
  cluster_create_pathways_parser.add_argument_group(
@@ -287,6 +298,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
287
298
  default=None,
288
299
  help='The tpu type to use, v5litepod-16, etc.',
289
300
  required=True,
301
+ metavar='TPU_TYPE',
302
+ choices=get_system_characteristics_keys_by_accelerator_type(
303
+ [AcceleratorType.TPU]
304
+ ),
290
305
  )
291
306
  # TODO(bzmarke): Add --device-type to support GPU/CPU
292
307
  cluster_create_ray_required_arguments.add_argument(
@@ -350,7 +365,9 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
350
365
  )
351
366
  add_resource_limits(cluster_create_resource_limits)
352
367
 
353
- cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
368
+ cluster_create_ray_parser.set_defaults(
369
+ func=cluster_create_ray_cluster, sub_slicing=False
370
+ )
354
371
 
355
372
 
356
373
  def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
@@ -375,7 +392,7 @@ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
375
392
  cluster_delete_optional_arguments.add_argument(
376
393
  '--cluster-state-gcs-bucket',
377
394
  type=str,
378
- default=xpk_cfg.get(CFG_BUCKET_KEY),
395
+ default=xpk_config.get(CFG_BUCKET_KEY),
379
396
  help='The name of the bucket to store cluster state.',
380
397
  required=False,
381
398
  )
@@ -409,6 +426,10 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
409
426
  type=str,
410
427
  default=None,
411
428
  help='The tpu type to cache images on, v5litepod-16, etc.',
429
+ metavar='TPU_TYPE',
430
+ choices=get_system_characteristics_keys_by_accelerator_type(
431
+ [AcceleratorType.TPU]
432
+ ),
412
433
  )
413
434
  cluster_cacheimage_group.add_argument(
414
435
  '--device-type',
@@ -418,6 +439,8 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
418
439
  'The device type to cache images on (can be tpu or gpu),'
419
440
  ' v5litepod-16, h100-80gb-8, etc.'
420
441
  ),
442
+ metavar='DEVICE_TYPE',
443
+ choices=get_system_characteristics_keys_by_accelerator_type(),
421
444
  )
422
445
 
423
446
  ### Required arguments
@@ -508,6 +531,10 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
508
531
  type=str,
509
532
  default=None,
510
533
  help='The tpu type used on cluster, v5litepod-16, etc.',
534
+ metavar='TPU_TYPE',
535
+ choices=get_system_characteristics_keys_by_accelerator_type(
536
+ [AcceleratorType.TPU]
537
+ ),
511
538
  )
512
539
  cluster_adapt_device_group.add_argument(
513
540
  '--device-type',
@@ -517,6 +544,8 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
517
544
  'The device type used on cluster (can be tpu or gpu or cpu), eg.'
518
545
  ' h100-80gb-8, n2-standard-32-4 etc.'
519
546
  ),
547
+ metavar='DEVICE_TYPE',
548
+ choices=get_system_characteristics_keys_by_accelerator_type(),
520
549
  )
521
550
 
522
551
  cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
@@ -937,3 +966,13 @@ def add_resource_limits(parser_or_group: ParserOrArgumentGroup):
937
966
  default=None,
938
967
  help='The CPU limit for the Kueue controller manager.',
939
968
  )
969
+
970
+
971
+ def add_cluster_create_sub_slicing_arguments(
972
+ parser_or_group: ParserOrArgumentGroup,
973
+ ):
974
+ parser_or_group.add_argument(
975
+ '--sub-slicing',
976
+ action='store_true',
977
+ help='Whether to set up cluster to support sub-slicing',
978
+ )
@@ -15,7 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import argparse
18
- from xpk.parser.cluster import set_cluster_create_parser
18
+ from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser
19
19
  import pytest
20
20
  from ..utils.feature_flags import FeatureFlags
21
21
 
@@ -49,7 +49,7 @@ def test_cluster_create_sub_slicing_is_false_by_default():
49
49
 
50
50
  set_cluster_create_parser(parser)
51
51
  args = parser.parse_args(
52
- ["--cluster", "test-cluster", "--tpu-type", "test-tpu"]
52
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
53
53
  )
54
54
 
55
55
  assert args.sub_slicing is False
@@ -60,7 +60,46 @@ def test_cluster_create_sub_slicing_can_be_set():
60
60
 
61
61
  set_cluster_create_parser(parser)
62
62
  args = parser.parse_args(
63
- ["--cluster", "test-cluster", "--tpu-type", "test-tpu", "--sub-slicing"]
63
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--sub-slicing"]
64
64
  )
65
65
 
66
66
  assert args.sub_slicing is True
67
+
68
+
69
+ def test_cluster_create_pathways_sub_slicing_is_hidden_with_flag_off():
70
+ FeatureFlags.SUB_SLICING_ENABLED = False
71
+ parser = argparse.ArgumentParser()
72
+
73
+ set_cluster_create_pathways_parser(parser)
74
+ help_str = parser.format_help()
75
+
76
+ assert "--sub-slicing" not in help_str
77
+
78
+
79
+ def test_cluster_create_pathways_sub_slicing_can_be_set():
80
+ parser = argparse.ArgumentParser()
81
+
82
+ set_cluster_create_pathways_parser(parser)
83
+ args = parser.parse_args(
84
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--sub-slicing"]
85
+ )
86
+
87
+ assert args.sub_slicing is True
88
+
89
+
90
+ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
91
+ parser = argparse.ArgumentParser()
92
+
93
+ set_cluster_create_ray_parser(parser)
94
+ args = parser.parse_args([
95
+ "--cluster",
96
+ "test-cluster",
97
+ "--tpu-type",
98
+ "tpu7x-2",
99
+ "--ray-version",
100
+ "19.32.0",
101
+ ])
102
+ help_str = parser.format_help()
103
+
104
+ assert args.sub_slicing is False
105
+ assert "--sub-slicing" not in help_str
xpk/parser/workload.py CHANGED
@@ -25,6 +25,7 @@ from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
25
25
  from .common import add_shared_arguments
26
26
  from .validators import directory_path_type, name_type
27
27
  from ..utils.feature_flags import FeatureFlags
28
+ from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType, SUB_SLICING_TOPOLOGIES
28
29
 
29
30
 
30
31
  def set_workload_parsers(workload_parser: ArgumentParser):
@@ -123,6 +124,10 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
123
124
  type=str,
124
125
  default=None,
125
126
  help='The tpu type to use, v5litepod-16, etc.',
127
+ metavar='TPU_TYPE',
128
+ choices=get_system_characteristics_keys_by_accelerator_type(
129
+ [AcceleratorType.TPU]
130
+ ),
126
131
  )
127
132
  workload_device_group.add_argument(
128
133
  '--device-type',
@@ -132,6 +137,8 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
132
137
  'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
133
138
  ' h100-80gb-8, n2-standard-32-4 etc.'
134
139
  ),
140
+ metavar='DEVICE_TYPE',
141
+ choices=get_system_characteristics_keys_by_accelerator_type(),
135
142
  )
136
143
 
137
144
  workload_create_parser_optional_arguments.add_argument(
@@ -285,6 +292,10 @@ def set_workload_create_pathways_parser(
285
292
  type=str,
286
293
  default=None,
287
294
  help='The tpu type to use, v5litepod-16, etc.',
295
+ metavar='TPU_TYPE',
296
+ choices=get_system_characteristics_keys_by_accelerator_type(
297
+ [AcceleratorType.TPU]
298
+ ),
288
299
  )
289
300
 
290
301
  ### "workload create-pathways" Optional arguments, specific to Pathways
@@ -665,6 +676,7 @@ def add_shared_workload_create_optional_arguments(args_parsers):
665
676
  type=str,
666
677
  help='Sub-slicing topology to use.',
667
678
  required=False,
679
+ choices=SUB_SLICING_TOPOLOGIES,
668
680
  )
669
681
 
670
682
 
@@ -56,7 +56,7 @@ def test_workload_create_sub_slicing_topology_is_none_by_default():
56
56
  "--workload",
57
57
  "test",
58
58
  "--tpu-type",
59
- "test-tpu",
59
+ "tpu7x-2",
60
60
  ])
61
61
 
62
62
  assert args.sub_slicing_topology is None
@@ -74,9 +74,9 @@ def test_workload_create_sub_slicing_topology_can_be_set():
74
74
  "--workload",
75
75
  "test",
76
76
  "--tpu-type",
77
- "test-tpu",
77
+ "tpu7x-8",
78
78
  "--sub-slicing-topology",
79
- "2x2",
79
+ "2x4",
80
80
  ])
81
81
 
82
- assert args.sub_slicing_topology is "2x2"
82
+ assert args.sub_slicing_topology is "2x4"
@@ -0,0 +1,29 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import sys
18
+ import os
19
+ import requests
20
+ import json
21
+
22
+ file_path = sys.argv[1]
23
+ if os.path.exists(file_path):
24
+ with open(file_path, mode="r", encoding="utf-8") as file:
25
+ kwargs = json.load(file)
26
+ response = requests.request(**kwargs)
27
+ print(f"Telemetry upload finished with {response.status_code} status code")
28
+
29
+ os.remove(file_path)
@@ -1,4 +1,4 @@
1
- apiVersion: kueue.x-k8s.io/v1alpha1
1
+ apiVersion: kueue.x-k8s.io/v1beta1
2
2
  kind: Topology
3
3
  metadata:
4
4
  name: "gke-default"
@@ -4,11 +4,6 @@ metadata:
4
4
  name: {{ sub_slice_topology_name }}
5
5
  spec:
6
6
  levels:
7
- - nodeLabel: "cloud.google.com/gke-tpu-slice-16x16-id"
8
- - nodeLabel: "cloud.google.com/gke-tpu-slice-8x16-id"
9
- - nodeLabel: "cloud.google.com/gke-tpu-slice-8x8-id"
10
- - nodeLabel: "cloud.google.com/gke-tpu-slice-4x8-id"
11
- - nodeLabel: "cloud.google.com/gke-tpu-slice-4x4-id"
12
- - nodeLabel: "cloud.google.com/gke-tpu-slice-2x4-id"
13
- - nodeLabel: "cloud.google.com/gke-tpu-slice-2x2-id"
14
- - nodeLabel: "kubernetes.io/hostname"
7
+ {% for level in levels %}
8
+ - nodeLabel: "{{level}}"
9
+ {% endfor %}