xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -14,16 +14,20 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import json
17
18
  from argparse import Namespace
18
19
  from dataclasses import dataclass
19
20
  from typing import Any
20
21
  from unittest.mock import MagicMock, patch
21
22
  import pytest
22
23
 
23
- from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command
24
+ from xpk.core.telemetry import MetricsCollector
25
+ from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
26
+ from xpk.core.capacity import CapacityType
24
27
  from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
25
28
  from xpk.core.testing.commands_tester import CommandsTester
26
29
  from xpk.utils.feature_flags import FeatureFlags
30
+ from xpk.utils.versions import ReleaseChannel
27
31
 
28
32
 
29
33
  @dataclass
@@ -31,9 +35,34 @@ class _Mocks:
31
35
  common_print_mock: MagicMock
32
36
  commands_print_mock: MagicMock
33
37
  commands_get_reservation_deployment_type: MagicMock
38
+ commands_get_pathways_machine_types: MagicMock
34
39
  commands_tester: CommandsTester
35
40
 
36
41
 
42
+ @dataclass
43
+ class _ClusterCreateMocks:
44
+ """Holds all the mocked dependencies for the cluster_create function."""
45
+
46
+ get_all_clusters_programmatic: MagicMock
47
+ get_gke_server_config: MagicMock
48
+ get_gke_control_plane_version: MagicMock
49
+ get_system_characteristics: MagicMock
50
+ authorize_private_cluster_access_if_necessary: MagicMock
51
+ update_coredns_if_necessary: MagicMock
52
+ get_cluster_credentials: MagicMock
53
+ setup_k8s_env: MagicMock
54
+ get_gke_node_pool_version: MagicMock
55
+ run_gke_node_pool_create_command: MagicMock
56
+ create_cluster_configmaps: MagicMock
57
+ set_jobset_on_cluster: MagicMock
58
+ get_cluster_location: MagicMock
59
+ install_kjob: MagicMock
60
+ xpk_exit: MagicMock
61
+ update_jobset_resources_if_necessary: MagicMock
62
+ _install_kueue: MagicMock
63
+ set_pathways_job_on_cluster: MagicMock
64
+
65
+
37
66
  @pytest.fixture
38
67
  def mocks(mocker) -> _Mocks:
39
68
  common_print_mock = mocker.patch(
@@ -47,15 +76,23 @@ def mocks(mocker) -> _Mocks:
47
76
  'xpk.commands.cluster.get_reservation_deployment_type',
48
77
  return_value='DENSE',
49
78
  )
79
+ commands_get_pathways_machine_types = mocker.patch(
80
+ 'xpk.commands.cluster.get_pathways_machine_types',
81
+ return_value=(0, []),
82
+ )
50
83
  return _Mocks(
51
84
  common_print_mock=common_print_mock,
52
85
  commands_get_reservation_deployment_type=commands_get_reservation_deployment_type,
53
86
  commands_print_mock=commands_print_mock,
87
+ commands_get_pathways_machine_types=commands_get_pathways_machine_types,
54
88
  commands_tester=CommandsTester(
55
89
  mocker,
56
90
  run_command_with_updates_path=(
57
91
  'xpk.commands.cluster.run_command_with_updates'
58
92
  ),
93
+ run_command_for_value_path=(
94
+ 'xpk.commands.cluster.run_command_for_value'
95
+ ),
59
96
  ),
60
97
  )
61
98
 
@@ -65,6 +102,10 @@ def construct_args(**kwargs: Any) -> Namespace:
65
102
  project='project',
66
103
  zone='us-central1-a',
67
104
  reservation='',
105
+ on_demand=False,
106
+ tpu_type=None,
107
+ device_type=None,
108
+ spot=False,
68
109
  default_pool_cpu_machine_type='test-machine-type',
69
110
  cluster='test-cluster',
70
111
  default_pool_cpu_num_nodes='100',
@@ -72,6 +113,7 @@ def construct_args(**kwargs: Any) -> Namespace:
72
113
  gke_version='',
73
114
  private=False,
74
115
  authorized_networks=None,
116
+ pathways_gce_machine_type='n2-standard-64',
75
117
  enable_pathways=False,
76
118
  enable_ray_cluster=False,
77
119
  enable_workload_identity=False,
@@ -87,11 +129,97 @@ def construct_args(**kwargs: Any) -> Namespace:
87
129
  memory_limit='100Gi',
88
130
  cpu_limit=100,
89
131
  cluster_cpu_machine_type='',
132
+ create_vertex_tensorboard=False,
133
+ enable_autoprovisioning=False,
134
+ sub_slicing_topology='2x2x2',
135
+ use_vertex_tensorboard=False,
136
+ env_file='',
137
+ env=None,
138
+ use_pathways=False,
139
+ debug_dump_gcs=False,
140
+ storage='',
141
+ restart_on_exit_codes=None,
142
+ ttl_seconds_after_finished=0,
143
+ max_restarts=1,
144
+ priority=0,
145
+ termination_grace_period_seconds=0,
146
+ docker_image_pull_secret='',
147
+ managed_mldiagnostics=False,
148
+ output_manifest_file='',
90
149
  )
91
150
  args_dict.update(kwargs)
92
151
  return Namespace(**args_dict)
93
152
 
94
153
 
154
+ @pytest.fixture
155
+ def cluster_create_mocks(mocker) -> _ClusterCreateMocks:
156
+ """Mocks all dependencies for the cluster_create function."""
157
+ # This fixture patches all the functions called by cluster_create, allowing
158
+ # tests to focus on specific logic paths without executing external commands
159
+ # or complex sub-functions. Each mock can be configured within the test
160
+ # itself if a specific return value or behavior is needed.
161
+ return _ClusterCreateMocks(
162
+ get_all_clusters_programmatic=mocker.patch(
163
+ 'xpk.commands.cluster.get_all_clusters_programmatic',
164
+ return_value=([], 0),
165
+ ),
166
+ get_gke_server_config=mocker.patch(
167
+ 'xpk.commands.cluster.get_gke_server_config',
168
+ return_value=(0, MagicMock()),
169
+ ),
170
+ get_gke_control_plane_version=mocker.patch(
171
+ 'xpk.commands.cluster.get_gke_control_plane_version'
172
+ ),
173
+ get_system_characteristics=mocker.patch(
174
+ 'xpk.commands.cluster.get_system_characteristics',
175
+ return_value=(TPU_TEST_SYSTEM, 0),
176
+ ),
177
+ authorize_private_cluster_access_if_necessary=mocker.patch(
178
+ 'xpk.commands.cluster.authorize_private_cluster_access_if_necessary',
179
+ return_value=0,
180
+ ),
181
+ update_coredns_if_necessary=mocker.patch(
182
+ 'xpk.commands.cluster.update_coredns_if_necessary', return_value=0
183
+ ),
184
+ get_cluster_credentials=mocker.patch(
185
+ 'xpk.commands.cluster.get_cluster_credentials', return_value=0
186
+ ),
187
+ setup_k8s_env=mocker.patch('xpk.commands.cluster.setup_k8s_env'),
188
+ get_gke_node_pool_version=mocker.patch(
189
+ 'xpk.commands.cluster.get_gke_node_pool_version',
190
+ return_value=(0, '1.2.3'),
191
+ ),
192
+ run_gke_node_pool_create_command=mocker.patch(
193
+ 'xpk.commands.cluster.run_gke_node_pool_create_command',
194
+ return_value=0,
195
+ ),
196
+ create_cluster_configmaps=mocker.patch(
197
+ 'xpk.commands.cluster.create_cluster_configmaps', return_value=0
198
+ ),
199
+ set_jobset_on_cluster=mocker.patch(
200
+ 'xpk.commands.cluster.set_jobset_on_cluster', return_value=0
201
+ ),
202
+ get_cluster_location=mocker.patch(
203
+ 'xpk.commands.cluster.get_cluster_location',
204
+ return_value='us-central1',
205
+ ),
206
+ install_kjob=mocker.patch(
207
+ 'xpk.commands.cluster.install_kjob', return_value=0
208
+ ),
209
+ xpk_exit=mocker.patch('xpk.commands.cluster.xpk_exit'),
210
+ update_jobset_resources_if_necessary=mocker.patch(
211
+ 'xpk.commands.cluster.update_jobset_resources_if_necessary',
212
+ return_value=0,
213
+ ),
214
+ _install_kueue=mocker.patch(
215
+ 'xpk.commands.cluster._install_kueue', return_value=0
216
+ ),
217
+ set_pathways_job_on_cluster=mocker.patch(
218
+ 'xpk.commands.cluster.set_pathways_job_on_cluster', return_value=0
219
+ ),
220
+ )
221
+
222
+
95
223
  GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
96
224
  'l4-1'
97
225
  ]
@@ -106,7 +234,7 @@ TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
106
234
  def test_validate_cluster_create_args_for_correct_args_pass(
107
235
  mocks: _Mocks,
108
236
  ):
109
- args = Namespace()
237
+ args = construct_args()
110
238
 
111
239
  _validate_cluster_create_args(args, GPU_TEST_SYSTEM)
112
240
 
@@ -185,6 +313,64 @@ def test_validate_cluster_create_args_for_invalid_reservation(
185
313
  )
186
314
 
187
315
 
316
+ def test_validate_cluster_create_args_for_enable_pathways_set_to_false(
317
+ mocks: _Mocks,
318
+ ):
319
+ args = construct_args(enable_pathways=False)
320
+ mocks.commands_get_pathways_machine_types.return_value = (1, [])
321
+
322
+ _validate_cluster_create_args(args, TPU_TEST_SYSTEM)
323
+
324
+ assert mocks.commands_print_mock.call_count == 0
325
+
326
+
327
+ def test_validate_cluster_create_args_for_errored_pathways_machine_types_retrieval(
328
+ mocks: _Mocks,
329
+ ):
330
+ args = construct_args(enable_pathways=True)
331
+ mocks.commands_get_pathways_machine_types.return_value = (1, [])
332
+
333
+ with pytest.raises(SystemExit):
334
+ _validate_cluster_create_args(args, TPU_TEST_SYSTEM)
335
+
336
+ assert mocks.commands_print_mock.call_count == 1
337
+ assert 'Unable to retrieve' in mocks.commands_print_mock.call_args[0][0]
338
+
339
+
340
+ def test_validate_cluster_create_args_for_invalid_pathways_machine_type(
341
+ mocks: _Mocks,
342
+ ):
343
+ args = construct_args(
344
+ enable_pathways=True, pathways_gce_machine_type='n2-standard-32'
345
+ )
346
+ mocks.commands_get_pathways_machine_types.return_value = (
347
+ 0,
348
+ ['n2-standard-64'],
349
+ )
350
+
351
+ with pytest.raises(SystemExit):
352
+ _validate_cluster_create_args(args, TPU_TEST_SYSTEM)
353
+
354
+ assert mocks.commands_print_mock.call_count == 2
355
+ assert 'Available machine types' in mocks.commands_print_mock.call_args[0][0]
356
+
357
+
358
+ def test_validate_cluster_create_args_for_valid_pathways_machine_type(
359
+ mocks: _Mocks,
360
+ ):
361
+ args = construct_args(
362
+ enable_pathways=True, pathways_gce_machine_type='n2-standard-32'
363
+ )
364
+ mocks.commands_get_pathways_machine_types.return_value = (
365
+ 0,
366
+ ['n2-standard-32'],
367
+ )
368
+
369
+ _validate_cluster_create_args(args, TPU_TEST_SYSTEM)
370
+
371
+ assert mocks.commands_print_mock.call_count == 0
372
+
373
+
188
374
  @patch('xpk.commands.cluster.KueueManager.install_or_upgrade')
189
375
  def test_install_kueue_returns_kueue_installation_code(
190
376
  mock_kueue_manager_install: MagicMock,
@@ -209,6 +395,7 @@ def test_run_gke_cluster_create_command_specifies_custom_cluster_arguments_last(
209
395
  ),
210
396
  gke_control_plane_version='1.2.3',
211
397
  system=TPU_TEST_SYSTEM,
398
+ release_channel=ReleaseChannel.STABLE,
212
399
  )
213
400
 
214
401
  assert result == 0
@@ -226,12 +413,16 @@ def test_run_gke_cluster_create_command_without_gke_version_does_not_have_no_aut
226
413
  args=construct_args(gke_version=''),
227
414
  gke_control_plane_version='1.2.3',
228
415
  system=TPU_TEST_SYSTEM,
416
+ release_channel=ReleaseChannel.RAPID,
229
417
  )
230
418
 
231
419
  assert result == 0
232
420
  mocks.commands_tester.assert_command_not_run(
233
421
  'clusters create', ' --no-enable-autoupgrade'
234
422
  )
423
+ mocks.commands_tester.assert_command_run(
424
+ 'clusters create', ' --release-channel=rapid'
425
+ )
235
426
 
236
427
 
237
428
  def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag(
@@ -241,24 +432,179 @@ def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag
241
432
  args=construct_args(gke_version='1.2.3'),
242
433
  gke_control_plane_version='1.2.3',
243
434
  system=TPU_TEST_SYSTEM,
435
+ release_channel=ReleaseChannel.REGULAR,
244
436
  )
245
437
 
246
438
  assert result == 0
247
439
  mocks.commands_tester.assert_command_run(
248
- 'clusters create', ' --no-enable-autoupgrade'
440
+ 'clusters create', '--release-channel=regular', ' --no-enable-autoupgrade'
249
441
  )
250
442
 
251
443
 
252
- def test_run_gke_cluster_create_command_with_gpu_system_has_no_enable_autoupgrade(
444
+ def test_run_gke_cluster_create_command_with_lustre_runs_correct_command(
253
445
  mocks: _Mocks,
254
446
  ):
255
447
  result = run_gke_cluster_create_command(
256
- args=construct_args(gke_version=''),
448
+ args=construct_args(
449
+ enable_lustre_csi_driver=True, enable_legacy_lustre_port=False
450
+ ),
257
451
  gke_control_plane_version='1.2.3',
258
- system=GPU_TEST_SYSTEM,
452
+ system=TPU_TEST_SYSTEM,
453
+ release_channel=ReleaseChannel.REGULAR,
454
+ )
455
+
456
+ assert result == 0
457
+ commands = mocks.commands_tester.get_matching_commands('clusters create')
458
+ assert len(commands) == 1
459
+ command = commands[0]
460
+ assert (
461
+ '--addons=LustreCsiDriver' in command
462
+ and '--enable-legacy-lustre-port' not in command
463
+ )
464
+
465
+
466
+ def test_run_gke_cluster_create_command_with_lustre_legacy_port_adds_correct_flag(
467
+ mocks: _Mocks,
468
+ ):
469
+ result = run_gke_cluster_create_command(
470
+ args=construct_args(
471
+ enable_lustre_csi_driver=True, enable_legacy_lustre_port=True
472
+ ),
473
+ gke_control_plane_version='1.2.3',
474
+ system=TPU_TEST_SYSTEM,
475
+ release_channel=ReleaseChannel.REGULAR,
259
476
  )
260
477
 
261
478
  assert result == 0
262
479
  mocks.commands_tester.assert_command_run(
263
- 'clusters create', ' --no-enable-autoupgrade'
480
+ 'clusters create',
481
+ '--enable-legacy-lustre-port',
482
+ '--addons=LustreCsiDriver',
483
+ )
484
+
485
+
486
+ def test_log_cluster_create_telemetry_does_not_log_when_feature_flag_is_disabled():
487
+ FeatureFlags.TELEMETRY_ENABLED = False
488
+ _log_cluster_create_telemetry(construct_args())
489
+ events = json.loads(MetricsCollector.flush())['log_event']
490
+ assert len(events) == 0
491
+
492
+
493
+ def test_log_cluster_create_telemetry_logs_correct_event_when_tpu_type_is_provided(
494
+ mocker: MagicMock,
495
+ ):
496
+ FeatureFlags.TELEMETRY_ENABLED = True
497
+ mocker.patch(
498
+ 'xpk.commands.cluster.get_capacity_type',
499
+ return_value=(CapacityType.SPOT, 0),
264
500
  )
501
+ _log_cluster_create_telemetry(construct_args(device_type='test-device-type'))
502
+ event = json.loads(MetricsCollector.flush())['log_event'][0]
503
+ payload = json.loads(event['source_extension_json'])
504
+ event_metadata = payload['event_metadata']
505
+ assert payload['event_name'] == 'cluster_create'
506
+ assert (
507
+ _get_event_metadata_value_by_key(
508
+ event_metadata,
509
+ 'XPK_ZONE',
510
+ )
511
+ == 'us-central1-a'
512
+ )
513
+ assert (
514
+ _get_event_metadata_value_by_key(
515
+ event_metadata,
516
+ 'XPK_SYSTEM_CHARACTERISTICS',
517
+ )
518
+ == 'test-device-type'
519
+ )
520
+ assert (
521
+ _get_event_metadata_value_by_key(
522
+ event_metadata,
523
+ 'XPK_PROVISIONING_MODE',
524
+ )
525
+ == 'spot'
526
+ )
527
+
528
+
529
+ def test_log_cluster_create_telemetry_logs_correct_event_when_device_type_is_provided(
530
+ mocker: MagicMock,
531
+ ):
532
+ FeatureFlags.TELEMETRY_ENABLED = True
533
+ mocker.patch(
534
+ 'xpk.commands.cluster.get_capacity_type',
535
+ return_value=(CapacityType.SPOT, 0),
536
+ )
537
+ _log_cluster_create_telemetry(construct_args(tpu_type='test-tpu-type'))
538
+ event = json.loads(MetricsCollector.flush())['log_event'][0]
539
+ payload = json.loads(event['source_extension_json'])
540
+ event_metadata = payload['event_metadata']
541
+ assert payload['event_name'] == 'cluster_create'
542
+ assert (
543
+ _get_event_metadata_value_by_key(
544
+ event_metadata,
545
+ 'XPK_ZONE',
546
+ )
547
+ == 'us-central1-a'
548
+ )
549
+ assert (
550
+ _get_event_metadata_value_by_key(
551
+ event_metadata,
552
+ 'XPK_SYSTEM_CHARACTERISTICS',
553
+ )
554
+ == 'test-tpu-type'
555
+ )
556
+ assert (
557
+ _get_event_metadata_value_by_key(
558
+ event_metadata,
559
+ 'XPK_PROVISIONING_MODE',
560
+ )
561
+ == 'spot'
562
+ )
563
+
564
+
565
+ def _get_event_metadata_value_by_key(
566
+ event_metadata: list[dict[str, str]], key: str
567
+ ) -> str | None:
568
+ return next(
569
+ (meta['value'] for meta in event_metadata if meta['key'] == key),
570
+ None,
571
+ )
572
+
573
+
574
+ @pytest.mark.parametrize(
575
+ 'gke_version_arg, expected_channel, expected_version',
576
+ [
577
+ (None, ReleaseChannel.RAPID, '1.2.4'), # No version, should use RAPID
578
+ (
579
+ '1.2.3',
580
+ ReleaseChannel.REGULAR,
581
+ '1.2.3',
582
+ ), # Version provided, should use REGULAR
583
+ ],
584
+ )
585
+ def test_cluster_create_calls_run_command_with_correct_channel_and_version(
586
+ gke_version_arg,
587
+ expected_channel,
588
+ expected_version,
589
+ mocks: _Mocks,
590
+ cluster_create_mocks: _ClusterCreateMocks,
591
+ ):
592
+ """
593
+ Verifies that cluster_create calls run_gke_cluster_create_command with the correct
594
+ release channel and GKE version based on whether a version is provided.
595
+ """
596
+ cluster_create_mocks.get_gke_control_plane_version.return_value = (
597
+ 0,
598
+ expected_version,
599
+ )
600
+
601
+ args = construct_args(gke_version=gke_version_arg)
602
+ cluster_create(args)
603
+
604
+ expected_command_parts = [
605
+ 'clusters create',
606
+ f'--cluster-version={expected_version}',
607
+ f'--release-channel={expected_channel.value.lower()}',
608
+ ]
609
+
610
+ mocks.commands_tester.assert_command_run(*expected_command_parts)
xpk/commands/config.py CHANGED
@@ -14,16 +14,14 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.config import XpkConfig
17
+ from ..core.config import get_config as get_xpk_config
18
18
  from ..utils.console import xpk_print
19
19
 
20
- xpk_cfg = XpkConfig()
21
-
22
20
 
23
21
  def set_config(args):
24
- xpk_cfg.set(args.set_config_args[0], args.set_config_args[1])
22
+ get_xpk_config().set(args.set_config_args[0], args.set_config_args[1])
25
23
 
26
24
 
27
25
  def get_config(args):
28
- value = xpk_cfg.get(args.get_config_key[0])
26
+ value = get_xpk_config().get(args.get_config_key[0])
29
27
  xpk_print(value)
xpk/commands/inspector.py CHANGED
@@ -18,7 +18,7 @@ from ..core.cluster import get_cluster_credentials
18
18
  from ..core.commands import run_command_for_value
19
19
  from ..core.gcloud_context import add_zone_and_project, get_cluster_location
20
20
  from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
21
- from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
21
+ from ..core.resources import ConfigMapType, get_config_map_name
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
24
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
@@ -162,14 +162,16 @@ def inspector(args) -> None:
162
162
  (
163
163
  (
164
164
  'kubectl get configmap'
165
- f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
165
+ f' {get_config_map_name(args.cluster, ConfigMapType.METADATA)} -o'
166
+ ' yaml'
166
167
  ),
167
168
  'GKE: Cluster Metadata ConfigMap Details',
168
169
  ),
169
170
  (
170
171
  (
171
172
  'kubectl get configmap'
172
- f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
173
+ f' {get_config_map_name(args.cluster, ConfigMapType.RESOURCES)} -o'
174
+ ' yaml'
173
175
  ),
174
176
  'GKE: Cluster Resources ConfigMap Details',
175
177
  ),
xpk/commands/kind.py CHANGED
@@ -30,6 +30,7 @@ from ..core.storage import install_storage_crd
30
30
  from ..core.system_characteristics import (
31
31
  SystemCharacteristics,
32
32
  AcceleratorType,
33
+ DockerPlatform,
33
34
  )
34
35
  from ..utils.console import (xpk_exit, xpk_print)
35
36
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
@@ -97,9 +98,10 @@ def cluster_create(args) -> None:
97
98
  AcceleratorType.CPU,
98
99
  'kind',
99
100
  supports_sub_slicing=False,
101
+ docker_platform=DockerPlatform.ARM,
100
102
  )
101
103
 
102
- kueue_manager = KueueManager()
104
+ kueue_manager = KueueManager(project='', zone='')
103
105
  kueue_manager.install_or_upgrade(
104
106
  KueueConfig(
105
107
  system,