xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/capacity.py CHANGED
@@ -29,6 +29,8 @@ H100_DEVICE_TYPE = 'h100-80gb-8'
29
29
  H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
30
30
  H200_DEVICE_TYPE = 'h200-141gb-8'
31
31
  B200_DEVICE_TYPE = 'b200-8'
32
+ GB200_DEVICE_TYPE = 'gb200-4'
33
+ GB200_DEVICE_TYPE_NOLSSD = 'gb200-4-no-ssd'
32
34
  RESERVATION_CONFIG_KEY = 'reservation_id'
33
35
 
34
36
 
@@ -115,9 +117,12 @@ def get_reservation_maintenance_interval(
115
117
  Returns:
116
118
  0 if successful and 1 otherwise.
117
119
  """
120
+ reservation_project, reservation_name = get_reservation_project_and_name(
121
+ reservation, project
122
+ )
118
123
  command = (
119
- f'gcloud beta compute reservations describe {reservation}'
120
- f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
124
+ f'gcloud beta compute reservations describe {reservation_name}'
125
+ f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
121
126
  )
122
127
  return_code, output = run_command_for_value(
123
128
  command, 'Get reservation maintenance interval'
@@ -139,9 +144,12 @@ def get_reservation_placement_policy(
139
144
  Returns:
140
145
  0 if successful and 1 otherwise.
141
146
  """
147
+ reservation_project, reservation_name = get_reservation_project_and_name(
148
+ reservation, project
149
+ )
142
150
  command = (
143
- f'gcloud beta compute reservations describe {reservation}'
144
- f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
151
+ f'gcloud beta compute reservations describe {reservation_name}'
152
+ f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
145
153
  )
146
154
  return_code, output = run_command_for_value(
147
155
  command, 'Get reservation placement policy'
@@ -156,9 +164,12 @@ def get_reservation_deployment_type(
156
164
  reservation: str, zone: str, project: str
157
165
  ) -> str:
158
166
  """Get reservation deployment type."""
167
+ reservation_project, reservation_name = get_reservation_project_and_name(
168
+ reservation, project
169
+ )
159
170
  command = (
160
- f'gcloud beta compute reservations describe {reservation}'
161
- f' --project={project} --zone={zone} --format="value(deploymentType)"'
171
+ f'gcloud beta compute reservations describe {reservation_name}'
172
+ f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
162
173
  )
163
174
  return_code, output = run_command_for_value(
164
175
  command, 'Get reservation deployment type', dry_run_return_val='DENSE'
@@ -178,9 +189,12 @@ def verify_reservation_exists(args) -> int:
178
189
  Returns:
179
190
  0 if successful and 1 otherwise.
180
191
  """
192
+ reservation_project, reservation_name = get_reservation_project_and_name(
193
+ args.reservation, args.project
194
+ )
181
195
  command = (
182
- f'gcloud beta compute reservations describe {args.reservation}'
183
- f' --project={args.project} --zone={args.zone}'
196
+ f'gcloud beta compute reservations describe {reservation_name}'
197
+ f' --project={reservation_project} --zone={args.zone}'
184
198
  )
185
199
  return_code = run_command_with_updates(command, 'Describe reservation')
186
200
  if return_code != 0:
@@ -264,3 +278,29 @@ def get_capacity_node_selectors_from_capacity_type(
264
278
  )
265
279
  return_code = 1
266
280
  return node_selector, return_code
281
+
282
+
283
+ def get_reservation_project_and_name(
284
+ reservation_name_or_path: str, cluster_project: str
285
+ ) -> tuple[str, str]:
286
+ """Get the reservation project and name.
287
+
288
+ Args:
289
+ reservation_name_or_path: either reservation name or reservation path in format
290
+ projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
291
+ cluster_project: the cluster project
292
+
293
+ Returns:
294
+ Tuple with reservation project and reservation name.
295
+ """
296
+ if '/' not in reservation_name_or_path:
297
+ return cluster_project, reservation_name_or_path
298
+ reservation_parts = reservation_name_or_path.split('/')
299
+ if (
300
+ len(reservation_parts) != 4
301
+ or reservation_parts[0] != 'projects'
302
+ or reservation_parts[2] != 'reservations'
303
+ ):
304
+ xpk_print('Unable to parse reservation: ', reservation_name_or_path)
305
+ xpk_exit(1)
306
+ return reservation_parts[1], reservation_parts[3]
xpk/core/capacity_test.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  import pytest
18
18
  from unittest.mock import MagicMock, patch
19
- from .capacity import get_reservation_deployment_type
19
+ from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
20
20
 
21
21
 
22
22
  @patch('xpk.core.capacity.xpk_print')
@@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
48
48
  reservation='reservation', zone='zone', project='project'
49
49
  )
50
50
  assert result == 'DENSE'
51
+
52
+
53
+ def test_get_reservation_project_and_name_parses_local_reservation():
54
+ project, name = get_reservation_project_and_name(
55
+ 'test-reservation', 'cluster-project'
56
+ )
57
+
58
+ assert project == 'cluster-project'
59
+ assert name == 'test-reservation'
60
+
61
+
62
+ def test_get_reservation_project_and_name_parses_shared_reservation():
63
+ project, name = get_reservation_project_and_name(
64
+ 'projects/reservation-project/reservations/test-reservation',
65
+ 'cluster-project',
66
+ )
67
+
68
+ assert project == 'reservation-project'
69
+ assert name == 'test-reservation'
70
+
71
+
72
+ @patch('xpk.core.capacity.xpk_print')
73
+ def test_get_reservation_project_and_name_fails_for_invalid_reservation(
74
+ xpk_print: MagicMock, mocker
75
+ ):
76
+ with pytest.raises(SystemExit):
77
+ get_reservation_project_and_name(
78
+ 'invalid/reservation',
79
+ 'cluster-project',
80
+ )
81
+ assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]
xpk/core/cluster.py CHANGED
@@ -22,7 +22,7 @@ from kubernetes import config
22
22
  from kubernetes.client.exceptions import ApiException
23
23
 
24
24
  from ..utils.console import xpk_exit, xpk_print
25
- from .capacity import B200_DEVICE_TYPE, H100_DEVICE_TYPE, H200_DEVICE_TYPE
25
+ from .capacity import H200_DEVICE_TYPE
26
26
  from .commands import (
27
27
  run_command_for_value,
28
28
  run_command_with_updates,
@@ -34,16 +34,11 @@ from .gcloud_context import (
34
34
  zone_to_region,
35
35
  )
36
36
  from .resources import get_cluster_system_characteristics
37
- from .system_characteristics import SystemCharacteristics
37
+ from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
38
38
 
39
39
  JOBSET_VERSION = 'v0.8.0'
40
40
  PATHWAYS_JOB_VERSION = 'v0.1.4'
41
- INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
42
- INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
43
- INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
44
- CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
45
41
  NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
46
- MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
47
42
 
48
43
  DEFAULT_NAMESPACE = 'default'
49
44
  XPK_SA = 'xpk-sa'
@@ -118,12 +113,12 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
118
113
  Returns:
119
114
  0 if successful and 1 otherwise.
120
115
  """
121
- if system.device_type == H100_DEVICE_TYPE:
122
- command = f'kubectl apply -f {INSTALLER_NCCL_TCPX}'
123
- elif system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
124
- command = f'kubectl apply -f {INSTALLER_NCCL_RDMA}'
125
- else:
126
- command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
116
+ nccl_installer = (
117
+ system.gpu_config.nccl_installer
118
+ if system.gpu_config and system.gpu_config.nccl_installer
119
+ else INSTALLER_NCCL_TCPXO
120
+ )
121
+ command = f'kubectl apply -f {nccl_installer}'
127
122
 
128
123
  return_code = run_command_with_updates(
129
124
  command, 'Install NCCL Plugin On Cluster'
@@ -135,35 +130,6 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
135
130
  )
136
131
  return 1
137
132
 
138
- if system.device_type == H100_DEVICE_TYPE:
139
- command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
140
-
141
- return_code = run_command_with_updates(
142
- command, 'Install NCCL Config On Cluster'
143
- )
144
-
145
- if return_code != 0:
146
- xpk_print(
147
- f'Install NCCL Config On Cluster request returned ERROR {return_code}'
148
- )
149
- return 1
150
-
151
- return 0
152
-
153
-
154
- def disable_mglru_on_cluster() -> int:
155
- """Disable MGLRU on the cluster.
156
-
157
- Returns:
158
- 0 if successful and 1 otherwise.
159
- """
160
- command = f'kubectl apply -f {MGLRU_DISABLE}'
161
- return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
162
-
163
- if return_code != 0:
164
- xpk_print('Disablig MGLRU On Cluster request returned ERROR')
165
- return 1
166
-
167
133
  return 0
168
134
 
169
135
 
@@ -309,10 +275,11 @@ def update_cluster_with_lustre_driver_if_necessary(args) -> int:
309
275
  Returns:
310
276
  0 if successful and error code otherwise.
311
277
  """
312
- if is_driver_enabled_on_cluster(
313
- args, driver='lustreCsiDriver'
314
- ) and is_driver_enabled_on_cluster(
315
- args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
278
+ if is_driver_enabled_on_cluster(args, driver='lustreCsiDriver') and (
279
+ not args.enable_legacy_lustre_port
280
+ or is_driver_enabled_on_cluster(
281
+ args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
282
+ )
316
283
  ):
317
284
  return 0
318
285
  cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
@@ -621,9 +588,13 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
621
588
  """
622
589
  command = (
623
590
  'gcloud container clusters update'
624
- f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
591
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
625
592
  ' --quiet'
626
593
  )
594
+ if args.enable_legacy_lustre_port:
595
+ command += ' --enable-legacy-lustre-port'
596
+ else:
597
+ command += ' --update-addons=LustreCsiDriver=ENABLED'
627
598
  xpk_print(
628
599
  'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
629
600
  )
@@ -729,78 +700,58 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
729
700
  return 0
730
701
 
731
702
 
732
- def test_and_retry_credentials_with_dns_logic(args) -> int:
733
- """Tests kubectl credentials and retries with default settings if a DNS error is found.
703
+ def get_cluster_credentials(args) -> int:
704
+ """Run cluster configuration command to set the kubectl config.
734
705
 
735
706
  Args:
736
707
  args: user provided arguments for running the command.
737
708
 
738
709
  Returns:
739
- 0 if credentials are valid after retrying, 1 otherwise.
710
+ 0 if successful and 1 otherwise.
740
711
  """
741
-
742
- xpk_print('Testing credentials with kubectl...')
743
- kubectl_command = 'kubectl get pods'
744
- kubectl_return_code, kubectl_output = run_command_for_value(
745
- kubectl_command, 'kubectl get pods'
746
- )
747
- if kubectl_return_code == 0:
748
- xpk_print('Credentials test succeeded.')
749
- return 0
750
-
751
- dns_endpoint_error = (
752
- 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
753
- ' is disabled'
754
- )
755
- if dns_endpoint_error not in kubectl_output:
756
- xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
757
- xpk_exit(kubectl_return_code)
758
- xpk_print(
759
- 'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
760
- ' flag...'
761
- )
762
-
763
712
  location = get_cluster_location(args.project, args.cluster, args.zone)
764
- without_dns_command = (
765
- 'gcloud container clusters get-credentials'
766
- f' {args.cluster} --location={location}'
767
- f' --project={args.project} &&'
768
- ' kubectl config view && kubectl config set-context --current'
769
- ' --namespace=default'
770
- )
771
- return_code = run_command_with_updates(
772
- without_dns_command, 'get-credentials to cluster', verbose=False
713
+
714
+ return_code = _get_credentials(
715
+ project=args.project,
716
+ cluster=args.cluster,
717
+ location=location,
718
+ dns_endpoint=True,
773
719
  )
774
720
  if return_code != 0:
775
- xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
776
- xpk_exit(return_code)
777
- return 0
778
-
721
+ return return_code
722
+
723
+ if not _are_credentials_valid():
724
+ xpk_print('Detected error. Retrying without --dns-endpoint flag...')
725
+ return_code = _get_credentials(
726
+ project=args.project,
727
+ cluster=args.cluster,
728
+ location=location,
729
+ dns_endpoint=False,
730
+ )
731
+ if return_code != 0:
732
+ return return_code
779
733
 
780
- def get_cluster_credentials(args) -> int:
781
- """Run cluster configuration command to set the kubectl config.
734
+ xpk_print('Finished get-credentials and kubectl setup.')
735
+ return 0
782
736
 
783
- Args:
784
- args: user provided arguments for running the command.
785
737
 
786
- Returns:
787
- 0 if successful and 1 otherwise.
788
- """
789
- location = get_cluster_location(args.project, args.cluster, args.zone)
738
+ def _get_credentials(
739
+ project: str, cluster: str, location: str, dns_endpoint: bool
740
+ ) -> int:
741
+ dns_endpoint_arg = '--dns-endpoint' if dns_endpoint else ''
790
742
  command = (
791
743
  'gcloud container clusters get-credentials'
792
- f' {args.cluster} --location={location} --dns-endpoint'
793
- f' --project={args.project} && kubectl config view && kubectl config'
744
+ f' {cluster} --location={location} {dns_endpoint_arg}'
745
+ f' --project={project} && kubectl config view && kubectl config'
794
746
  ' set-context --current --namespace=default'
795
747
  )
796
- task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
797
- return_code = run_command_with_updates_retry(command, task, verbose=False)
748
+ task = f'get-credentials-dns-endpoint to cluster {cluster}'
749
+ return run_command_with_updates(command, task, verbose=False)
798
750
 
799
- if return_code != 0:
800
- xpk_print(f'{task} returned ERROR {return_code}')
801
- xpk_exit(return_code)
802
-
803
- return_code = test_and_retry_credentials_with_dns_logic(args)
804
- xpk_print('Finished get-credentials and kubectl setup.')
805
751
 
806
- return return_code
752
+ def _are_credentials_valid() -> bool:
753
+ kubectl_command = 'kubectl get pods'
754
+ kubectl_return_code = run_command_with_updates(
755
+ kubectl_command, 'Test kubectl credentials'
756
+ )
757
+ return kubectl_return_code == 0
@@ -0,0 +1,170 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ from .testing.commands_tester import CommandsTester
19
+ from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
20
+ from pytest_mock import MockerFixture
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def commands_tester(mocker: MockerFixture) -> CommandsTester:
25
+ return CommandsTester(
26
+ mocker=mocker,
27
+ run_command_for_value_path="xpk.core.cluster.run_command_for_value",
28
+ run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
29
+ )
30
+
31
+
32
+ @pytest.fixture(autouse=True)
33
+ def mock_location(mocker: MockerFixture):
34
+ mocker.patch(
35
+ "xpk.core.cluster.get_cluster_location", return_value="us-central1"
36
+ )
37
+
38
+
39
+ @pytest.fixture(autouse=True)
40
+ def command_args(mocker: MockerFixture):
41
+ return mocker.Mock(cluster="cluster", project="project", zone="zone")
42
+
43
+
44
+ def test_get_cluster_credentials_returns_1_when_retrieval_command_fails(
45
+ commands_tester: CommandsTester, command_args
46
+ ):
47
+ commands_tester.set_result_for_command(
48
+ (1, ""), "gcloud container clusters get-credentials"
49
+ )
50
+ assert get_cluster_credentials(command_args) == 1
51
+
52
+
53
+ def test_get_cluster_credentials_returns_0_when_retrieval_succeeds(
54
+ commands_tester: CommandsTester, command_args
55
+ ):
56
+ commands_tester.set_result_for_command(
57
+ (0, ""), "gcloud container clusters get-credentials"
58
+ )
59
+ assert get_cluster_credentials(command_args) == 0
60
+
61
+
62
+ def test_get_cluster_credentials_does_not_retry_with_dns_when_retrieval_succeeds(
63
+ commands_tester: CommandsTester, command_args
64
+ ):
65
+ commands_tester.set_result_for_command(
66
+ (0, ""), "gcloud container clusters get-credentials --dns-endpoint"
67
+ )
68
+ commands_tester.set_result_for_command((0, ""), "kubectl get pods")
69
+ get_cluster_credentials(command_args)
70
+ non_dns_endpoint_commands = [
71
+ c
72
+ for c in commands_tester.get_matching_commands(
73
+ "gcloud container clusters get-credentials"
74
+ )
75
+ if "dns-endpoint" not in c
76
+ ]
77
+ assert len(non_dns_endpoint_commands) == 0
78
+
79
+
80
+ def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
81
+ commands_tester: CommandsTester, command_args
82
+ ):
83
+ commands_tester.set_result_for_command(
84
+ (0, ""), "gcloud container clusters get-credentials --dns-endpoint"
85
+ )
86
+ commands_tester.set_result_for_command((1, ""), "kubectl get pods")
87
+ get_cluster_credentials(command_args)
88
+ non_dns_endpoint_commands = [
89
+ c
90
+ for c in commands_tester.get_matching_commands(
91
+ "gcloud container clusters get-credentials"
92
+ )
93
+ if "dns-endpoint" not in c
94
+ ]
95
+ assert len(non_dns_endpoint_commands) == 1
96
+
97
+
98
+ def test_update_cluster_with_lustre_driver_if_necessary_with_default_port_runs_correct_checks(
99
+ commands_tester: CommandsTester, command_args
100
+ ):
101
+ commands_tester.set_result_for_command(
102
+ (0, "True"),
103
+ "gcloud container clusters describe",
104
+ )
105
+ command_args.enable_legacy_lustre_port = None
106
+ update_cluster_with_lustre_driver_if_necessary(command_args)
107
+
108
+ executed_commands = commands_tester.get_matching_commands()
109
+ assert executed_commands == [
110
+ "gcloud container clusters describe cluster --project=project"
111
+ " --location=us-central1"
112
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
113
+ ]
114
+
115
+
116
+ def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_correct_checks(
117
+ commands_tester: CommandsTester, command_args
118
+ ):
119
+ commands_tester.set_result_for_command(
120
+ (0, "True"),
121
+ "gcloud container clusters describe",
122
+ )
123
+ command_args.enable_legacy_lustre_port = True
124
+ update_cluster_with_lustre_driver_if_necessary(command_args)
125
+
126
+ executed_commands = commands_tester.get_matching_commands()
127
+ assert executed_commands == [
128
+ (
129
+ "gcloud container clusters describe cluster --project=project"
130
+ " --location=us-central1"
131
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
132
+ ),
133
+ (
134
+ "gcloud container clusters describe cluster --project=project"
135
+ " --location=us-central1"
136
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enableLegacyLustrePort)"'
137
+ ),
138
+ ]
139
+
140
+
141
+ def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
142
+ commands_tester: CommandsTester, command_args
143
+ ):
144
+ commands_tester.set_result_for_command(
145
+ (0, ""), "gcloud container clusters update"
146
+ )
147
+ command_args.enable_legacy_lustre_port = None
148
+ update_gke_cluster_with_lustre_driver_enabled(command_args)
149
+
150
+ executed_commands = commands_tester.get_matching_commands()
151
+ assert executed_commands == [
152
+ "gcloud container clusters update cluster --project=project"
153
+ " --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED"
154
+ ]
155
+
156
+
157
+ def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
158
+ commands_tester: CommandsTester, command_args
159
+ ):
160
+ commands_tester.set_result_for_command(
161
+ (0, ""), "gcloud container clusters update"
162
+ )
163
+ command_args.enable_legacy_lustre_port = True
164
+ update_gke_cluster_with_lustre_driver_enabled(command_args)
165
+
166
+ executed_commands = commands_tester.get_matching_commands()
167
+ assert executed_commands == [
168
+ "gcloud container clusters update cluster --project=project"
169
+ " --location=us-central1 --quiet --enable-legacy-lustre-port"
170
+ ]
xpk/core/commands.py CHANGED
@@ -195,16 +195,13 @@ def run_command_with_updates(command, task, verbose=True) -> int:
195
195
  return_code = child.poll()
196
196
  if return_code is None:
197
197
  xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
198
- time.sleep(1)
199
- i += 1
198
+ time.sleep(10)
199
+ i += 10
200
200
  else:
201
201
  xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
202
202
  return return_code
203
203
  else:
204
- xpk_print(
205
- f'Task: `{task}` is implemented by `{command}`, hiding output unless'
206
- ' there is an error.'
207
- )
204
+ xpk_print(f'Task: `{task}` is implemented by `{command}`')
208
205
  try:
209
206
  subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
210
207
  except subprocess.CalledProcessError as e:
@@ -277,10 +274,7 @@ def run_command_for_value(
277
274
  return return_code, f'{out_str}\n{err_str}'
278
275
  else:
279
276
  if not quiet:
280
- xpk_print(
281
- f'Task: `{task}` is implemented by `{command}`, hiding output unless'
282
- ' there is an error.'
283
- )
277
+ xpk_print(f'Task: `{task}` is implemented by `{command}`')
284
278
  try:
285
279
  output = subprocess.check_output(
286
280
  command,