xpk 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. integration/gcluster_a3mega_test.py +11 -0
  2. integration/gcluster_a3ultra_test.py +11 -0
  3. integration/gcluster_a4_test.py +11 -0
  4. xpk/commands/cluster.py +57 -21
  5. xpk/commands/cluster_gcluster.py +25 -5
  6. xpk/commands/cluster_gcluster_test.py +11 -2
  7. xpk/commands/cluster_test.py +233 -12
  8. xpk/commands/config.py +3 -5
  9. xpk/commands/kind.py +1 -1
  10. xpk/commands/storage.py +8 -10
  11. xpk/commands/workload.py +28 -11
  12. xpk/commands/workload_test.py +3 -3
  13. xpk/core/blueprint/blueprint_generator.py +70 -33
  14. xpk/core/blueprint/blueprint_test.py +9 -0
  15. xpk/core/capacity.py +46 -8
  16. xpk/core/capacity_test.py +32 -1
  17. xpk/core/cluster.py +37 -57
  18. xpk/core/cluster_test.py +95 -0
  19. xpk/core/commands.py +4 -10
  20. xpk/core/config.py +9 -2
  21. xpk/core/gcloud_context.py +18 -12
  22. xpk/core/gcloud_context_test.py +111 -1
  23. xpk/core/kjob.py +6 -9
  24. xpk/core/kueue_manager.py +192 -32
  25. xpk/core/kueue_manager_test.py +132 -4
  26. xpk/core/nodepool.py +21 -29
  27. xpk/core/nodepool_test.py +17 -15
  28. xpk/core/scheduling.py +16 -1
  29. xpk/core/scheduling_test.py +85 -6
  30. xpk/core/system_characteristics.py +77 -19
  31. xpk/core/system_characteristics_test.py +80 -5
  32. xpk/core/telemetry.py +263 -0
  33. xpk/core/telemetry_test.py +211 -0
  34. xpk/main.py +31 -13
  35. xpk/parser/cluster.py +48 -9
  36. xpk/parser/cluster_test.py +42 -3
  37. xpk/parser/workload.py +12 -0
  38. xpk/parser/workload_test.py +4 -4
  39. xpk/telemetry_uploader.py +29 -0
  40. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  41. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  42. xpk/utils/console.py +41 -10
  43. xpk/utils/console_test.py +106 -0
  44. xpk/utils/feature_flags.py +7 -1
  45. xpk/utils/file.py +4 -1
  46. xpk/utils/topology.py +4 -0
  47. xpk/utils/user_agent.py +35 -0
  48. xpk/utils/user_agent_test.py +44 -0
  49. xpk/utils/user_input.py +48 -0
  50. xpk/utils/user_input_test.py +92 -0
  51. xpk/utils/validation.py +0 -11
  52. xpk/utils/versions.py +31 -0
  53. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
  54. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
  55. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
  56. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
  57. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
  58. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
@@ -19,10 +19,13 @@ import shutil
19
19
  from typing import Optional
20
20
 
21
21
  from ruamel import yaml
22
+ from packaging.version import parse
22
23
 
23
24
  from ...utils.console import xpk_exit, xpk_print
25
+ from ...utils.versions import ReleaseChannel
24
26
  from ...utils.file import ensure_directory_exists
25
27
 
28
+
26
29
  from ..capacity import (
27
30
  H100_DEVICE_TYPE,
28
31
  B200_DEVICE_TYPE,
@@ -84,6 +87,8 @@ class BlueprintGenerator:
84
87
  region: str,
85
88
  zone: str,
86
89
  auth_cidr: str,
90
+ cluster_version: str,
91
+ release_channel: ReleaseChannel,
87
92
  prefix: str = "",
88
93
  num_nodes: int = 2,
89
94
  pods_ip_cidr_range: str = "10.4.0.0/14",
@@ -142,11 +147,17 @@ class BlueprintGenerator:
142
147
  },
143
148
  )
144
149
 
150
+ sanitized_version = cluster_version.replace("-", "+", 1)
151
+ version = parse(sanitized_version)
152
+ version_prefix = f"{version.major}.{version.minor}"
145
153
  gke_cluster = DeploymentModule(
146
154
  id="gke_cluster",
147
155
  source="modules/scheduler/gke-cluster",
148
156
  use=[primary_vpc_name, gpu_subnets_name],
149
157
  settings={
158
+ "release_channel": release_channel.value,
159
+ "version_prefix": version_prefix,
160
+ "min_master_version": cluster_version,
150
161
  "prefix_with_deployment_name": False,
151
162
  "name_suffix": cluster_name,
152
163
  "enable_private_endpoint": False,
@@ -171,6 +182,16 @@ class BlueprintGenerator:
171
182
  },
172
183
  outputs=["instructions"],
173
184
  )
185
+ if release_channel != ReleaseChannel.RAPID:
186
+ gke_cluster.set_setting(
187
+ "maintenance_exclusions",
188
+ [{
189
+ "name": "no-minor-or-node-upgrades-indefinite",
190
+ "start_time": "2024-12-01T00:00:00Z",
191
+ "end_time": "2026-01-16T00:00:00Z",
192
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
193
+ }],
194
+ )
174
195
 
175
196
  group_placement_0 = DeploymentModule(
176
197
  id="group_placement_0",
@@ -215,6 +236,9 @@ class BlueprintGenerator:
215
236
  else:
216
237
  a3_megagpu_pool_0.update_settings({"static_node_count": num_nodes})
217
238
 
239
+ if release_channel == ReleaseChannel.RAPID:
240
+ a3_megagpu_pool_0.set_setting("auto_upgrade", True)
241
+
218
242
  set_placement_policy = capacity_type != CapacityType.SPOT
219
243
  workload = DeploymentModule(
220
244
  id="workload_component_install",
@@ -391,6 +415,8 @@ class BlueprintGenerator:
391
415
  zone: str,
392
416
  auth_cidr: str,
393
417
  system_node_pool_machine_type: str,
418
+ cluster_version: str,
419
+ release_channel: ReleaseChannel,
394
420
  reservation: Optional[str | None] = None,
395
421
  gcs_bucket: Optional[str | None] = None,
396
422
  num_nodes: int = 2,
@@ -480,28 +506,19 @@ class BlueprintGenerator:
480
506
  },
481
507
  },
482
508
  )
509
+
510
+ sanitized_version = cluster_version.replace("-", "+", 1)
511
+ version = parse(sanitized_version)
512
+ version_prefix = f"{version.major}.{version.minor}"
483
513
  cluster_id = f"{cluster_name}-a3-ultragpu-cluster"
484
514
  a3_ultra_cluster = DeploymentModule(
485
515
  id=cluster_id,
486
516
  source="modules/scheduler/gke-cluster",
487
517
  use=[net_0_id],
488
518
  settings={
489
- "release_channel": (
490
- "UNSPECIFIED"
491
- if capacity_type == CapacityType.FLEX_START
492
- else "RAPID"
493
- ),
494
- "version_prefix": "1.32.",
495
- "maintenance_exclusions": (
496
- []
497
- if capacity_type == CapacityType.FLEX_START
498
- else [{
499
- "name": "no-minor-or-node-upgrades-indefinite",
500
- "start_time": "2024-12-01T00:00:00Z",
501
- "end_time": "2025-12-22T00:00:00Z",
502
- "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
503
- }]
504
- ),
519
+ "release_channel": release_channel.value,
520
+ "version_prefix": version_prefix,
521
+ "min_cluster_version": cluster_version,
505
522
  "prefix_with_deployment_name": False,
506
523
  "name_suffix": cluster_name,
507
524
  "system_node_pool_machine_type": system_node_pool_machine_type,
@@ -537,6 +554,17 @@ class BlueprintGenerator:
537
554
  },
538
555
  outputs=["instructions"],
539
556
  )
557
+ if release_channel != ReleaseChannel.RAPID:
558
+ a3_ultra_cluster.set_setting(
559
+ "maintenance_exclusions",
560
+ [{
561
+ "name": "no-minor-or-node-upgrades-indefinite",
562
+ "start_time": "2024-12-01T00:00:00Z",
563
+ "end_time": "2026-01-16T00:00:00Z",
564
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
565
+ }],
566
+ )
567
+
540
568
  system, _ = get_system_characteristics_by_device_type(a3ultra_device_type)
541
569
  if system is None:
542
570
  xpk_print(
@@ -584,6 +612,9 @@ class BlueprintGenerator:
584
612
  else:
585
613
  gpu_pool.update_settings({"static_node_count": num_nodes})
586
614
 
615
+ if release_channel == ReleaseChannel.RAPID:
616
+ gpu_pool.set_setting("auto_upgrade", True)
617
+
587
618
  workload_manager_install_id = "workload-manager-install"
588
619
  workload_manager_install = DeploymentModule(
589
620
  id=workload_manager_install_id,
@@ -674,6 +705,8 @@ class BlueprintGenerator:
674
705
  zone: str,
675
706
  auth_cidr: str,
676
707
  system_node_pool_machine_type: str,
708
+ cluster_version: str,
709
+ release_channel: ReleaseChannel,
677
710
  reservation: Optional[str | None] = None,
678
711
  gcs_bucket: Optional[str | None] = None,
679
712
  num_nodes: int = 2,
@@ -761,12 +794,19 @@ class BlueprintGenerator:
761
794
  },
762
795
  },
763
796
  )
797
+
798
+ sanitized_version = cluster_version.replace("-", "+", 1)
799
+ version = parse(sanitized_version)
800
+ version_prefix = f"{version.major}.{version.minor}"
764
801
  cluster_id = f"{cluster_name}-a4-cluster"
765
802
  a4_cluster = DeploymentModule(
766
803
  id=cluster_id,
767
804
  source="modules/scheduler/gke-cluster",
768
805
  use=[net_0_id],
769
806
  settings={
807
+ "release_channel": release_channel.value,
808
+ "version_prefix": version_prefix,
809
+ "min_cluster_version": cluster_version,
770
810
  "system_node_pool_machine_type": system_node_pool_machine_type,
771
811
  "system_node_pool_node_count": {
772
812
  "total_min_nodes": system_node_pool_min_node_count,
@@ -791,25 +831,20 @@ class BlueprintGenerator:
791
831
  " alias_ip_range=[]}],"
792
832
  f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
793
833
  ),
794
- "version_prefix": "1.32.",
795
- "release_channel": (
796
- "UNSPECIFIED"
797
- if capacity_type == CapacityType.FLEX_START
798
- else "RAPID"
799
- ),
800
- "maintenance_exclusions": (
801
- []
802
- if capacity_type == CapacityType.FLEX_START
803
- else [{
804
- "name": "no-minor-or-node-upgrades-indefinite",
805
- "start_time": "2024-12-01T00:00:00Z",
806
- "end_time": "2025-12-22T00:00:00Z",
807
- "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
808
- }]
809
- ),
810
834
  },
811
835
  outputs=["instructions"],
812
836
  )
837
+ if release_channel != ReleaseChannel.RAPID:
838
+ a4_cluster.set_setting(
839
+ "maintenance_exclusions",
840
+ [{
841
+ "name": "no-minor-or-node-upgrades-indefinite",
842
+ "start_time": "2024-12-01T00:00:00Z",
843
+ "end_time": "2026-01-16T00:00:00Z",
844
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
845
+ }],
846
+ )
847
+
813
848
  system, _ = get_system_characteristics_by_device_type(a4_device_type)
814
849
  if system is None:
815
850
  xpk_print(
@@ -859,6 +894,9 @@ class BlueprintGenerator:
859
894
  else:
860
895
  gpu_pool.update_settings({"static_node_count": num_nodes})
861
896
 
897
+ if release_channel == ReleaseChannel.RAPID:
898
+ gpu_pool.set_setting("auto_upgrade", True)
899
+
862
900
  workload_manager_install_id = "workload-manager-install"
863
901
  workload_manager_install = DeploymentModule(
864
902
  id=workload_manager_install_id,
@@ -1019,7 +1057,6 @@ class BlueprintGenerator:
1019
1057
  "enable_flex_start": True,
1020
1058
  "enable_queued_provisioning": True,
1021
1059
  "autoscaling_total_min_nodes": 0,
1022
- "release_channel": "UNSPECIFIED",
1023
1060
  "auto_repair": False,
1024
1061
  "auto_upgrade": False,
1025
1062
  }
@@ -22,6 +22,7 @@ import ruamel.yaml
22
22
  from xpk.core.blueprint.blueprint_definitions import Blueprint
23
23
  from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
24
  from xpk.core.capacity import CapacityType
25
+ from xpk.utils.versions import ReleaseChannel
25
26
 
26
27
  yaml = ruamel.yaml.YAML()
27
28
 
@@ -60,6 +61,8 @@ def test_generate_a3_mega_blueprint():
60
61
  reservation="test-reservation",
61
62
  capacity_type=CapacityType.RESERVATION,
62
63
  system_node_pool_min_node_count=5,
64
+ release_channel=ReleaseChannel.RAPID,
65
+ cluster_version="1.2.3",
63
66
  )
64
67
 
65
68
  assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
@@ -99,6 +102,8 @@ def test_generate_a3_mega_spot_blueprint():
99
102
  auth_cidr="10.0.0.0/32",
100
103
  capacity_type=CapacityType.SPOT,
101
104
  system_node_pool_min_node_count=5,
105
+ release_channel=ReleaseChannel.RAPID,
106
+ cluster_version="1.2.3",
102
107
  )
103
108
 
104
109
  assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
@@ -135,6 +140,8 @@ def test_generate_a3_ultra_blueprint():
135
140
  capacity_type=CapacityType.RESERVATION,
136
141
  gcs_bucket="test-bucket",
137
142
  prefix="testdir",
143
+ release_channel=ReleaseChannel.RAPID,
144
+ cluster_version="1.2.3",
138
145
  )
139
146
  with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
140
147
  ctk_yaml = yaml.load(stream)
@@ -180,6 +187,8 @@ def test_generate_a4_blueprint():
180
187
  capacity_type=CapacityType.RESERVATION,
181
188
  gcs_bucket="test-bucket",
182
189
  prefix="testdir",
190
+ release_channel=ReleaseChannel.RAPID,
191
+ cluster_version="1.2.3",
183
192
  )
184
193
  with open(a4_yaml_test_path, encoding="utf-8") as stream:
185
194
  ctk_yaml = yaml.load(stream)
xpk/core/capacity.py CHANGED
@@ -115,9 +115,12 @@ def get_reservation_maintenance_interval(
115
115
  Returns:
116
116
  0 if successful and 1 otherwise.
117
117
  """
118
+ reservation_project, reservation_name = get_reservation_project_and_name(
119
+ reservation, project
120
+ )
118
121
  command = (
119
- f'gcloud beta compute reservations describe {reservation}'
120
- f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
122
+ f'gcloud beta compute reservations describe {reservation_name}'
123
+ f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
121
124
  )
122
125
  return_code, output = run_command_for_value(
123
126
  command, 'Get reservation maintenance interval'
@@ -139,9 +142,12 @@ def get_reservation_placement_policy(
139
142
  Returns:
140
143
  0 if successful and 1 otherwise.
141
144
  """
145
+ reservation_project, reservation_name = get_reservation_project_and_name(
146
+ reservation, project
147
+ )
142
148
  command = (
143
- f'gcloud beta compute reservations describe {reservation}'
144
- f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
149
+ f'gcloud beta compute reservations describe {reservation_name}'
150
+ f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
145
151
  )
146
152
  return_code, output = run_command_for_value(
147
153
  command, 'Get reservation placement policy'
@@ -156,9 +162,12 @@ def get_reservation_deployment_type(
156
162
  reservation: str, zone: str, project: str
157
163
  ) -> str:
158
164
  """Get reservation deployment type."""
165
+ reservation_project, reservation_name = get_reservation_project_and_name(
166
+ reservation, project
167
+ )
159
168
  command = (
160
- f'gcloud beta compute reservations describe {reservation}'
161
- f' --project={project} --zone={zone} --format="value(deploymentType)"'
169
+ f'gcloud beta compute reservations describe {reservation_name}'
170
+ f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
162
171
  )
163
172
  return_code, output = run_command_for_value(
164
173
  command, 'Get reservation deployment type', dry_run_return_val='DENSE'
@@ -178,9 +187,12 @@ def verify_reservation_exists(args) -> int:
178
187
  Returns:
179
188
  0 if successful and 1 otherwise.
180
189
  """
190
+ reservation_project, reservation_name = get_reservation_project_and_name(
191
+ args.reservation, args.project
192
+ )
181
193
  command = (
182
- f'gcloud beta compute reservations describe {args.reservation}'
183
- f' --project={args.project} --zone={args.zone}'
194
+ f'gcloud beta compute reservations describe {reservation_name}'
195
+ f' --project={reservation_project} --zone={args.zone}'
184
196
  )
185
197
  return_code = run_command_with_updates(command, 'Describe reservation')
186
198
  if return_code != 0:
@@ -264,3 +276,29 @@ def get_capacity_node_selectors_from_capacity_type(
264
276
  )
265
277
  return_code = 1
266
278
  return node_selector, return_code
279
+
280
+
281
+ def get_reservation_project_and_name(
282
+ reservation_name_or_path: str, cluster_project: str
283
+ ) -> tuple[str, str]:
284
+ """Get the reservation project and name.
285
+
286
+ Args:
287
+ reservation_name_or_path: either reservation name or reservation path in format
288
+ projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
289
+ cluster_project: the cluster project
290
+
291
+ Returns:
292
+ Tuple with reservation project and reservation name.
293
+ """
294
+ if '/' not in reservation_name_or_path:
295
+ return cluster_project, reservation_name_or_path
296
+ reservation_parts = reservation_name_or_path.split('/')
297
+ if (
298
+ len(reservation_parts) != 4
299
+ or reservation_parts[0] != 'projects'
300
+ or reservation_parts[2] != 'reservations'
301
+ ):
302
+ xpk_print('Unable to parse reservation: ', reservation_name_or_path)
303
+ xpk_exit(1)
304
+ return reservation_parts[1], reservation_parts[3]
xpk/core/capacity_test.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  import pytest
18
18
  from unittest.mock import MagicMock, patch
19
- from .capacity import get_reservation_deployment_type
19
+ from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
20
20
 
21
21
 
22
22
  @patch('xpk.core.capacity.xpk_print')
@@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
48
48
  reservation='reservation', zone='zone', project='project'
49
49
  )
50
50
  assert result == 'DENSE'
51
+
52
+
53
+ def test_get_reservation_project_and_name_parses_local_reservation():
54
+ project, name = get_reservation_project_and_name(
55
+ 'test-reservation', 'cluster-project'
56
+ )
57
+
58
+ assert project == 'cluster-project'
59
+ assert name == 'test-reservation'
60
+
61
+
62
+ def test_get_reservation_project_and_name_parses_shared_reservation():
63
+ project, name = get_reservation_project_and_name(
64
+ 'projects/reservation-project/reservations/test-reservation',
65
+ 'cluster-project',
66
+ )
67
+
68
+ assert project == 'reservation-project'
69
+ assert name == 'test-reservation'
70
+
71
+
72
+ @patch('xpk.core.capacity.xpk_print')
73
+ def test_get_reservation_project_and_name_fails_for_invalid_reservation(
74
+ xpk_print: MagicMock, mocker
75
+ ):
76
+ with pytest.raises(SystemExit):
77
+ get_reservation_project_and_name(
78
+ 'invalid/reservation',
79
+ 'cluster-project',
80
+ )
81
+ assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]
xpk/core/cluster.py CHANGED
@@ -729,78 +729,58 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
729
729
  return 0
730
730
 
731
731
 
732
- def test_and_retry_credentials_with_dns_logic(args) -> int:
733
- """Tests kubectl credentials and retries with default settings if a DNS error is found.
732
+ def get_cluster_credentials(args) -> int:
733
+ """Run cluster configuration command to set the kubectl config.
734
734
 
735
735
  Args:
736
736
  args: user provided arguments for running the command.
737
737
 
738
738
  Returns:
739
- 0 if credentials are valid after retrying, 1 otherwise.
739
+ 0 if successful and 1 otherwise.
740
740
  """
741
-
742
- xpk_print('Testing credentials with kubectl...')
743
- kubectl_command = 'kubectl get pods'
744
- kubectl_return_code, kubectl_output = run_command_for_value(
745
- kubectl_command, 'kubectl get pods'
746
- )
747
- if kubectl_return_code == 0:
748
- xpk_print('Credentials test succeeded.')
749
- return 0
750
-
751
- dns_endpoint_error = (
752
- 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
753
- ' is disabled'
754
- )
755
- if dns_endpoint_error not in kubectl_output:
756
- xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
757
- xpk_exit(kubectl_return_code)
758
- xpk_print(
759
- 'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
760
- ' flag...'
761
- )
762
-
763
741
  location = get_cluster_location(args.project, args.cluster, args.zone)
764
- without_dns_command = (
765
- 'gcloud container clusters get-credentials'
766
- f' {args.cluster} --location={location}'
767
- f' --project={args.project} &&'
768
- ' kubectl config view && kubectl config set-context --current'
769
- ' --namespace=default'
770
- )
771
- return_code = run_command_with_updates(
772
- without_dns_command, 'get-credentials to cluster', verbose=False
742
+
743
+ return_code = _get_credentials(
744
+ project=args.project,
745
+ cluster=args.cluster,
746
+ location=location,
747
+ dns_endpoint=True,
773
748
  )
774
749
  if return_code != 0:
775
- xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
776
- xpk_exit(return_code)
777
- return 0
778
-
750
+ return return_code
751
+
752
+ if not _are_credentials_valid():
753
+ xpk_print('Detected error. Retrying without --dns-endpoint flag...')
754
+ return_code = _get_credentials(
755
+ project=args.project,
756
+ cluster=args.cluster,
757
+ location=location,
758
+ dns_endpoint=False,
759
+ )
760
+ if return_code != 0:
761
+ return return_code
779
762
 
780
- def get_cluster_credentials(args) -> int:
781
- """Run cluster configuration command to set the kubectl config.
763
+ xpk_print('Finished get-credentials and kubectl setup.')
764
+ return 0
782
765
 
783
- Args:
784
- args: user provided arguments for running the command.
785
766
 
786
- Returns:
787
- 0 if successful and 1 otherwise.
788
- """
789
- location = get_cluster_location(args.project, args.cluster, args.zone)
767
+ def _get_credentials(
768
+ project: str, cluster: str, location: str, dns_endpoint: bool
769
+ ) -> int:
770
+ dns_endpoint_arg = '--dns-endpoint' if dns_endpoint else ''
790
771
  command = (
791
772
  'gcloud container clusters get-credentials'
792
- f' {args.cluster} --location={location} --dns-endpoint'
793
- f' --project={args.project} && kubectl config view && kubectl config'
773
+ f' {cluster} --location={location} {dns_endpoint_arg}'
774
+ f' --project={project} && kubectl config view && kubectl config'
794
775
  ' set-context --current --namespace=default'
795
776
  )
796
- task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
797
- return_code = run_command_with_updates_retry(command, task, verbose=False)
777
+ task = f'get-credentials-dns-endpoint to cluster {cluster}'
778
+ return run_command_with_updates(command, task, verbose=False)
798
779
 
799
- if return_code != 0:
800
- xpk_print(f'{task} returned ERROR {return_code}')
801
- xpk_exit(return_code)
802
780
 
803
- return_code = test_and_retry_credentials_with_dns_logic(args)
804
- xpk_print('Finished get-credentials and kubectl setup.')
805
-
806
- return return_code
781
+ def _are_credentials_valid() -> bool:
782
+ kubectl_command = 'kubectl get pods'
783
+ kubectl_return_code = run_command_with_updates(
784
+ kubectl_command, 'Test kubectl credentials'
785
+ )
786
+ return kubectl_return_code == 0
@@ -0,0 +1,95 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ from .testing.commands_tester import CommandsTester
19
+ from .cluster import get_cluster_credentials
20
+ from pytest_mock import MockerFixture
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def commands_tester(mocker: MockerFixture) -> CommandsTester:
25
+ return CommandsTester(
26
+ mocker=mocker,
27
+ run_command_for_value_path="xpk.core.cluster.run_command_for_value",
28
+ run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
29
+ )
30
+
31
+
32
+ @pytest.fixture(autouse=True)
33
+ def mock_location(mocker: MockerFixture):
34
+ mocker.patch(
35
+ "xpk.core.cluster.get_cluster_location", return_value="us-central1"
36
+ )
37
+
38
+
39
+ @pytest.fixture(autouse=True)
40
+ def command_args(mocker: MockerFixture):
41
+ return mocker.Mock(cluster="cluster", project="project", zone="zone")
42
+
43
+
44
+ def test_get_cluster_credentials_returns_1_when_retrieval_command_fails(
45
+ commands_tester: CommandsTester, command_args
46
+ ):
47
+ commands_tester.set_result_for_command(
48
+ (1, ""), "gcloud container clusters get-credentials"
49
+ )
50
+ assert get_cluster_credentials(command_args) == 1
51
+
52
+
53
+ def test_get_cluster_credentials_returns_0_when_retrieval_succeeds(
54
+ commands_tester: CommandsTester, command_args
55
+ ):
56
+ commands_tester.set_result_for_command(
57
+ (0, ""), "gcloud container clusters get-credentials"
58
+ )
59
+ assert get_cluster_credentials(command_args) == 0
60
+
61
+
62
+ def test_get_cluster_credentials_does_not_retry_with_dns_when_retrieval_succeeds(
63
+ commands_tester: CommandsTester, command_args
64
+ ):
65
+ commands_tester.set_result_for_command(
66
+ (0, ""), "gcloud container clusters get-credentials --dns-endpoint"
67
+ )
68
+ commands_tester.set_result_for_command((0, ""), "kubectl get pods")
69
+ get_cluster_credentials(command_args)
70
+ non_dns_endpoint_commands = [
71
+ c
72
+ for c in commands_tester.get_matching_commands(
73
+ "gcloud container clusters get-credentials"
74
+ )
75
+ if "dns-endpoint" not in c
76
+ ]
77
+ assert len(non_dns_endpoint_commands) == 0
78
+
79
+
80
+ def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
81
+ commands_tester: CommandsTester, command_args
82
+ ):
83
+ commands_tester.set_result_for_command(
84
+ (0, ""), "gcloud container clusters get-credentials --dns-endpoint"
85
+ )
86
+ commands_tester.set_result_for_command((1, ""), "kubectl get pods")
87
+ get_cluster_credentials(command_args)
88
+ non_dns_endpoint_commands = [
89
+ c
90
+ for c in commands_tester.get_matching_commands(
91
+ "gcloud container clusters get-credentials"
92
+ )
93
+ if "dns-endpoint" not in c
94
+ ]
95
+ assert len(non_dns_endpoint_commands) == 1
xpk/core/commands.py CHANGED
@@ -195,16 +195,13 @@ def run_command_with_updates(command, task, verbose=True) -> int:
195
195
  return_code = child.poll()
196
196
  if return_code is None:
197
197
  xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
198
- time.sleep(1)
199
- i += 1
198
+ time.sleep(10)
199
+ i += 10
200
200
  else:
201
201
  xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
202
202
  return return_code
203
203
  else:
204
- xpk_print(
205
- f'Task: `{task}` is implemented by `{command}`, hiding output unless'
206
- ' there is an error.'
207
- )
204
+ xpk_print(f'Task: `{task}` is implemented by `{command}`')
208
205
  try:
209
206
  subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
210
207
  except subprocess.CalledProcessError as e:
@@ -277,10 +274,7 @@ def run_command_for_value(
277
274
  return return_code, f'{out_str}\n{err_str}'
278
275
  else:
279
276
  if not quiet:
280
- xpk_print(
281
- f'Task: `{task}` is implemented by `{command}`, hiding output unless'
282
- ' there is an error.'
283
- )
277
+ xpk_print(f'Task: `{task}` is implemented by `{command}`')
284
278
  try:
285
279
  output = subprocess.check_output(
286
280
  command,
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.14.3'
25
+ __version__ = 'v0.15.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -30,6 +30,8 @@ CONFIGS_KEY = 'configs'
30
30
  CFG_BUCKET_KEY = 'cluster-state-gcs-bucket'
31
31
  CLUSTER_NAME_KEY = 'cluster-name'
32
32
  PROJECT_KEY = 'project-id'
33
+ CLIENT_ID_KEY = 'client-id'
34
+ SEND_TELEMETRY_KEY = 'send-telemetry'
33
35
  ZONE_KEY = 'zone'
34
36
  KJOB_BATCH_IMAGE = 'batch-image'
35
37
  KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
@@ -45,6 +47,8 @@ DEFAULT_KEYS = [
45
47
  CFG_BUCKET_KEY,
46
48
  CLUSTER_NAME_KEY,
47
49
  PROJECT_KEY,
50
+ CLIENT_ID_KEY,
51
+ SEND_TELEMETRY_KEY,
48
52
  ZONE_KEY,
49
53
  GKE_ENDPOINT_KEY,
50
54
  DEPENDENCIES_KEY,
@@ -82,7 +86,7 @@ class XpkConfig:
82
86
  with open(self._config, encoding='utf-8', mode='w') as stream:
83
87
  yaml.dump(config_yaml, stream)
84
88
 
85
- def set(self, key: str, value: str) -> None:
89
+ def set(self, key: str, value: str | None) -> None:
86
90
  if key not in self._allowed_keys:
87
91
  xpk_print(f'Key {key} is not an allowed xpk config key.')
88
92
  return
@@ -114,3 +118,6 @@ class XpkConfig:
114
118
  return None
115
119
  val: dict[str, str] = config_yaml[CONFIGS_KEY]
116
120
  return val
121
+
122
+
123
+ xpk_config = XpkConfig()