xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py CHANGED
@@ -15,8 +15,8 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from typing import List
18
- from ..utils.console import get_user_input, xpk_print
19
- from ..utils.topology import get_topology_product, is_topology_valid
18
+ from ..utils.console import ask_for_user_consent, xpk_print
19
+ from .scheduling import get_placement_policy_name, is_placement_policy_supported
20
20
  from .capacity import (
21
21
  AUTOPROVISIONING_CONFIG_VALUE,
22
22
  H100_MEGA_DEVICE_TYPE,
@@ -28,10 +28,9 @@ from .capacity import (
28
28
  from .commands import run_command_for_value, run_commands
29
29
  from .gcloud_context import GkeServerConfig, get_cluster_location, zone_to_region
30
30
  from .resources import (
31
- CLUSTER_CONFIGMAP_YAML,
32
- CLUSTER_RESOURCES_CONFIGMAP,
31
+ ConfigMapType,
33
32
  check_cluster_resources,
34
- create_or_update_cluster_configmap,
33
+ update_cluster_configmap,
35
34
  )
36
35
  from .system_characteristics import AcceleratorType
37
36
 
@@ -110,6 +109,7 @@ def run_gke_node_pool_create_command(
110
109
  existing_node_pool_names, args.cluster, desired_node_pool_count
111
110
  )
112
111
 
112
+ node_pools_to_delete = []
113
113
  node_pools_to_remain = []
114
114
  delete_commands = []
115
115
  delete_task_names = []
@@ -186,14 +186,10 @@ def run_gke_node_pool_create_command(
186
186
  # when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
187
187
  # In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
188
188
  if delete_commands:
189
- will_delete = True
190
- if node_pools_to_delete and not args.force:
191
- will_delete = get_user_input(
192
- f'Planning to delete {len(node_pools_to_delete)} node pools including'
193
- f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
194
- ' (no):\n'
195
- )
196
- if not will_delete:
189
+ if node_pools_to_delete and not ask_for_user_consent(
190
+ f'Planning to delete {len(node_pools_to_delete)} node pools including'
191
+ f' {node_pools_to_delete}. \nDo you wish to delete?'
192
+ ):
197
193
  xpk_print(
198
194
  'You have requested to not delete the existing nodepools in the'
199
195
  ' cluster. There will be no change to the cluster.'
@@ -215,18 +211,15 @@ def run_gke_node_pool_create_command(
215
211
 
216
212
  # Enable Workload Identity on existing Nodepools
217
213
  if update_WI_commands:
218
- will_update_WI = True
219
- if node_pools_to_update_WI and not args.force:
220
- will_update_WI = get_user_input(
221
- 'Planning to enable Workload Identity Federation on'
222
- f' {len(node_pools_to_update_WI)} existing node pools including'
223
- f' {node_pools_to_update_WI}.This immediately enables Workload'
224
- ' Identity Federation for GKE for any workloads running in the node'
225
- ' pool. Also, xpk does not support disabling Workload Identity on'
226
- ' clusters that have it enabled already \nDo you wish to update: y'
227
- ' (yes) / n (no):\n'
228
- )
229
- if not will_update_WI:
214
+ will_update_WI = not node_pools_to_update_WI or ask_for_user_consent(
215
+ 'Planning to enable Workload Identity Federation on'
216
+ f' {len(node_pools_to_update_WI)} existing node pools including'
217
+ f' {node_pools_to_update_WI}. This immediately enables Workload'
218
+ ' Identity Federation for GKE for any workloads running in the node'
219
+ ' pool. Also, xpk does not support disabling Workload Identity on'
220
+ ' clusters that have it enabled already \nDo you wish to update?'
221
+ )
222
+ if will_update_WI:
230
223
  for i, command in enumerate(update_WI_commands):
231
224
  xpk_print(
232
225
  f'To complete {update_WI_task_names[i]} we are executing {command}'
@@ -253,22 +246,23 @@ def run_gke_node_pool_create_command(
253
246
  )
254
247
  else:
255
248
  resources_data = f'{device_type}: "0"'
256
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
257
- resources_yml = CLUSTER_CONFIGMAP_YAML.format(
258
- args=args, name=resources_configmap_name, data=resources_data
249
+ return_code = update_cluster_configmap(
250
+ cluster_name=args.cluster,
251
+ config_map_type=ConfigMapType.RESOURCES,
252
+ data=resources_data,
259
253
  )
260
- configmap_yml = {}
261
- configmap_yml[resources_configmap_name] = resources_yml
262
- return_code = create_or_update_cluster_configmap(configmap_yml)
263
254
  if return_code != 0:
264
255
  return 1
265
256
 
266
257
  placement_args = ''
267
- if system.requires_workload_policy and is_topology_valid(system.topology):
268
- placement_policy = (
269
- f'{system.device_type}-{system.topology}-placement-policy'
258
+ if is_placement_policy_supported(system):
259
+ placement_policy = get_placement_policy_name(system)
260
+ ensure_resource_policy_exists(
261
+ resource_policy_name=placement_policy,
262
+ project=args.project,
263
+ zone=args.zone,
264
+ topology=system.topology,
270
265
  )
271
- ensure_resource_policy_exists(placement_policy, args, system.topology)
272
266
  placement_args = f' --placement-policy={placement_policy}'
273
267
 
274
268
  create_commands = []
@@ -290,16 +284,16 @@ def run_gke_node_pool_create_command(
290
284
  )
291
285
  if system.accelerator_type == AcceleratorType.TPU:
292
286
  command += f' --node-version={gke_node_pool_version}'
293
- topology_product = get_topology_product(system.topology)
294
287
  if capacity_type == CapacityType.FLEX_START:
295
288
  command += ' --num-nodes=0'
296
- elif topology_product > 1:
289
+ else:
297
290
  command += f' --num-nodes={system.vms_per_slice}'
298
291
  command += (
299
292
  f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
300
293
  )
301
294
 
302
- if topology_product > 1:
295
+ # --tpu-topology should not be set for single-host node pools
296
+ if system.vms_per_slice > 1:
303
297
  # --placement-type=COMPACT enables group placement policy which
304
298
  # is mutually exclusive with workload policy, --tpu-topology should
305
299
  # also not be passed when workload policy is used
@@ -319,7 +313,7 @@ def run_gke_node_pool_create_command(
319
313
  command += (
320
314
  ' --accelerator'
321
315
  f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
322
- f' --no-enable-autoupgrade --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
316
+ f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
323
317
  )
324
318
  if device_type == H100_MEGA_DEVICE_TYPE:
325
319
  for i in range(1, 9):
@@ -595,14 +589,14 @@ def get_desired_node_pool_names(
595
589
 
596
590
 
597
591
  def ensure_resource_policy_exists(
598
- resource_policy_name: str, args, topology: str
592
+ resource_policy_name: str, project: str, zone: str, topology: str
599
593
  ) -> None:
600
594
  return_code, _ = run_command_for_value(
601
595
  (
602
596
  'gcloud compute resource-policies describe'
603
597
  f' {resource_policy_name} '
604
- f'--project={args.project} '
605
- f'--region={zone_to_region(args.zone)}'
598
+ f'--project={project} '
599
+ f'--region={zone_to_region(zone)}'
606
600
  ),
607
601
  'Retrieve resource policy',
608
602
  )
@@ -613,7 +607,7 @@ def ensure_resource_policy_exists(
613
607
  return_code, _ = run_command_for_value(
614
608
  (
615
609
  'gcloud compute resource-policies create workload-policy'
616
- f' {resource_policy_name} --project={args.project} --region={zone_to_region(args.zone)} --type=HIGH_THROUGHPUT'
610
+ f' {resource_policy_name} --project={project} --region={zone_to_region(zone)} --type=HIGH_THROUGHPUT'
617
611
  f' --accelerator-topology={topology}'
618
612
  ),
619
613
  'Create resource policy',
xpk/core/nodepool_test.py CHANGED
@@ -20,7 +20,7 @@ from xpk.core.nodepool import (
20
20
  get_desired_node_pool_names,
21
21
  run_gke_node_pool_create_command,
22
22
  )
23
- from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
24
24
 
25
25
  CLUSTER_NAME = "running-cucumber"
26
26
 
@@ -96,7 +96,12 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
96
96
  mock = mocker.patch(
97
97
  "xpk.core.nodepool.run_command_for_value", return_value=(0, "")
98
98
  )
99
- ensure_resource_policy_exists("resource-policy", args, "2x2x1")
99
+ ensure_resource_policy_exists(
100
+ resource_policy_name="resource-policy",
101
+ project="test-project",
102
+ zone="us-central1-a",
103
+ topology="2x2x1",
104
+ )
100
105
  mock.assert_called_once()
101
106
 
102
107
 
@@ -108,7 +113,12 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
108
113
  mock = mocker.patch(
109
114
  "xpk.core.nodepool.run_command_for_value", side_effect=[(1, ""), (0, "")]
110
115
  )
111
- ensure_resource_policy_exists("resource-policy", args, "2x2x1")
116
+ ensure_resource_policy_exists(
117
+ resource_policy_name="resource-policy",
118
+ project="test-project",
119
+ zone="us-central1-a",
120
+ topology="2x2x1",
121
+ )
112
122
  assert mock.call_count == 2
113
123
  assert mock.call_args_list[0].args[1] == "Retrieve resource policy"
114
124
 
@@ -125,7 +135,12 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
125
135
  "xpk.core.nodepool.run_command_for_value",
126
136
  side_effect=[(1, ""), (1, "")],
127
137
  )
128
- ensure_resource_policy_exists("resource-policy", args, "2x2x1")
138
+ ensure_resource_policy_exists(
139
+ resource_policy_name="resource-policy",
140
+ project="test-project",
141
+ zone="us-central1-a",
142
+ topology="2x2x1",
143
+ )
129
144
 
130
145
 
131
146
  @pytest.fixture
@@ -145,22 +160,24 @@ def mock_nodepool_dependencies(mocker):
145
160
  "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
146
161
  )
147
162
  mocker.patch("xpk.core.nodepool.run_commands", return_value=0)
148
- mocker.patch("xpk.core.nodepool.get_user_input", return_value=True)
149
- mock_is_topology_valid = mocker.patch("xpk.core.nodepool.is_topology_valid")
163
+ mocker.patch("xpk.core.nodepool.ask_for_user_consent", return_value=True)
164
+ mock_is_placement_policy_supported = mocker.patch(
165
+ "xpk.core.nodepool.is_placement_policy_supported"
166
+ )
150
167
  mock_ensure_resource_policy = mocker.patch(
151
168
  "xpk.core.nodepool.ensure_resource_policy_exists"
152
169
  )
153
- return mock_is_topology_valid, mock_ensure_resource_policy
170
+ return mock_is_placement_policy_supported, mock_ensure_resource_policy
154
171
 
155
172
 
156
173
  def test_placement_policy_created_for_gpu_with_valid_topology(
157
174
  mocker, mock_nodepool_dependencies
158
175
  ):
159
176
  """Tests that placement policy is created for GPUs with a valid topology."""
160
- mock_is_topology_valid, mock_ensure_resource_policy = (
177
+ mock_is_placement_policy_supported, mock_ensure_resource_policy = (
161
178
  mock_nodepool_dependencies
162
179
  )
163
- mock_is_topology_valid.return_value = True
180
+ mock_is_placement_policy_supported.return_value = True
164
181
  args = mocker.Mock(
165
182
  tpu_type=None,
166
183
  device_type="h100-80gb-8",
@@ -170,13 +187,15 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
170
187
  )
171
188
  system = SystemCharacteristics(
172
189
  topology="N/A",
173
- vms_per_slice=1,
190
+ vms_per_slice=2,
174
191
  gke_accelerator="nvidia-h100-80gb",
175
192
  gce_machine_type="a3-highgpu-8g",
176
193
  chips_per_vm=8,
177
194
  accelerator_type=AcceleratorType.GPU,
178
195
  device_type="h100-80gb-8",
179
196
  supports_sub_slicing=False,
197
+ docker_platform=DockerPlatform.ARM,
198
+ gpu_config=GpuConfig(requires_topology=True),
180
199
  )
181
200
 
182
201
  run_gke_node_pool_create_command(args, system, "1.2.3")
@@ -188,10 +207,10 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
188
207
  mocker, mock_nodepool_dependencies
189
208
  ):
190
209
  """Tests that placement policy is not created for GPUs with an invalid topology."""
191
- mock_is_topology_valid, mock_ensure_resource_policy = (
210
+ mock_is_placement_policy_supported, mock_ensure_resource_policy = (
192
211
  mock_nodepool_dependencies
193
212
  )
194
- mock_is_topology_valid.return_value = False
213
+ mock_is_placement_policy_supported.return_value = False
195
214
  args = mocker.Mock(
196
215
  tpu_type=None,
197
216
  device_type="h100-80gb-8",
@@ -200,13 +219,15 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
200
219
  )
201
220
  system = SystemCharacteristics(
202
221
  topology="N/A",
203
- vms_per_slice=1,
222
+ vms_per_slice=2,
204
223
  gke_accelerator="nvidia-h100-80gb",
205
224
  gce_machine_type="a3-highgpu-8g",
206
225
  chips_per_vm=8,
207
226
  accelerator_type=AcceleratorType.GPU,
208
227
  device_type="h100-80gb-8",
209
228
  supports_sub_slicing=False,
229
+ docker_platform=DockerPlatform.ARM,
230
+ gpu_config=GpuConfig(requires_topology=True),
210
231
  )
211
232
 
212
233
  run_gke_node_pool_create_command(args, system, "1.2.3")
@@ -218,10 +239,10 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
218
239
  mocker, mock_nodepool_dependencies
219
240
  ):
220
241
  """Tests that placement policy is created for tpu7x with a valid topology."""
221
- mock_is_topology_valid, mock_ensure_resource_policy = (
242
+ mock_is_placement_policy_supported, mock_ensure_resource_policy = (
222
243
  mock_nodepool_dependencies
223
244
  )
224
- mock_is_topology_valid.return_value = True
245
+ mock_is_placement_policy_supported.return_value = True
225
246
  args = mocker.Mock(
226
247
  tpu_type="tpu7x-8",
227
248
  device_type=None,
@@ -232,7 +253,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
232
253
  )
233
254
  system = SystemCharacteristics(
234
255
  topology="2x2x1",
235
- vms_per_slice=1,
256
+ vms_per_slice=2,
236
257
  gke_accelerator="tpu7x",
237
258
  gce_machine_type="tpu7x-standard-4t",
238
259
  chips_per_vm=4,
@@ -240,6 +261,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
240
261
  device_type="tpu7x-8",
241
262
  requires_workload_policy=True,
242
263
  supports_sub_slicing=False,
264
+ docker_platform=DockerPlatform.ARM,
243
265
  )
244
266
 
245
267
  run_gke_node_pool_create_command(args, system, "1.2.3")
@@ -251,14 +273,14 @@ def test_placement_policy_not_created_for_non7x_tpu(
251
273
  mocker, mock_nodepool_dependencies
252
274
  ):
253
275
  """Tests that placement policy is not created for non-tpu7x TPUs."""
254
- mock_is_topology_valid, mock_ensure_resource_policy = (
276
+ mock_is_placement_policy_supported, mock_ensure_resource_policy = (
255
277
  mock_nodepool_dependencies
256
278
  )
257
- mock_is_topology_valid.return_value = True
279
+ mock_is_placement_policy_supported.return_value = False
258
280
  args = mocker.Mock(
259
281
  tpu_type="v6e",
260
282
  device_type=None,
261
- num_slices=1,
283
+ num_slices=2,
262
284
  cluster="test-cluster",
263
285
  project="test-project",
264
286
  zone="us-central1-a",
@@ -272,6 +294,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
272
294
  accelerator_type=AcceleratorType.TPU,
273
295
  device_type="v6e-4",
274
296
  supports_sub_slicing=True,
297
+ docker_platform=DockerPlatform.ARM,
275
298
  )
276
299
 
277
300
  run_gke_node_pool_create_command(args, system, "1.2.3")
xpk/core/pathways.py CHANGED
@@ -333,3 +333,26 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
333
333
  xpk_print(f'Delete Workload request returned ERROR {return_code}')
334
334
  return False
335
335
  return True
336
+
337
+
338
+ def get_pathways_machine_types(
339
+ project: str, zone: str
340
+ ) -> tuple[int, list[str]]:
341
+ # Identify machine types with sufficient allocatable capacity to
342
+ # schedule the Pathways pod. This filter ensures the selected node
343
+ # is large enough to handle the control plane workload plus GKE
344
+ # system overhead.
345
+ min_memory_mb = 233 * 1024
346
+ command = (
347
+ 'gcloud compute machine-types list --filter "guestCpus >= 49 AND memoryMb'
348
+ f' >= {min_memory_mb} AND zone = \'{zone}\'" --format="value(name)"'
349
+ f' --project={project}'
350
+ )
351
+ return_code, result = run_command_for_value(
352
+ command=command,
353
+ task='Retrieve available pathways machine types',
354
+ dry_run_return_val='n2-standard-64',
355
+ )
356
+ if return_code != 0:
357
+ return return_code, []
358
+ return 0, result.strip().splitlines()
@@ -0,0 +1,57 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ from unittest.mock import MagicMock
19
+ from xpk.core.testing.commands_tester import CommandsTester
20
+ from .pathways import get_pathways_machine_types
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def commands_tester(mocker: MagicMock):
25
+ return CommandsTester(
26
+ mocker,
27
+ run_command_with_updates_path=(
28
+ "xpk.core.pathways.run_command_with_updates"
29
+ ),
30
+ run_command_for_value_path="xpk.core.pathways.run_command_for_value",
31
+ )
32
+
33
+
34
+ def test_get_pathways_machine_types_when_command_fails_returns_failed_exit_code(
35
+ commands_tester: CommandsTester,
36
+ ):
37
+ commands_tester.set_result_for_command(
38
+ (1, ""), "gcloud compute machine-types list"
39
+ )
40
+ return_code, machine_types = get_pathways_machine_types(
41
+ project="gke-project", zone="us-central1-a"
42
+ )
43
+ assert return_code == 1
44
+ assert machine_types == []
45
+
46
+
47
+ def test_get_pathways_machine_types_when_command_suceeds_returns_machine_types(
48
+ commands_tester: CommandsTester,
49
+ ):
50
+ commands_tester.set_result_for_command(
51
+ (0, "abc\ncba"), "gcloud compute machine-types list"
52
+ )
53
+ return_code, machine_types = get_pathways_machine_types(
54
+ project="gke-project", zone="us-central1-a"
55
+ )
56
+ assert return_code == 0
57
+ assert machine_types == ["abc", "cba"]
xpk/core/resources.py CHANGED
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from dataclasses import dataclass
18
+ import os
18
19
 
19
20
  from ..utils.console import xpk_print
20
21
  from ..utils.file import write_tmp_file
@@ -30,9 +31,13 @@ from .capacity import (
30
31
  from .commands import run_command_for_value, run_commands
31
32
  from .config import XPK_CURRENT_VERSION
32
33
  from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
34
+ from enum import Enum
35
+
36
+
37
+ class ConfigMapType(Enum):
38
+ RESOURCES = 'resources-configmap'
39
+ METADATA = 'metadata-configmap'
33
40
 
34
- CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
35
- CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
36
41
 
37
42
  CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
38
43
  apiVersion: v1
@@ -50,7 +55,15 @@ class AutoprovisioningConfig:
50
55
  maximum_chips: int
51
56
 
52
57
 
53
- def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
58
+ def get_config_map_name(
59
+ cluster_name: str, config_map_type: ConfigMapType
60
+ ) -> str:
61
+ return f'{cluster_name}-{config_map_type.value}'
62
+
63
+
64
+ def get_cluster_configmap(
65
+ cluster_name: str, config_map_type: ConfigMapType
66
+ ) -> dict[str, str] | None:
54
67
  """Run the Get GKE Cluster ConfigMap request.
55
68
 
56
69
  Args:
@@ -59,15 +72,17 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
59
72
  Returns:
60
73
  key:value pairs stored in cluster ConfigMap.
61
74
  """
75
+ config_map_name = get_config_map_name(cluster_name, config_map_type)
62
76
  command = (
63
77
  'kubectl get configmap'
64
- f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
78
+ f' {config_map_name} -o=custom-columns="ConfigData:data"'
79
+ ' --no-headers=true'
65
80
  )
66
81
 
67
82
  return_code, return_value = run_command_for_value(
68
83
  command,
69
84
  'GKE Cluster Get ConfigMap',
70
- dry_run_return_val='map[]',
85
+ dry_run_return_val=_get_dry_run_config_map_value(config_map_type),
71
86
  )
72
87
  if return_code != 0:
73
88
  xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
@@ -89,9 +104,18 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
89
104
  return config_map
90
105
 
91
106
 
107
+ def _get_dry_run_config_map_value(config_map_type: ConfigMapType) -> str:
108
+ default_value = 'map[]'
109
+
110
+ if config_map_type == ConfigMapType.RESOURCES:
111
+ return os.getenv('DRY_RUN_RESOURCES_CONFIG_MAP', default_value)
112
+
113
+ return default_value
114
+
115
+
92
116
  def create_cluster_configmaps(
93
117
  args,
94
- system,
118
+ system: SystemCharacteristics,
95
119
  tensorboard_config: dict,
96
120
  autoprovisioning_config: AutoprovisioningConfig | None,
97
121
  ) -> int:
@@ -127,9 +151,11 @@ def create_cluster_configmaps(
127
151
  resources_data = (
128
152
  f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
129
153
  )
130
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
154
+ resources_configmap_name = get_config_map_name(
155
+ args.cluster, ConfigMapType.RESOURCES
156
+ )
131
157
  resources_yml = CLUSTER_CONFIGMAP_YAML.format(
132
- args=args, name=resources_configmap_name, data=resources_data
158
+ name=resources_configmap_name, data=resources_data
133
159
  )
134
160
  configmap_yml[resources_configmap_name] = resources_yml
135
161
 
@@ -148,15 +174,17 @@ def create_cluster_configmaps(
148
174
  # Reservation ID if applicable.
149
175
  if capacity_type == CapacityType.RESERVATION:
150
176
  metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
151
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
177
+ metadata_configmap_name = get_config_map_name(
178
+ args.cluster, ConfigMapType.METADATA
179
+ )
152
180
  metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
153
- args=args, name=metadata_configmap_name, data=metadata
181
+ name=metadata_configmap_name, data=metadata
154
182
  )
155
183
  configmap_yml[metadata_configmap_name] = metadata_yml
156
- return create_or_update_cluster_configmap(configmap_yml)
184
+ return _create_or_update_cluster_configmap(configmap_yml)
157
185
 
158
186
 
159
- def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
187
+ def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
160
188
  """
161
189
  Args:
162
190
  configmap_yml: dict containing ConfigMap name and yml string.
@@ -187,7 +215,18 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
187
215
  return 0
188
216
 
189
217
 
190
- def check_cluster_resources(args, system) -> tuple[bool, bool]:
218
+ def update_cluster_configmap(
219
+ cluster_name: str, config_map_type: ConfigMapType, data: str
220
+ ) -> int:
221
+ config_map_name = get_config_map_name(cluster_name, config_map_type)
222
+ yaml = CLUSTER_CONFIGMAP_YAML.format(name=config_map_name, data=data)
223
+ config_map_dict = {config_map_name: yaml}
224
+ return _create_or_update_cluster_configmap(config_map_dict)
225
+
226
+
227
+ def check_cluster_resources(
228
+ args, system: SystemCharacteristics
229
+ ) -> tuple[bool, bool]:
191
230
  """Check if cluster has resources of a specified device_type/gke_accelerator.
192
231
  This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
193
232
 
@@ -200,8 +239,9 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
200
239
  True if resources in the cluster should be checked, False otherwise.
201
240
  True if device_type/gke_accelerator exists in the cluster, False otherwise.
202
241
  """
203
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
204
- resources_config_map = get_cluster_configmap(resources_configmap_name)
242
+ resources_config_map = get_cluster_configmap(
243
+ args.cluster, ConfigMapType.RESOURCES
244
+ )
205
245
  if resources_config_map is None:
206
246
  xpk_print(
207
247
  f'No ConfigMap exist for cluster with the name {resources_config_map}.'
@@ -216,20 +256,35 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
216
256
 
217
257
 
218
258
  def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
219
- """Get systemCharcteristics based on the cluster resources configMap
259
+ """Get SystemCharcteristics based on the cluster resources configMap.
260
+
220
261
  Args:
221
262
  args: user provided arguments for running the command.
222
263
 
223
264
  Returns:
224
- returns system characteristics
265
+ returns system characteristics, or None if not found.
266
+ """
267
+ resources_config_map = get_cluster_configmap(
268
+ args.cluster, ConfigMapType.RESOURCES
269
+ )
270
+ return get_cluster_system_characteristics_from_config_map(
271
+ resources_config_map
272
+ )
273
+
274
+
275
+ def get_cluster_system_characteristics_from_config_map(
276
+ resources_config_map: dict[str, str] | None,
277
+ ) -> SystemCharacteristics | None:
278
+ """Get SystemCharcteristics based on the cluster resources configMap.
279
+
280
+ Returns:
281
+ returns system characteristics, or None if not found.
225
282
  """
226
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
227
- cluster_config_map = get_cluster_configmap(resources_configmap_name)
228
283
 
229
- if cluster_config_map is None:
284
+ if resources_config_map is None:
230
285
  return None
231
286
 
232
- for key in cluster_config_map:
287
+ for key in resources_config_map:
233
288
  system, result_code = get_system_characteristics_by_device_type(key)
234
289
  if result_code == 0:
235
290
  return system
@@ -238,20 +293,22 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
238
293
 
239
294
 
240
295
  def get_cluster_capacity_type(args) -> CapacityType | None:
241
- """Get systemCharcteristics based on the cluster resources configMap
296
+ """Get CapacityType based on the cluster metadata configMap.
297
+
242
298
  Args:
243
299
  args: user provided arguments for running the command.
244
300
 
245
301
  Returns:
246
- returns system characteristics
302
+ returns CapacityType, or None if not found.
247
303
  """
248
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
249
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
304
+ metadata_configmap_name = get_cluster_configmap(
305
+ args.cluster, ConfigMapType.METADATA
306
+ )
250
307
 
251
- if cluster_config_map is None:
308
+ if metadata_configmap_name is None:
252
309
  return None
253
310
 
254
- capacityValue = cluster_config_map.get('capacity_type')
311
+ capacityValue = metadata_configmap_name.get('capacity_type')
255
312
  if capacityValue is not None:
256
313
  return CapacityType[capacityValue.upper()]
257
314