xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. xpk/commands/batch.py +8 -8
  2. xpk/commands/cluster.py +19 -19
  3. xpk/commands/cluster_gcluster.py +2 -1
  4. xpk/commands/common.py +7 -3
  5. xpk/commands/info.py +12 -12
  6. xpk/commands/inspector.py +1 -1
  7. xpk/commands/job.py +42 -12
  8. xpk/commands/kjob_common.py +2 -1
  9. xpk/commands/storage.py +6 -3
  10. xpk/commands/workload.py +28 -15
  11. xpk/core/blueprint/blueprint_generator.py +7 -7
  12. xpk/core/blueprint/blueprint_test.py +218 -0
  13. xpk/core/capacity.py +3 -1
  14. xpk/core/cluster.py +14 -8
  15. xpk/core/cluster_private.py +8 -2
  16. xpk/core/commands.py +13 -10
  17. xpk/core/config.py +3 -4
  18. xpk/core/config_test.py +71 -0
  19. xpk/core/docker_image.py +14 -5
  20. xpk/core/docker_manager.py +1 -1
  21. xpk/core/docker_resources.py +10 -5
  22. xpk/core/filestore.py +7 -2
  23. xpk/core/gcloud_context.py +2 -2
  24. xpk/core/jobset.py +1 -1
  25. xpk/core/kjob.py +7 -3
  26. xpk/core/kueue.py +28 -8
  27. xpk/core/nap.py +5 -5
  28. xpk/core/network.py +1 -1
  29. xpk/core/nodepool.py +8 -3
  30. xpk/core/nodepool_test.py +82 -0
  31. xpk/core/pathways.py +6 -2
  32. xpk/core/ray.py +1 -1
  33. xpk/core/resources.py +18 -14
  34. xpk/core/scheduling.py +4 -0
  35. xpk/core/storage.py +14 -14
  36. xpk/core/system_characteristics.py +1 -1
  37. xpk/core/workload.py +11 -0
  38. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  39. xpk/core/workload_decorators/storage_decorator.py +2 -1
  40. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  41. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  42. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  43. xpk/core/workload_test.py +28 -0
  44. xpk/main.py +12 -10
  45. xpk/parser/cluster.py +110 -49
  46. xpk/parser/common.py +45 -36
  47. xpk/parser/storage.py +12 -13
  48. xpk/parser/workload.py +57 -39
  49. xpk/utils/console.py +2 -1
  50. xpk/utils/execution_context.py +28 -0
  51. xpk/utils/file.py +25 -10
  52. xpk/utils/network.py +4 -0
  53. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
  54. xpk-0.13.0.dist-info/RECORD +101 -0
  55. xpk-0.11.0.dist-info/RECORD +0 -95
  56. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
  57. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
  58. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
  59. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
xpk/core/kjob.py CHANGED
@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
23
23
  from kubernetes.client.rest import ApiException
24
24
 
25
25
  from ..utils import templates
26
+ from ..utils.execution_context import is_dry_run
26
27
  from ..utils.console import xpk_exit, xpk_print
27
28
  from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
28
29
  from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
@@ -277,7 +278,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
277
278
  job_spec = rdma_decorator.decorate_kjob_template(job_spec)
278
279
  job_template_dict = yaml.safe_load(yml_string)
279
280
  job_template_dict["template"] = job_spec
280
- return yaml.dump(job_template_dict, sort_keys=False)
281
+ yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
282
+ return yaml_result
281
283
 
282
284
 
283
285
  def create_job_template_instance(
@@ -367,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
367
369
  def prepare_kjob(args: Namespace) -> int:
368
370
  system = get_cluster_system_characteristics(args)
369
371
 
370
- k8s_api_client = setup_k8s_env(args)
371
- storages = get_auto_mount_storages(k8s_api_client)
372
+ storages = []
373
+ if not is_dry_run():
374
+ k8s_api_client = setup_k8s_env(args)
375
+ storages = get_auto_mount_storages(k8s_api_client)
372
376
 
373
377
  service_account = ""
374
378
  if len(storages) > 0:
xpk/core/kueue.py CHANGED
@@ -43,7 +43,7 @@ from .system_characteristics import (
43
43
  KUEUE_VERSION = 'v0.12.2'
44
44
  CLUSTER_QUEUE_NAME = 'cluster-queue'
45
45
  LOCAL_QUEUE_NAME = 'multislice-queue'
46
- WAIT_FOR_KUEUE_TIMEOUT = '5m'
46
+ WAIT_FOR_KUEUE_TIMEOUT = '10m'
47
47
  MEMORY_SIZE_PER_VM = 1.2
48
48
  MIN_MEMORY_LIMIT_SIZE = 4096
49
49
 
@@ -89,6 +89,10 @@ metadata:
89
89
  name: dws-config
90
90
  spec:
91
91
  provisioningClassName: queued-provisioning.gke.io
92
+ podSetUpdates:
93
+ nodeSelector:
94
+ - key: autoscaling.gke.io/provisioning-request
95
+ valueFromProvisioningClassDetail: ResizeRequestName
92
96
  managedResources:
93
97
  - {managed_resource}
94
98
  ---
@@ -320,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
320
324
  return return_code
321
325
 
322
326
 
323
- def get_kueue_version(args) -> (int, str):
327
+ def get_kueue_version(args) -> tuple[int, str]:
324
328
  command = 'kubectl kueue version'
325
329
  task = 'Get kueue version on server'
326
330
  return_code, val = run_command_for_value(command, task, args)
@@ -432,6 +436,8 @@ def install_kueue_crs(
432
436
  cluster_hardware_name=cluster_hardware_name,
433
437
  resource_type=resource_type,
434
438
  total_chips=total_chips,
439
+ cpu_limit=args.cpu_limit,
440
+ memory_limit=args.memory_limit,
435
441
  )
436
442
  topology_label = ''
437
443
  if system.device_type in [
@@ -470,7 +476,7 @@ def install_kueue_crs(
470
476
  yml_string = topology_yaml + yml_string
471
477
 
472
478
  tmp = write_tmp_file(yml_string)
473
- command = f'kubectl apply -f {str(tmp.file.name)}'
479
+ command = f'kubectl apply -f {str(tmp)}'
474
480
 
475
481
  task = 'Applying Kueue Custom Resources'
476
482
  return_code = run_command_with_updates_retry(command, task, args)
@@ -480,7 +486,7 @@ def install_kueue_crs(
480
486
 
481
487
 
482
488
  def get_kueue_covered_resources_config(
483
- cluster_hardware_name, resource_type, total_chips
489
+ cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
484
490
  ) -> str:
485
491
  """Gets Kueue covered resources configuration.
486
492
 
@@ -493,17 +499,31 @@ def get_kueue_covered_resources_config(
493
499
  A string of Kueue covered resources configuration.
494
500
  """
495
501
  config_format = """
496
- - coveredResources: ["{resource_type}"]
502
+ - coveredResources: {resource_types}
497
503
  flavors:
498
504
  - name: {cluster_hardware_name}
499
505
  resources:
500
506
  - name: "{resource_type}"
501
- nominalQuota: {total_chips}
502
- """
507
+ nominalQuota: {total_chips}"""
508
+ resource_types = [resource_type]
509
+ if cpu_limit:
510
+ config_format = config_format + """
511
+ - name: "cpu"
512
+ nominalQuota: {cpu_limit}"""
513
+ resource_types.append('cpu')
514
+ if memory_limit:
515
+ config_format = config_format + """
516
+ - name: "memory"
517
+ nominalQuota: {memory_limit}"""
518
+ resource_types.append('memory')
519
+
503
520
  config_string = config_format.format(
504
521
  cluster_hardware_name=cluster_hardware_name,
522
+ resource_types=resource_types,
505
523
  resource_type=resource_type,
506
524
  total_chips=total_chips,
525
+ cpu_limit=cpu_limit,
526
+ memory_limit=memory_limit,
507
527
  )
508
528
  return config_string
509
529
 
@@ -532,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
532
552
  memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
533
553
  )
534
554
  tmp = write_tmp_file(yml_string)
535
- command = f'kubectl apply -f {str(tmp.file.name)}'
555
+ command = f'kubectl apply -f {str(tmp)}'
536
556
 
537
557
  task = 'Updating Kueue Controller Manager resources'
538
558
  return_code = run_command_with_updates_retry(command, task, args)
xpk/core/nap.py CHANGED
@@ -37,6 +37,7 @@ from .resources import (
37
37
  )
38
38
  from .scheduling import get_total_chips_requested_from_args
39
39
  from .system_characteristics import AcceleratorType, SystemCharacteristics
40
+ from typing import cast
40
41
 
41
42
  AUTOPROVISIONING_CONFIG_FILE = """
42
43
  management:
@@ -249,7 +250,7 @@ def create_autoprovisioning_config(
249
250
  zones=f'- {args.zone}',
250
251
  )
251
252
  autoprovisioning_config = AutoprovisioningConfig(
252
- config_filename=write_tmp_file(yml_string).name,
253
+ config_filename=write_tmp_file(yml_string),
253
254
  minimum_chips=minimum,
254
255
  maximum_chips=maximum,
255
256
  )
@@ -269,9 +270,6 @@ def is_autoprovisioning_enabled(
269
270
  bool is true if autoprovisioning is enabled, false otherwise.
270
271
  int of 0 if successful and 1 otherwise.
271
272
  """
272
- # Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
273
- if args.use_pathways:
274
- return False, 0
275
273
 
276
274
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
277
275
  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
@@ -339,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
339
337
  )
340
338
  return node_selector_args, 1
341
339
 
342
- return_code, capacity_type_str = get_value_from_map(
340
+ return_code, optional_capacity_type_str = get_value_from_map(
343
341
  CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
344
342
  )
345
343
  if return_code != 0:
346
344
  return node_selector_args, return_code
345
+ # return_code==0 implies capacity_type is defined
346
+ capacity_type_str = cast(str, optional_capacity_type_str)
347
347
 
348
348
  if capacity_type_str == CapacityType.RESERVATION.name:
349
349
  return_code, args.reservation = get_value_from_map(
xpk/core/network.py CHANGED
@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
221
221
  """
222
222
  yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
223
223
  tmp = write_tmp_file(yml_string)
224
- command = f'kubectl apply -f {str(tmp.file.name)}'
224
+ command = f'kubectl apply -f {str(tmp)}'
225
225
 
226
226
  return_code = run_command_with_updates(
227
227
  command, 'GKE Cluster Create Network Config', args
xpk/core/nodepool.py CHANGED
@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
265
265
  )
266
266
  configmap_yml = {}
267
267
  configmap_yml[resources_configmap_name] = resources_yml
268
- return_code = create_or_update_cluster_configmap(configmap_yml)
268
+ return_code = create_or_update_cluster_configmap(
269
+ configmap_yml, args.dry_run
270
+ )
269
271
  if return_code != 0:
270
272
  return 1
271
273
 
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
461
463
  f' --region={zone_to_region(args.zone)} --format="value(locations)"'
462
464
  )
463
465
  return_code, nodepool_zone = run_command_for_value(
464
- command, 'Get Node Pool Zone', args
466
+ command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
465
467
  )
466
468
  if return_code != 0:
467
469
  xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
570
572
  for i, command in enumerate(commands):
571
573
  xpk_print(f'To complete {task_names[i]} we are executing {command}')
572
574
  max_return_code = run_commands(
573
- commands, 'Update GKE node pools to default RAPID GKE version', task_names
575
+ commands,
576
+ 'Update GKE node pools to default RAPID GKE version',
577
+ task_names,
578
+ dry_run=args.dry_run,
574
579
  )
575
580
  if max_return_code != 0:
576
581
  xpk_print(
@@ -0,0 +1,82 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.nodepool import get_desired_node_pool_names
18
+
19
+ CLUSTER_NAME = "running-cucumber"
20
+
21
+
22
+ def node_pool_name(number: int) -> str:
23
+ return f"{CLUSTER_NAME}-np-{number}"
24
+
25
+
26
+ def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
27
+ result = get_desired_node_pool_names(
28
+ existing_node_pool_names=[node_pool_name(0)],
29
+ cluster_name=CLUSTER_NAME,
30
+ desired_node_pool_count=2,
31
+ )
32
+
33
+ expected_result = [node_pool_name(0), node_pool_name(1)]
34
+ assert set(result) == set(expected_result)
35
+
36
+
37
+ def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
38
+ result = get_desired_node_pool_names(
39
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
40
+ cluster_name=CLUSTER_NAME,
41
+ desired_node_pool_count=1,
42
+ )
43
+
44
+ expected_result = [node_pool_name(0)]
45
+ assert set(result) == set(expected_result)
46
+
47
+
48
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
49
+ result = get_desired_node_pool_names(
50
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
51
+ cluster_name=CLUSTER_NAME,
52
+ desired_node_pool_count=3,
53
+ )
54
+
55
+ expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
56
+ assert set(result) == set(expected_result)
57
+
58
+
59
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
60
+ result = get_desired_node_pool_names(
61
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
62
+ cluster_name=CLUSTER_NAME,
63
+ desired_node_pool_count=2,
64
+ )
65
+
66
+ expected_result = [node_pool_name(0), node_pool_name(3)]
67
+ assert set(result) == set(expected_result)
68
+
69
+
70
+ def test_compute_desired_node_pool_names_with_unknown_node_pools():
71
+ result = get_desired_node_pool_names(
72
+ existing_node_pool_names=[
73
+ "unknown-node-pool",
74
+ node_pool_name(0),
75
+ node_pool_name(3),
76
+ ],
77
+ cluster_name=CLUSTER_NAME,
78
+ desired_node_pool_count=2,
79
+ )
80
+
81
+ expected_result = [node_pool_name(0), node_pool_name(3)]
82
+ assert set(result) == set(expected_result)
xpk/core/pathways.py CHANGED
@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
19
19
  from ..core.gcloud_context import zone_to_region
20
20
  from ..core.nodepool import get_all_nodepools_programmatic
21
21
  from ..utils.console import xpk_exit, xpk_print
22
+ from ..utils.execution_context import is_dry_run
22
23
  from .system_characteristics import AcceleratorType, SystemCharacteristics
23
24
 
24
25
 
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
79
80
  # Ensure the cluster and CPU nodepools were created with create-pathways
80
81
  all_node_pools = get_all_nodepools_programmatic(args)
81
82
  desired_pw_cpu_node_pools = {'cpu-np'}
82
- if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
83
+ if (
84
+ not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
85
+ and not is_dry_run()
86
+ ):
83
87
  xpk_print(
84
88
  'Cluster needs to be created with `xpk create-pathways` to run'
85
89
  ' Pathways workloads.'
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
322
326
  return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
323
327
  else:
324
328
  return_code = run_commands(
325
- commands, 'Delete Workload', task_names, batch=100
329
+ commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
326
330
  )
327
331
 
328
332
  if return_code != 0:
xpk/core/ray.py CHANGED
@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
132
132
  )
133
133
 
134
134
  tmp = write_tmp_file(yml_string)
135
- command = f'kubectl apply -f {str(tmp.file.name)}'
135
+ command = f'kubectl apply -f {str(tmp)}'
136
136
  task = 'Applying RayCluster'
137
137
  retry_attempts = 1
138
138
  return_code = run_command_with_updates_retry(
xpk/core/resources.py CHANGED
@@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
66
66
  )
67
67
 
68
68
  return_code, return_value = run_command_for_value(
69
- command, 'GKE Cluster Get ConfigMap', args
69
+ command,
70
+ 'GKE Cluster Get ConfigMap',
71
+ args,
72
+ dry_run_return_val='map[]',
70
73
  )
71
74
  if return_code != 0:
72
75
  xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
@@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
81
84
  configs = return_value[4:-1].split(' ')
82
85
 
83
86
  for config in configs:
84
- key, value = config.strip().split(':')
85
- config_map[key] = value
87
+ parts = config.strip().split(':')
88
+ if len(parts) != 2:
89
+ continue
90
+ config_map[parts[0]] = parts[1]
86
91
  return config_map
87
92
 
88
93
 
@@ -108,13 +113,7 @@ def create_cluster_configmaps(
108
113
  device_type = system.device_type
109
114
  if system.accelerator_type == AcceleratorType['GPU']:
110
115
  resources_data = f'{device_type}: "{int(args.num_nodes)}"'
111
- elif (
112
- not args.enable_pathways
113
- and args.enable_autoprovisioning
114
- and autoprovisioning_config
115
- ):
116
- # Currently autoprovisioning is not supported with Pathways.
117
- # Auto provisioning will have variable topologies for a gke accelerator type.
116
+ elif args.enable_autoprovisioning and autoprovisioning_config:
118
117
  resources_data = (
119
118
  f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
120
119
  )
@@ -156,10 +155,12 @@ def create_cluster_configmaps(
156
155
  args=args, name=metadata_configmap_name, data=metadata
157
156
  )
158
157
  configmap_yml[metadata_configmap_name] = metadata_yml
159
- return create_or_update_cluster_configmap(configmap_yml)
158
+ return create_or_update_cluster_configmap(configmap_yml, args.dry_run)
160
159
 
161
160
 
162
- def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
161
+ def create_or_update_cluster_configmap(
162
+ configmap_yml: dict, dry_run: bool
163
+ ) -> int:
163
164
  """
164
165
  Args:
165
166
  configmap_yml: dict containing ConfigMap name and yml string.
@@ -171,13 +172,16 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
171
172
  task_names = []
172
173
  for configmap_name, yml_string in configmap_yml.items():
173
174
  tmp = write_tmp_file(yml_string)
174
- command = f'kubectl apply -f {str(tmp.file.name)}'
175
+ command = f'kubectl apply -f {str(tmp)}'
175
176
  commands.append(command)
176
177
  task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
177
178
  task_names.append(task_name)
178
179
 
179
180
  return_code = run_commands(
180
- commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
181
+ commands,
182
+ 'GKE Cluster CreateOrUpdate ConfigMap(s)',
183
+ task_names,
184
+ dry_run=dry_run,
181
185
  )
182
186
  if return_code != 0:
183
187
  xpk_print(
xpk/core/scheduling.py CHANGED
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..utils.console import xpk_print
18
+ from ..utils.execution_context import is_dry_run
18
19
  from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
19
20
  from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
20
21
  from .system_characteristics import (
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
45
46
  )
46
47
  return True
47
48
 
49
+ if is_dry_run():
50
+ return True
51
+
48
52
  # Check for gke accelerator type:
49
53
  missing_gke_accelerator_type = False
50
54
  if not cluster_config_map.get(system.gke_accelerator):
xpk/core/storage.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import os
18
18
  from argparse import Namespace
19
19
  from dataclasses import dataclass
20
- from typing import Any
20
+ from typing import Any, cast
21
21
 
22
22
  import ruamel.yaml
23
23
  from google.cloud import storage as gcp_storage
@@ -95,17 +95,17 @@ class Storage:
95
95
  Args:
96
96
  data: A dictionary containing the Storage resource definition.
97
97
  """
98
- metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
98
+ metadata = data.get("metadata", {})
99
99
  self.name = metadata.get("name")
100
100
  spec = data.get("spec", {})
101
- self.type: str = spec.get("type")
102
- self.auto_mount: bool = spec.get("auto_mount")
103
- self.mount_point: bool = spec.get("mount_point")
104
- self.readonly: bool = spec.get("readonly")
105
- self.manifest: str = spec.get("manifest")
106
- self.pvc: str = spec.get("pvc")
107
- self.pv: str = spec.get("pv")
108
- self.bucket: str = self._get_bucket()
101
+ self.type = spec.get("type")
102
+ self.auto_mount = spec.get("auto_mount")
103
+ self.mount_point = spec.get("mount_point")
104
+ self.readonly = spec.get("readonly")
105
+ self.manifest = spec.get("manifest")
106
+ self.pvc = spec.get("pvc")
107
+ self.pv = spec.get("pv")
108
+ self.bucket = self._get_bucket()
109
109
 
110
110
  def fields_as_list(self) -> list[str]:
111
111
  """
@@ -117,9 +117,9 @@ class Storage:
117
117
  return [
118
118
  self.name,
119
119
  self.type,
120
- self.auto_mount,
120
+ str(self.auto_mount),
121
121
  self.mount_point,
122
- self.readonly,
122
+ str(self.readonly),
123
123
  self.manifest,
124
124
  ]
125
125
 
@@ -133,7 +133,7 @@ class Storage:
133
133
  client = k8s_client.CoreV1Api()
134
134
  try:
135
135
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
136
- return pv.spec.csi.volume_handle
136
+ return cast(str, pv.spec.csi.volume_handle)
137
137
  except ApiException as e:
138
138
  xpk_print(
139
139
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -150,7 +150,7 @@ class Storage:
150
150
  client = k8s_client.CoreV1Api()
151
151
  try:
152
152
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
153
- return pv.spec.mount_options
153
+ return cast(list[str], pv.spec.mount_options)
154
154
  except ApiException as e:
155
155
  xpk_print(
156
156
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -55,7 +55,7 @@ class SystemCharacteristics:
55
55
  gke_accelerator: str
56
56
  gce_machine_type: str
57
57
  chips_per_vm: int
58
- accelerator_type: AcceleratorType # type: ignore
58
+ accelerator_type: int # TODO: use enums
59
59
  device_type: str
60
60
 
61
61
 
xpk/core/workload.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import re
17
18
  from ..utils.console import xpk_exit, xpk_print
18
19
  from .commands import run_command_for_value
19
20
  from .gcloud_context import zone_to_region
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
240
241
  xpk_print('Your workload did not complete successfully')
241
242
  return 125
242
243
  return 0
244
+
245
+
246
+ GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
247
+ """Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
248
+
249
+
250
+ def get_jobsets_list_gcp_link(project: str) -> str:
251
+ """Returns a link to Cloud Console JobSets list"""
252
+
253
+ return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'
@@ -18,7 +18,7 @@ import yaml
18
18
  from ...utils.yaml import literal_string
19
19
 
20
20
 
21
- def decorate_kjob_template(job_manifest) -> str:
21
+ def decorate_kjob_template(job_manifest: dict) -> dict:
22
22
  spec = (
23
23
  job_manifest.setdefault('spec', {})
24
24
  .setdefault('template', {})
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
64
64
  add_tolerations(job_manifest)
65
65
  update_gpu_containers(job_manifest)
66
66
 
67
- return yaml.dump(manifest, sort_keys=False)
67
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
68
+ return yaml_str
68
69
 
69
70
 
70
71
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
36
36
  job_manifest = job['template']
37
37
  add_annotations(job_manifest, storages)
38
38
  add_volumes(job_manifest, storage_volumes)
39
- return yaml.dump(manifest, sort_keys=False)
39
+ yaml_result: str = yaml.dump(manifest, sort_keys=False)
40
+ return yaml_result
40
41
 
41
42
 
42
43
  def add_annotations(job_manifest, storages):
@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
55
55
  for job in manifest['spec']['replicatedJobs']:
56
56
  job_manifest = job['template']
57
57
  job_manifest = decorate_job(job_manifest)
58
- return yaml.dump(manifest, sort_keys=False)
58
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
59
+ return yaml_str
59
60
 
60
61
 
61
62
  def get_interfaces_annotation() -> dict:
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
131
132
  })
132
133
  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
134
  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
135
+ volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
134
136
  volumes.append(
135
137
  {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136
138
  )
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
168
170
  spec['initContainers'].append(tcpxo_daemon_container)
169
171
 
170
172
 
171
- def update_gpu_containers(job_manifest):
173
+ def update_gpu_containers(job_manifest) -> None:
172
174
  for container in job_manifest['spec']['template']['spec']['containers']:
173
175
  if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
174
176
  env: list = container.setdefault('env', [])