xpk 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/cluster.py +10 -11
  2. xpk/commands/cluster_gcluster.py +2 -1
  3. xpk/commands/common.py +3 -3
  4. xpk/commands/info.py +12 -12
  5. xpk/commands/job.py +12 -10
  6. xpk/commands/kjob_common.py +2 -1
  7. xpk/commands/storage.py +1 -1
  8. xpk/commands/workload.py +12 -6
  9. xpk/core/blueprint/blueprint_generator.py +7 -7
  10. xpk/core/blueprint/blueprint_test.py +218 -0
  11. xpk/core/capacity.py +3 -1
  12. xpk/core/cluster.py +9 -7
  13. xpk/core/cluster_private.py +5 -1
  14. xpk/core/commands.py +3 -3
  15. xpk/core/config.py +3 -4
  16. xpk/core/config_test.py +71 -0
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +1 -1
  19. xpk/core/filestore.py +7 -2
  20. xpk/core/gcloud_context.py +2 -2
  21. xpk/core/kjob.py +2 -1
  22. xpk/core/kueue.py +6 -2
  23. xpk/core/nap.py +4 -4
  24. xpk/core/nodepool_test.py +82 -0
  25. xpk/core/resources.py +1 -7
  26. xpk/core/storage.py +14 -14
  27. xpk/core/system_characteristics.py +1 -1
  28. xpk/core/workload.py +11 -0
  29. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  30. xpk/core/workload_decorators/storage_decorator.py +2 -1
  31. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  32. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  33. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  34. xpk/core/workload_test.py +28 -0
  35. xpk/main.py +9 -10
  36. xpk/parser/cluster.py +67 -49
  37. xpk/parser/common.py +45 -36
  38. xpk/parser/storage.py +12 -13
  39. xpk/parser/workload.py +57 -39
  40. xpk/utils/console.py +2 -1
  41. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
  42. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/RECORD +46 -41
  43. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
  44. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py CHANGED
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
62
62
  0 if successful and 1 otherwise.
63
63
  """
64
64
  command = (
65
- 'kubectl apply --server-side -f'
66
- f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
65
+ 'kubectl apply --server-side --force-conflicts'
66
+ f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
67
67
  )
68
68
  task = f'Install Jobset on {args.cluster}'
69
69
  return_code = run_command_with_updates_retry(command, task, args)
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
217
217
  )
218
218
  if err_code != 0:
219
219
  xpk_exit(err_code)
220
- data = yaml.safe_load(val)
221
- return data['items'] # pytype: disable=bad-return-type
220
+ data: dict[str, list[dict]] = yaml.safe_load(val)
221
+ return data['items']
222
222
 
223
223
 
224
224
  def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
@@ -445,7 +445,7 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
445
445
  args.project_number = project_id_to_project_number(args.project)
446
446
 
447
447
  config.load_kube_config()
448
- return k8s_client.ApiClient() # pytype: disable=bad-return-type
448
+ return k8s_client.ApiClient()
449
449
 
450
450
 
451
451
  def get_gpu_type_from_cluster(args) -> str:
@@ -817,9 +817,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
817
817
  server_config_return_code, gke_server_config = get_gke_server_config(args)
818
818
  if server_config_return_code != 0:
819
819
  xpk_exit(server_config_return_code)
820
+ assert gke_server_config
821
+
820
822
  upgrade_master_return_code = upgrade_gke_control_plane_version(
821
823
  args,
822
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
824
+ gke_server_config.default_rapid_gke_version,
823
825
  )
824
826
  if upgrade_master_return_code > 0:
825
827
  xpk_print("Updating GKE cluster's control plane upgrade failed!")
@@ -828,7 +830,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
828
830
  # Upgrade nodepools version after the master upgrade.
829
831
  node_pool_update_code = upgrade_gke_nodepools_version(
830
832
  args,
831
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
833
+ gke_server_config.default_rapid_gke_version,
832
834
  )
833
835
  if node_pool_update_code > 0:
834
836
  xpk_print('Upgrading nodepools version failed!')
@@ -95,7 +95,11 @@ def add_current_machine_to_networks_if_needed(
95
95
  "Adding current machine's IP address to the authorized networks"
96
96
  ' failed!'
97
97
  )
98
- return add_current_machine_to_networks_return_code, authorized_networks
98
+ return (
99
+ add_current_machine_to_networks_return_code,
100
+ False,
101
+ authorized_networks,
102
+ )
99
103
 
100
104
  return 0, is_current_machine_in_network, authorized_networks
101
105
 
xpk/core/commands.py CHANGED
@@ -274,9 +274,9 @@ def run_command_for_value(
274
274
  else:
275
275
  if not quiet:
276
276
  xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
277
- out, err = child.communicate()
278
- out, err = str(out, 'UTF-8'), str(err, 'UTF-8')
279
- return return_code, f'{out}\n{err}'
277
+ out_bytes, err_bytes = child.communicate()
278
+ out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
279
+ return return_code, f'{out_str}\n{err_str}'
280
280
  else:
281
281
  if not quiet:
282
282
  xpk_print(
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.11.0'
25
+ __version__ = 'v0.12.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -71,13 +71,12 @@ class XpkConfig:
71
71
  dir_path = '/'.join(self._config.split('/')[:-1])
72
72
  file.ensure_directory_exists(dir_path)
73
73
 
74
- config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
75
74
  if not os.path.exists(self._config):
76
75
  return None
77
76
 
78
77
  with open(self._config, encoding='utf-8', mode='r') as stream:
79
78
  config_yaml: dict = yaml.load(stream)
80
- return config_yaml
79
+ return config_yaml
81
80
 
82
81
  def _save_configs(self, config_yaml: dict) -> None:
83
82
  with open(self._config, encoding='utf-8', mode='w') as stream:
@@ -109,7 +108,7 @@ class XpkConfig:
109
108
 
110
109
  def get_all(
111
110
  self,
112
- ) -> dict[str, dict[str, str] | str] | None:
111
+ ) -> dict[str, str] | None:
113
112
  config_yaml = self._open_configs()
114
113
  if config_yaml is None:
115
114
  return None
@@ -0,0 +1,71 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
18
+
19
+ import os
20
+ import pytest
21
+
22
+ config_tmp_path = '/tmp/config/config.yaml'
23
+
24
+
25
+ @pytest.fixture(name='_')
26
+ def _():
27
+ if os.path.exists(config_tmp_path):
28
+ os.remove(config_tmp_path)
29
+ yield
30
+ if os.path.exists(config_tmp_path):
31
+ os.remove(config_tmp_path)
32
+
33
+
34
+ def test_config(_):
35
+ cfg = XpkConfig(config_tmp_path)
36
+ cfg.set('project-id', 'foo')
37
+ project_id = cfg.get('project-id')
38
+ assert project_id == 'foo'
39
+
40
+
41
+ def test_config_get_all(_):
42
+ cfg = XpkConfig(config_tmp_path)
43
+ cfg.set(PROJECT_KEY, 'foo')
44
+ cfg.set(CLUSTER_NAME_KEY, 'bar')
45
+ cfg.set(ZONE_KEY, 'europe-west1-a')
46
+ cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
47
+
48
+ cfg_all = cfg.get_all()
49
+ assert cfg_all[PROJECT_KEY] == 'foo'
50
+ assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
51
+ assert cfg_all[ZONE_KEY] == 'europe-west1-a'
52
+ assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
53
+
54
+
55
+ def test_config_get_empty(_):
56
+ cfg = XpkConfig(config_tmp_path)
57
+ val = cfg.get(PROJECT_KEY)
58
+ assert val is None
59
+
60
+
61
+ def test_config_get_all_empty(_):
62
+ cfg = XpkConfig(config_tmp_path)
63
+ val = cfg.get_all()
64
+ assert not val
65
+
66
+
67
+ def test_config_set_incorrect(_):
68
+ cfg = XpkConfig(config_tmp_path)
69
+ cfg.set('foo', 'bar')
70
+ cfg_all = cfg.get_all()
71
+ assert not cfg_all
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.57.1"
33
+ ctk_build_ref = "v1.62.2"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"
@@ -72,7 +72,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
72
72
  if system.accelerator_type == AcceleratorType['CPU']:
73
73
  return get_cpu_env(args, system)
74
74
 
75
- return format_env_dict(args.env, system) # pytype: disable=bad-return-type
75
+ return format_env_dict(args.env, system)
76
76
 
77
77
 
78
78
  def get_gpu_env(args, system) -> str:
xpk/core/filestore.py CHANGED
@@ -93,11 +93,12 @@ class FilestoreClient:
93
93
 
94
94
  for instance in instancesZonal:
95
95
  if instance.name == fullname_zonal:
96
- return instance # pytype: disable=bad-return-type
96
+ return instance
97
97
 
98
98
  for instance in instancesRegional:
99
99
  if instance.name == fullname_regional:
100
- return instance # pytype: disable=bad-return-type
100
+ return instance
101
+ return None
101
102
 
102
103
  def check_instance_exists(self) -> bool:
103
104
  """Check if Filestore instance exists"""
@@ -111,6 +112,7 @@ class FilestoreClient:
111
112
  def get_instance_location(self) -> str:
112
113
  """Get Filestore instance's location"""
113
114
  self.load_instance()
115
+ assert self.instance
114
116
  return str(self.instance.name.split("/")[3])
115
117
 
116
118
  def create_instance(
@@ -192,6 +194,7 @@ class FilestoreClient:
192
194
 
193
195
  def create_sc(self, name: str, network: str) -> dict:
194
196
  """Create a yaml representing filestore StorageClass."""
197
+ assert self.instance
195
198
  data = templates.load(FS_SC_PATH)
196
199
  data["metadata"]["name"] = get_storage_class_name(name)
197
200
  data["parameters"]["tier"] = self.instance.tier.name
@@ -202,6 +205,7 @@ class FilestoreClient:
202
205
 
203
206
  def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
204
207
  """Create a yaml representing filestore PersistentVolume."""
208
+ assert self.instance
205
209
  data = templates.load(FS_PV_PATH)
206
210
  data["metadata"]["name"] = get_pv_name(name)
207
211
  data["spec"]["storageClassName"] = get_storage_class_name(name)
@@ -219,6 +223,7 @@ class FilestoreClient:
219
223
 
220
224
  def create_pvc(self, name: str, access_mode: str) -> dict:
221
225
  """Create a yaml representing filestore PersistentVolumeClaim."""
226
+ assert self.instance
222
227
  data = templates.load(FS_PVC_PATH)
223
228
  data["metadata"]["name"] = get_pvc_name(name)
224
229
  data["spec"]["accessModes"] = [access_mode]
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
75
75
  xpk_print(f'Working on {args.project} and {args.zone}')
76
76
 
77
77
 
78
- def zone_to_region(zone) -> str:
78
+ def zone_to_region(zone: str) -> str:
79
79
  """Helper function converts zone name to region name.
80
80
 
81
81
  Args:
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
85
85
  The region name.
86
86
  """
87
87
  zone_terms = zone.split('-')
88
- return zone_terms[0] + '-' + zone_terms[1] # pytype: disable=bad-return-type
88
+ return zone_terms[0] + '-' + zone_terms[1]
89
89
 
90
90
 
91
91
  @dataclass
xpk/core/kjob.py CHANGED
@@ -277,7 +277,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
277
277
  job_spec = rdma_decorator.decorate_kjob_template(job_spec)
278
278
  job_template_dict = yaml.safe_load(yml_string)
279
279
  job_template_dict["template"] = job_spec
280
- return yaml.dump(job_template_dict, sort_keys=False)
280
+ yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
281
+ return yaml_result
281
282
 
282
283
 
283
284
  def create_job_template_instance(
xpk/core/kueue.py CHANGED
@@ -43,7 +43,7 @@ from .system_characteristics import (
43
43
  KUEUE_VERSION = 'v0.12.2'
44
44
  CLUSTER_QUEUE_NAME = 'cluster-queue'
45
45
  LOCAL_QUEUE_NAME = 'multislice-queue'
46
- WAIT_FOR_KUEUE_TIMEOUT = '5m'
46
+ WAIT_FOR_KUEUE_TIMEOUT = '10m'
47
47
  MEMORY_SIZE_PER_VM = 1.2
48
48
  MIN_MEMORY_LIMIT_SIZE = 4096
49
49
 
@@ -89,6 +89,10 @@ metadata:
89
89
  name: dws-config
90
90
  spec:
91
91
  provisioningClassName: queued-provisioning.gke.io
92
+ podSetUpdates:
93
+ nodeSelector:
94
+ - key: autoscaling.gke.io/provisioning-request
95
+ valueFromProvisioningClassDetail: ResizeRequestName
92
96
  managedResources:
93
97
  - {managed_resource}
94
98
  ---
@@ -320,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
320
324
  return return_code
321
325
 
322
326
 
323
- def get_kueue_version(args) -> (int, str):
327
+ def get_kueue_version(args) -> tuple[int, str]:
324
328
  command = 'kubectl kueue version'
325
329
  task = 'Get kueue version on server'
326
330
  return_code, val = run_command_for_value(command, task, args)
xpk/core/nap.py CHANGED
@@ -37,6 +37,7 @@ from .resources import (
37
37
  )
38
38
  from .scheduling import get_total_chips_requested_from_args
39
39
  from .system_characteristics import AcceleratorType, SystemCharacteristics
40
+ from typing import cast
40
41
 
41
42
  AUTOPROVISIONING_CONFIG_FILE = """
42
43
  management:
@@ -269,9 +270,6 @@ def is_autoprovisioning_enabled(
269
270
  bool is true if autoprovisioning is enabled, false otherwise.
270
271
  int of 0 if successful and 1 otherwise.
271
272
  """
272
- # Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
273
- if args.use_pathways:
274
- return False, 0
275
273
 
276
274
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
277
275
  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
@@ -339,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
339
337
  )
340
338
  return node_selector_args, 1
341
339
 
342
- return_code, capacity_type_str = get_value_from_map(
340
+ return_code, optional_capacity_type_str = get_value_from_map(
343
341
  CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
344
342
  )
345
343
  if return_code != 0:
346
344
  return node_selector_args, return_code
345
+ # return_code==0 implies capacity_type is defined
346
+ capacity_type_str = cast(str, optional_capacity_type_str)
347
347
 
348
348
  if capacity_type_str == CapacityType.RESERVATION.name:
349
349
  return_code, args.reservation = get_value_from_map(
@@ -0,0 +1,82 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.nodepool import get_desired_node_pool_names
18
+
19
+ CLUSTER_NAME = "running-cucumber"
20
+
21
+
22
+ def node_pool_name(number: int) -> str:
23
+ return f"{CLUSTER_NAME}-np-{number}"
24
+
25
+
26
+ def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
27
+ result = get_desired_node_pool_names(
28
+ existing_node_pool_names=[node_pool_name(0)],
29
+ cluster_name=CLUSTER_NAME,
30
+ desired_node_pool_count=2,
31
+ )
32
+
33
+ expected_result = [node_pool_name(0), node_pool_name(1)]
34
+ assert set(result) == set(expected_result)
35
+
36
+
37
+ def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
38
+ result = get_desired_node_pool_names(
39
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
40
+ cluster_name=CLUSTER_NAME,
41
+ desired_node_pool_count=1,
42
+ )
43
+
44
+ expected_result = [node_pool_name(0)]
45
+ assert set(result) == set(expected_result)
46
+
47
+
48
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
49
+ result = get_desired_node_pool_names(
50
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
51
+ cluster_name=CLUSTER_NAME,
52
+ desired_node_pool_count=3,
53
+ )
54
+
55
+ expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
56
+ assert set(result) == set(expected_result)
57
+
58
+
59
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
60
+ result = get_desired_node_pool_names(
61
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
62
+ cluster_name=CLUSTER_NAME,
63
+ desired_node_pool_count=2,
64
+ )
65
+
66
+ expected_result = [node_pool_name(0), node_pool_name(3)]
67
+ assert set(result) == set(expected_result)
68
+
69
+
70
+ def test_compute_desired_node_pool_names_with_unknown_node_pools():
71
+ result = get_desired_node_pool_names(
72
+ existing_node_pool_names=[
73
+ "unknown-node-pool",
74
+ node_pool_name(0),
75
+ node_pool_name(3),
76
+ ],
77
+ cluster_name=CLUSTER_NAME,
78
+ desired_node_pool_count=2,
79
+ )
80
+
81
+ expected_result = [node_pool_name(0), node_pool_name(3)]
82
+ assert set(result) == set(expected_result)
xpk/core/resources.py CHANGED
@@ -108,13 +108,7 @@ def create_cluster_configmaps(
108
108
  device_type = system.device_type
109
109
  if system.accelerator_type == AcceleratorType['GPU']:
110
110
  resources_data = f'{device_type}: "{int(args.num_nodes)}"'
111
- elif (
112
- not args.enable_pathways
113
- and args.enable_autoprovisioning
114
- and autoprovisioning_config
115
- ):
116
- # Currently autoprovisioning is not supported with Pathways.
117
- # Auto provisioning will have variable topologies for a gke accelerator type.
111
+ elif args.enable_autoprovisioning and autoprovisioning_config:
118
112
  resources_data = (
119
113
  f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
120
114
  )
xpk/core/storage.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import os
18
18
  from argparse import Namespace
19
19
  from dataclasses import dataclass
20
- from typing import Any
20
+ from typing import Any, cast
21
21
 
22
22
  import ruamel.yaml
23
23
  from google.cloud import storage as gcp_storage
@@ -95,17 +95,17 @@ class Storage:
95
95
  Args:
96
96
  data: A dictionary containing the Storage resource definition.
97
97
  """
98
- metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
98
+ metadata = data.get("metadata", {})
99
99
  self.name = metadata.get("name")
100
100
  spec = data.get("spec", {})
101
- self.type: str = spec.get("type")
102
- self.auto_mount: bool = spec.get("auto_mount")
103
- self.mount_point: bool = spec.get("mount_point")
104
- self.readonly: bool = spec.get("readonly")
105
- self.manifest: str = spec.get("manifest")
106
- self.pvc: str = spec.get("pvc")
107
- self.pv: str = spec.get("pv")
108
- self.bucket: str = self._get_bucket()
101
+ self.type = spec.get("type")
102
+ self.auto_mount = spec.get("auto_mount")
103
+ self.mount_point = spec.get("mount_point")
104
+ self.readonly = spec.get("readonly")
105
+ self.manifest = spec.get("manifest")
106
+ self.pvc = spec.get("pvc")
107
+ self.pv = spec.get("pv")
108
+ self.bucket = self._get_bucket()
109
109
 
110
110
  def fields_as_list(self) -> list[str]:
111
111
  """
@@ -117,9 +117,9 @@ class Storage:
117
117
  return [
118
118
  self.name,
119
119
  self.type,
120
- self.auto_mount,
120
+ str(self.auto_mount),
121
121
  self.mount_point,
122
- self.readonly,
122
+ str(self.readonly),
123
123
  self.manifest,
124
124
  ]
125
125
 
@@ -133,7 +133,7 @@ class Storage:
133
133
  client = k8s_client.CoreV1Api()
134
134
  try:
135
135
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
136
- return pv.spec.csi.volume_handle
136
+ return cast(str, pv.spec.csi.volume_handle)
137
137
  except ApiException as e:
138
138
  xpk_print(
139
139
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -150,7 +150,7 @@ class Storage:
150
150
  client = k8s_client.CoreV1Api()
151
151
  try:
152
152
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
153
- return pv.spec.mount_options
153
+ return cast(list[str], pv.spec.mount_options)
154
154
  except ApiException as e:
155
155
  xpk_print(
156
156
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -55,7 +55,7 @@ class SystemCharacteristics:
55
55
  gke_accelerator: str
56
56
  gce_machine_type: str
57
57
  chips_per_vm: int
58
- accelerator_type: AcceleratorType # type: ignore
58
+ accelerator_type: int # TODO: use enums
59
59
  device_type: str
60
60
 
61
61
 
xpk/core/workload.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import re
17
18
  from ..utils.console import xpk_exit, xpk_print
18
19
  from .commands import run_command_for_value
19
20
  from .gcloud_context import zone_to_region
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
240
241
  xpk_print('Your workload did not complete successfully')
241
242
  return 125
242
243
  return 0
244
+
245
+
246
+ GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
247
+ """Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
248
+
249
+
250
+ def get_jobsets_list_gcp_link(project: str) -> str:
251
+ """Returns a link to Cloud Console JobSets list"""
252
+
253
+ return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'
@@ -18,7 +18,7 @@ import yaml
18
18
  from ...utils.yaml import literal_string
19
19
 
20
20
 
21
- def decorate_kjob_template(job_manifest) -> str:
21
+ def decorate_kjob_template(job_manifest: dict) -> dict:
22
22
  spec = (
23
23
  job_manifest.setdefault('spec', {})
24
24
  .setdefault('template', {})
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
64
64
  add_tolerations(job_manifest)
65
65
  update_gpu_containers(job_manifest)
66
66
 
67
- return yaml.dump(manifest, sort_keys=False)
67
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
68
+ return yaml_str
68
69
 
69
70
 
70
71
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
36
36
  job_manifest = job['template']
37
37
  add_annotations(job_manifest, storages)
38
38
  add_volumes(job_manifest, storage_volumes)
39
- return yaml.dump(manifest, sort_keys=False)
39
+ yaml_result: str = yaml.dump(manifest, sort_keys=False)
40
+ return yaml_result
40
41
 
41
42
 
42
43
  def add_annotations(job_manifest, storages):
@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
55
55
  for job in manifest['spec']['replicatedJobs']:
56
56
  job_manifest = job['template']
57
57
  job_manifest = decorate_job(job_manifest)
58
- return yaml.dump(manifest, sort_keys=False)
58
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
59
+ return yaml_str
59
60
 
60
61
 
61
62
  def get_interfaces_annotation() -> dict:
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
131
132
  })
132
133
  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
134
  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
135
+ volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
134
136
  volumes.append(
135
137
  {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136
138
  )
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
168
170
  spec['initContainers'].append(tcpxo_daemon_container)
169
171
 
170
172
 
171
- def update_gpu_containers(job_manifest):
173
+ def update_gpu_containers(job_manifest) -> None:
172
174
  for container in job_manifest['spec']['template']['spec']['containers']:
173
175
  if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
174
176
  env: list = container.setdefault('env', [])