xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. xpk/commands/cluster.py +270 -8
  2. xpk/commands/cluster_gcluster.py +2 -1
  3. xpk/commands/common.py +3 -3
  4. xpk/commands/info.py +12 -12
  5. xpk/commands/job.py +12 -10
  6. xpk/commands/kjob_common.py +2 -1
  7. xpk/commands/storage.py +1 -1
  8. xpk/commands/workload.py +12 -6
  9. xpk/core/blueprint/blueprint_generator.py +7 -7
  10. xpk/core/blueprint/blueprint_test.py +218 -0
  11. xpk/core/capacity.py +5 -3
  12. xpk/core/cluster.py +9 -7
  13. xpk/core/cluster_private.py +5 -1
  14. xpk/core/commands.py +3 -3
  15. xpk/core/config.py +3 -4
  16. xpk/core/config_test.py +71 -0
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +1 -1
  19. xpk/core/filestore.py +7 -2
  20. xpk/core/gcloud_context.py +2 -2
  21. xpk/core/jobset.py +1 -1
  22. xpk/core/kjob.py +2 -1
  23. xpk/core/kueue.py +12 -4
  24. xpk/core/nap.py +20 -6
  25. xpk/core/nodepool.py +52 -19
  26. xpk/core/nodepool_test.py +82 -0
  27. xpk/core/resources.py +1 -7
  28. xpk/core/scheduling.py +1 -1
  29. xpk/core/storage.py +14 -14
  30. xpk/core/system_characteristics.py +267 -1081
  31. xpk/core/workload.py +11 -0
  32. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  33. xpk/core/workload_decorators/storage_decorator.py +2 -1
  34. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  35. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  37. xpk/core/workload_test.py +28 -0
  38. xpk/main.py +9 -10
  39. xpk/parser/cluster.py +67 -49
  40. xpk/parser/common.py +45 -36
  41. xpk/parser/storage.py +12 -13
  42. xpk/parser/workload.py +57 -39
  43. xpk/utils/console.py +2 -1
  44. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
  45. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
  46. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
  47. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
  48. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
  49. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,218 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import os
18
+ import shutil
19
+
20
+ import ruamel.yaml
21
+
22
+ from xpk.core.blueprint.blueprint_definitions import Blueprint
23
+ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
+ from xpk.core.capacity import CapacityType
25
+
26
+ yaml = ruamel.yaml.YAML()
27
+
28
+ yaml.register_class(Blueprint)
29
+
30
+ a3_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega.yaml"
31
+ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
32
+ a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
33
+ a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
34
+ config_map_filename = "config-map.yaml.tftpl"
35
+ kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
36
+ tmp_test_dir = "/tmp/xpk_test"
37
+
38
+
39
+ def prepare_test():
40
+ if os.path.exists(tmp_test_dir):
41
+ shutil.rmtree(tmp_test_dir)
42
+ os.mkdir(tmp_test_dir)
43
+
44
+
45
+ def test_generate_a3_mega_blueprint():
46
+ prepare_test()
47
+ blueprint_name = "xpk-gke-a3-megagpu"
48
+ bp_generator = BlueprintGenerator(tmp_test_dir)
49
+ bp = bp_generator.generate_a3_mega_blueprint(
50
+ project_id="foo",
51
+ cluster_name="bar",
52
+ blueprint_name=blueprint_name,
53
+ prefix="prefix",
54
+ region="us-central1",
55
+ zone="us-central1-c",
56
+ auth_cidr="10.0.0.0/32",
57
+ reservation_placement_policy={
58
+ "type": "COMPACT",
59
+ "name": "test-reservation-placement",
60
+ },
61
+ reservation="test-reservation",
62
+ capacity_type=CapacityType.RESERVATION,
63
+ system_node_pool_min_node_count=5,
64
+ )
65
+
66
+ assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
67
+
68
+ with open(a3_yaml_test_path, encoding="utf-8") as stream:
69
+ ctk_yaml = yaml.load(stream)
70
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
71
+ ctk_test = yaml.load(generated_blueprint)
72
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
73
+ assert ctk_test.terraform_backend_defaults is None
74
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
75
+ assert (
76
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
77
+ )
78
+ assert ctk_yaml.vars == ctk_test.vars
79
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
80
+ assert os.path.exists(
81
+ os.path.join(
82
+ tmp_test_dir, "prefix", blueprint_name, config_map_filename
83
+ )
84
+ )
85
+ assert os.path.exists(
86
+ os.path.join(
87
+ tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
88
+ )
89
+ )
90
+
91
+ shutil.rmtree(tmp_test_dir)
92
+
93
+
94
+ def test_generate_a3_mega_spot_blueprint():
95
+ prepare_test()
96
+ blueprint_name = "xpk-gke-a3-megagpu"
97
+ bp_generator = BlueprintGenerator(tmp_test_dir)
98
+ bp = bp_generator.generate_a3_mega_blueprint(
99
+ project_id="foo",
100
+ cluster_name="bar",
101
+ blueprint_name=blueprint_name,
102
+ prefix="prefix",
103
+ region="us-central1",
104
+ zone="us-central1-c",
105
+ auth_cidr="10.0.0.0/32",
106
+ capacity_type=CapacityType.SPOT,
107
+ system_node_pool_min_node_count=5,
108
+ )
109
+
110
+ assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
111
+
112
+ with open(a3_spot_yaml_test_path, encoding="utf-8") as stream:
113
+ ctk_yaml = yaml.load(stream)
114
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
115
+ ctk_test = yaml.load(generated_blueprint)
116
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
117
+ assert ctk_test.terraform_backend_defaults is None
118
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
119
+ assert (
120
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
121
+ )
122
+ assert ctk_yaml.vars == ctk_test.vars
123
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
124
+
125
+ shutil.rmtree(tmp_test_dir)
126
+
127
+
128
+ def test_generate_a3_ultra_blueprint():
129
+ prepare_test()
130
+ blueprint_name = "xpk-gke-a3-ultra"
131
+ bp_generator = BlueprintGenerator(tmp_test_dir)
132
+ bp = bp_generator.generate_a3_ultra_blueprint(
133
+ project_id="foo",
134
+ cluster_name="gke-a3-ultra",
135
+ blueprint_name=blueprint_name,
136
+ region="us-central1",
137
+ zone="us-central1-c",
138
+ auth_cidr="10.0.0.0/32",
139
+ reservation="test-reservation",
140
+ system_node_pool_machine_type="e2-standard-16",
141
+ capacity_type=CapacityType.RESERVATION,
142
+ gcs_bucket="test-bucket",
143
+ prefix="testdir",
144
+ )
145
+ with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
146
+ ctk_yaml = yaml.load(stream)
147
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
148
+ ctk_test = yaml.load(generated_blueprint)
149
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
150
+ assert (
151
+ ctk_yaml.terraform_backend_defaults
152
+ == ctk_test.terraform_backend_defaults
153
+ )
154
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
155
+ assert (
156
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
157
+ )
158
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
159
+ assert os.path.exists(
160
+ os.path.join(
161
+ tmp_test_dir, "testdir", blueprint_name, "mlgru-disable.yaml"
162
+ )
163
+ )
164
+ assert os.path.exists(
165
+ os.path.join(
166
+ tmp_test_dir, "testdir", blueprint_name, "nccl-installer.yaml"
167
+ )
168
+ )
169
+
170
+ shutil.rmtree(tmp_test_dir)
171
+
172
+
173
+ def test_generate_a4_blueprint():
174
+ prepare_test()
175
+ blueprint_name = "xpk-gke-a4"
176
+ bp_generator = BlueprintGenerator(tmp_test_dir)
177
+ bp = bp_generator.generate_a4_blueprint(
178
+ project_id="foo",
179
+ cluster_name="gke-a4",
180
+ blueprint_name=blueprint_name,
181
+ region="us-central1",
182
+ zone="us-central1-c",
183
+ auth_cidr="10.0.0.0/32",
184
+ reservation="test-reservation",
185
+ system_node_pool_machine_type="e2-standard-16",
186
+ capacity_type=CapacityType.RESERVATION,
187
+ gcs_bucket="test-bucket",
188
+ prefix="testdir",
189
+ )
190
+ with open(a4_yaml_test_path, encoding="utf-8") as stream:
191
+ ctk_yaml = yaml.load(stream)
192
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
193
+ ctk_test = yaml.load(generated_blueprint)
194
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
195
+ assert (
196
+ ctk_yaml.terraform_backend_defaults
197
+ == ctk_test.terraform_backend_defaults
198
+ )
199
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
200
+ assert (
201
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
202
+ )
203
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
204
+ assert os.path.exists(
205
+ os.path.join(
206
+ tmp_test_dir, "testdir", blueprint_name, "storage_crd.yaml"
207
+ )
208
+ )
209
+ assert os.path.exists(
210
+ os.path.join(
211
+ tmp_test_dir,
212
+ "testdir",
213
+ blueprint_name,
214
+ "nccl-rdma-installer-a4.yaml",
215
+ )
216
+ )
217
+
218
+ shutil.rmtree(tmp_test_dir)
xpk/core/capacity.py CHANGED
@@ -195,10 +195,12 @@ def get_capacity_arguments_from_capacity_type(
195
195
  capacity_args = '--spot'
196
196
  case CapacityType.FLEX_START:
197
197
  capacity_args = (
198
- ' --flex-start --enable-queued-provisioning --enable-autoscaling'
198
+ ' --flex-start --enable-autoscaling'
199
199
  ' --location-policy=ANY --reservation-affinity=none'
200
200
  f' --no-enable-autorepair --max-nodes={max_nodes}'
201
201
  )
202
+ if args.num_slices <= 1:
203
+ capacity_args += ' --enable-queued-provisioning'
202
204
  case CapacityType.RESERVATION:
203
205
  capacity_args = (
204
206
  f'--reservation-affinity=specific --reservation={args.reservation}'
@@ -232,9 +234,9 @@ def get_capacity_node_selectors_from_capacity_type(
232
234
  case CapacityType.ON_DEMAND.name:
233
235
  node_selector = ''
234
236
  case CapacityType.FLEX_START.name:
235
- node_selector = 'cloud.google.com/gke-queued="true"'
237
+ node_selector = 'cloud.google.com/gke-queued: "true"'
236
238
  case CapacityType.SPOT.name:
237
- node_selector = 'cloud.google.com/gke-spot="true"'
239
+ node_selector = 'cloud.google.com/gke-spot: "true"'
238
240
  case CapacityType.RESERVATION.name:
239
241
  node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
240
242
  case _:
xpk/core/cluster.py CHANGED
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
62
62
  0 if successful and 1 otherwise.
63
63
  """
64
64
  command = (
65
- 'kubectl apply --server-side -f'
66
- f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
65
+ 'kubectl apply --server-side --force-conflicts'
66
+ f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
67
67
  )
68
68
  task = f'Install Jobset on {args.cluster}'
69
69
  return_code = run_command_with_updates_retry(command, task, args)
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
217
217
  )
218
218
  if err_code != 0:
219
219
  xpk_exit(err_code)
220
- data = yaml.safe_load(val)
221
- return data['items'] # pytype: disable=bad-return-type
220
+ data: dict[str, list[dict]] = yaml.safe_load(val)
221
+ return data['items']
222
222
 
223
223
 
224
224
  def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
@@ -445,7 +445,7 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
445
445
  args.project_number = project_id_to_project_number(args.project)
446
446
 
447
447
  config.load_kube_config()
448
- return k8s_client.ApiClient() # pytype: disable=bad-return-type
448
+ return k8s_client.ApiClient()
449
449
 
450
450
 
451
451
  def get_gpu_type_from_cluster(args) -> str:
@@ -817,9 +817,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
817
817
  server_config_return_code, gke_server_config = get_gke_server_config(args)
818
818
  if server_config_return_code != 0:
819
819
  xpk_exit(server_config_return_code)
820
+ assert gke_server_config
821
+
820
822
  upgrade_master_return_code = upgrade_gke_control_plane_version(
821
823
  args,
822
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
824
+ gke_server_config.default_rapid_gke_version,
823
825
  )
824
826
  if upgrade_master_return_code > 0:
825
827
  xpk_print("Updating GKE cluster's control plane upgrade failed!")
@@ -828,7 +830,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
828
830
  # Upgrade nodepools version after the master upgrade.
829
831
  node_pool_update_code = upgrade_gke_nodepools_version(
830
832
  args,
831
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
833
+ gke_server_config.default_rapid_gke_version,
832
834
  )
833
835
  if node_pool_update_code > 0:
834
836
  xpk_print('Upgrading nodepools version failed!')
@@ -95,7 +95,11 @@ def add_current_machine_to_networks_if_needed(
95
95
  "Adding current machine's IP address to the authorized networks"
96
96
  ' failed!'
97
97
  )
98
- return add_current_machine_to_networks_return_code, authorized_networks
98
+ return (
99
+ add_current_machine_to_networks_return_code,
100
+ False,
101
+ authorized_networks,
102
+ )
99
103
 
100
104
  return 0, is_current_machine_in_network, authorized_networks
101
105
 
xpk/core/commands.py CHANGED
@@ -274,9 +274,9 @@ def run_command_for_value(
274
274
  else:
275
275
  if not quiet:
276
276
  xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
277
- out, err = child.communicate()
278
- out, err = str(out, 'UTF-8'), str(err, 'UTF-8')
279
- return return_code, f'{out}\n{err}'
277
+ out_bytes, err_bytes = child.communicate()
278
+ out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
279
+ return return_code, f'{out_str}\n{err_str}'
280
280
  else:
281
281
  if not quiet:
282
282
  xpk_print(
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.10.1'
25
+ __version__ = 'v0.12.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -71,13 +71,12 @@ class XpkConfig:
71
71
  dir_path = '/'.join(self._config.split('/')[:-1])
72
72
  file.ensure_directory_exists(dir_path)
73
73
 
74
- config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
75
74
  if not os.path.exists(self._config):
76
75
  return None
77
76
 
78
77
  with open(self._config, encoding='utf-8', mode='r') as stream:
79
78
  config_yaml: dict = yaml.load(stream)
80
- return config_yaml
79
+ return config_yaml
81
80
 
82
81
  def _save_configs(self, config_yaml: dict) -> None:
83
82
  with open(self._config, encoding='utf-8', mode='w') as stream:
@@ -109,7 +108,7 @@ class XpkConfig:
109
108
 
110
109
  def get_all(
111
110
  self,
112
- ) -> dict[str, dict[str, str] | str] | None:
111
+ ) -> dict[str, str] | None:
113
112
  config_yaml = self._open_configs()
114
113
  if config_yaml is None:
115
114
  return None
@@ -0,0 +1,71 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
18
+
19
+ import os
20
+ import pytest
21
+
22
+ config_tmp_path = '/tmp/config/config.yaml'
23
+
24
+
25
+ @pytest.fixture(name='_')
26
+ def _():
27
+ if os.path.exists(config_tmp_path):
28
+ os.remove(config_tmp_path)
29
+ yield
30
+ if os.path.exists(config_tmp_path):
31
+ os.remove(config_tmp_path)
32
+
33
+
34
+ def test_config(_):
35
+ cfg = XpkConfig(config_tmp_path)
36
+ cfg.set('project-id', 'foo')
37
+ project_id = cfg.get('project-id')
38
+ assert project_id == 'foo'
39
+
40
+
41
+ def test_config_get_all(_):
42
+ cfg = XpkConfig(config_tmp_path)
43
+ cfg.set(PROJECT_KEY, 'foo')
44
+ cfg.set(CLUSTER_NAME_KEY, 'bar')
45
+ cfg.set(ZONE_KEY, 'europe-west1-a')
46
+ cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
47
+
48
+ cfg_all = cfg.get_all()
49
+ assert cfg_all[PROJECT_KEY] == 'foo'
50
+ assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
51
+ assert cfg_all[ZONE_KEY] == 'europe-west1-a'
52
+ assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
53
+
54
+
55
+ def test_config_get_empty(_):
56
+ cfg = XpkConfig(config_tmp_path)
57
+ val = cfg.get(PROJECT_KEY)
58
+ assert val is None
59
+
60
+
61
+ def test_config_get_all_empty(_):
62
+ cfg = XpkConfig(config_tmp_path)
63
+ val = cfg.get_all()
64
+ assert not val
65
+
66
+
67
+ def test_config_set_incorrect(_):
68
+ cfg = XpkConfig(config_tmp_path)
69
+ cfg.set('foo', 'bar')
70
+ cfg_all = cfg.get_all()
71
+ assert not cfg_all
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.57.1"
33
+ ctk_build_ref = "v1.62.2"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"
@@ -72,7 +72,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
72
72
  if system.accelerator_type == AcceleratorType['CPU']:
73
73
  return get_cpu_env(args, system)
74
74
 
75
- return format_env_dict(args.env, system) # pytype: disable=bad-return-type
75
+ return format_env_dict(args.env, system)
76
76
 
77
77
 
78
78
  def get_gpu_env(args, system) -> str:
xpk/core/filestore.py CHANGED
@@ -93,11 +93,12 @@ class FilestoreClient:
93
93
 
94
94
  for instance in instancesZonal:
95
95
  if instance.name == fullname_zonal:
96
- return instance # pytype: disable=bad-return-type
96
+ return instance
97
97
 
98
98
  for instance in instancesRegional:
99
99
  if instance.name == fullname_regional:
100
- return instance # pytype: disable=bad-return-type
100
+ return instance
101
+ return None
101
102
 
102
103
  def check_instance_exists(self) -> bool:
103
104
  """Check if Filestore instance exists"""
@@ -111,6 +112,7 @@ class FilestoreClient:
111
112
  def get_instance_location(self) -> str:
112
113
  """Get Filestore instance's location"""
113
114
  self.load_instance()
115
+ assert self.instance
114
116
  return str(self.instance.name.split("/")[3])
115
117
 
116
118
  def create_instance(
@@ -192,6 +194,7 @@ class FilestoreClient:
192
194
 
193
195
  def create_sc(self, name: str, network: str) -> dict:
194
196
  """Create a yaml representing filestore StorageClass."""
197
+ assert self.instance
195
198
  data = templates.load(FS_SC_PATH)
196
199
  data["metadata"]["name"] = get_storage_class_name(name)
197
200
  data["parameters"]["tier"] = self.instance.tier.name
@@ -202,6 +205,7 @@ class FilestoreClient:
202
205
 
203
206
  def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
204
207
  """Create a yaml representing filestore PersistentVolume."""
208
+ assert self.instance
205
209
  data = templates.load(FS_PV_PATH)
206
210
  data["metadata"]["name"] = get_pv_name(name)
207
211
  data["spec"]["storageClassName"] = get_storage_class_name(name)
@@ -219,6 +223,7 @@ class FilestoreClient:
219
223
 
220
224
  def create_pvc(self, name: str, access_mode: str) -> dict:
221
225
  """Create a yaml representing filestore PersistentVolumeClaim."""
226
+ assert self.instance
222
227
  data = templates.load(FS_PVC_PATH)
223
228
  data["metadata"]["name"] = get_pvc_name(name)
224
229
  data["spec"]["accessModes"] = [access_mode]
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
75
75
  xpk_print(f'Working on {args.project} and {args.zone}')
76
76
 
77
77
 
78
- def zone_to_region(zone) -> str:
78
+ def zone_to_region(zone: str) -> str:
79
79
  """Helper function converts zone name to region name.
80
80
 
81
81
  Args:
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
85
85
  The region name.
86
86
  """
87
87
  zone_terms = zone.split('-')
88
- return zone_terms[0] + '-' + zone_terms[1] # pytype: disable=bad-return-type
88
+ return zone_terms[0] + '-' + zone_terms[1]
89
89
 
90
90
 
91
91
  @dataclass
xpk/core/jobset.py CHANGED
@@ -81,7 +81,7 @@ spec:
81
81
  limits:
82
82
  memory: {memory_limit_size}
83
83
  requests:
84
- cpu: 500m
84
+ cpu: 1000m
85
85
  memory: 128Mi
86
86
  securityContext:
87
87
  allowPrivilegeEscalation: false
xpk/core/kjob.py CHANGED
@@ -277,7 +277,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
277
277
  job_spec = rdma_decorator.decorate_kjob_template(job_spec)
278
278
  job_template_dict = yaml.safe_load(yml_string)
279
279
  job_template_dict["template"] = job_spec
280
- return yaml.dump(job_template_dict, sort_keys=False)
280
+ yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
281
+ return yaml_result
281
282
 
282
283
 
283
284
  def create_job_template_instance(
xpk/core/kueue.py CHANGED
@@ -43,7 +43,7 @@ from .system_characteristics import (
43
43
  KUEUE_VERSION = 'v0.12.2'
44
44
  CLUSTER_QUEUE_NAME = 'cluster-queue'
45
45
  LOCAL_QUEUE_NAME = 'multislice-queue'
46
- WAIT_FOR_KUEUE_TIMEOUT = '5m'
46
+ WAIT_FOR_KUEUE_TIMEOUT = '10m'
47
47
  MEMORY_SIZE_PER_VM = 1.2
48
48
  MIN_MEMORY_LIMIT_SIZE = 4096
49
49
 
@@ -89,6 +89,10 @@ metadata:
89
89
  name: dws-config
90
90
  spec:
91
91
  provisioningClassName: queued-provisioning.gke.io
92
+ podSetUpdates:
93
+ nodeSelector:
94
+ - key: autoscaling.gke.io/provisioning-request
95
+ valueFromProvisioningClassDetail: ResizeRequestName
92
96
  managedResources:
93
97
  - {managed_resource}
94
98
  ---
@@ -244,14 +248,16 @@ spec:
244
248
  periodSeconds: 10
245
249
  resources:
246
250
  limits:
247
- cpu: 500m
251
+ cpu: 1000m
248
252
  memory: {memory_limit_size}
249
253
  requests:
250
- cpu: 500m
254
+ cpu: 1000m
251
255
  memory: 512Mi
252
256
  securityContext:
253
257
  allowPrivilegeEscalation: false
254
258
  volumeMounts:
259
+ - mountPath: /visibility
260
+ name: visibility
255
261
  - mountPath: /tmp/k8s-webhook-server/serving-certs
256
262
  name: cert
257
263
  readOnly: true
@@ -263,6 +269,8 @@ spec:
263
269
  serviceAccountName: kueue-controller-manager
264
270
  terminationGracePeriodSeconds: 10
265
271
  volumes:
272
+ - name: visibility
273
+ emptyDir: {{}}
266
274
  - name: cert
267
275
  secret:
268
276
  defaultMode: 420
@@ -316,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
316
324
  return return_code
317
325
 
318
326
 
319
- def get_kueue_version(args) -> (int, str):
327
+ def get_kueue_version(args) -> tuple[int, str]:
320
328
  command = 'kubectl kueue version'
321
329
  task = 'Get kueue version on server'
322
330
  return_code, val = run_command_for_value(command, task, args)
xpk/core/nap.py CHANGED
@@ -37,11 +37,14 @@ from .resources import (
37
37
  )
38
38
  from .scheduling import get_total_chips_requested_from_args
39
39
  from .system_characteristics import AcceleratorType, SystemCharacteristics
40
+ from typing import cast
40
41
 
41
42
  AUTOPROVISIONING_CONFIG_FILE = """
42
43
  management:
43
44
  autoRepair: true
44
45
  autoUpgrade: true
46
+ scopes:
47
+ - "https://www.googleapis.com/auth/devstorage.read_write"
45
48
  autoprovisioningLocations:
46
49
  {zones}
47
50
  {resource_limits}
@@ -106,6 +109,18 @@ def enable_autoprovisioning_on_cluster(
106
109
  xpk_print(f'{task} request returned ERROR {return_code}')
107
110
  return autoprovisioning_config, return_code
108
111
 
112
+ command = (
113
+ 'gcloud container clusters update'
114
+ f' {args.cluster} --project={args.project}'
115
+ f' --region={zone_to_region(args.zone)}'
116
+ ' --autoscaling-profile=optimize-utilization'
117
+ )
118
+ task = 'Update cluster with autoscaling-profile'
119
+ return_code = run_command_with_updates(command, task, args)
120
+ if return_code != 0:
121
+ xpk_print(f'{task} request returned ERROR {return_code}')
122
+ return autoprovisioning_config, return_code
123
+
109
124
  # Update created accelerator node pools to support autoprovisioning.
110
125
  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
111
126
  if return_code != 0:
@@ -171,11 +186,11 @@ def create_autoprovisioning_config(
171
186
  # is not controlled by NAP.
172
187
  cpu_limits = """
173
188
  minimum: 1
174
- maximum: 10000
189
+ maximum: 1000000
175
190
  """
176
191
  memory_limits = """
177
192
  minimum: 1
178
- maximum: 10000
193
+ maximum: 10000000
179
194
  """
180
195
 
181
196
  # By default, the maximum chips is set to be the current number of resources used
@@ -255,9 +270,6 @@ def is_autoprovisioning_enabled(
255
270
  bool is true if autoprovisioning is enabled, false otherwise.
256
271
  int of 0 if successful and 1 otherwise.
257
272
  """
258
- # Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
259
- if args.use_pathways:
260
- return False, 0
261
273
 
262
274
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
263
275
  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
@@ -325,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
325
337
  )
326
338
  return node_selector_args, 1
327
339
 
328
- return_code, capacity_type_str = get_value_from_map(
340
+ return_code, optional_capacity_type_str = get_value_from_map(
329
341
  CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
330
342
  )
331
343
  if return_code != 0:
332
344
  return node_selector_args, return_code
345
+ # return_code==0 implies capacity_type is defined
346
+ capacity_type_str = cast(str, optional_capacity_type_str)
333
347
 
334
348
  if capacity_type_str == CapacityType.RESERVATION.name:
335
349
  return_code, args.reservation = get_value_from_map(