xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. xpk/commands/batch.py +8 -8
  2. xpk/commands/cluster.py +19 -19
  3. xpk/commands/cluster_gcluster.py +2 -1
  4. xpk/commands/common.py +7 -3
  5. xpk/commands/info.py +12 -12
  6. xpk/commands/inspector.py +1 -1
  7. xpk/commands/job.py +42 -12
  8. xpk/commands/kjob_common.py +2 -1
  9. xpk/commands/storage.py +6 -3
  10. xpk/commands/workload.py +28 -15
  11. xpk/core/blueprint/blueprint_generator.py +7 -7
  12. xpk/core/blueprint/blueprint_test.py +218 -0
  13. xpk/core/capacity.py +3 -1
  14. xpk/core/cluster.py +14 -8
  15. xpk/core/cluster_private.py +8 -2
  16. xpk/core/commands.py +13 -10
  17. xpk/core/config.py +3 -4
  18. xpk/core/config_test.py +71 -0
  19. xpk/core/docker_image.py +14 -5
  20. xpk/core/docker_manager.py +1 -1
  21. xpk/core/docker_resources.py +10 -5
  22. xpk/core/filestore.py +7 -2
  23. xpk/core/gcloud_context.py +2 -2
  24. xpk/core/jobset.py +1 -1
  25. xpk/core/kjob.py +7 -3
  26. xpk/core/kueue.py +28 -8
  27. xpk/core/nap.py +5 -5
  28. xpk/core/network.py +1 -1
  29. xpk/core/nodepool.py +8 -3
  30. xpk/core/nodepool_test.py +82 -0
  31. xpk/core/pathways.py +6 -2
  32. xpk/core/ray.py +1 -1
  33. xpk/core/resources.py +18 -14
  34. xpk/core/scheduling.py +4 -0
  35. xpk/core/storage.py +14 -14
  36. xpk/core/system_characteristics.py +1 -1
  37. xpk/core/workload.py +11 -0
  38. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  39. xpk/core/workload_decorators/storage_decorator.py +2 -1
  40. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  41. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  42. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  43. xpk/core/workload_test.py +28 -0
  44. xpk/main.py +12 -10
  45. xpk/parser/cluster.py +110 -49
  46. xpk/parser/common.py +45 -36
  47. xpk/parser/storage.py +12 -13
  48. xpk/parser/workload.py +57 -39
  49. xpk/utils/console.py +2 -1
  50. xpk/utils/execution_context.py +28 -0
  51. xpk/utils/file.py +25 -10
  52. xpk/utils/network.py +4 -0
  53. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
  54. xpk-0.13.0.dist-info/RECORD +101 -0
  55. xpk-0.11.0.dist-info/RECORD +0 -95
  56. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
  57. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
  58. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
  59. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,218 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import os
18
+ import shutil
19
+
20
+ import ruamel.yaml
21
+
22
+ from xpk.core.blueprint.blueprint_definitions import Blueprint
23
+ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
+ from xpk.core.capacity import CapacityType
25
+
26
+ yaml = ruamel.yaml.YAML()
27
+
28
+ yaml.register_class(Blueprint)
29
+
30
+ a3_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega.yaml"
31
+ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
32
+ a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
33
+ a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
34
+ config_map_filename = "config-map.yaml.tftpl"
35
+ kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
36
+ tmp_test_dir = "/tmp/xpk_test"
37
+
38
+
39
+ def prepare_test():
40
+ if os.path.exists(tmp_test_dir):
41
+ shutil.rmtree(tmp_test_dir)
42
+ os.mkdir(tmp_test_dir)
43
+
44
+
45
+ def test_generate_a3_mega_blueprint():
46
+ prepare_test()
47
+ blueprint_name = "xpk-gke-a3-megagpu"
48
+ bp_generator = BlueprintGenerator(tmp_test_dir)
49
+ bp = bp_generator.generate_a3_mega_blueprint(
50
+ project_id="foo",
51
+ cluster_name="bar",
52
+ blueprint_name=blueprint_name,
53
+ prefix="prefix",
54
+ region="us-central1",
55
+ zone="us-central1-c",
56
+ auth_cidr="10.0.0.0/32",
57
+ reservation_placement_policy={
58
+ "type": "COMPACT",
59
+ "name": "test-reservation-placement",
60
+ },
61
+ reservation="test-reservation",
62
+ capacity_type=CapacityType.RESERVATION,
63
+ system_node_pool_min_node_count=5,
64
+ )
65
+
66
+ assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
67
+
68
+ with open(a3_yaml_test_path, encoding="utf-8") as stream:
69
+ ctk_yaml = yaml.load(stream)
70
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
71
+ ctk_test = yaml.load(generated_blueprint)
72
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
73
+ assert ctk_test.terraform_backend_defaults is None
74
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
75
+ assert (
76
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
77
+ )
78
+ assert ctk_yaml.vars == ctk_test.vars
79
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
80
+ assert os.path.exists(
81
+ os.path.join(
82
+ tmp_test_dir, "prefix", blueprint_name, config_map_filename
83
+ )
84
+ )
85
+ assert os.path.exists(
86
+ os.path.join(
87
+ tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
88
+ )
89
+ )
90
+
91
+ shutil.rmtree(tmp_test_dir)
92
+
93
+
94
+ def test_generate_a3_mega_spot_blueprint():
95
+ prepare_test()
96
+ blueprint_name = "xpk-gke-a3-megagpu"
97
+ bp_generator = BlueprintGenerator(tmp_test_dir)
98
+ bp = bp_generator.generate_a3_mega_blueprint(
99
+ project_id="foo",
100
+ cluster_name="bar",
101
+ blueprint_name=blueprint_name,
102
+ prefix="prefix",
103
+ region="us-central1",
104
+ zone="us-central1-c",
105
+ auth_cidr="10.0.0.0/32",
106
+ capacity_type=CapacityType.SPOT,
107
+ system_node_pool_min_node_count=5,
108
+ )
109
+
110
+ assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
111
+
112
+ with open(a3_spot_yaml_test_path, encoding="utf-8") as stream:
113
+ ctk_yaml = yaml.load(stream)
114
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
115
+ ctk_test = yaml.load(generated_blueprint)
116
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
117
+ assert ctk_test.terraform_backend_defaults is None
118
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
119
+ assert (
120
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
121
+ )
122
+ assert ctk_yaml.vars == ctk_test.vars
123
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
124
+
125
+ shutil.rmtree(tmp_test_dir)
126
+
127
+
128
+ def test_generate_a3_ultra_blueprint():
129
+ prepare_test()
130
+ blueprint_name = "xpk-gke-a3-ultra"
131
+ bp_generator = BlueprintGenerator(tmp_test_dir)
132
+ bp = bp_generator.generate_a3_ultra_blueprint(
133
+ project_id="foo",
134
+ cluster_name="gke-a3-ultra",
135
+ blueprint_name=blueprint_name,
136
+ region="us-central1",
137
+ zone="us-central1-c",
138
+ auth_cidr="10.0.0.0/32",
139
+ reservation="test-reservation",
140
+ system_node_pool_machine_type="e2-standard-16",
141
+ capacity_type=CapacityType.RESERVATION,
142
+ gcs_bucket="test-bucket",
143
+ prefix="testdir",
144
+ )
145
+ with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
146
+ ctk_yaml = yaml.load(stream)
147
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
148
+ ctk_test = yaml.load(generated_blueprint)
149
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
150
+ assert (
151
+ ctk_yaml.terraform_backend_defaults
152
+ == ctk_test.terraform_backend_defaults
153
+ )
154
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
155
+ assert (
156
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
157
+ )
158
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
159
+ assert os.path.exists(
160
+ os.path.join(
161
+ tmp_test_dir, "testdir", blueprint_name, "mlgru-disable.yaml"
162
+ )
163
+ )
164
+ assert os.path.exists(
165
+ os.path.join(
166
+ tmp_test_dir, "testdir", blueprint_name, "nccl-installer.yaml"
167
+ )
168
+ )
169
+
170
+ shutil.rmtree(tmp_test_dir)
171
+
172
+
173
+ def test_generate_a4_blueprint():
174
+ prepare_test()
175
+ blueprint_name = "xpk-gke-a4"
176
+ bp_generator = BlueprintGenerator(tmp_test_dir)
177
+ bp = bp_generator.generate_a4_blueprint(
178
+ project_id="foo",
179
+ cluster_name="gke-a4",
180
+ blueprint_name=blueprint_name,
181
+ region="us-central1",
182
+ zone="us-central1-c",
183
+ auth_cidr="10.0.0.0/32",
184
+ reservation="test-reservation",
185
+ system_node_pool_machine_type="e2-standard-16",
186
+ capacity_type=CapacityType.RESERVATION,
187
+ gcs_bucket="test-bucket",
188
+ prefix="testdir",
189
+ )
190
+ with open(a4_yaml_test_path, encoding="utf-8") as stream:
191
+ ctk_yaml = yaml.load(stream)
192
+ with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
193
+ ctk_test = yaml.load(generated_blueprint)
194
+ assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
195
+ assert (
196
+ ctk_yaml.terraform_backend_defaults
197
+ == ctk_test.terraform_backend_defaults
198
+ )
199
+ assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
200
+ assert (
201
+ ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
202
+ )
203
+ assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
204
+ assert os.path.exists(
205
+ os.path.join(
206
+ tmp_test_dir, "testdir", blueprint_name, "storage_crd.yaml"
207
+ )
208
+ )
209
+ assert os.path.exists(
210
+ os.path.join(
211
+ tmp_test_dir,
212
+ "testdir",
213
+ blueprint_name,
214
+ "nccl-rdma-installer-a4.yaml",
215
+ )
216
+ )
217
+
218
+ shutil.rmtree(tmp_test_dir)
xpk/core/capacity.py CHANGED
@@ -195,10 +195,12 @@ def get_capacity_arguments_from_capacity_type(
195
195
  capacity_args = '--spot'
196
196
  case CapacityType.FLEX_START:
197
197
  capacity_args = (
198
- ' --flex-start --enable-queued-provisioning --enable-autoscaling'
198
+ ' --flex-start --enable-autoscaling'
199
199
  ' --location-policy=ANY --reservation-affinity=none'
200
200
  f' --no-enable-autorepair --max-nodes={max_nodes}'
201
201
  )
202
+ if args.num_slices <= 1:
203
+ capacity_args += ' --enable-queued-provisioning'
202
204
  case CapacityType.RESERVATION:
203
205
  capacity_args = (
204
206
  f'--reservation-affinity=specific --reservation={args.reservation}'
xpk/core/cluster.py CHANGED
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
62
62
  0 if successful and 1 otherwise.
63
63
  """
64
64
  command = (
65
- 'kubectl apply --server-side -f'
66
- f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
65
+ 'kubectl apply --server-side --force-conflicts'
66
+ f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
67
67
  )
68
68
  task = f'Install Jobset on {args.cluster}'
69
69
  return_code = run_command_with_updates_retry(command, task, args)
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
217
217
  )
218
218
  if err_code != 0:
219
219
  xpk_exit(err_code)
220
- data = yaml.safe_load(val)
221
- return data['items'] # pytype: disable=bad-return-type
220
+ data: dict[str, list[dict]] = yaml.safe_load(val)
221
+ return data['items']
222
222
 
223
223
 
224
224
  def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
@@ -442,10 +442,14 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
442
442
  if not getattr(args, 'kind_cluster', False):
443
443
  add_zone_and_project(args)
444
444
  get_cluster_credentials(args)
445
- args.project_number = project_id_to_project_number(args.project)
445
+ args.project_number = (
446
+ project_id_to_project_number(args.project)
447
+ if not args.dry_run
448
+ else abs(hash(args.project) % (10**12)) # 12 digit hash
449
+ )
446
450
 
447
451
  config.load_kube_config()
448
- return k8s_client.ApiClient() # pytype: disable=bad-return-type
452
+ return k8s_client.ApiClient()
449
453
 
450
454
 
451
455
  def get_gpu_type_from_cluster(args) -> str:
@@ -817,9 +821,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
817
821
  server_config_return_code, gke_server_config = get_gke_server_config(args)
818
822
  if server_config_return_code != 0:
819
823
  xpk_exit(server_config_return_code)
824
+ assert gke_server_config
825
+
820
826
  upgrade_master_return_code = upgrade_gke_control_plane_version(
821
827
  args,
822
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
828
+ gke_server_config.default_rapid_gke_version,
823
829
  )
824
830
  if upgrade_master_return_code > 0:
825
831
  xpk_print("Updating GKE cluster's control plane upgrade failed!")
@@ -828,7 +834,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
828
834
  # Upgrade nodepools version after the master upgrade.
829
835
  node_pool_update_code = upgrade_gke_nodepools_version(
830
836
  args,
831
- gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
837
+ gke_server_config.default_rapid_gke_version,
832
838
  )
833
839
  if node_pool_update_code > 0:
834
840
  xpk_print('Upgrading nodepools version failed!')
@@ -19,6 +19,7 @@ from ..utils.network import (
19
19
  add_current_machine_to_networks,
20
20
  is_current_machine_in_any_network,
21
21
  )
22
+ from ..utils.execution_context import is_dry_run
22
23
  from ..utils.objects import is_text_true
23
24
  from .commands import run_command_for_value, run_command_with_updates
24
25
  from .gcloud_context import zone_to_region
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
37
38
  if not args.private and args.authorized_networks is None:
38
39
  xpk_print('Cluster is public and no need to authorize networks.')
39
40
  return 0
40
- else:
41
+ elif not is_dry_run():
41
42
  xpk_print(
42
43
  'Cannot convert an existing public cluster to private. The arguments'
43
44
  ' --private and --authorized-networks are not acceptable for public'
@@ -95,7 +96,11 @@ def add_current_machine_to_networks_if_needed(
95
96
  "Adding current machine's IP address to the authorized networks"
96
97
  ' failed!'
97
98
  )
98
- return add_current_machine_to_networks_return_code, authorized_networks
99
+ return (
100
+ add_current_machine_to_networks_return_code,
101
+ False,
102
+ authorized_networks,
103
+ )
99
104
 
100
105
  return 0, is_current_machine_in_network, authorized_networks
101
106
 
@@ -160,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]:
160
165
  command,
161
166
  'Fetching the list of authorized network from cluster describe.',
162
167
  args,
168
+ dry_run_return_val='127.0.0.1/32',
163
169
  )
164
170
 
165
171
  if return_code != 0:
xpk/core/commands.py CHANGED
@@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
78
78
  The max return code and a list of all the return codes.
79
79
  """
80
80
 
81
+ files = [open(f, 'w', encoding='utf-8') for f in output_logs]
81
82
  children = []
82
83
  start_time = datetime.datetime.now()
83
- for i, command in enumerate(commands):
84
+ for command, file in zip(commands, files):
84
85
  children.append(
85
86
  # subprocess managed by list pylint: disable=consider-using-with
86
- subprocess.Popen(
87
- command, stdout=output_logs[i], stderr=output_logs[i], shell=True
88
- )
87
+ subprocess.Popen(command, stdout=file, stderr=file, shell=True)
89
88
  )
90
89
 
91
90
  while True:
@@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
99
98
  slow_worker_text = per_command_name[slow_worker_index]
100
99
  slow_str = (
101
100
  f', task {slow_worker_text} still working, logfile'
102
- f' {output_logs[slow_worker_index].name}'
101
+ f' {output_logs[slow_worker_index]}'
103
102
  )
104
103
  else:
105
104
  slow_str = ''
@@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
116
115
  )
117
116
  xpk_print(
118
117
  f'Failure is {per_command_name[failing_index]}'
119
- f' and logfile {output_logs[failing_index].name}'
118
+ f' and logfile {output_logs[failing_index]}'
120
119
  )
121
120
  for child in children:
122
121
  child.terminate()
@@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
126
125
  break
127
126
 
128
127
  time.sleep(1)
128
+
129
+ for file in files:
130
+ file.close()
131
+
129
132
  return max_returncode, returncodes
130
133
 
131
134
 
@@ -274,9 +277,9 @@ def run_command_for_value(
274
277
  else:
275
278
  if not quiet:
276
279
  xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
277
- out, err = child.communicate()
278
- out, err = str(out, 'UTF-8'), str(err, 'UTF-8')
279
- return return_code, f'{out}\n{err}'
280
+ out_bytes, err_bytes = child.communicate()
281
+ out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
282
+ return return_code, f'{out_str}\n{err_str}'
280
283
  else:
281
284
  if not quiet:
282
285
  xpk_print(
@@ -351,6 +354,6 @@ def run_command_with_full_controls(
351
354
 
352
355
  def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
353
356
  tmp = write_tmp_file(yml_string)
354
- command = f'kubectl apply -f {str(tmp.file.name)}'
357
+ command = f'kubectl apply -f {str(tmp)}'
355
358
  err_code = run_command_with_updates(command, task, args)
356
359
  return err_code
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.11.0'
25
+ __version__ = 'v0.13.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -71,13 +71,12 @@ class XpkConfig:
71
71
  dir_path = '/'.join(self._config.split('/')[:-1])
72
72
  file.ensure_directory_exists(dir_path)
73
73
 
74
- config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
75
74
  if not os.path.exists(self._config):
76
75
  return None
77
76
 
78
77
  with open(self._config, encoding='utf-8', mode='r') as stream:
79
78
  config_yaml: dict = yaml.load(stream)
80
- return config_yaml
79
+ return config_yaml
81
80
 
82
81
  def _save_configs(self, config_yaml: dict) -> None:
83
82
  with open(self._config, encoding='utf-8', mode='w') as stream:
@@ -109,7 +108,7 @@ class XpkConfig:
109
108
 
110
109
  def get_all(
111
110
  self,
112
- ) -> dict[str, dict[str, str] | str] | None:
111
+ ) -> dict[str, str] | None:
113
112
  config_yaml = self._open_configs()
114
113
  if config_yaml is None:
115
114
  return None
@@ -0,0 +1,71 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
18
+
19
+ import os
20
+ import pytest
21
+
22
+ config_tmp_path = '/tmp/config/config.yaml'
23
+
24
+
25
+ @pytest.fixture(name='_')
26
+ def _():
27
+ if os.path.exists(config_tmp_path):
28
+ os.remove(config_tmp_path)
29
+ yield
30
+ if os.path.exists(config_tmp_path):
31
+ os.remove(config_tmp_path)
32
+
33
+
34
+ def test_config(_):
35
+ cfg = XpkConfig(config_tmp_path)
36
+ cfg.set('project-id', 'foo')
37
+ project_id = cfg.get('project-id')
38
+ assert project_id == 'foo'
39
+
40
+
41
+ def test_config_get_all(_):
42
+ cfg = XpkConfig(config_tmp_path)
43
+ cfg.set(PROJECT_KEY, 'foo')
44
+ cfg.set(CLUSTER_NAME_KEY, 'bar')
45
+ cfg.set(ZONE_KEY, 'europe-west1-a')
46
+ cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
47
+
48
+ cfg_all = cfg.get_all()
49
+ assert cfg_all[PROJECT_KEY] == 'foo'
50
+ assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
51
+ assert cfg_all[ZONE_KEY] == 'europe-west1-a'
52
+ assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
53
+
54
+
55
+ def test_config_get_empty(_):
56
+ cfg = XpkConfig(config_tmp_path)
57
+ val = cfg.get(PROJECT_KEY)
58
+ assert val is None
59
+
60
+
61
+ def test_config_get_all_empty(_):
62
+ cfg = XpkConfig(config_tmp_path)
63
+ val = cfg.get_all()
64
+ assert not val
65
+
66
+
67
+ def test_config_set_incorrect(_):
68
+ cfg = XpkConfig(config_tmp_path)
69
+ cfg.set('foo', 'bar')
70
+ cfg_all = cfg.get_all()
71
+ assert not cfg_all
xpk/core/docker_image.py CHANGED
@@ -21,6 +21,7 @@ import string
21
21
 
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import write_tmp_file
24
+ from ..utils.execution_context import is_dry_run
24
25
  from .commands import run_command_with_updates
25
26
 
26
27
  DEFAULT_DOCKER_IMAGE = 'python:3.10'
@@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
75
76
  """
76
77
 
77
78
  # Pick a name for the docker image.
78
- docker_image_prefix = os.getenv('USER', 'unknown')
79
+ docker_image_prefix = (
80
+ 'dry-run' if is_dry_run() else os.getenv('USER', 'unknown')
81
+ )
79
82
  docker_name = f'{docker_image_prefix}-runner'
80
83
 
81
84
  script_dir_dockerfile = """FROM {base_docker_image}
@@ -94,7 +97,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
94
97
  )
95
98
  tmp = write_tmp_file(docker_file)
96
99
  docker_build_command = (
97
- f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t'
100
+ f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t'
98
101
  f' {docker_name} {args.script_dir}'
99
102
  )
100
103
  xpk_print(f'Building {args.script_dir} into docker image.')
@@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
114
117
 
115
118
  # Pick a randomly generated `tag_length` character docker tag.
116
119
  tag_length = 4
117
- tag_random_prefix = ''.join(
118
- random.choices(string.ascii_lowercase, k=tag_length)
120
+ tag_random_prefix = (
121
+ 'prefix'
122
+ if is_dry_run()
123
+ else ''.join(random.choices(string.ascii_lowercase, k=tag_length))
124
+ )
125
+ tag_datetime = (
126
+ 'current'
127
+ if is_dry_run()
128
+ else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
119
129
  )
120
- tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
121
130
  tag_name = f'{tag_random_prefix}-{tag_datetime}'
122
131
  cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
123
132
  xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.57.1"
33
+ ctk_build_ref = "v1.62.2"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"
@@ -20,6 +20,7 @@ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
20
20
  from .cluster import setup_k8s_env
21
21
  from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
22
22
  from .system_characteristics import AcceleratorType, SystemCharacteristics
23
+ from ..utils.execution_context import is_dry_run
23
24
 
24
25
 
25
26
  def get_main_container_resources(
@@ -72,7 +73,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
72
73
  if system.accelerator_type == AcceleratorType['CPU']:
73
74
  return get_cpu_env(args, system)
74
75
 
75
- return format_env_dict(args.env, system) # pytype: disable=bad-return-type
76
+ return format_env_dict(args.env, system)
76
77
 
77
78
 
78
79
  def get_gpu_env(args, system) -> str:
@@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
272
273
  - name: shared-data
273
274
  """
274
275
 
275
- storages: list[Storage] = get_storages_to_mount(
276
- setup_k8s_env(args), args.storage
276
+ storages: list[Storage] = (
277
+ []
278
+ if is_dry_run()
279
+ else get_storages_to_mount(setup_k8s_env(args), args.storage)
277
280
  )
278
281
  for storage in storages:
279
282
  if storage.type in {
@@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
325
328
  elif system.accelerator_type == AcceleratorType['GPU']:
326
329
  volume_mount_yaml = ''
327
330
 
328
- storages: list[Storage] = get_storages_to_mount(
329
- setup_k8s_env(args), args.storage
331
+ storages: list[Storage] = (
332
+ []
333
+ if is_dry_run()
334
+ else get_storages_to_mount(setup_k8s_env(args), args.storage)
330
335
  )
331
336
  for storage in storages:
332
337
  if storage.type in {
xpk/core/filestore.py CHANGED
@@ -93,11 +93,12 @@ class FilestoreClient:
93
93
 
94
94
  for instance in instancesZonal:
95
95
  if instance.name == fullname_zonal:
96
- return instance # pytype: disable=bad-return-type
96
+ return instance
97
97
 
98
98
  for instance in instancesRegional:
99
99
  if instance.name == fullname_regional:
100
- return instance # pytype: disable=bad-return-type
100
+ return instance
101
+ return None
101
102
 
102
103
  def check_instance_exists(self) -> bool:
103
104
  """Check if Filestore instance exists"""
@@ -111,6 +112,7 @@ class FilestoreClient:
111
112
  def get_instance_location(self) -> str:
112
113
  """Get Filestore instance's location"""
113
114
  self.load_instance()
115
+ assert self.instance
114
116
  return str(self.instance.name.split("/")[3])
115
117
 
116
118
  def create_instance(
@@ -192,6 +194,7 @@ class FilestoreClient:
192
194
 
193
195
  def create_sc(self, name: str, network: str) -> dict:
194
196
  """Create a yaml representing filestore StorageClass."""
197
+ assert self.instance
195
198
  data = templates.load(FS_SC_PATH)
196
199
  data["metadata"]["name"] = get_storage_class_name(name)
197
200
  data["parameters"]["tier"] = self.instance.tier.name
@@ -202,6 +205,7 @@ class FilestoreClient:
202
205
 
203
206
  def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
204
207
  """Create a yaml representing filestore PersistentVolume."""
208
+ assert self.instance
205
209
  data = templates.load(FS_PV_PATH)
206
210
  data["metadata"]["name"] = get_pv_name(name)
207
211
  data["spec"]["storageClassName"] = get_storage_class_name(name)
@@ -219,6 +223,7 @@ class FilestoreClient:
219
223
 
220
224
  def create_pvc(self, name: str, access_mode: str) -> dict:
221
225
  """Create a yaml representing filestore PersistentVolumeClaim."""
226
+ assert self.instance
222
227
  data = templates.load(FS_PVC_PATH)
223
228
  data["metadata"]["name"] = get_pvc_name(name)
224
229
  data["spec"]["accessModes"] = [access_mode]
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
75
75
  xpk_print(f'Working on {args.project} and {args.zone}')
76
76
 
77
77
 
78
- def zone_to_region(zone) -> str:
78
+ def zone_to_region(zone: str) -> str:
79
79
  """Helper function converts zone name to region name.
80
80
 
81
81
  Args:
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
85
85
  The region name.
86
86
  """
87
87
  zone_terms = zone.split('-')
88
- return zone_terms[0] + '-' + zone_terms[1] # pytype: disable=bad-return-type
88
+ return zone_terms[0] + '-' + zone_terms[1]
89
89
 
90
90
 
91
91
  @dataclass
xpk/core/jobset.py CHANGED
@@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
134
134
  memory_limit_size=new_memory_limit,
135
135
  )
136
136
  tmp = write_tmp_file(yml_string)
137
- command = f'kubectl apply -f {str(tmp.file.name)}'
137
+ command = f'kubectl apply -f {str(tmp)}'
138
138
 
139
139
  task = 'Updating jobset Controller Manager resources'
140
140
  return_code = run_command_with_updates_retry(command, task, args)