xpk 0.12.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {xpk-0.12.0/src/xpk.egg-info → xpk-0.13.0}/PKG-INFO +1 -1
  2. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/batch.py +8 -8
  3. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/cluster.py +9 -8
  4. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/common.py +4 -0
  5. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/inspector.py +1 -1
  6. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/job.py +30 -2
  7. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/storage.py +5 -2
  8. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/workload.py +16 -9
  9. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/cluster.py +5 -1
  10. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/cluster_private.py +3 -1
  11. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/commands.py +10 -7
  12. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/config.py +1 -1
  13. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_image.py +14 -5
  14. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_resources.py +9 -4
  15. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/jobset.py +1 -1
  16. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/kjob.py +5 -2
  17. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/kueue.py +22 -6
  18. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nap.py +1 -1
  19. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/network.py +1 -1
  20. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nodepool.py +8 -3
  21. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/pathways.py +6 -2
  22. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/ray.py +1 -1
  23. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/resources.py +17 -7
  24. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/scheduling.py +4 -0
  25. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/main.py +4 -1
  26. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/cluster.py +43 -0
  27. xpk-0.13.0/src/xpk/utils/execution_context.py +28 -0
  28. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/file.py +25 -10
  29. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/network.py +4 -0
  30. {xpk-0.12.0 → xpk-0.13.0/src/xpk.egg-info}/PKG-INFO +1 -1
  31. {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/SOURCES.txt +1 -0
  32. {xpk-0.12.0 → xpk-0.13.0}/LICENSE +0 -0
  33. {xpk-0.12.0 → xpk-0.13.0}/README.md +0 -0
  34. {xpk-0.12.0 → xpk-0.13.0}/pyproject.toml +0 -0
  35. {xpk-0.12.0 → xpk-0.13.0}/setup.cfg +0 -0
  36. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/__init__.py +0 -0
  37. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/api/__init__.py +0 -0
  38. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/api/storage_crd.yaml +0 -0
  39. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/__init__.py +0 -0
  40. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/cluster_gcluster.py +0 -0
  41. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/config.py +0 -0
  42. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/info.py +0 -0
  43. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/kind.py +0 -0
  44. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/kjob_common.py +0 -0
  45. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/run.py +0 -0
  46. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/shell.py +0 -0
  47. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/version.py +0 -0
  48. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/__init__.py +0 -0
  49. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/__init__.py +0 -0
  50. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  51. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
  52. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  53. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/capacity.py +0 -0
  54. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/config_test.py +0 -0
  55. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_container.py +0 -0
  56. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_manager.py +0 -0
  57. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/filestore.py +0 -0
  58. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcloud_context.py +0 -0
  59. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcluster_manager.py +0 -0
  60. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcsfuse.py +0 -0
  61. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/monitoring.py +0 -0
  62. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/mtc.py +0 -0
  63. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nodepool_test.py +0 -0
  64. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/__init__.py +0 -0
  65. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  66. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  67. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/storage.py +0 -0
  68. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/system_characteristics.py +0 -0
  69. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/vertex.py +0 -0
  70. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload.py +0 -0
  71. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  72. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  73. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  74. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  75. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  76. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  77. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_test.py +0 -0
  78. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/__init__.py +0 -0
  79. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/batch.py +0 -0
  80. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/common.py +0 -0
  81. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/config.py +0 -0
  82. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/core.py +0 -0
  83. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/info.py +0 -0
  84. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/inspector.py +0 -0
  85. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/job.py +0 -0
  86. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/kind.py +0 -0
  87. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/run.py +0 -0
  88. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/shell.py +0 -0
  89. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/storage.py +0 -0
  90. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/validators.py +0 -0
  91. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/version.py +0 -0
  92. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/workload.py +0 -0
  93. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/templates/__init__.py +0 -0
  94. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/templates/storage.yaml +0 -0
  95. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/__init__.py +0 -0
  96. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/console.py +0 -0
  97. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/gcs_utils.py +0 -0
  98. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/kubectl.py +0 -0
  99. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/objects.py +0 -0
  100. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/templates.py +0 -0
  101. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/validation.py +0 -0
  102. {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/yaml.py +0 -0
  103. {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  104. {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/entry_points.txt +0 -0
  105. {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/requires.txt +0 -0
  106. {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -31,6 +31,7 @@ from ..core.kjob import (
31
31
  )
32
32
  from ..core.kueue import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
+ from ..utils.execution_context import is_dry_run
34
35
  from .kind import set_local_cluster_command
35
36
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
37
 
@@ -51,18 +52,16 @@ def batch(args: Namespace) -> None:
51
52
  if set_cluster_command_code != 0:
52
53
  xpk_exit(set_cluster_command_code)
53
54
 
54
- err_code = prepare_kjob(args)
55
- if err_code > 0:
56
- xpk_exit(err_code)
57
- setup_k8s_service_accounts()
55
+ if not is_dry_run():
56
+ err_code = prepare_kjob(args)
57
+ if err_code > 0:
58
+ xpk_exit(err_code)
59
+ setup_k8s_service_accounts()
58
60
 
59
61
  submit_job(args)
60
62
 
61
63
 
62
64
  def submit_job(args: Namespace) -> None:
63
-
64
- setup_k8s_service_accounts()
65
-
66
65
  cmd = (
67
66
  'kubectl kjob create slurm'
68
67
  f' --profile {AppProfileDefaults.NAME.value}'
@@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None:
73
72
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
73
  cmd = add_TAS_annotations_to_command(args, cmd)
75
74
 
76
- for annotation in get_storage_annotations(args):
75
+ annotations = [] if is_dry_run() else get_storage_annotations(args)
76
+ for annotation in annotations:
77
77
  cmd += f' --pod-template-annotation {annotation}'
78
78
 
79
79
  if args.ignore_unknown_flags:
@@ -76,6 +76,7 @@ from ..core.vertex import create_vertex_tensorboard
76
76
  from ..core.workload import get_workload_list
77
77
  from ..utils.console import get_user_input, xpk_exit, xpk_print
78
78
  from ..utils.file import write_tmp_file
79
+ from ..utils.execution_context import is_dry_run
79
80
  from . import cluster_gcluster
80
81
  from .common import set_cluster_command
81
82
  import shutil
@@ -128,9 +129,10 @@ def cluster_adapt(args) -> None:
128
129
 
129
130
  get_cluster_credentials(args)
130
131
 
131
- k8s_client = setup_k8s_env(args)
132
+ if not is_dry_run():
133
+ k8s_client = setup_k8s_env(args)
134
+ install_storage_crd(k8s_client)
132
135
 
133
- install_storage_crd(k8s_client)
134
136
  install_storage_csis(args)
135
137
 
136
138
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -251,9 +253,10 @@ def cluster_create(args) -> None:
251
253
  if update_coredns_command_code != 0:
252
254
  xpk_exit(update_cluster_command_code)
253
255
 
254
- k8s_client = setup_k8s_env(args)
256
+ if not is_dry_run():
257
+ k8s_client = setup_k8s_env(args)
258
+ install_storage_crd(k8s_client)
255
259
 
256
- install_storage_crd(k8s_client)
257
260
  install_storage_csis(args)
258
261
 
259
262
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -409,10 +412,8 @@ def cluster_cacheimage(args) -> None:
409
412
  nodeSelectorKey=node_selector_key,
410
413
  )
411
414
  tmp = write_tmp_file(yml_string)
412
- command_apply = f'kubectl apply -f {str(tmp.file.name)}'
413
- command_delete = (
414
- f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
415
- )
415
+ command_apply = f'kubectl apply -f {str(tmp)}'
416
+ command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
416
417
 
417
418
  return_code = run_command_with_updates(
418
419
  command_delete, 'Deleting Cached Image', args
@@ -18,6 +18,7 @@ from ..core.commands import run_command_with_updates_retry
18
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
19
19
  from ..core.gcloud_context import zone_to_region
20
20
  from ..utils.console import xpk_print, xpk_exit
21
+ from ..utils.execution_context import is_dry_run
21
22
  from ..core.system_characteristics import (
22
23
  SystemCharacteristics,
23
24
  )
@@ -63,6 +64,9 @@ def is_TAS_possible(
63
64
  True if possible and False otherwise.
64
65
  """
65
66
 
67
+ if is_dry_run():
68
+ return True
69
+
66
70
  if system_characteristics is None:
67
71
  xpk_print('system_characteristics data was not found in configmaps.')
68
72
  xpk_exit(1)
@@ -346,7 +346,7 @@ def inspector(args) -> None:
346
346
  )
347
347
 
348
348
  # Summarize inspector:
349
- xpk_print(f'Find xpk inspector output file: {inspector_file.name}')
349
+ xpk_print(f'Find xpk inspector output file: {inspector_file}')
350
350
 
351
351
  if final_return_code != 0:
352
352
  xpk_print(
@@ -28,6 +28,28 @@ from ..utils.console import xpk_exit, xpk_print
28
28
  from .kind import set_local_cluster_command
29
29
 
30
30
 
31
+ JOBS_DRY_RUN_YAML = """
32
+ items:
33
+ - apiVersion: slurm.k8s.io/v1alpha1
34
+ kind: SlurmJob
35
+ metadata:
36
+ annotations:
37
+ kjobctl.x-k8s.io/script: echo hello
38
+ creationTimestamp: '2024-04-29T12:00:00Z'
39
+ labels:
40
+ kjobctl.x-k8s.io/app-profile: default
41
+ name: golden-job
42
+ namespace: default
43
+ spec:
44
+ script: echo hello
45
+ """
46
+
47
+ PODS_DRY_RUN_RESULT = """
48
+ foo-pod 2/2 Running 0 2d
49
+ bar-pod 1/1 Evicted 0 1d
50
+ """
51
+
52
+
31
53
  def job_info(args):
32
54
  """Run commands obtaining information about a job given by name.
33
55
 
@@ -52,7 +74,10 @@ def job_info(args):
52
74
  f' metadata.name=={job_name}'
53
75
  )
54
76
  job_code, job_text = run_command_for_value(
55
- job_command, 'Getting job info', args
77
+ job_command,
78
+ 'Getting job info',
79
+ args,
80
+ dry_run_return_val=JOBS_DRY_RUN_YAML,
56
81
  )
57
82
  if job_code != 0:
58
83
  xpk_print(f'Job info request returned ERROR {job_code}')
@@ -60,7 +85,10 @@ def job_info(args):
60
85
 
61
86
  pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
62
87
  pods_code, pods_text = run_command_for_value(
63
- pods_command, 'Getting pods list', args
88
+ pods_command,
89
+ 'Getting pods list',
90
+ args,
91
+ dry_run_return_val=PODS_DRY_RUN_RESULT,
64
92
  )
65
93
  if pods_code != 0:
66
94
  xpk_print(f'Pods list request returned ERROR {pods_code}')
@@ -58,6 +58,7 @@ from ..core.storage import (
58
58
  )
59
59
  from ..utils.console import get_user_input, xpk_exit, xpk_print
60
60
  from ..utils.kubectl import apply_kubectl_manifest
61
+ from ..utils.execution_context import is_dry_run
61
62
 
62
63
 
63
64
  def storage_create(args: Namespace) -> None:
@@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
243
244
 
244
245
 
245
246
  def storage_list(args: Namespace) -> None:
246
- k8s_api_client = setup_k8s_env(args)
247
- storages = list_storages(k8s_api_client)
247
+ storages = []
248
+ if not is_dry_run():
249
+ k8s_api_client = setup_k8s_env(args)
250
+ storages = list_storages(k8s_api_client)
248
251
  print_storages_for_cluster(storages)
249
252
 
250
253
 
@@ -97,6 +97,7 @@ from ..core.workload_decorators import (
97
97
  )
98
98
  from ..utils.console import get_user_input, xpk_exit, xpk_print
99
99
  from ..utils.file import write_tmp_file
100
+ from ..utils.execution_context import is_dry_run
100
101
  from . import cluster_gcluster
101
102
  from .common import is_TAS_possible
102
103
 
@@ -306,8 +307,10 @@ def workload_create(args) -> None:
306
307
  Returns:
307
308
  0 if successful and 1 otherwise.
308
309
  """
309
- k8s_api_client = setup_k8s_env(args)
310
- setup_k8s_service_accounts()
310
+ k8s_api_client = None
311
+ if not is_dry_run():
312
+ k8s_api_client = setup_k8s_env(args)
313
+ setup_k8s_service_accounts()
311
314
 
312
315
  workload_exists = check_if_workload_exists(args)
313
316
 
@@ -383,8 +386,10 @@ def workload_create(args) -> None:
383
386
  all_storages = []
384
387
  # Currently storage customization is not supported for Pathways workloads. b/408468941
385
388
  if not args.use_pathways:
386
- storages: list[Storage] = get_storages_to_mount(
387
- k8s_api_client, args.storage
389
+ storages: list[Storage] = (
390
+ []
391
+ if k8s_api_client is None
392
+ else get_storages_to_mount(k8s_api_client, args.storage)
388
393
  )
389
394
  gcs_fuse_storages = list(
390
395
  filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
@@ -569,14 +574,14 @@ def workload_create(args) -> None:
569
574
  pod_failure_policy=pod_failure_policy,
570
575
  )
571
576
  tmp = write_tmp_file(yml_string)
572
- command = f'kubectl apply -f {str(tmp.file.name)}'
577
+ command = f'kubectl apply -f {str(tmp)}'
573
578
  return_code = run_command_with_updates(command, 'Creating Workload', args)
574
579
 
575
580
  if return_code != 0:
576
581
  xpk_print(f'Create Workload request returned ERROR {return_code}')
577
582
  xpk_exit(return_code)
578
583
 
579
- if not args.use_pathways:
584
+ if not args.use_pathways and not is_dry_run():
580
585
  add_bucket_iam_members(args, storages)
581
586
 
582
587
  # Get GKE outlier dashboard for TPU
@@ -725,7 +730,11 @@ def workload_delete(args) -> None:
725
730
  )
726
731
  else:
727
732
  return_code = run_commands(
728
- commands, 'Delete Workload', task_names, batch=100
733
+ commands,
734
+ 'Delete Workload',
735
+ task_names,
736
+ batch=100,
737
+ dry_run=args.dry_run,
729
738
  )
730
739
 
731
740
  if return_code != 0:
@@ -743,8 +752,6 @@ def workload_list(args) -> None:
743
752
  Returns:
744
753
  0 if successful and 1 otherwise.
745
754
  """
746
- xpk_print(args)
747
-
748
755
  xpk_print('Starting workload list', flush=True)
749
756
  add_zone_and_project(args)
750
757
  get_cluster_credentials(args)
@@ -442,7 +442,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
442
442
  if not getattr(args, 'kind_cluster', False):
443
443
  add_zone_and_project(args)
444
444
  get_cluster_credentials(args)
445
- args.project_number = project_id_to_project_number(args.project)
445
+ args.project_number = (
446
+ project_id_to_project_number(args.project)
447
+ if not args.dry_run
448
+ else abs(hash(args.project) % (10**12)) # 12 digit hash
449
+ )
446
450
 
447
451
  config.load_kube_config()
448
452
  return k8s_client.ApiClient()
@@ -19,6 +19,7 @@ from ..utils.network import (
19
19
  add_current_machine_to_networks,
20
20
  is_current_machine_in_any_network,
21
21
  )
22
+ from ..utils.execution_context import is_dry_run
22
23
  from ..utils.objects import is_text_true
23
24
  from .commands import run_command_for_value, run_command_with_updates
24
25
  from .gcloud_context import zone_to_region
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
37
38
  if not args.private and args.authorized_networks is None:
38
39
  xpk_print('Cluster is public and no need to authorize networks.')
39
40
  return 0
40
- else:
41
+ elif not is_dry_run():
41
42
  xpk_print(
42
43
  'Cannot convert an existing public cluster to private. The arguments'
43
44
  ' --private and --authorized-networks are not acceptable for public'
@@ -164,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]:
164
165
  command,
165
166
  'Fetching the list of authorized network from cluster describe.',
166
167
  args,
168
+ dry_run_return_val='127.0.0.1/32',
167
169
  )
168
170
 
169
171
  if return_code != 0:
@@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
78
78
  The max return code and a list of all the return codes.
79
79
  """
80
80
 
81
+ files = [open(f, 'w', encoding='utf-8') for f in output_logs]
81
82
  children = []
82
83
  start_time = datetime.datetime.now()
83
- for i, command in enumerate(commands):
84
+ for command, file in zip(commands, files):
84
85
  children.append(
85
86
  # subprocess managed by list pylint: disable=consider-using-with
86
- subprocess.Popen(
87
- command, stdout=output_logs[i], stderr=output_logs[i], shell=True
88
- )
87
+ subprocess.Popen(command, stdout=file, stderr=file, shell=True)
89
88
  )
90
89
 
91
90
  while True:
@@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
99
98
  slow_worker_text = per_command_name[slow_worker_index]
100
99
  slow_str = (
101
100
  f', task {slow_worker_text} still working, logfile'
102
- f' {output_logs[slow_worker_index].name}'
101
+ f' {output_logs[slow_worker_index]}'
103
102
  )
104
103
  else:
105
104
  slow_str = ''
@@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
116
115
  )
117
116
  xpk_print(
118
117
  f'Failure is {per_command_name[failing_index]}'
119
- f' and logfile {output_logs[failing_index].name}'
118
+ f' and logfile {output_logs[failing_index]}'
120
119
  )
121
120
  for child in children:
122
121
  child.terminate()
@@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
126
125
  break
127
126
 
128
127
  time.sleep(1)
128
+
129
+ for file in files:
130
+ file.close()
131
+
129
132
  return max_returncode, returncodes
130
133
 
131
134
 
@@ -351,6 +354,6 @@ def run_command_with_full_controls(
351
354
 
352
355
  def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
353
356
  tmp = write_tmp_file(yml_string)
354
- command = f'kubectl apply -f {str(tmp.file.name)}'
357
+ command = f'kubectl apply -f {str(tmp)}'
355
358
  err_code = run_command_with_updates(command, task, args)
356
359
  return err_code
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.12.0'
25
+ __version__ = 'v0.13.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -21,6 +21,7 @@ import string
21
21
 
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import write_tmp_file
24
+ from ..utils.execution_context import is_dry_run
24
25
  from .commands import run_command_with_updates
25
26
 
26
27
  DEFAULT_DOCKER_IMAGE = 'python:3.10'
@@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
75
76
  """
76
77
 
77
78
  # Pick a name for the docker image.
78
- docker_image_prefix = os.getenv('USER', 'unknown')
79
+ docker_image_prefix = (
80
+ 'dry-run' if is_dry_run() else os.getenv('USER', 'unknown')
81
+ )
79
82
  docker_name = f'{docker_image_prefix}-runner'
80
83
 
81
84
  script_dir_dockerfile = """FROM {base_docker_image}
@@ -94,7 +97,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
94
97
  )
95
98
  tmp = write_tmp_file(docker_file)
96
99
  docker_build_command = (
97
- f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t'
100
+ f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t'
98
101
  f' {docker_name} {args.script_dir}'
99
102
  )
100
103
  xpk_print(f'Building {args.script_dir} into docker image.')
@@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
114
117
 
115
118
  # Pick a randomly generated `tag_length` character docker tag.
116
119
  tag_length = 4
117
- tag_random_prefix = ''.join(
118
- random.choices(string.ascii_lowercase, k=tag_length)
120
+ tag_random_prefix = (
121
+ 'prefix'
122
+ if is_dry_run()
123
+ else ''.join(random.choices(string.ascii_lowercase, k=tag_length))
124
+ )
125
+ tag_datetime = (
126
+ 'current'
127
+ if is_dry_run()
128
+ else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
119
129
  )
120
- tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
121
130
  tag_name = f'{tag_random_prefix}-{tag_datetime}'
122
131
  cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
123
132
  xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
@@ -20,6 +20,7 @@ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
20
20
  from .cluster import setup_k8s_env
21
21
  from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
22
22
  from .system_characteristics import AcceleratorType, SystemCharacteristics
23
+ from ..utils.execution_context import is_dry_run
23
24
 
24
25
 
25
26
  def get_main_container_resources(
@@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
272
273
  - name: shared-data
273
274
  """
274
275
 
275
- storages: list[Storage] = get_storages_to_mount(
276
- setup_k8s_env(args), args.storage
276
+ storages: list[Storage] = (
277
+ []
278
+ if is_dry_run()
279
+ else get_storages_to_mount(setup_k8s_env(args), args.storage)
277
280
  )
278
281
  for storage in storages:
279
282
  if storage.type in {
@@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
325
328
  elif system.accelerator_type == AcceleratorType['GPU']:
326
329
  volume_mount_yaml = ''
327
330
 
328
- storages: list[Storage] = get_storages_to_mount(
329
- setup_k8s_env(args), args.storage
331
+ storages: list[Storage] = (
332
+ []
333
+ if is_dry_run()
334
+ else get_storages_to_mount(setup_k8s_env(args), args.storage)
330
335
  )
331
336
  for storage in storages:
332
337
  if storage.type in {
@@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
134
134
  memory_limit_size=new_memory_limit,
135
135
  )
136
136
  tmp = write_tmp_file(yml_string)
137
- command = f'kubectl apply -f {str(tmp.file.name)}'
137
+ command = f'kubectl apply -f {str(tmp)}'
138
138
 
139
139
  task = 'Updating jobset Controller Manager resources'
140
140
  return_code = run_command_with_updates_retry(command, task, args)
@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
23
23
  from kubernetes.client.rest import ApiException
24
24
 
25
25
  from ..utils import templates
26
+ from ..utils.execution_context import is_dry_run
26
27
  from ..utils.console import xpk_exit, xpk_print
27
28
  from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
28
29
  from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
@@ -368,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
368
369
  def prepare_kjob(args: Namespace) -> int:
369
370
  system = get_cluster_system_characteristics(args)
370
371
 
371
- k8s_api_client = setup_k8s_env(args)
372
- storages = get_auto_mount_storages(k8s_api_client)
372
+ storages = []
373
+ if not is_dry_run():
374
+ k8s_api_client = setup_k8s_env(args)
375
+ storages = get_auto_mount_storages(k8s_api_client)
373
376
 
374
377
  service_account = ""
375
378
  if len(storages) > 0:
@@ -436,6 +436,8 @@ def install_kueue_crs(
436
436
  cluster_hardware_name=cluster_hardware_name,
437
437
  resource_type=resource_type,
438
438
  total_chips=total_chips,
439
+ cpu_limit=args.cpu_limit,
440
+ memory_limit=args.memory_limit,
439
441
  )
440
442
  topology_label = ''
441
443
  if system.device_type in [
@@ -474,7 +476,7 @@ def install_kueue_crs(
474
476
  yml_string = topology_yaml + yml_string
475
477
 
476
478
  tmp = write_tmp_file(yml_string)
477
- command = f'kubectl apply -f {str(tmp.file.name)}'
479
+ command = f'kubectl apply -f {str(tmp)}'
478
480
 
479
481
  task = 'Applying Kueue Custom Resources'
480
482
  return_code = run_command_with_updates_retry(command, task, args)
@@ -484,7 +486,7 @@ def install_kueue_crs(
484
486
 
485
487
 
486
488
  def get_kueue_covered_resources_config(
487
- cluster_hardware_name, resource_type, total_chips
489
+ cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
488
490
  ) -> str:
489
491
  """Gets Kueue covered resources configuration.
490
492
 
@@ -497,17 +499,31 @@ def get_kueue_covered_resources_config(
497
499
  A string of Kueue covered resources configuration.
498
500
  """
499
501
  config_format = """
500
- - coveredResources: ["{resource_type}"]
502
+ - coveredResources: {resource_types}
501
503
  flavors:
502
504
  - name: {cluster_hardware_name}
503
505
  resources:
504
506
  - name: "{resource_type}"
505
- nominalQuota: {total_chips}
506
- """
507
+ nominalQuota: {total_chips}"""
508
+ resource_types = [resource_type]
509
+ if cpu_limit:
510
+ config_format = config_format + """
511
+ - name: "cpu"
512
+ nominalQuota: {cpu_limit}"""
513
+ resource_types.append('cpu')
514
+ if memory_limit:
515
+ config_format = config_format + """
516
+ - name: "memory"
517
+ nominalQuota: {memory_limit}"""
518
+ resource_types.append('memory')
519
+
507
520
  config_string = config_format.format(
508
521
  cluster_hardware_name=cluster_hardware_name,
522
+ resource_types=resource_types,
509
523
  resource_type=resource_type,
510
524
  total_chips=total_chips,
525
+ cpu_limit=cpu_limit,
526
+ memory_limit=memory_limit,
511
527
  )
512
528
  return config_string
513
529
 
@@ -536,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
536
552
  memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
537
553
  )
538
554
  tmp = write_tmp_file(yml_string)
539
- command = f'kubectl apply -f {str(tmp.file.name)}'
555
+ command = f'kubectl apply -f {str(tmp)}'
540
556
 
541
557
  task = 'Updating Kueue Controller Manager resources'
542
558
  return_code = run_command_with_updates_retry(command, task, args)
@@ -250,7 +250,7 @@ def create_autoprovisioning_config(
250
250
  zones=f'- {args.zone}',
251
251
  )
252
252
  autoprovisioning_config = AutoprovisioningConfig(
253
- config_filename=write_tmp_file(yml_string).name,
253
+ config_filename=write_tmp_file(yml_string),
254
254
  minimum_chips=minimum,
255
255
  maximum_chips=maximum,
256
256
  )
@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
221
221
  """
222
222
  yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
223
223
  tmp = write_tmp_file(yml_string)
224
- command = f'kubectl apply -f {str(tmp.file.name)}'
224
+ command = f'kubectl apply -f {str(tmp)}'
225
225
 
226
226
  return_code = run_command_with_updates(
227
227
  command, 'GKE Cluster Create Network Config', args
@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
265
265
  )
266
266
  configmap_yml = {}
267
267
  configmap_yml[resources_configmap_name] = resources_yml
268
- return_code = create_or_update_cluster_configmap(configmap_yml)
268
+ return_code = create_or_update_cluster_configmap(
269
+ configmap_yml, args.dry_run
270
+ )
269
271
  if return_code != 0:
270
272
  return 1
271
273
 
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
461
463
  f' --region={zone_to_region(args.zone)} --format="value(locations)"'
462
464
  )
463
465
  return_code, nodepool_zone = run_command_for_value(
464
- command, 'Get Node Pool Zone', args
466
+ command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
465
467
  )
466
468
  if return_code != 0:
467
469
  xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
570
572
  for i, command in enumerate(commands):
571
573
  xpk_print(f'To complete {task_names[i]} we are executing {command}')
572
574
  max_return_code = run_commands(
573
- commands, 'Update GKE node pools to default RAPID GKE version', task_names
575
+ commands,
576
+ 'Update GKE node pools to default RAPID GKE version',
577
+ task_names,
578
+ dry_run=args.dry_run,
574
579
  )
575
580
  if max_return_code != 0:
576
581
  xpk_print(
@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
19
19
  from ..core.gcloud_context import zone_to_region
20
20
  from ..core.nodepool import get_all_nodepools_programmatic
21
21
  from ..utils.console import xpk_exit, xpk_print
22
+ from ..utils.execution_context import is_dry_run
22
23
  from .system_characteristics import AcceleratorType, SystemCharacteristics
23
24
 
24
25
 
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
79
80
  # Ensure the cluster and CPU nodepools were created with create-pathways
80
81
  all_node_pools = get_all_nodepools_programmatic(args)
81
82
  desired_pw_cpu_node_pools = {'cpu-np'}
82
- if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
83
+ if (
84
+ not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
85
+ and not is_dry_run()
86
+ ):
83
87
  xpk_print(
84
88
  'Cluster needs to be created with `xpk create-pathways` to run'
85
89
  ' Pathways workloads.'
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
322
326
  return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
323
327
  else:
324
328
  return_code = run_commands(
325
- commands, 'Delete Workload', task_names, batch=100
329
+ commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
326
330
  )
327
331
 
328
332
  if return_code != 0:
@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
132
132
  )
133
133
 
134
134
  tmp = write_tmp_file(yml_string)
135
- command = f'kubectl apply -f {str(tmp.file.name)}'
135
+ command = f'kubectl apply -f {str(tmp)}'
136
136
  task = 'Applying RayCluster'
137
137
  retry_attempts = 1
138
138
  return_code = run_command_with_updates_retry(