xpk 0.11.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {xpk-0.11.0/src/xpk.egg-info → xpk-0.13.0}/PKG-INFO +4 -1
  2. {xpk-0.11.0 → xpk-0.13.0}/pyproject.toml +24 -1
  3. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/batch.py +8 -8
  4. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/cluster.py +19 -19
  5. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/cluster_gcluster.py +2 -1
  6. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/common.py +7 -3
  7. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/info.py +12 -12
  8. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/inspector.py +1 -1
  9. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/job.py +42 -12
  10. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/kjob_common.py +2 -1
  11. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/storage.py +6 -3
  12. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/workload.py +28 -15
  13. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_generator.py +7 -7
  14. xpk-0.13.0/src/xpk/core/blueprint/blueprint_test.py +218 -0
  15. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/capacity.py +3 -1
  16. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/cluster.py +14 -8
  17. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/cluster_private.py +8 -2
  18. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/commands.py +13 -10
  19. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/config.py +3 -4
  20. xpk-0.13.0/src/xpk/core/config_test.py +71 -0
  21. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/docker_image.py +14 -5
  22. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/docker_manager.py +1 -1
  23. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/docker_resources.py +10 -5
  24. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/filestore.py +7 -2
  25. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/gcloud_context.py +2 -2
  26. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/jobset.py +1 -1
  27. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/kjob.py +7 -3
  28. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/kueue.py +28 -8
  29. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/nap.py +5 -5
  30. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/network.py +1 -1
  31. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/nodepool.py +8 -3
  32. xpk-0.13.0/src/xpk/core/nodepool_test.py +82 -0
  33. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/pathways.py +6 -2
  34. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/ray.py +1 -1
  35. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/resources.py +18 -14
  36. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/scheduling.py +4 -0
  37. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/storage.py +14 -14
  38. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/system_characteristics.py +1 -1
  39. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload.py +11 -0
  40. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/rdma_decorator.py +3 -2
  41. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/storage_decorator.py +2 -1
  42. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  43. xpk-0.13.0/src/xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  44. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  45. xpk-0.13.0/src/xpk/core/workload_test.py +28 -0
  46. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/main.py +12 -10
  47. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/cluster.py +110 -49
  48. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/common.py +45 -36
  49. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/storage.py +12 -13
  50. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/workload.py +57 -39
  51. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/console.py +2 -1
  52. xpk-0.13.0/src/xpk/utils/execution_context.py +28 -0
  53. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/file.py +25 -10
  54. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/network.py +4 -0
  55. {xpk-0.11.0 → xpk-0.13.0/src/xpk.egg-info}/PKG-INFO +4 -1
  56. {xpk-0.11.0 → xpk-0.13.0}/src/xpk.egg-info/SOURCES.txt +6 -0
  57. {xpk-0.11.0 → xpk-0.13.0}/src/xpk.egg-info/requires.txt +3 -0
  58. {xpk-0.11.0 → xpk-0.13.0}/LICENSE +0 -0
  59. {xpk-0.11.0 → xpk-0.13.0}/README.md +0 -0
  60. {xpk-0.11.0 → xpk-0.13.0}/setup.cfg +0 -0
  61. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/__init__.py +0 -0
  62. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/api/__init__.py +0 -0
  63. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/api/storage_crd.yaml +0 -0
  64. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/__init__.py +0 -0
  65. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/config.py +0 -0
  66. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/kind.py +0 -0
  67. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/run.py +0 -0
  68. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/shell.py +0 -0
  69. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/commands/version.py +0 -0
  70. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/__init__.py +0 -0
  71. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/blueprint/__init__.py +0 -0
  72. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  73. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/docker_container.py +0 -0
  74. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/gcluster_manager.py +0 -0
  75. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/gcsfuse.py +0 -0
  76. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/monitoring.py +0 -0
  77. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/mtc.py +0 -0
  78. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/remote_state/__init__.py +0 -0
  79. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  80. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  81. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/vertex.py +0 -0
  82. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  83. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/__init__.py +0 -0
  84. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/batch.py +0 -0
  85. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/config.py +0 -0
  86. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/core.py +0 -0
  87. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/info.py +0 -0
  88. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/inspector.py +0 -0
  89. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/job.py +0 -0
  90. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/kind.py +0 -0
  91. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/run.py +0 -0
  92. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/shell.py +0 -0
  93. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/validators.py +0 -0
  94. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/parser/version.py +0 -0
  95. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/templates/__init__.py +0 -0
  96. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/templates/storage.yaml +0 -0
  97. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/__init__.py +0 -0
  98. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/gcs_utils.py +0 -0
  99. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/kubectl.py +0 -0
  100. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/objects.py +0 -0
  101. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/templates.py +0 -0
  102. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/validation.py +0 -0
  103. {xpk-0.11.0 → xpk-0.13.0}/src/xpk/utils/yaml.py +0 -0
  104. {xpk-0.11.0 → xpk-0.13.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  105. {xpk-0.11.0 → xpk-0.13.0}/src/xpk.egg-info/entry_points.txt +0 -0
  106. {xpk-0.11.0 → xpk-0.13.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.11.0
3
+ Version: 0.13.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -28,6 +28,9 @@ Requires-Dist: pylint>=2.6.0; extra == "dev"
28
28
  Requires-Dist: pre-commit; extra == "dev"
29
29
  Requires-Dist: pytest; extra == "dev"
30
30
  Requires-Dist: docker==7.1.0; extra == "dev"
31
+ Requires-Dist: mypy~=1.17; extra == "dev"
32
+ Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
33
+ Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
31
34
  Dynamic: license-file
32
35
 
33
36
  <!--
@@ -62,7 +62,10 @@ dev = [
62
62
  "pylint>=2.6.0",
63
63
  "pre-commit",
64
64
  "pytest",
65
- "docker==7.1.0"
65
+ "docker==7.1.0",
66
+ "mypy ~= 1.17",
67
+ "types-PyYAML == 6.0.2",
68
+ "types-docker ~= 7.1.0.0",
66
69
  ]
67
70
 
68
71
  [tool.setuptools.dynamic]
@@ -79,3 +82,23 @@ line-length = 80
79
82
  unstable = true
80
83
  pyink-indentation = 2
81
84
  pyink-use-majority-quotes = true
85
+
86
+ [tool.mypy]
87
+ follow_untyped_imports = true
88
+ warn_unreachable = true
89
+
90
+ strict = true
91
+ # Current code is not compatible with all of the strict flags:
92
+ disallow_any_generics = false
93
+ disallow_untyped_calls = false
94
+ disallow_untyped_defs = false
95
+ disallow_incomplete_defs = false
96
+ check_untyped_defs = false
97
+ no_implicit_reexport = false
98
+
99
+ # Remove follow_imports below once the exclude list is empty:
100
+ follow_imports = "silent"
101
+ files = "src"
102
+ exclude = [
103
+ 'src/xpk/core/blueprint/blueprint_generator\.py',
104
+ ]
@@ -31,6 +31,7 @@ from ..core.kjob import (
31
31
  )
32
32
  from ..core.kueue import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
+ from ..utils.execution_context import is_dry_run
34
35
  from .kind import set_local_cluster_command
35
36
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
37
 
@@ -51,18 +52,16 @@ def batch(args: Namespace) -> None:
51
52
  if set_cluster_command_code != 0:
52
53
  xpk_exit(set_cluster_command_code)
53
54
 
54
- err_code = prepare_kjob(args)
55
- if err_code > 0:
56
- xpk_exit(err_code)
57
- setup_k8s_service_accounts()
55
+ if not is_dry_run():
56
+ err_code = prepare_kjob(args)
57
+ if err_code > 0:
58
+ xpk_exit(err_code)
59
+ setup_k8s_service_accounts()
58
60
 
59
61
  submit_job(args)
60
62
 
61
63
 
62
64
  def submit_job(args: Namespace) -> None:
63
-
64
- setup_k8s_service_accounts()
65
-
66
65
  cmd = (
67
66
  'kubectl kjob create slurm'
68
67
  f' --profile {AppProfileDefaults.NAME.value}'
@@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None:
73
72
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
73
  cmd = add_TAS_annotations_to_command(args, cmd)
75
74
 
76
- for annotation in get_storage_annotations(args):
75
+ annotations = [] if is_dry_run() else get_storage_annotations(args)
76
+ for annotation in annotations:
77
77
  cmd += f' --pod-template-annotation {annotation}'
78
78
 
79
79
  if args.ignore_unknown_flags:
@@ -76,6 +76,7 @@ from ..core.vertex import create_vertex_tensorboard
76
76
  from ..core.workload import get_workload_list
77
77
  from ..utils.console import get_user_input, xpk_exit, xpk_print
78
78
  from ..utils.file import write_tmp_file
79
+ from ..utils.execution_context import is_dry_run
79
80
  from . import cluster_gcluster
80
81
  from .common import set_cluster_command
81
82
  import shutil
@@ -92,7 +93,7 @@ def cluster_adapt(args) -> None:
92
93
 
93
94
  system, return_code = get_system_characteristics(args)
94
95
 
95
- if return_code > 0:
96
+ if return_code > 0 or system is None:
96
97
  xpk_print('Fetching system characteristics failed!')
97
98
  xpk_exit(return_code)
98
99
 
@@ -128,9 +129,10 @@ def cluster_adapt(args) -> None:
128
129
 
129
130
  get_cluster_credentials(args)
130
131
 
131
- k8s_client = setup_k8s_env(args)
132
+ if not is_dry_run():
133
+ k8s_client = setup_k8s_env(args)
134
+ install_storage_crd(k8s_client)
132
135
 
133
- install_storage_crd(k8s_client)
134
136
  install_storage_csis(args)
135
137
 
136
138
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -141,8 +143,6 @@ def cluster_adapt(args) -> None:
141
143
  if not tensorboard_config:
142
144
  xpk_exit(1)
143
145
 
144
- # Provision node pools dynamically based on incoming workloads:
145
- # Currently autoprovisioning is not supported with Pathways.
146
146
  autoprovisioning_config = None
147
147
  if args.enable_autoprovisioning:
148
148
  xpk_print('Enabling Autoprovisioning')
@@ -201,7 +201,7 @@ def cluster_create(args) -> None:
201
201
  """
202
202
  system, return_code = get_system_characteristics(args)
203
203
 
204
- if return_code > 0:
204
+ if return_code > 0 or system is None:
205
205
  xpk_print('Fetching system characteristics failed!')
206
206
  xpk_exit(return_code)
207
207
 
@@ -217,13 +217,13 @@ def cluster_create(args) -> None:
217
217
  xpk_exit(0)
218
218
 
219
219
  return_code, gke_server_config = get_gke_server_config(args)
220
- if return_code != 0:
220
+ if return_code != 0 or gke_server_config is None:
221
221
  xpk_exit(return_code)
222
222
 
223
223
  return_code, gke_control_plane_version = get_gke_control_plane_version(
224
224
  args, gke_server_config
225
225
  )
226
- if return_code != 0:
226
+ if return_code != 0 or gke_control_plane_version is None:
227
227
  xpk_exit(return_code)
228
228
 
229
229
  create_cluster_command_code = create_cluster_if_necessary(
@@ -253,9 +253,10 @@ def cluster_create(args) -> None:
253
253
  if update_coredns_command_code != 0:
254
254
  xpk_exit(update_cluster_command_code)
255
255
 
256
- k8s_client = setup_k8s_env(args)
256
+ if not is_dry_run():
257
+ k8s_client = setup_k8s_env(args)
258
+ install_storage_crd(k8s_client)
257
259
 
258
- install_storage_crd(k8s_client)
259
260
  install_storage_csis(args)
260
261
 
261
262
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -294,7 +295,7 @@ def cluster_create(args) -> None:
294
295
  # Provision node pools dynamically based on incoming workloads:
295
296
  # Currently autoprovisioning is not supported with Pathways.
296
297
  autoprovisioning_config = None
297
- if not args.enable_pathways and args.enable_autoprovisioning:
298
+ if args.enable_autoprovisioning:
298
299
  xpk_print('Enabling Autoprovisioning')
299
300
  autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
300
301
  args, system
@@ -398,7 +399,7 @@ def cluster_cacheimage(args) -> None:
398
399
  get_cluster_credentials(args)
399
400
  system, return_code = get_system_characteristics(args)
400
401
 
401
- if return_code > 0:
402
+ if return_code > 0 or system is None:
402
403
  xpk_print('Fetching system characteristics failed!')
403
404
  xpk_exit(return_code)
404
405
 
@@ -411,10 +412,8 @@ def cluster_cacheimage(args) -> None:
411
412
  nodeSelectorKey=node_selector_key,
412
413
  )
413
414
  tmp = write_tmp_file(yml_string)
414
- command_apply = f'kubectl apply -f {str(tmp.file.name)}'
415
- command_delete = (
416
- f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
417
- )
415
+ command_apply = f'kubectl apply -f {str(tmp)}'
416
+ command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
418
417
 
419
418
  return_code = run_command_with_updates(
420
419
  command_delete, 'Deleting Cached Image', args
@@ -808,6 +807,7 @@ def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
808
807
 
809
808
  def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
810
809
  """Check for the existence of a specific Deployment in a given namespace."""
810
+ # TODO: rewrite this to be more obvious, check if it is correct
811
811
  command = (
812
812
  f'kubectl get deployment {deployment_name} -n'
813
813
  f' {namespace} --ignore-not-found'
@@ -815,11 +815,11 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
815
815
  result = run_command_with_updates(
816
816
  command, 'Waiting for kubeDNS to be checked.', args
817
817
  )
818
- return result
818
+ return result != 0
819
819
 
820
820
 
821
821
  def verify_coredns_readiness(
822
- args, timeout: int = 120, namespace: str = 'kube-system'
822
+ args, timeout: int = 240, namespace: str = 'kube-system'
823
823
  ):
824
824
  """Verifies CoreDNS readiness using kubectl wait commands."""
825
825
  xpk_print('Now verifying CoreDNS readiness...')
@@ -874,7 +874,7 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
874
874
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
875
875
 
876
876
 
877
- def update_coredns(args):
877
+ def update_coredns(args) -> int:
878
878
  """Updates and deploys CoreDNS within a cluster.
879
879
 
880
880
  Args:
@@ -310,4 +310,5 @@ def generate_blueprint(
310
310
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
311
311
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
312
312
  )
313
- return None
313
+ xpk_print('Device type is not supported.')
314
+ xpk_exit(1)
@@ -18,6 +18,7 @@ from ..core.commands import run_command_with_updates_retry
18
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
19
19
  from ..core.gcloud_context import zone_to_region
20
20
  from ..utils.console import xpk_print, xpk_exit
21
+ from ..utils.execution_context import is_dry_run
21
22
  from ..core.system_characteristics import (
22
23
  SystemCharacteristics,
23
24
  )
@@ -50,8 +51,8 @@ def set_cluster_command(args) -> int:
50
51
 
51
52
 
52
53
  def is_TAS_possible(
53
- system_characteristics: SystemCharacteristics,
54
- capacity_type: CapacityType,
54
+ system_characteristics: SystemCharacteristics | None,
55
+ capacity_type: CapacityType | None,
55
56
  flex: bool,
56
57
  ) -> bool:
57
58
  """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
@@ -63,6 +64,9 @@ def is_TAS_possible(
63
64
  True if possible and False otherwise.
64
65
  """
65
66
 
67
+ if is_dry_run():
68
+ return True
69
+
66
70
  if system_characteristics is None:
67
71
  xpk_print('system_characteristics data was not found in configmaps.')
68
72
  xpk_exit(1)
@@ -71,7 +75,7 @@ def is_TAS_possible(
71
75
  xpk_print('capacity_type data was not found in configmaps.')
72
76
  xpk_exit(1)
73
77
 
74
- if flex:
78
+ if not flex:
75
79
  return False
76
80
 
77
81
  if (
@@ -51,19 +51,19 @@ def info(args: Namespace) -> None:
51
51
  cqs = run_kueuectl_list_clusterqueue(args)
52
52
  quotas = get_nominal_quotas(cqs)
53
53
 
54
- if lq:
54
+ if lq and lqs is not None:
55
55
  print_formatted_lqs(lqs, quotas)
56
56
 
57
57
  if cq:
58
58
  print_formatted_cqs(cqs, quotas)
59
59
 
60
60
 
61
- def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
61
+ def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
62
62
  """Get quotas from clusterqueues.
63
63
  This function retrieves how much of resource in each flavor is assigned to cluster queue.
64
64
  It parses flavors of passed cluster queues.
65
65
  Args:
66
- - cqs - list of cluster queues.
66
+ - cqs - string containing a list of cluster queues in JSON format.
67
67
  Returns:
68
68
  - dictionary of cluster queues resources quotas in format:
69
69
  {cq_name:{"flavorName:resourceName":quota}}
@@ -75,7 +75,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
75
75
  xpk_print(cqs)
76
76
  xpk_exit(1)
77
77
 
78
- quotas = {}
78
+ quotas: dict[str, dict] = {}
79
79
  for cq in cq_list:
80
80
  spec = cq['spec']
81
81
  cq_name = cq['metadata']['name']
@@ -89,7 +89,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
89
89
  return quotas
90
90
 
91
91
 
92
- def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
92
+ def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
93
93
  try:
94
94
  cq_list = json.loads(cqs)['items']
95
95
  except ValueError:
@@ -105,7 +105,7 @@ def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
105
105
  )
106
106
 
107
107
 
108
- def print_formatted_lqs(lqs: list[dict], nominalQuotas) -> None:
108
+ def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
109
109
  try:
110
110
  lq_list = json.loads(lqs)['items']
111
111
  except ValueError:
@@ -143,18 +143,18 @@ def parse_queue_lists(
143
143
 
144
144
 
145
145
  def get_flavors_resources_reservations(
146
- cq_name: str, flavors_res: list[dict]
146
+ cq_name: str, flavors_res: dict
147
147
  ) -> dict[str, dict[str, str]]:
148
148
  """Get usage of flavors resources.
149
149
  This function parser flavorsReservation section of clusterQueue of LocalQueue.
150
150
  Args:
151
151
  - cq_name - name of ClusterQueue to which flavors belong.
152
- - flavors_res - list of reservations made by flavors
152
+ - flavors_res - dict of reservations made by flavors
153
153
  Returns:
154
154
  Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
155
155
  Dict format: {cq_name: {{flavor:resource}:reservation}}
156
156
  """
157
- reservations = {}
157
+ reservations: dict[str, dict] = {}
158
158
  reservations[cq_name] = {}
159
159
  for flavor_name, flavor_resources_reservation_list in flavors_res.items():
160
160
  for resource in flavor_resources_reservation_list:
@@ -167,15 +167,15 @@ def get_flavors_resources_reservations(
167
167
 
168
168
  def get_flavors_usage(
169
169
  q_entry: dict, res_field: str, flavor_resource_quotas: dict
170
- ) -> list[dict]:
170
+ ) -> dict[str, str]:
171
171
  """Parse q_entry to retrieve list of each resource usage in flavour.
172
172
  Args:
173
173
  q_entry - single entry into either LocalQueue or ClusterQueue structured as json
174
174
  flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
175
175
  Returns:
176
- list of dicts where each list entry is in format (key, entry) where:
176
+ Dict where for each (key, value):
177
177
  - key is flavorName:resourceName
178
- - entry is flavorResourceReservation/flavorResourceQuota
178
+ - value is string formatted as 'flavorResourceReservation/flavorResourceQuota'
179
179
  """
180
180
  status = q_entry['status']
181
181
  flavors_res = status[res_field]
@@ -346,7 +346,7 @@ def inspector(args) -> None:
346
346
  )
347
347
 
348
348
  # Summarize inspector:
349
- xpk_print(f'Find xpk inspector output file: {inspector_file.name}')
349
+ xpk_print(f'Find xpk inspector output file: {inspector_file}')
350
350
 
351
351
  if final_return_code != 0:
352
352
  xpk_print(
@@ -18,6 +18,7 @@ import re
18
18
  import sys
19
19
 
20
20
  from ruamel.yaml import YAML
21
+ from typing import cast
21
22
 
22
23
  from ..core.commands import run_command_for_value, run_command_with_updates
23
24
  from ..core.cluster import get_cluster_credentials
@@ -27,6 +28,28 @@ from ..utils.console import xpk_exit, xpk_print
27
28
  from .kind import set_local_cluster_command
28
29
 
29
30
 
31
+ JOBS_DRY_RUN_YAML = """
32
+ items:
33
+ - apiVersion: slurm.k8s.io/v1alpha1
34
+ kind: SlurmJob
35
+ metadata:
36
+ annotations:
37
+ kjobctl.x-k8s.io/script: echo hello
38
+ creationTimestamp: '2024-04-29T12:00:00Z'
39
+ labels:
40
+ kjobctl.x-k8s.io/app-profile: default
41
+ name: golden-job
42
+ namespace: default
43
+ spec:
44
+ script: echo hello
45
+ """
46
+
47
+ PODS_DRY_RUN_RESULT = """
48
+ foo-pod 2/2 Running 0 2d
49
+ bar-pod 1/1 Evicted 0 1d
50
+ """
51
+
52
+
30
53
  def job_info(args):
31
54
  """Run commands obtaining information about a job given by name.
32
55
 
@@ -51,7 +74,10 @@ def job_info(args):
51
74
  f' metadata.name=={job_name}'
52
75
  )
53
76
  job_code, job_text = run_command_for_value(
54
- job_command, 'Getting job info', args
77
+ job_command,
78
+ 'Getting job info',
79
+ args,
80
+ dry_run_return_val=JOBS_DRY_RUN_YAML,
55
81
  )
56
82
  if job_code != 0:
57
83
  xpk_print(f'Job info request returned ERROR {job_code}')
@@ -59,7 +85,10 @@ def job_info(args):
59
85
 
60
86
  pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
61
87
  pods_code, pods_text = run_command_for_value(
62
- pods_command, 'Getting pods list', args
88
+ pods_command,
89
+ 'Getting pods list',
90
+ args,
91
+ dry_run_return_val=PODS_DRY_RUN_RESULT,
63
92
  )
64
93
  if pods_code != 0:
65
94
  xpk_print(f'Pods list request returned ERROR {pods_code}')
@@ -84,7 +113,7 @@ def job_info(args):
84
113
 
85
114
 
86
115
  def get_profile(job_yaml: dict) -> str:
87
- containers = (
116
+ containers: list[dict] = (
88
117
  job_yaml.get('spec', {})
89
118
  .get('template', {})
90
119
  .get('spec', {})
@@ -96,13 +125,13 @@ def get_profile(job_yaml: dict) -> str:
96
125
 
97
126
 
98
127
  def get_mounts(job_yaml: dict) -> list[dict]:
99
- containers = (
128
+ containers: list[dict] = (
100
129
  job_yaml.get('spec', {})
101
130
  .get('template', {})
102
131
  .get('spec', {})
103
132
  .get('containers', [])
104
133
  )
105
- mounts = next(iter(containers), {}).get('volumeMounts', [])
134
+ mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
106
135
  return mounts
107
136
 
108
137
 
@@ -112,23 +141,24 @@ def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
112
141
  return search_res
113
142
 
114
143
 
115
- def get_pods(pods_text: str) -> list[str]:
144
+ def get_pods(pods_text: str) -> list[dict[str, str]]:
116
145
  pods_lines = pods_text.strip().split('\n')
117
- pods_lines = [line.split() for line in pods_lines]
146
+ pods_lines_tokenized = [line.split() for line in pods_lines]
118
147
  return [
119
148
  {
120
- 'Name': line[0],
121
- 'Status': line[2],
149
+ 'Name': tokens[0],
150
+ 'Status': tokens[2],
122
151
  }
123
- for line in pods_lines
152
+ for tokens in pods_lines_tokenized
124
153
  ]
125
154
 
126
155
 
127
156
  def get_script_name(job_yaml: dict) -> str | None:
128
- return (
157
+ return cast(
158
+ str | None,
129
159
  job_yaml.get('metadata', {})
130
160
  .get('annotations', {})
131
- .get('kjobctl.x-k8s.io/script', '')
161
+ .get('kjobctl.x-k8s.io/script', ''),
132
162
  )
133
163
 
134
164
 
@@ -33,6 +33,7 @@ from ..core.resources import get_cluster_capacity_type, get_cluster_system_chara
33
33
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
34
34
  gpu_type = get_gpu_type_from_cluster(args)
35
35
 
36
+ annotations: tuple
36
37
  if gpu_type == H100_MEGA_DEVICE_TYPE:
37
38
  annotations = get_a3mega_pod_template_annotations(args)
38
39
  elif gpu_type == H200_DEVICE_TYPE:
@@ -40,7 +41,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
40
41
  elif gpu_type == B200_DEVICE_TYPE:
41
42
  annotations = get_a4_pod_template_annotations(args)
42
43
  else:
43
- annotations = []
44
+ annotations = tuple()
44
45
 
45
46
  flags = [
46
47
  f" --pod-template-annotation {annotation} " for annotation in annotations
@@ -58,6 +58,7 @@ from ..core.storage import (
58
58
  )
59
59
  from ..utils.console import get_user_input, xpk_exit, xpk_print
60
60
  from ..utils.kubectl import apply_kubectl_manifest
61
+ from ..utils.execution_context import is_dry_run
61
62
 
62
63
 
63
64
  def storage_create(args: Namespace) -> None:
@@ -141,7 +142,7 @@ def storage_delete(args: Namespace) -> None:
141
142
 
142
143
  def storage_attach(args: Namespace) -> None:
143
144
  add_zone_and_project(args)
144
- manifest = [{}]
145
+ manifest: list[dict] = [{}]
145
146
  if args.type == GCP_FILESTORE_TYPE:
146
147
  if args.instance is None:
147
148
  args.instance = args.name
@@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
243
244
 
244
245
 
245
246
  def storage_list(args: Namespace) -> None:
246
- k8s_api_client = setup_k8s_env(args)
247
- storages = list_storages(k8s_api_client)
247
+ storages = []
248
+ if not is_dry_run():
249
+ k8s_api_client = setup_k8s_env(args)
250
+ storages = list_storages(k8s_api_client)
248
251
  print_storages_for_cluster(storages)
249
252
 
250
253
 
@@ -84,6 +84,7 @@ from ..core.system_characteristics import (
84
84
  from ..core.vertex import create_vertex_experiment
85
85
  from ..core.workload import (
86
86
  check_if_workload_exists,
87
+ get_jobsets_list_gcp_link,
87
88
  get_workload_list,
88
89
  wait_for_job_completion,
89
90
  zone_to_region,
@@ -96,6 +97,7 @@ from ..core.workload_decorators import (
96
97
  )
97
98
  from ..utils.console import get_user_input, xpk_exit, xpk_print
98
99
  from ..utils.file import write_tmp_file
100
+ from ..utils.execution_context import is_dry_run
99
101
  from . import cluster_gcluster
100
102
  from .common import is_TAS_possible
101
103
 
@@ -226,7 +228,8 @@ spec:
226
228
  metadata:
227
229
  labels:
228
230
  xpk.google.com/workload: {args.workload}
229
- annotations: {annotations}
231
+ annotations:
232
+ {annotations}
230
233
  spec:
231
234
  priorityClassName: {args.priority}
232
235
  restartPolicy: Never
@@ -304,8 +307,10 @@ def workload_create(args) -> None:
304
307
  Returns:
305
308
  0 if successful and 1 otherwise.
306
309
  """
307
- k8s_api_client = setup_k8s_env(args)
308
- setup_k8s_service_accounts()
310
+ k8s_api_client = None
311
+ if not is_dry_run():
312
+ k8s_api_client = setup_k8s_env(args)
313
+ setup_k8s_service_accounts()
309
314
 
310
315
  workload_exists = check_if_workload_exists(args)
311
316
 
@@ -319,7 +324,7 @@ def workload_create(args) -> None:
319
324
  xpk_print('Starting workload create', flush=True)
320
325
  system, return_code = get_system_characteristics(args)
321
326
 
322
- if return_code > 0:
327
+ if return_code > 0 or system is None:
323
328
  xpk_print('Fetching system characteristics failed!')
324
329
  xpk_exit(return_code)
325
330
 
@@ -345,7 +350,7 @@ def workload_create(args) -> None:
345
350
  ):
346
351
  xpk_print(
347
352
  'Warning: Cluster has been created using XPK version:'
348
- f' {cluster_config_map["xpk_version"]} but the XPK version you are'
353
+ f' {cluster_xpk_version} but the XPK version you are'
349
354
  f' using to schedule workload is: {XPK_CURRENT_VERSION}. Some features'
350
355
  ' might not be available for this cluster. We recommend to'
351
356
  ' upgrade/downgrade your XPK version or cluster by running `xpk'
@@ -354,7 +359,7 @@ def workload_create(args) -> None:
354
359
 
355
360
  debugging_dashboard_id = None
356
361
 
357
- tensorboard_config = {}
362
+ tensorboard_config: dict | None = {}
358
363
  if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
359
364
  tensorboard_config = create_vertex_experiment(args)
360
365
  # exit if failed to create Experiment in Vertex AI
@@ -381,8 +386,10 @@ def workload_create(args) -> None:
381
386
  all_storages = []
382
387
  # Currently storage customization is not supported for Pathways workloads. b/408468941
383
388
  if not args.use_pathways:
384
- storages: list[Storage] = get_storages_to_mount(
385
- k8s_api_client, args.storage
389
+ storages: list[Storage] = (
390
+ []
391
+ if k8s_api_client is None
392
+ else get_storages_to_mount(k8s_api_client, args.storage)
386
393
  )
387
394
  gcs_fuse_storages = list(
388
395
  filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
@@ -450,8 +457,8 @@ def workload_create(args) -> None:
450
457
  - action: FailJobSet
451
458
  onJobFailureReasons:
452
459
  - PodFailurePolicy"""
453
- restart_on_exit_codes = get_restart_exit_codes(args)
454
- restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
460
+ restart_on_exit_codes_list = get_restart_exit_codes(args)
461
+ restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list))
455
462
  pod_failure_policy = f"""
456
463
  podFailurePolicy:
457
464
  rules:
@@ -567,14 +574,14 @@ def workload_create(args) -> None:
567
574
  pod_failure_policy=pod_failure_policy,
568
575
  )
569
576
  tmp = write_tmp_file(yml_string)
570
- command = f'kubectl apply -f {str(tmp.file.name)}'
577
+ command = f'kubectl apply -f {str(tmp)}'
571
578
  return_code = run_command_with_updates(command, 'Creating Workload', args)
572
579
 
573
580
  if return_code != 0:
574
581
  xpk_print(f'Create Workload request returned ERROR {return_code}')
575
582
  xpk_exit(return_code)
576
583
 
577
- if not args.use_pathways:
584
+ if not args.use_pathways and not is_dry_run():
578
585
  add_bucket_iam_members(args, storages)
579
586
 
580
587
  # Get GKE outlier dashboard for TPU
@@ -723,7 +730,11 @@ def workload_delete(args) -> None:
723
730
  )
724
731
  else:
725
732
  return_code = run_commands(
726
- commands, 'Delete Workload', task_names, batch=100
733
+ commands,
734
+ 'Delete Workload',
735
+ task_names,
736
+ batch=100,
737
+ dry_run=args.dry_run,
727
738
  )
728
739
 
729
740
  if return_code != 0:
@@ -741,8 +752,6 @@ def workload_list(args) -> None:
741
752
  Returns:
742
753
  0 if successful and 1 otherwise.
743
754
  """
744
- xpk_print(args)
745
-
746
755
  xpk_print('Starting workload list', flush=True)
747
756
  add_zone_and_project(args)
748
757
  get_cluster_credentials(args)
@@ -760,4 +769,8 @@ def workload_list(args) -> None:
760
769
  xpk_print(f'List Job request returned ERROR {return_code}')
761
770
  xpk_exit(return_code)
762
771
  xpk_print(f'Workload List Output:\n{return_value}')
772
+
773
+ workload_list_gcp_link = get_jobsets_list_gcp_link(project=args.project)
774
+ xpk_print(f'See your workloads in Cloud Console: {workload_list_gcp_link}')
775
+
763
776
  xpk_exit(0)