xpk 0.12.0__tar.gz → 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {xpk-0.12.0/src/xpk.egg-info → xpk-0.14.0}/PKG-INFO +6 -1
  2. {xpk-0.12.0 → xpk-0.14.0}/README.md +2 -0
  3. {xpk-0.12.0 → xpk-0.14.0}/pyproject.toml +4 -1
  4. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/batch.py +17 -10
  5. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/cluster.py +137 -123
  6. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/cluster_gcluster.py +77 -14
  7. xpk-0.14.0/src/xpk/commands/cluster_gcluster_test.py +177 -0
  8. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/common.py +13 -27
  9. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/info.py +11 -9
  10. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/inspector.py +22 -11
  11. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/job.py +53 -9
  12. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/kind.py +38 -40
  13. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/kjob_common.py +4 -4
  14. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/run.py +9 -2
  15. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/shell.py +13 -10
  16. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/storage.py +26 -2
  17. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/version.py +0 -4
  18. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/workload.py +58 -30
  19. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_generator.py +4 -40
  20. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_test.py +0 -6
  21. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/capacity.py +6 -5
  22. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/cluster.py +96 -195
  23. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/cluster_private.py +9 -12
  24. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/commands.py +21 -25
  25. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/config.py +1 -1
  26. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/docker_image.py +17 -9
  27. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/docker_resources.py +9 -4
  28. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/gcloud_context.py +26 -2
  29. xpk-0.14.0/src/xpk/core/gcloud_context_test.py +96 -0
  30. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/gcluster_manager.py +0 -3
  31. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/jobset.py +5 -8
  32. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/kjob.py +19 -29
  33. xpk-0.14.0/src/xpk/core/kueue_manager.py +383 -0
  34. xpk-0.14.0/src/xpk/core/kueue_manager_test.py +542 -0
  35. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/monitoring.py +1 -1
  36. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/nap.py +11 -16
  37. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/network.py +18 -19
  38. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/nodepool.py +65 -71
  39. xpk-0.14.0/src/xpk/core/nodepool_test.py +279 -0
  40. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/pathways.py +9 -5
  41. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/ray.py +11 -15
  42. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/resources.py +15 -10
  43. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/scheduling.py +23 -1
  44. xpk-0.14.0/src/xpk/core/scheduling_test.py +31 -0
  45. xpk-0.14.0/src/xpk/core/system_characteristics.py +733 -0
  46. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/vertex.py +1 -1
  47. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload.py +7 -8
  48. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/main.py +3 -2
  49. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/cluster.py +50 -0
  50. xpk-0.14.0/src/xpk/parser/cluster_test.py +66 -0
  51. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/common.py +11 -0
  52. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/workload.py +62 -25
  53. xpk-0.14.0/src/xpk/parser/workload_test.py +82 -0
  54. xpk-0.14.0/src/xpk/utils/execution_context.py +28 -0
  55. xpk-0.14.0/src/xpk/utils/feature_flags.py +28 -0
  56. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/file.py +25 -10
  57. xpk-0.14.0/src/xpk/utils/kueue.py +20 -0
  58. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/network.py +4 -0
  59. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/templates.py +2 -0
  60. xpk-0.14.0/src/xpk/utils/topology.py +37 -0
  61. xpk-0.14.0/src/xpk/utils/topology_test.py +43 -0
  62. xpk-0.14.0/src/xpk/utils/validation.py +104 -0
  63. xpk-0.14.0/src/xpk/utils/validation_test.py +37 -0
  64. {xpk-0.12.0 → xpk-0.14.0/src/xpk.egg-info}/PKG-INFO +6 -1
  65. {xpk-0.12.0 → xpk-0.14.0}/src/xpk.egg-info/SOURCES.txt +13 -1
  66. {xpk-0.12.0 → xpk-0.14.0}/src/xpk.egg-info/requires.txt +3 -0
  67. xpk-0.12.0/src/xpk/core/kueue.py +0 -545
  68. xpk-0.12.0/src/xpk/core/nodepool_test.py +0 -82
  69. xpk-0.12.0/src/xpk/core/system_characteristics.py +0 -627
  70. xpk-0.12.0/src/xpk/utils/validation.py +0 -80
  71. {xpk-0.12.0 → xpk-0.14.0}/LICENSE +0 -0
  72. {xpk-0.12.0 → xpk-0.14.0}/setup.cfg +0 -0
  73. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/__init__.py +0 -0
  74. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/api/__init__.py +0 -0
  75. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/api/storage_crd.yaml +0 -0
  76. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/__init__.py +0 -0
  77. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/commands/config.py +0 -0
  78. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/__init__.py +0 -0
  79. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/blueprint/__init__.py +0 -0
  80. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  81. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/config_test.py +0 -0
  82. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/docker_container.py +0 -0
  83. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/docker_manager.py +0 -0
  84. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/filestore.py +0 -0
  85. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/gcsfuse.py +0 -0
  86. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/mtc.py +0 -0
  87. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/remote_state/__init__.py +0 -0
  88. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  89. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  90. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/storage.py +0 -0
  91. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  92. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  93. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  94. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  95. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  96. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  97. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/core/workload_test.py +0 -0
  98. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/__init__.py +0 -0
  99. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/batch.py +0 -0
  100. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/config.py +0 -0
  101. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/core.py +0 -0
  102. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/info.py +0 -0
  103. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/inspector.py +0 -0
  104. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/job.py +0 -0
  105. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/kind.py +0 -0
  106. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/run.py +0 -0
  107. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/shell.py +0 -0
  108. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/storage.py +0 -0
  109. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/validators.py +0 -0
  110. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/parser/version.py +0 -0
  111. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/templates/__init__.py +0 -0
  112. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/templates/storage.yaml +0 -0
  113. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/__init__.py +0 -0
  114. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/console.py +0 -0
  115. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/gcs_utils.py +0 -0
  116. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/kubectl.py +0 -0
  117. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/objects.py +0 -0
  118. {xpk-0.12.0 → xpk-0.14.0}/src/xpk/utils/yaml.py +0 -0
  119. {xpk-0.12.0 → xpk-0.14.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  120. {xpk-0.12.0 → xpk-0.14.0}/src/xpk.egg-info/entry_points.txt +0 -0
  121. {xpk-0.12.0 → xpk-0.14.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.12.0
3
+ Version: 0.14.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -22,15 +22,18 @@ Requires-Dist: google-api-core==2.24.1
22
22
  Requires-Dist: packaging==24.2
23
23
  Requires-Dist: google-cloud-filestore==1.12.0
24
24
  Requires-Dist: google-cloud-storage
25
+ Requires-Dist: Jinja2==3.1.6
25
26
  Provides-Extra: dev
26
27
  Requires-Dist: pyink==24.3.0; extra == "dev"
27
28
  Requires-Dist: pylint>=2.6.0; extra == "dev"
28
29
  Requires-Dist: pre-commit; extra == "dev"
29
30
  Requires-Dist: pytest; extra == "dev"
31
+ Requires-Dist: pytest-mock==3.15.1; extra == "dev"
30
32
  Requires-Dist: docker==7.1.0; extra == "dev"
31
33
  Requires-Dist: mypy~=1.17; extra == "dev"
32
34
  Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
33
35
  Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
36
+ Requires-Dist: pylint-per-file-ignores==1.4.0; extra == "dev"
34
37
  Dynamic: license-file
35
38
 
36
39
  <!--
@@ -76,6 +79,7 @@ XPK supports the following TPU types:
76
79
  * v5e
77
80
  * v5p
78
81
  * Trillium (v6e)
82
+ * Ironwood (tpu7x)
79
83
 
80
84
  and the following GPU types:
81
85
  * A100
@@ -83,6 +87,7 @@ and the following GPU types:
83
87
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
84
88
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
85
89
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
90
+ * A4X (gb200)
86
91
 
87
92
  and the following CPU types:
88
93
  * n2-standard-32
@@ -41,6 +41,7 @@ XPK supports the following TPU types:
41
41
  * v5e
42
42
  * v5p
43
43
  * Trillium (v6e)
44
+ * Ironwood (tpu7x)
44
45
 
45
46
  and the following GPU types:
46
47
  * A100
@@ -48,6 +49,7 @@ and the following GPU types:
48
49
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
49
50
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
50
51
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
52
+ * A4X (gb200)
51
53
 
52
54
  and the following CPU types:
53
55
  * n2-standard-32
@@ -40,7 +40,8 @@ dependencies = [
40
40
  "google-api-core==2.24.1",
41
41
  "packaging==24.2",
42
42
  "google-cloud-filestore==1.12.0",
43
- "google-cloud-storage"
43
+ "google-cloud-storage",
44
+ "Jinja2==3.1.6"
44
45
  ]
45
46
 
46
47
  [project.urls]
@@ -62,10 +63,12 @@ dev = [
62
63
  "pylint>=2.6.0",
63
64
  "pre-commit",
64
65
  "pytest",
66
+ "pytest-mock==3.15.1",
65
67
  "docker==7.1.0",
66
68
  "mypy ~= 1.17",
67
69
  "types-PyYAML == 6.0.2",
68
70
  "types-docker ~= 7.1.0.0",
71
+ "pylint-per-file-ignores == 1.4.0",
69
72
  ]
70
73
 
71
74
  [tool.setuptools.dynamic]
@@ -29,8 +29,10 @@ from ..core.kjob import (
29
29
  get_storage_annotations,
30
30
  prepare_kjob,
31
31
  )
32
- from ..core.kueue import LOCAL_QUEUE_NAME
32
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
+ from ..utils.execution_context import is_dry_run
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
34
36
  from .kind import set_local_cluster_command
35
37
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
38
 
@@ -43,6 +45,12 @@ def batch(args: Namespace) -> None:
43
45
  Returns:
44
46
  None
45
47
  """
48
+ if should_validate_dependencies(args):
49
+ validate_dependencies_list([
50
+ SystemDependency.KUBECTL,
51
+ SystemDependency.KJOB,
52
+ SystemDependency.GCLOUD,
53
+ ])
46
54
  if not args.kind_cluster:
47
55
  add_zone_and_project(args)
48
56
  get_cluster_credentials(args)
@@ -51,18 +59,16 @@ def batch(args: Namespace) -> None:
51
59
  if set_cluster_command_code != 0:
52
60
  xpk_exit(set_cluster_command_code)
53
61
 
54
- err_code = prepare_kjob(args)
55
- if err_code > 0:
56
- xpk_exit(err_code)
57
- setup_k8s_service_accounts()
62
+ if not is_dry_run():
63
+ err_code = prepare_kjob(args)
64
+ if err_code > 0:
65
+ xpk_exit(err_code)
66
+ setup_k8s_service_accounts()
58
67
 
59
68
  submit_job(args)
60
69
 
61
70
 
62
71
  def submit_job(args: Namespace) -> None:
63
-
64
- setup_k8s_service_accounts()
65
-
66
72
  cmd = (
67
73
  'kubectl kjob create slurm'
68
74
  f' --profile {AppProfileDefaults.NAME.value}'
@@ -73,7 +79,8 @@ def submit_job(args: Namespace) -> None:
73
79
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
80
  cmd = add_TAS_annotations_to_command(args, cmd)
75
81
 
76
- for annotation in get_storage_annotations(args):
82
+ annotations = [] if is_dry_run() else get_storage_annotations(args)
83
+ for annotation in annotations:
77
84
  cmd += f' --pod-template-annotation {annotation}'
78
85
 
79
86
  if args.ignore_unknown_flags:
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code, return_value = run_command_for_value(cmd, 'submit job', args)
136
+ return_code, return_value = run_command_for_value(cmd, 'submit job')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running batch job returned ERROR {return_code}')