xpk 0.13.0__tar.gz → 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {xpk-0.13.0/src/xpk.egg-info → xpk-0.14.0}/PKG-INFO +6 -1
  2. {xpk-0.13.0 → xpk-0.14.0}/README.md +2 -0
  3. {xpk-0.13.0 → xpk-0.14.0}/pyproject.toml +4 -1
  4. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/batch.py +9 -2
  5. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/cluster.py +128 -115
  6. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/cluster_gcluster.py +77 -14
  7. xpk-0.14.0/src/xpk/commands/cluster_gcluster_test.py +177 -0
  8. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/common.py +10 -28
  9. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/info.py +11 -9
  10. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/inspector.py +21 -10
  11. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/job.py +25 -9
  12. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/kind.py +38 -40
  13. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/kjob_common.py +4 -4
  14. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/run.py +9 -2
  15. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/shell.py +13 -10
  16. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/storage.py +21 -0
  17. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/version.py +0 -4
  18. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/workload.py +43 -22
  19. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_generator.py +4 -40
  20. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_test.py +0 -6
  21. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/capacity.py +6 -5
  22. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/cluster.py +91 -194
  23. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/cluster_private.py +6 -11
  24. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/commands.py +11 -18
  25. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/config.py +1 -1
  26. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_image.py +3 -4
  27. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcloud_context.py +26 -2
  28. xpk-0.14.0/src/xpk/core/gcloud_context_test.py +96 -0
  29. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcluster_manager.py +0 -3
  30. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/jobset.py +4 -7
  31. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/kjob.py +14 -27
  32. xpk-0.14.0/src/xpk/core/kueue_manager.py +383 -0
  33. xpk-0.14.0/src/xpk/core/kueue_manager_test.py +542 -0
  34. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/monitoring.py +1 -1
  35. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/nap.py +10 -15
  36. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/network.py +17 -18
  37. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/nodepool.py +66 -77
  38. xpk-0.14.0/src/xpk/core/nodepool_test.py +279 -0
  39. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/pathways.py +5 -5
  40. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/ray.py +10 -14
  41. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/resources.py +6 -11
  42. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/scheduling.py +19 -1
  43. xpk-0.14.0/src/xpk/core/scheduling_test.py +31 -0
  44. xpk-0.14.0/src/xpk/core/system_characteristics.py +733 -0
  45. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/vertex.py +1 -1
  46. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload.py +7 -8
  47. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/main.py +2 -4
  48. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/cluster.py +7 -0
  49. xpk-0.14.0/src/xpk/parser/cluster_test.py +66 -0
  50. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/common.py +11 -0
  51. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/workload.py +62 -25
  52. xpk-0.14.0/src/xpk/parser/workload_test.py +82 -0
  53. xpk-0.14.0/src/xpk/utils/feature_flags.py +28 -0
  54. xpk-0.14.0/src/xpk/utils/kueue.py +20 -0
  55. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/templates.py +2 -0
  56. xpk-0.14.0/src/xpk/utils/topology.py +37 -0
  57. xpk-0.14.0/src/xpk/utils/topology_test.py +43 -0
  58. xpk-0.14.0/src/xpk/utils/validation.py +104 -0
  59. xpk-0.14.0/src/xpk/utils/validation_test.py +37 -0
  60. {xpk-0.13.0 → xpk-0.14.0/src/xpk.egg-info}/PKG-INFO +6 -1
  61. {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/SOURCES.txt +12 -1
  62. {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/requires.txt +3 -0
  63. xpk-0.13.0/src/xpk/core/kueue.py +0 -561
  64. xpk-0.13.0/src/xpk/core/nodepool_test.py +0 -82
  65. xpk-0.13.0/src/xpk/core/system_characteristics.py +0 -627
  66. xpk-0.13.0/src/xpk/utils/validation.py +0 -80
  67. {xpk-0.13.0 → xpk-0.14.0}/LICENSE +0 -0
  68. {xpk-0.13.0 → xpk-0.14.0}/setup.cfg +0 -0
  69. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/__init__.py +0 -0
  70. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/api/__init__.py +0 -0
  71. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/api/storage_crd.yaml +0 -0
  72. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/__init__.py +0 -0
  73. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/config.py +0 -0
  74. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/__init__.py +0 -0
  75. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/__init__.py +0 -0
  76. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  77. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/config_test.py +0 -0
  78. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_container.py +0 -0
  79. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_manager.py +0 -0
  80. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_resources.py +0 -0
  81. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/filestore.py +0 -0
  82. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcsfuse.py +0 -0
  83. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/mtc.py +0 -0
  84. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/__init__.py +0 -0
  85. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  86. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  87. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/storage.py +0 -0
  88. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  89. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  90. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  91. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  92. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  93. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  94. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_test.py +0 -0
  95. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/__init__.py +0 -0
  96. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/batch.py +0 -0
  97. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/config.py +0 -0
  98. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/core.py +0 -0
  99. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/info.py +0 -0
  100. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/inspector.py +0 -0
  101. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/job.py +0 -0
  102. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/kind.py +0 -0
  103. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/run.py +0 -0
  104. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/shell.py +0 -0
  105. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/storage.py +0 -0
  106. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/validators.py +0 -0
  107. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/version.py +0 -0
  108. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/templates/__init__.py +0 -0
  109. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/templates/storage.yaml +0 -0
  110. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/__init__.py +0 -0
  111. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/console.py +0 -0
  112. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/execution_context.py +0 -0
  113. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/file.py +0 -0
  114. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/gcs_utils.py +0 -0
  115. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/kubectl.py +0 -0
  116. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/network.py +0 -0
  117. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/objects.py +0 -0
  118. {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/yaml.py +0 -0
  119. {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  120. {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/entry_points.txt +0 -0
  121. {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.13.0
3
+ Version: 0.14.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -22,15 +22,18 @@ Requires-Dist: google-api-core==2.24.1
22
22
  Requires-Dist: packaging==24.2
23
23
  Requires-Dist: google-cloud-filestore==1.12.0
24
24
  Requires-Dist: google-cloud-storage
25
+ Requires-Dist: Jinja2==3.1.6
25
26
  Provides-Extra: dev
26
27
  Requires-Dist: pyink==24.3.0; extra == "dev"
27
28
  Requires-Dist: pylint>=2.6.0; extra == "dev"
28
29
  Requires-Dist: pre-commit; extra == "dev"
29
30
  Requires-Dist: pytest; extra == "dev"
31
+ Requires-Dist: pytest-mock==3.15.1; extra == "dev"
30
32
  Requires-Dist: docker==7.1.0; extra == "dev"
31
33
  Requires-Dist: mypy~=1.17; extra == "dev"
32
34
  Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
33
35
  Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
36
+ Requires-Dist: pylint-per-file-ignores==1.4.0; extra == "dev"
34
37
  Dynamic: license-file
35
38
 
36
39
  <!--
@@ -76,6 +79,7 @@ XPK supports the following TPU types:
76
79
  * v5e
77
80
  * v5p
78
81
  * Trillium (v6e)
82
+ * Ironwood (tpu7x)
79
83
 
80
84
  and the following GPU types:
81
85
  * A100
@@ -83,6 +87,7 @@ and the following GPU types:
83
87
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
84
88
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
85
89
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
90
+ * A4X (gb200)
86
91
 
87
92
  and the following CPU types:
88
93
  * n2-standard-32
@@ -41,6 +41,7 @@ XPK supports the following TPU types:
41
41
  * v5e
42
42
  * v5p
43
43
  * Trillium (v6e)
44
+ * Ironwood (tpu7x)
44
45
 
45
46
  and the following GPU types:
46
47
  * A100
@@ -48,6 +49,7 @@ and the following GPU types:
48
49
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
49
50
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
50
51
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
52
+ * A4X (gb200)
51
53
 
52
54
  and the following CPU types:
53
55
  * n2-standard-32
@@ -40,7 +40,8 @@ dependencies = [
40
40
  "google-api-core==2.24.1",
41
41
  "packaging==24.2",
42
42
  "google-cloud-filestore==1.12.0",
43
- "google-cloud-storage"
43
+ "google-cloud-storage",
44
+ "Jinja2==3.1.6"
44
45
  ]
45
46
 
46
47
  [project.urls]
@@ -62,10 +63,12 @@ dev = [
62
63
  "pylint>=2.6.0",
63
64
  "pre-commit",
64
65
  "pytest",
66
+ "pytest-mock==3.15.1",
65
67
  "docker==7.1.0",
66
68
  "mypy ~= 1.17",
67
69
  "types-PyYAML == 6.0.2",
68
70
  "types-docker ~= 7.1.0.0",
71
+ "pylint-per-file-ignores == 1.4.0",
69
72
  ]
70
73
 
71
74
  [tool.setuptools.dynamic]
@@ -29,9 +29,10 @@ from ..core.kjob import (
29
29
  get_storage_annotations,
30
30
  prepare_kjob,
31
31
  )
32
- from ..core.kueue import LOCAL_QUEUE_NAME
32
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
34
  from ..utils.execution_context import is_dry_run
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
35
36
  from .kind import set_local_cluster_command
36
37
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
37
38
 
@@ -44,6 +45,12 @@ def batch(args: Namespace) -> None:
44
45
  Returns:
45
46
  None
46
47
  """
48
+ if should_validate_dependencies(args):
49
+ validate_dependencies_list([
50
+ SystemDependency.KUBECTL,
51
+ SystemDependency.KJOB,
52
+ SystemDependency.GCLOUD,
53
+ ])
47
54
  if not args.kind_cluster:
48
55
  add_zone_and_project(args)
49
56
  get_cluster_credentials(args)
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code, return_value = run_command_for_value(cmd, 'submit job', args)
136
+ return_code, return_value = run_command_for_value(cmd, 'submit job')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running batch job returned ERROR {return_code}')