xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,107 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.docker_manager import DockerManager
18
+ from xpk.core.gcluster_manager import GclusterManager
19
+ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
20
+ import os
21
+ import pytest
22
+ import shutil
23
+
24
+ ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
25
+ project_id = os.getenv("PROJECT_ID")
26
+ region = os.getenv("REGION")
27
+ zone = os.getenv("ZONE")
28
+ auth_cidr = os.getenv("AUTH_CIDR")
29
+ cluster_name = os.getenv("GKE_ML_TEST_CLUSTER_NAME")
30
+
31
+ uploads_dir = "uploads"
32
+
33
+
34
+ def prepare_test(docker_path: str, bp_path: str) -> None:
35
+ if not os.path.exists(docker_path):
36
+ os.makedirs(docker_path)
37
+ if not os.path.exists(bp_path):
38
+ os.makedirs(bp_path)
39
+
40
+
41
+ @pytest.mark.skip(reason="Credentails not working. Skipping for now")
42
+ def test_create_deployment():
43
+ assert project_id is not None
44
+ assert region is not None
45
+ assert zone is not None
46
+ assert auth_cidr is not None
47
+ assert ctk_gcloud_cfg is not None
48
+ assert cluster_name is not None
49
+
50
+ pwd = os.getcwd()
51
+ test_docker_working_dir = os.path.join(
52
+ pwd, "xpkclusters/tests/xpk_test_docker_dir"
53
+ )
54
+ test_bp_dir = os.path.join(pwd, "xpkclusters/tests/xpk_test_bp_dir")
55
+ prepare_test(test_docker_working_dir, test_bp_dir)
56
+ blueprint_name = "my-test-blueprint"
57
+ prefix = "prefix"
58
+
59
+ docker_manager = DockerManager(
60
+ gcloud_cfg_path=ctk_gcloud_cfg, working_dir=test_docker_working_dir
61
+ )
62
+ docker_manager.initialize()
63
+
64
+ bpm = BlueprintGenerator(storage_path=test_bp_dir)
65
+ ml_gke_blueprint = bpm.generate_gke_ml_blueprint(
66
+ cluster_name=cluster_name,
67
+ blueprint_name=blueprint_name,
68
+ prefix=prefix,
69
+ region=region,
70
+ project_id=project_id,
71
+ auth_cidr=auth_cidr,
72
+ )
73
+ blueprint_test_path = os.path.join(
74
+ test_bp_dir, prefix, f"{blueprint_name}.yaml"
75
+ )
76
+ # there are no files in ghcp stage for this blueprint
77
+ blueprint_deps_test_path = ""
78
+
79
+ assert ml_gke_blueprint.blueprint_file == blueprint_test_path
80
+ assert ml_gke_blueprint.blueprint_dependencies == blueprint_deps_test_path
81
+
82
+ assert os.path.exists(blueprint_test_path)
83
+
84
+ gcluster_manager = GclusterManager(
85
+ gcluster_command_runner=docker_manager, remote_state_client=None
86
+ )
87
+
88
+ staged_bp_path = gcluster_manager.stage_files(
89
+ blueprint_file=ml_gke_blueprint.blueprint_file,
90
+ blueprint_dependencies=ml_gke_blueprint.blueprint_dependencies,
91
+ prefix=prefix,
92
+ )
93
+
94
+ assert staged_bp_path == os.path.join(
95
+ "/out", uploads_dir, prefix, f"{blueprint_name}.yaml"
96
+ )
97
+
98
+ gcluster_manager.deploy(
99
+ blueprint_path=staged_bp_path,
100
+ deployment_name=blueprint_name,
101
+ prefix=prefix,
102
+ )
103
+ gcluster_manager.destroy_deployment(
104
+ deployment_name=blueprint_name, prefix=prefix
105
+ )
106
+ shutil.rmtree(test_docker_working_dir)
107
+ shutil.rmtree(test_bp_dir)
xpk/commands/batch.py CHANGED
@@ -29,9 +29,10 @@ from ..core.kjob import (
29
29
  get_storage_annotations,
30
30
  prepare_kjob,
31
31
  )
32
- from ..core.kueue import LOCAL_QUEUE_NAME
32
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
34
  from ..utils.execution_context import is_dry_run
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
35
36
  from .kind import set_local_cluster_command
36
37
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
37
38
 
@@ -44,6 +45,12 @@ def batch(args: Namespace) -> None:
44
45
  Returns:
45
46
  None
46
47
  """
48
+ if should_validate_dependencies(args):
49
+ validate_dependencies_list([
50
+ SystemDependency.KUBECTL,
51
+ SystemDependency.KJOB,
52
+ SystemDependency.GCLOUD,
53
+ ])
47
54
  if not args.kind_cluster:
48
55
  add_zone_and_project(args)
49
56
  get_cluster_credentials(args)
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code, return_value = run_command_for_value(cmd, 'submit job', args)
136
+ return_code, return_value = run_command_for_value(cmd, 'submit job')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running batch job returned ERROR {return_code}')