xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py CHANGED
@@ -29,8 +29,10 @@ from ..core.kjob import (
29
29
  get_storage_annotations,
30
30
  prepare_kjob,
31
31
  )
32
- from ..core.kueue import LOCAL_QUEUE_NAME
32
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
33
  from ..utils.console import xpk_exit, xpk_print
34
+ from ..utils.execution_context import is_dry_run
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
34
36
  from .kind import set_local_cluster_command
35
37
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
38
 
@@ -43,6 +45,12 @@ def batch(args: Namespace) -> None:
43
45
  Returns:
44
46
  None
45
47
  """
48
+ if should_validate_dependencies(args):
49
+ validate_dependencies_list([
50
+ SystemDependency.KUBECTL,
51
+ SystemDependency.KJOB,
52
+ SystemDependency.GCLOUD,
53
+ ])
46
54
  if not args.kind_cluster:
47
55
  add_zone_and_project(args)
48
56
  get_cluster_credentials(args)
@@ -51,18 +59,16 @@ def batch(args: Namespace) -> None:
51
59
  if set_cluster_command_code != 0:
52
60
  xpk_exit(set_cluster_command_code)
53
61
 
54
- err_code = prepare_kjob(args)
55
- if err_code > 0:
56
- xpk_exit(err_code)
57
- setup_k8s_service_accounts()
62
+ if not is_dry_run():
63
+ err_code = prepare_kjob(args)
64
+ if err_code > 0:
65
+ xpk_exit(err_code)
66
+ setup_k8s_service_accounts()
58
67
 
59
68
  submit_job(args)
60
69
 
61
70
 
62
71
  def submit_job(args: Namespace) -> None:
63
-
64
- setup_k8s_service_accounts()
65
-
66
72
  cmd = (
67
73
  'kubectl kjob create slurm'
68
74
  f' --profile {AppProfileDefaults.NAME.value}'
@@ -73,7 +79,8 @@ def submit_job(args: Namespace) -> None:
73
79
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
80
  cmd = add_TAS_annotations_to_command(args, cmd)
75
81
 
76
- for annotation in get_storage_annotations(args):
82
+ annotations = [] if is_dry_run() else get_storage_annotations(args)
83
+ for annotation in annotations:
77
84
  cmd += f' --pod-template-annotation {annotation}'
78
85
 
79
86
  if args.ignore_unknown_flags:
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code, return_value = run_command_for_value(cmd, 'submit job', args)
136
+ return_code, return_value = run_command_for_value(cmd, 'submit job')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running batch job returned ERROR {return_code}')