xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/capacity.py ADDED
@@ -0,0 +1,185 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import enum
18
+
19
+ from ..utils.console import xpk_print
20
+ from .commands import run_command_with_updates
21
+
22
+ AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
23
+ AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
24
+ AUTOPROVISIONING_CONFIG_MAXIMUM_KEY = 'maximum_chips'
25
+ CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
26
+
27
+ H100_DEVICE_TYPE = 'h100-80gb-8'
28
+ H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
29
+ H200_DEVICE_TYPE = 'h200-141gb-8'
30
+ RESERVATION_CONFIG_KEY = 'reservation_id'
31
+
32
+
33
+ class CapacityType(enum.Enum):
34
+ ON_DEMAND = 'on_demand'
35
+ RESERVATION = 'reservation'
36
+ SPOT = 'spot'
37
+ UNKNOWN = 'unknown'
38
+
39
+
40
+ def print_reservations(args) -> int:
41
+ """Print the reservations in the project.
42
+
43
+ Args:
44
+ args: user provided arguments for running the command.
45
+
46
+ Returns:
47
+ 0 if successful and 1 otherwise.
48
+ """
49
+ command = f'gcloud beta compute reservations list --project={args.project}'
50
+ return_code = run_command_with_updates(
51
+ command, 'Get all reservations in the project', args
52
+ )
53
+ if return_code != 0:
54
+ xpk_print(f'Get all reservations returned ERROR {return_code}')
55
+ return 1
56
+ return 0
57
+
58
+
59
+ def get_capacity_type(args) -> tuple[CapacityType, int]:
60
+ """Determine the capacity type based on user arguments.
61
+
62
+ Args:
63
+ args: user provided arguments for running the command.
64
+
65
+ Returns:
66
+ Tuple with string with the system characteristics and
67
+ int of 0 if successful and 1 otherwise.
68
+ """
69
+ capacity_type = CapacityType.UNKNOWN
70
+ num_types = 0
71
+ return_code = 0
72
+
73
+ # Determine the capacity argument.
74
+ if args.on_demand:
75
+ capacity_type = CapacityType.ON_DEMAND
76
+ num_types += 1
77
+ if args.reservation:
78
+ return_code = verify_reservation_exists(args)
79
+ if return_code > 0:
80
+ return capacity_type, return_code
81
+ capacity_type = CapacityType.RESERVATION
82
+ num_types += 1
83
+ if args.spot:
84
+ capacity_type = CapacityType.SPOT
85
+ num_types += 1
86
+
87
+ # Check that the number of user arguments provided is valid.
88
+ if num_types == 0:
89
+ capacity_type = CapacityType.UNKNOWN
90
+ elif num_types != 1:
91
+ xpk_print(
92
+ 'ERROR: User specified more than one of the following arguments. Please'
93
+ ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
94
+ ' or `--spot`.'
95
+ )
96
+ return_code = 1
97
+
98
+ return capacity_type, return_code
99
+
100
+
101
+ def verify_reservation_exists(args) -> int:
102
+ """Verify the reservation exists.
103
+
104
+ Args:
105
+ args: user provided arguments for running the command.
106
+
107
+ Returns:
108
+ 0 if successful and 1 otherwise.
109
+ """
110
+ command = (
111
+ f'gcloud beta compute reservations describe {args.reservation}'
112
+ f' --project={args.project} --zone={args.zone}'
113
+ )
114
+ return_code = run_command_with_updates(command, 'Describe reservation', args)
115
+ if return_code != 0:
116
+ xpk_print(f'Describe reservation returned ERROR {return_code}')
117
+ xpk_print('Please confirm that your reservation name is correct.')
118
+ return 1
119
+ return 0
120
+
121
+
122
+ def get_capacity_arguments_from_capacity_type(
123
+ args, capacity_type: CapacityType
124
+ ) -> tuple[str, int]:
125
+ """Determine the TPU Nodepool creation capacity arguments needed.
126
+
127
+ Args:
128
+ args: user provided arguments for running the command.
129
+ capacity_type: The type of capacity the user configured.
130
+
131
+ Returns:
132
+ Tuple with string with the capacity argument to use and
133
+ int of 0 if successful and 1 otherwise.
134
+ """
135
+ capacity_args = ''
136
+ return_code = 0
137
+
138
+ match capacity_type:
139
+ case CapacityType.ON_DEMAND:
140
+ capacity_args = ''
141
+ case CapacityType.SPOT:
142
+ capacity_args = '--spot'
143
+ case CapacityType.RESERVATION:
144
+ capacity_args = (
145
+ f'--reservation-affinity=specific --reservation={args.reservation}'
146
+ )
147
+ case _:
148
+ xpk_print(
149
+ f'Unknown capacity type: {capacity_type}. Unable to determine'
150
+ ' capacity args.'
151
+ )
152
+ return_code = 1
153
+ return capacity_args, return_code
154
+
155
+
156
+ def get_capacity_node_selectors_from_capacity_type(
157
+ args, capacity_type: str
158
+ ) -> tuple[str, int]:
159
+ """Determine the node selectors for a workload to run on a specific capacity type.
160
+
161
+ Args:
162
+ args: user provided arguments for running the command.
163
+ capacity_type: The type of capacity the user configured.
164
+
165
+ Returns:
166
+ Tuple with string with the node selectors to use and
167
+ int of 0 if successful and 1 otherwise.
168
+ """
169
+ node_selector = ''
170
+ return_code = 0
171
+
172
+ match capacity_type:
173
+ case CapacityType.ON_DEMAND.name:
174
+ node_selector = ''
175
+ case CapacityType.SPOT.name:
176
+ node_selector = 'cloud.google.com/gke-spot="true"'
177
+ case CapacityType.RESERVATION.name:
178
+ node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
179
+ case _:
180
+ xpk_print(
181
+ f'Unknown capacity type: {capacity_type}. Unable to determine the'
182
+ ' node selectors.'
183
+ )
184
+ return_code = 1
185
+ return node_selector, return_code