xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,185 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.blueprint.blueprint_generator import BlueprintGenerator, BlueprintGeneratorOutput, supported_device_types, a3mega_device_type, a3ultra_device_type
18
+ from ..core.docker_manager import DockerManager
19
+ from ..core.gcluster_manager import GclusterManager
20
+ from ..core.core import zone_to_region, get_capacity_type
21
+ from ..utils.console import xpk_exit, xpk_print
22
+ from ..utils.network import all_IPs_cidr
23
+ from ..utils.file import ensure_directory_exists
24
+ from ..utils.objects import hash_string
25
+ import os
26
+
27
+ blueprints_path = os.path.abspath('xpkclusters/blueprints')
28
+ gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
29
+ gcloud_cfg_path = os.path.expanduser('~/.config/gcloud')
30
+
31
+
32
+ def cluster_create(args) -> None:
33
+ """Function around cluster creation using Cluster toolkit.
34
+
35
+ Args:
36
+ args: user provided arguments for running the command.
37
+
38
+ Returns:
39
+ 0 if successful and 1 otherwise.
40
+ """
41
+ check_gcloud_authenticated()
42
+ prepare_directories()
43
+ gcm = prepare_gcluster_manager()
44
+ region = zone_to_region(args.zone)
45
+
46
+ # unique_name uses shortened hash string, so still name collision is possible
47
+ unique_name = get_unique_name(args.project, region, args.cluster)
48
+ # prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
49
+ prefix = get_prefix_path(args.project, region)
50
+
51
+ bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
52
+
53
+ # staging: sending the blueprint file(s) to gcluster's working directory
54
+ bp_staged_path = gcm.stage_files(
55
+ blueprint_file=bp.blueprint_file,
56
+ blueprint_dependencies=bp.blueprint_dependencies,
57
+ prefix=prefix,
58
+ )
59
+ gcm.deploy(
60
+ blueprint_path=bp_staged_path,
61
+ deployment_name=unique_name,
62
+ prefix=prefix,
63
+ )
64
+
65
+ xpk_exit(0)
66
+
67
+
68
+ def cluster_delete(args) -> None:
69
+ """Function around cluster delete for the clusters created by Cluster toolkit.
70
+
71
+ Args:
72
+ args: user provided arguments for running the command.
73
+
74
+ Returns:
75
+ 0 if successful and 1 otherwise.
76
+ """
77
+ check_gcloud_authenticated()
78
+ prepare_directories()
79
+ gcm = prepare_gcluster_manager()
80
+ region = zone_to_region(args.zone)
81
+
82
+ # unique_name uses shortened hash string, so still name collision is possible
83
+ unique_name = get_unique_name(args.project, region, args.cluster)
84
+ # prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
85
+ prefix_path = get_prefix_path(args.project, region)
86
+
87
+ gcm.destroy_deployment(deployment_name=unique_name, prefix=prefix_path)
88
+
89
+ xpk_exit(0)
90
+
91
+
92
+ def created_by_gcluster(args) -> bool:
93
+ prepare_directories()
94
+ region = zone_to_region(args.zone)
95
+ unique_name = get_unique_name(args.project, region, args.cluster)
96
+ prefix = get_prefix_path(args.project, region)
97
+ bpg = prepare_blueprint_generator()
98
+ return bpg.blueprint_exists(unique_name, prefix)
99
+
100
+
101
+ def get_unique_name(project_id, region, cluster_name):
102
+ unique_string_hash = hash_string(
103
+ input_string=f'{project_id}-{region}-{cluster_name}'.lower(), length=5
104
+ )
105
+ return f'{cluster_name}-{unique_string_hash}'
106
+
107
+
108
+ def get_prefix_path(project_id, region):
109
+ return f'{project_id}-{region}'.lower()
110
+
111
+
112
+ def prepare_directories() -> None:
113
+ ensure_directory_exists(blueprints_path)
114
+ ensure_directory_exists(gcluster_working_dir)
115
+
116
+
117
+ def check_gcloud_authenticated():
118
+ if not os.path.exists(gcloud_cfg_path):
119
+ xpk_print(
120
+ 'Failed to find gcloud credential directory.'
121
+ f' {gcloud_cfg_path} {blueprints_path} {gcluster_working_dir}'
122
+ )
123
+ xpk_print(
124
+ 'Please authenticate to gcloud ("gcloud auth application-default'
125
+ ' login") and then run your command.'
126
+ )
127
+ xpk_exit(1)
128
+
129
+
130
+ def prepare_gcluster_manager() -> GclusterManager:
131
+ dm = DockerManager(
132
+ working_dir=gcluster_working_dir, gcloud_cfg_path=gcloud_cfg_path
133
+ )
134
+ dm.initialize()
135
+ return GclusterManager(gcluster_command_runner=dm)
136
+
137
+
138
+ def prepare_blueprint_generator() -> BlueprintGenerator:
139
+ return BlueprintGenerator(storage_path=blueprints_path)
140
+
141
+
142
+ def generate_blueprint(
143
+ blueprint_name, args, prefix=None
144
+ ) -> BlueprintGeneratorOutput:
145
+ capacity_type, return_code = get_capacity_type(args)
146
+ if return_code != 0:
147
+ xpk_print('Capacity type is invalid.')
148
+ xpk_exit(return_code)
149
+
150
+ bpg = prepare_blueprint_generator()
151
+
152
+ if args.device_type in supported_device_types:
153
+ if args.device_type == a3mega_device_type:
154
+ num_nodes = args.num_nodes if not args.num_nodes is None else 2
155
+ return bpg.generate_a3_mega_blueprint(
156
+ blueprint_name=blueprint_name,
157
+ prefix=prefix,
158
+ cluster_name=args.cluster,
159
+ region=zone_to_region(args.zone),
160
+ project_id=args.project,
161
+ zone=args.zone,
162
+ auth_cidr=all_IPs_cidr,
163
+ num_nodes=num_nodes,
164
+ reservation=args.reservation if args.reservation else None,
165
+ capacity_type=capacity_type,
166
+ system_node_pool_machine_type=args.default_pool_cpu_machine_type,
167
+ system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
168
+ )
169
+ if args.device_type == a3ultra_device_type:
170
+ num_nodes = args.num_nodes if not args.num_nodes is None else 2
171
+ return bpg.generate_a3_ultra_blueprint(
172
+ blueprint_name=blueprint_name,
173
+ prefix=prefix,
174
+ cluster_name=args.cluster,
175
+ region=zone_to_region(args.zone),
176
+ project_id=args.project,
177
+ zone=args.zone,
178
+ auth_cidr=all_IPs_cidr,
179
+ num_nodes=num_nodes,
180
+ reservation=args.reservation if args.reservation else None,
181
+ capacity_type=capacity_type,
182
+ system_node_pool_machine_type=args.default_pool_cpu_machine_type,
183
+ system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
184
+ )
185
+ return None
xpk/commands/info.py ADDED
@@ -0,0 +1,245 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_exit, xpk_print
18
+ from ..core.kueue import verify_kueuectl
19
+ from .cluster import set_cluster_command
20
+ from ..core.commands import (
21
+ run_command_for_value,
22
+ )
23
+ from ..core.core import (
24
+ add_zone_and_project,
25
+ )
26
+ import json
27
+ from tabulate import tabulate
28
+ from argparse import Namespace
29
+
30
+ table_fmt = 'plain'
31
+
32
+
33
+ def info(args: Namespace) -> None:
34
+ """Provide info about localqueues, clusterqueues and their resources.
35
+
36
+ Args:
37
+ args: user provided arguments for running the command.
38
+ Returns:
39
+ None
40
+ """
41
+ add_zone_and_project(args)
42
+ set_cluster_command_code = set_cluster_command(args)
43
+ if set_cluster_command_code != 0:
44
+ xpk_exit(set_cluster_command_code)
45
+
46
+ verify_kueuectl(args)
47
+ lq, cq = bool(args.localqueue), bool(args.clusterqueue)
48
+ if not lq and not cq:
49
+ lq, cq = True, True
50
+
51
+ lqs, cqs = None, None
52
+ if lq:
53
+ lqs = run_kueuectl_list_localqueue(args)
54
+
55
+ cqs = run_kueuectl_list_clusterqueue(args)
56
+ quotas = get_nominal_quotas(cqs)
57
+
58
+ if lq:
59
+ print_formatted_lqs(lqs, quotas)
60
+
61
+ if cq:
62
+ print_formatted_cqs(cqs, quotas)
63
+
64
+
65
+ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
66
+ """Get quotas from clusterqueues.
67
+ This function retrieves how much of resource in each flavor is assigned to cluster queue.
68
+ It parses flavors of passed cluster queues.
69
+ Args:
70
+ - cqs - list of cluster queues.
71
+ Returns:
72
+ - dictionary of cluster queues resources quotas in format:
73
+ {cq_name:{"flavorName:resourceName":quota}}
74
+ """
75
+ try:
76
+ cq_list = json.loads(cqs)['items']
77
+ except ValueError:
78
+ xpk_print('Incorrect respone from list clusterqueue')
79
+ xpk_print(cqs)
80
+ xpk_exit(1)
81
+
82
+ quotas = {}
83
+ for cq in cq_list:
84
+ spec = cq['spec']
85
+ cq_name = cq['metadata']['name']
86
+ quotas[cq_name] = {}
87
+ for rg in spec['resourceGroups']:
88
+ for flavor in rg['flavors']:
89
+ name = flavor['name']
90
+ for resource in flavor['resources']:
91
+ key = f'{name}:{resource["name"]}'
92
+ quotas[cq_name][key] = resource['nominalQuota']
93
+ return quotas
94
+
95
+
96
+ def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
97
+ try:
98
+ cq_list = json.loads(cqs)['items']
99
+ except ValueError:
100
+ xpk_print('Incorrect respone from list clusterqueue')
101
+ xpk_print(cqs)
102
+ xpk_exit(1)
103
+
104
+ cq_usages = parse_queue_lists(cq_list, nominalQuotas)
105
+
106
+ xpk_print(
107
+ 'Cluster Queues usage \n',
108
+ tabulate(cq_usages, headers='keys', tablefmt=table_fmt),
109
+ )
110
+
111
+
112
+ def print_formatted_lqs(lqs: list[dict], nominalQuotas) -> None:
113
+ try:
114
+ lq_list = json.loads(lqs)['items']
115
+ except ValueError:
116
+ xpk_print('Incorrect respone from list localqueue')
117
+ xpk_print(lqs)
118
+ xpk_exit(1)
119
+
120
+ lq_usages = parse_queue_lists(lq_list, nominalQuotas)
121
+ xpk_print(
122
+ 'Local Queues usage \n',
123
+ tabulate(lq_usages, headers='keys', tablefmt=table_fmt),
124
+ )
125
+
126
+
127
+ def parse_queue_lists(
128
+ qs: list[dict],
129
+ flavor_resource_quotas: dict,
130
+ reservation_key: str = 'flavorsReservation',
131
+ ) -> list[dict]:
132
+ qs_usage_list = []
133
+ for q in qs:
134
+ queue_name = q['metadata']['name']
135
+ q_pending_workloads = q['status']['pendingWorkloads']
136
+ q_admitted_workloads = q['status']['admittedWorkloads']
137
+ q_status = {
138
+ 'QUEUE': queue_name,
139
+ 'ADMITTED_WORKLOADS': q_admitted_workloads,
140
+ 'PENDING_WORKLOADS': q_pending_workloads,
141
+ }
142
+ q_status.update(
143
+ get_flavors_usage(q, reservation_key, flavor_resource_quotas)
144
+ )
145
+ qs_usage_list.append(q_status)
146
+ return qs_usage_list
147
+
148
+
149
+ def get_flavors_resources_reservations(
150
+ cq_name: str, flavors_res: list[dict]
151
+ ) -> dict[str, dict[str, str]]:
152
+ """Get usage of flavors resources.
153
+ This function parser flavorsReservation section of clusterQueue of LocalQueue.
154
+ Args:
155
+ - cq_name - name of ClusterQueue to which flavors belong.
156
+ - flavors_res - list of reservations made by flavors
157
+ Returns:
158
+ Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
159
+ Dict format: {cq_name: {{flavor:resource}:reservation}}
160
+ """
161
+ reservations = {}
162
+ reservations[cq_name] = {}
163
+ for flavor_name, flavor_resources_reservation_list in flavors_res.items():
164
+ for resource in flavor_resources_reservation_list:
165
+ reservations[cq_name][f'{flavor_name}:{resource["name"]}'] = resource[
166
+ 'total'
167
+ ]
168
+
169
+ return reservations
170
+
171
+
172
+ def get_flavors_usage(
173
+ q_entry: dict, res_field: str, flavor_resource_quotas: dict
174
+ ) -> list[dict]:
175
+ """Parse q_entry to retrieve list of each resource usage in flavour.
176
+ Args:
177
+ q_entry - single entry into either LocalQueue or ClusterQueue structured as json
178
+ flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
179
+ Returns:
180
+ list of dicts where each list entry is in format (key, entry) where:
181
+ - key is flavorName:resourceName
182
+ - entry is flavorResourceReservation/flavorResourceQuota
183
+ """
184
+ status = q_entry['status']
185
+ flavors_res = status[res_field]
186
+ queue_type = q_entry['kind']
187
+
188
+ flavors_res = {flavor['name']: flavor['resources'] for flavor in flavors_res}
189
+ usage_fraction = {}
190
+ cq_name = (
191
+ q_entry['metadata']['name']
192
+ if queue_type == 'ClusterQueue'
193
+ else q_entry['spec']['clusterQueue']
194
+ )
195
+
196
+ reservations = get_flavors_resources_reservations(cq_name, flavors_res)
197
+
198
+ for cq_name, cq_reservations in reservations.items():
199
+ cq_nominal_quotas = flavor_resource_quotas[cq_name]
200
+
201
+ for flavor_resource, flavor_resource_quota in cq_nominal_quotas.items():
202
+ flavor_resource_reservation = cq_reservations[flavor_resource]
203
+ usage_fraction[flavor_resource] = (
204
+ f'{flavor_resource_reservation}/{flavor_resource_quota}'
205
+ )
206
+ return usage_fraction
207
+
208
+
209
+ def run_kueuectl_list_localqueue(args: Namespace) -> str:
210
+ """Run the kueuectl list localqueue command.
211
+
212
+ Args:
213
+ args: user provided arguments for running the command.
214
+
215
+ Returns:
216
+ kueuectl list localqueue formatted as json string.
217
+ """
218
+ command = 'kubectl kueue list localqueue -o json'
219
+ if args.namespace != '':
220
+ command += f' --namespace {args.namespace}'
221
+ return_code, val = run_command_for_value(command, 'list localqueue', args)
222
+
223
+ if return_code != 0:
224
+ xpk_print(f'Cluster info request returned ERROR {return_code}')
225
+ xpk_exit(return_code)
226
+ return val
227
+
228
+
229
+ def run_kueuectl_list_clusterqueue(args: Namespace) -> str:
230
+ """Run the kueuectl list clusterqueue command.
231
+
232
+ Args:
233
+ args: user provided arguments for running the command.
234
+
235
+ Returns:
236
+ kueuectl list clusterqueue formatted as json string
237
+ """
238
+ command = 'kubectl kueue list clusterqueue -o json'
239
+
240
+ return_code, val = run_command_for_value(command, 'list clusterqueue', args)
241
+
242
+ if return_code != 0:
243
+ xpk_print(f'Cluster info request returned ERROR {return_code}')
244
+ xpk_exit(return_code)
245
+ return val