xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +19 -19
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +7 -3
- xpk/commands/info.py +12 -12
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +42 -12
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +6 -3
- xpk/commands/workload.py +28 -15
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +14 -8
- xpk/core/cluster_private.py +8 -2
- xpk/core/commands.py +13 -10
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +10 -5
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +7 -3
- xpk/core/kueue.py +28 -8
- xpk/core/nap.py +5 -5
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/nodepool_test.py +82 -0
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +18 -14
- xpk/core/scheduling.py +4 -0
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +12 -10
- xpk/parser/cluster.py +110 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
- xpk-0.13.0.dist-info/RECORD +101 -0
- xpk-0.11.0.dist-info/RECORD +0 -95
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
xpk/core/kjob.py
CHANGED
|
@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
|
|
|
23
23
|
from kubernetes.client.rest import ApiException
|
|
24
24
|
|
|
25
25
|
from ..utils import templates
|
|
26
|
+
from ..utils.execution_context import is_dry_run
|
|
26
27
|
from ..utils.console import xpk_exit, xpk_print
|
|
27
28
|
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
28
29
|
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
@@ -277,7 +278,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
|
|
|
277
278
|
job_spec = rdma_decorator.decorate_kjob_template(job_spec)
|
|
278
279
|
job_template_dict = yaml.safe_load(yml_string)
|
|
279
280
|
job_template_dict["template"] = job_spec
|
|
280
|
-
|
|
281
|
+
yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
|
|
282
|
+
return yaml_result
|
|
281
283
|
|
|
282
284
|
|
|
283
285
|
def create_job_template_instance(
|
|
@@ -367,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
|
|
|
367
369
|
def prepare_kjob(args: Namespace) -> int:
|
|
368
370
|
system = get_cluster_system_characteristics(args)
|
|
369
371
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
+
storages = []
|
|
373
|
+
if not is_dry_run():
|
|
374
|
+
k8s_api_client = setup_k8s_env(args)
|
|
375
|
+
storages = get_auto_mount_storages(k8s_api_client)
|
|
372
376
|
|
|
373
377
|
service_account = ""
|
|
374
378
|
if len(storages) > 0:
|
xpk/core/kueue.py
CHANGED
|
@@ -43,7 +43,7 @@ from .system_characteristics import (
|
|
|
43
43
|
KUEUE_VERSION = 'v0.12.2'
|
|
44
44
|
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
45
45
|
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
46
|
-
WAIT_FOR_KUEUE_TIMEOUT = '
|
|
46
|
+
WAIT_FOR_KUEUE_TIMEOUT = '10m'
|
|
47
47
|
MEMORY_SIZE_PER_VM = 1.2
|
|
48
48
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
49
49
|
|
|
@@ -89,6 +89,10 @@ metadata:
|
|
|
89
89
|
name: dws-config
|
|
90
90
|
spec:
|
|
91
91
|
provisioningClassName: queued-provisioning.gke.io
|
|
92
|
+
podSetUpdates:
|
|
93
|
+
nodeSelector:
|
|
94
|
+
- key: autoscaling.gke.io/provisioning-request
|
|
95
|
+
valueFromProvisioningClassDetail: ResizeRequestName
|
|
92
96
|
managedResources:
|
|
93
97
|
- {managed_resource}
|
|
94
98
|
---
|
|
@@ -320,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
|
|
|
320
324
|
return return_code
|
|
321
325
|
|
|
322
326
|
|
|
323
|
-
def get_kueue_version(args) ->
|
|
327
|
+
def get_kueue_version(args) -> tuple[int, str]:
|
|
324
328
|
command = 'kubectl kueue version'
|
|
325
329
|
task = 'Get kueue version on server'
|
|
326
330
|
return_code, val = run_command_for_value(command, task, args)
|
|
@@ -432,6 +436,8 @@ def install_kueue_crs(
|
|
|
432
436
|
cluster_hardware_name=cluster_hardware_name,
|
|
433
437
|
resource_type=resource_type,
|
|
434
438
|
total_chips=total_chips,
|
|
439
|
+
cpu_limit=args.cpu_limit,
|
|
440
|
+
memory_limit=args.memory_limit,
|
|
435
441
|
)
|
|
436
442
|
topology_label = ''
|
|
437
443
|
if system.device_type in [
|
|
@@ -470,7 +476,7 @@ def install_kueue_crs(
|
|
|
470
476
|
yml_string = topology_yaml + yml_string
|
|
471
477
|
|
|
472
478
|
tmp = write_tmp_file(yml_string)
|
|
473
|
-
command = f'kubectl apply -f {str(tmp
|
|
479
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
474
480
|
|
|
475
481
|
task = 'Applying Kueue Custom Resources'
|
|
476
482
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -480,7 +486,7 @@ def install_kueue_crs(
|
|
|
480
486
|
|
|
481
487
|
|
|
482
488
|
def get_kueue_covered_resources_config(
|
|
483
|
-
cluster_hardware_name, resource_type, total_chips
|
|
489
|
+
cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
|
|
484
490
|
) -> str:
|
|
485
491
|
"""Gets Kueue covered resources configuration.
|
|
486
492
|
|
|
@@ -493,17 +499,31 @@ def get_kueue_covered_resources_config(
|
|
|
493
499
|
A string of Kueue covered resources configuration.
|
|
494
500
|
"""
|
|
495
501
|
config_format = """
|
|
496
|
-
- coveredResources:
|
|
502
|
+
- coveredResources: {resource_types}
|
|
497
503
|
flavors:
|
|
498
504
|
- name: {cluster_hardware_name}
|
|
499
505
|
resources:
|
|
500
506
|
- name: "{resource_type}"
|
|
501
|
-
nominalQuota: {total_chips}
|
|
502
|
-
|
|
507
|
+
nominalQuota: {total_chips}"""
|
|
508
|
+
resource_types = [resource_type]
|
|
509
|
+
if cpu_limit:
|
|
510
|
+
config_format = config_format + """
|
|
511
|
+
- name: "cpu"
|
|
512
|
+
nominalQuota: {cpu_limit}"""
|
|
513
|
+
resource_types.append('cpu')
|
|
514
|
+
if memory_limit:
|
|
515
|
+
config_format = config_format + """
|
|
516
|
+
- name: "memory"
|
|
517
|
+
nominalQuota: {memory_limit}"""
|
|
518
|
+
resource_types.append('memory')
|
|
519
|
+
|
|
503
520
|
config_string = config_format.format(
|
|
504
521
|
cluster_hardware_name=cluster_hardware_name,
|
|
522
|
+
resource_types=resource_types,
|
|
505
523
|
resource_type=resource_type,
|
|
506
524
|
total_chips=total_chips,
|
|
525
|
+
cpu_limit=cpu_limit,
|
|
526
|
+
memory_limit=memory_limit,
|
|
507
527
|
)
|
|
508
528
|
return config_string
|
|
509
529
|
|
|
@@ -532,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
|
|
|
532
552
|
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
|
|
533
553
|
)
|
|
534
554
|
tmp = write_tmp_file(yml_string)
|
|
535
|
-
command = f'kubectl apply -f {str(tmp
|
|
555
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
536
556
|
|
|
537
557
|
task = 'Updating Kueue Controller Manager resources'
|
|
538
558
|
return_code = run_command_with_updates_retry(command, task, args)
|
xpk/core/nap.py
CHANGED
|
@@ -37,6 +37,7 @@ from .resources import (
|
|
|
37
37
|
)
|
|
38
38
|
from .scheduling import get_total_chips_requested_from_args
|
|
39
39
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
40
|
+
from typing import cast
|
|
40
41
|
|
|
41
42
|
AUTOPROVISIONING_CONFIG_FILE = """
|
|
42
43
|
management:
|
|
@@ -249,7 +250,7 @@ def create_autoprovisioning_config(
|
|
|
249
250
|
zones=f'- {args.zone}',
|
|
250
251
|
)
|
|
251
252
|
autoprovisioning_config = AutoprovisioningConfig(
|
|
252
|
-
config_filename=write_tmp_file(yml_string)
|
|
253
|
+
config_filename=write_tmp_file(yml_string),
|
|
253
254
|
minimum_chips=minimum,
|
|
254
255
|
maximum_chips=maximum,
|
|
255
256
|
)
|
|
@@ -269,9 +270,6 @@ def is_autoprovisioning_enabled(
|
|
|
269
270
|
bool is true if autoprovisioning is enabled, false otherwise.
|
|
270
271
|
int of 0 if successful and 1 otherwise.
|
|
271
272
|
"""
|
|
272
|
-
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
|
|
273
|
-
if args.use_pathways:
|
|
274
|
-
return False, 0
|
|
275
273
|
|
|
276
274
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
277
275
|
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
@@ -339,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
339
337
|
)
|
|
340
338
|
return node_selector_args, 1
|
|
341
339
|
|
|
342
|
-
return_code,
|
|
340
|
+
return_code, optional_capacity_type_str = get_value_from_map(
|
|
343
341
|
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
|
|
344
342
|
)
|
|
345
343
|
if return_code != 0:
|
|
346
344
|
return node_selector_args, return_code
|
|
345
|
+
# return_code==0 implies capacity_type is defined
|
|
346
|
+
capacity_type_str = cast(str, optional_capacity_type_str)
|
|
347
347
|
|
|
348
348
|
if capacity_type_str == CapacityType.RESERVATION.name:
|
|
349
349
|
return_code, args.reservation = get_value_from_map(
|
xpk/core/network.py
CHANGED
|
@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
|
|
|
221
221
|
"""
|
|
222
222
|
yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
|
|
223
223
|
tmp = write_tmp_file(yml_string)
|
|
224
|
-
command = f'kubectl apply -f {str(tmp
|
|
224
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
225
225
|
|
|
226
226
|
return_code = run_command_with_updates(
|
|
227
227
|
command, 'GKE Cluster Create Network Config', args
|
xpk/core/nodepool.py
CHANGED
|
@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
|
|
|
265
265
|
)
|
|
266
266
|
configmap_yml = {}
|
|
267
267
|
configmap_yml[resources_configmap_name] = resources_yml
|
|
268
|
-
return_code = create_or_update_cluster_configmap(
|
|
268
|
+
return_code = create_or_update_cluster_configmap(
|
|
269
|
+
configmap_yml, args.dry_run
|
|
270
|
+
)
|
|
269
271
|
if return_code != 0:
|
|
270
272
|
return 1
|
|
271
273
|
|
|
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
|
|
|
461
463
|
f' --region={zone_to_region(args.zone)} --format="value(locations)"'
|
|
462
464
|
)
|
|
463
465
|
return_code, nodepool_zone = run_command_for_value(
|
|
464
|
-
command, 'Get Node Pool Zone', args
|
|
466
|
+
command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
|
|
465
467
|
)
|
|
466
468
|
if return_code != 0:
|
|
467
469
|
xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
|
|
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
|
|
|
570
572
|
for i, command in enumerate(commands):
|
|
571
573
|
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
572
574
|
max_return_code = run_commands(
|
|
573
|
-
commands,
|
|
575
|
+
commands,
|
|
576
|
+
'Update GKE node pools to default RAPID GKE version',
|
|
577
|
+
task_names,
|
|
578
|
+
dry_run=args.dry_run,
|
|
574
579
|
)
|
|
575
580
|
if max_return_code != 0:
|
|
576
581
|
xpk_print(
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.nodepool import get_desired_node_pool_names
|
|
18
|
+
|
|
19
|
+
CLUSTER_NAME = "running-cucumber"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def node_pool_name(number: int) -> str:
|
|
23
|
+
return f"{CLUSTER_NAME}-np-{number}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
|
|
27
|
+
result = get_desired_node_pool_names(
|
|
28
|
+
existing_node_pool_names=[node_pool_name(0)],
|
|
29
|
+
cluster_name=CLUSTER_NAME,
|
|
30
|
+
desired_node_pool_count=2,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
expected_result = [node_pool_name(0), node_pool_name(1)]
|
|
34
|
+
assert set(result) == set(expected_result)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
|
|
38
|
+
result = get_desired_node_pool_names(
|
|
39
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
|
|
40
|
+
cluster_name=CLUSTER_NAME,
|
|
41
|
+
desired_node_pool_count=1,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
expected_result = [node_pool_name(0)]
|
|
45
|
+
assert set(result) == set(expected_result)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
|
|
49
|
+
result = get_desired_node_pool_names(
|
|
50
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
51
|
+
cluster_name=CLUSTER_NAME,
|
|
52
|
+
desired_node_pool_count=3,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
|
|
56
|
+
assert set(result) == set(expected_result)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
|
|
60
|
+
result = get_desired_node_pool_names(
|
|
61
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
62
|
+
cluster_name=CLUSTER_NAME,
|
|
63
|
+
desired_node_pool_count=2,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
67
|
+
assert set(result) == set(expected_result)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_compute_desired_node_pool_names_with_unknown_node_pools():
|
|
71
|
+
result = get_desired_node_pool_names(
|
|
72
|
+
existing_node_pool_names=[
|
|
73
|
+
"unknown-node-pool",
|
|
74
|
+
node_pool_name(0),
|
|
75
|
+
node_pool_name(3),
|
|
76
|
+
],
|
|
77
|
+
cluster_name=CLUSTER_NAME,
|
|
78
|
+
desired_node_pool_count=2,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
82
|
+
assert set(result) == set(expected_result)
|
xpk/core/pathways.py
CHANGED
|
@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
|
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..core.nodepool import get_all_nodepools_programmatic
|
|
21
21
|
from ..utils.console import xpk_exit, xpk_print
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
24
|
|
|
24
25
|
|
|
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
|
|
|
79
80
|
# Ensure the cluster and CPU nodepools were created with create-pathways
|
|
80
81
|
all_node_pools = get_all_nodepools_programmatic(args)
|
|
81
82
|
desired_pw_cpu_node_pools = {'cpu-np'}
|
|
82
|
-
if
|
|
83
|
+
if (
|
|
84
|
+
not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
|
|
85
|
+
and not is_dry_run()
|
|
86
|
+
):
|
|
83
87
|
xpk_print(
|
|
84
88
|
'Cluster needs to be created with `xpk create-pathways` to run'
|
|
85
89
|
' Pathways workloads.'
|
|
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
|
|
|
322
326
|
return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
|
|
323
327
|
else:
|
|
324
328
|
return_code = run_commands(
|
|
325
|
-
commands, 'Delete Workload', task_names, batch=100
|
|
329
|
+
commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
|
|
326
330
|
)
|
|
327
331
|
|
|
328
332
|
if return_code != 0:
|
xpk/core/ray.py
CHANGED
|
@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
|
|
|
132
132
|
)
|
|
133
133
|
|
|
134
134
|
tmp = write_tmp_file(yml_string)
|
|
135
|
-
command = f'kubectl apply -f {str(tmp
|
|
135
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
136
136
|
task = 'Applying RayCluster'
|
|
137
137
|
retry_attempts = 1
|
|
138
138
|
return_code = run_command_with_updates_retry(
|
xpk/core/resources.py
CHANGED
|
@@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
return_code, return_value = run_command_for_value(
|
|
69
|
-
command,
|
|
69
|
+
command,
|
|
70
|
+
'GKE Cluster Get ConfigMap',
|
|
71
|
+
args,
|
|
72
|
+
dry_run_return_val='map[]',
|
|
70
73
|
)
|
|
71
74
|
if return_code != 0:
|
|
72
75
|
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
@@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
81
84
|
configs = return_value[4:-1].split(' ')
|
|
82
85
|
|
|
83
86
|
for config in configs:
|
|
84
|
-
|
|
85
|
-
|
|
87
|
+
parts = config.strip().split(':')
|
|
88
|
+
if len(parts) != 2:
|
|
89
|
+
continue
|
|
90
|
+
config_map[parts[0]] = parts[1]
|
|
86
91
|
return config_map
|
|
87
92
|
|
|
88
93
|
|
|
@@ -108,13 +113,7 @@ def create_cluster_configmaps(
|
|
|
108
113
|
device_type = system.device_type
|
|
109
114
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
110
115
|
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
|
|
111
|
-
elif
|
|
112
|
-
not args.enable_pathways
|
|
113
|
-
and args.enable_autoprovisioning
|
|
114
|
-
and autoprovisioning_config
|
|
115
|
-
):
|
|
116
|
-
# Currently autoprovisioning is not supported with Pathways.
|
|
117
|
-
# Auto provisioning will have variable topologies for a gke accelerator type.
|
|
116
|
+
elif args.enable_autoprovisioning and autoprovisioning_config:
|
|
118
117
|
resources_data = (
|
|
119
118
|
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
120
119
|
)
|
|
@@ -156,10 +155,12 @@ def create_cluster_configmaps(
|
|
|
156
155
|
args=args, name=metadata_configmap_name, data=metadata
|
|
157
156
|
)
|
|
158
157
|
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
159
|
-
return create_or_update_cluster_configmap(configmap_yml)
|
|
158
|
+
return create_or_update_cluster_configmap(configmap_yml, args.dry_run)
|
|
160
159
|
|
|
161
160
|
|
|
162
|
-
def create_or_update_cluster_configmap(
|
|
161
|
+
def create_or_update_cluster_configmap(
|
|
162
|
+
configmap_yml: dict, dry_run: bool
|
|
163
|
+
) -> int:
|
|
163
164
|
"""
|
|
164
165
|
Args:
|
|
165
166
|
configmap_yml: dict containing ConfigMap name and yml string.
|
|
@@ -171,13 +172,16 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
|
171
172
|
task_names = []
|
|
172
173
|
for configmap_name, yml_string in configmap_yml.items():
|
|
173
174
|
tmp = write_tmp_file(yml_string)
|
|
174
|
-
command = f'kubectl apply -f {str(tmp
|
|
175
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
175
176
|
commands.append(command)
|
|
176
177
|
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
177
178
|
task_names.append(task_name)
|
|
178
179
|
|
|
179
180
|
return_code = run_commands(
|
|
180
|
-
commands,
|
|
181
|
+
commands,
|
|
182
|
+
'GKE Cluster CreateOrUpdate ConfigMap(s)',
|
|
183
|
+
task_names,
|
|
184
|
+
dry_run=dry_run,
|
|
181
185
|
)
|
|
182
186
|
if return_code != 0:
|
|
183
187
|
xpk_print(
|
xpk/core/scheduling.py
CHANGED
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.execution_context import is_dry_run
|
|
18
19
|
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
19
20
|
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
|
|
20
21
|
from .system_characteristics import (
|
|
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
45
46
|
)
|
|
46
47
|
return True
|
|
47
48
|
|
|
49
|
+
if is_dry_run():
|
|
50
|
+
return True
|
|
51
|
+
|
|
48
52
|
# Check for gke accelerator type:
|
|
49
53
|
missing_gke_accelerator_type = False
|
|
50
54
|
if not cluster_config_map.get(system.gke_accelerator):
|
xpk/core/storage.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
from dataclasses import dataclass
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any, cast
|
|
21
21
|
|
|
22
22
|
import ruamel.yaml
|
|
23
23
|
from google.cloud import storage as gcp_storage
|
|
@@ -95,17 +95,17 @@ class Storage:
|
|
|
95
95
|
Args:
|
|
96
96
|
data: A dictionary containing the Storage resource definition.
|
|
97
97
|
"""
|
|
98
|
-
metadata
|
|
98
|
+
metadata = data.get("metadata", {})
|
|
99
99
|
self.name = metadata.get("name")
|
|
100
100
|
spec = data.get("spec", {})
|
|
101
|
-
self.type
|
|
102
|
-
self.auto_mount
|
|
103
|
-
self.mount_point
|
|
104
|
-
self.readonly
|
|
105
|
-
self.manifest
|
|
106
|
-
self.pvc
|
|
107
|
-
self.pv
|
|
108
|
-
self.bucket
|
|
101
|
+
self.type = spec.get("type")
|
|
102
|
+
self.auto_mount = spec.get("auto_mount")
|
|
103
|
+
self.mount_point = spec.get("mount_point")
|
|
104
|
+
self.readonly = spec.get("readonly")
|
|
105
|
+
self.manifest = spec.get("manifest")
|
|
106
|
+
self.pvc = spec.get("pvc")
|
|
107
|
+
self.pv = spec.get("pv")
|
|
108
|
+
self.bucket = self._get_bucket()
|
|
109
109
|
|
|
110
110
|
def fields_as_list(self) -> list[str]:
|
|
111
111
|
"""
|
|
@@ -117,9 +117,9 @@ class Storage:
|
|
|
117
117
|
return [
|
|
118
118
|
self.name,
|
|
119
119
|
self.type,
|
|
120
|
-
self.auto_mount,
|
|
120
|
+
str(self.auto_mount),
|
|
121
121
|
self.mount_point,
|
|
122
|
-
self.readonly,
|
|
122
|
+
str(self.readonly),
|
|
123
123
|
self.manifest,
|
|
124
124
|
]
|
|
125
125
|
|
|
@@ -133,7 +133,7 @@ class Storage:
|
|
|
133
133
|
client = k8s_client.CoreV1Api()
|
|
134
134
|
try:
|
|
135
135
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
136
|
-
return pv.spec.csi.volume_handle
|
|
136
|
+
return cast(str, pv.spec.csi.volume_handle)
|
|
137
137
|
except ApiException as e:
|
|
138
138
|
xpk_print(
|
|
139
139
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
|
@@ -150,7 +150,7 @@ class Storage:
|
|
|
150
150
|
client = k8s_client.CoreV1Api()
|
|
151
151
|
try:
|
|
152
152
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
153
|
-
return pv.spec.mount_options
|
|
153
|
+
return cast(list[str], pv.spec.mount_options)
|
|
154
154
|
except ApiException as e:
|
|
155
155
|
xpk_print(
|
|
156
156
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
xpk/core/workload.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import re
|
|
17
18
|
from ..utils.console import xpk_exit, xpk_print
|
|
18
19
|
from .commands import run_command_for_value
|
|
19
20
|
from .gcloud_context import zone_to_region
|
|
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
|
|
|
240
241
|
xpk_print('Your workload did not complete successfully')
|
|
241
242
|
return 125
|
|
242
243
|
return 0
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
|
|
247
|
+
"""Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def get_jobsets_list_gcp_link(project: str) -> str:
|
|
251
|
+
"""Returns a link to Cloud Console JobSets list"""
|
|
252
|
+
|
|
253
|
+
return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'
|
|
@@ -18,7 +18,7 @@ import yaml
|
|
|
18
18
|
from ...utils.yaml import literal_string
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def decorate_kjob_template(job_manifest) ->
|
|
21
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
22
22
|
spec = (
|
|
23
23
|
job_manifest.setdefault('spec', {})
|
|
24
24
|
.setdefault('template', {})
|
|
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
64
64
|
add_tolerations(job_manifest)
|
|
65
65
|
update_gpu_containers(job_manifest)
|
|
66
66
|
|
|
67
|
-
|
|
67
|
+
yaml_str: str = yaml.dump(manifest, sort_keys=False)
|
|
68
|
+
return yaml_str
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
|
36
36
|
job_manifest = job['template']
|
|
37
37
|
add_annotations(job_manifest, storages)
|
|
38
38
|
add_volumes(job_manifest, storage_volumes)
|
|
39
|
-
|
|
39
|
+
yaml_result: str = yaml.dump(manifest, sort_keys=False)
|
|
40
|
+
return yaml_result
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def add_annotations(job_manifest, storages):
|
|
@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
|
|
|
55
55
|
for job in manifest['spec']['replicatedJobs']:
|
|
56
56
|
job_manifest = job['template']
|
|
57
57
|
job_manifest = decorate_job(job_manifest)
|
|
58
|
-
|
|
58
|
+
yaml_str: str = yaml.dump(manifest, sort_keys=False)
|
|
59
|
+
return yaml_str
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
def get_interfaces_annotation() -> dict:
|
|
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
|
|
|
131
132
|
})
|
|
132
133
|
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
134
|
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
135
|
+
volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
|
|
134
136
|
volumes.append(
|
|
135
137
|
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
136
138
|
)
|
|
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
|
|
|
168
170
|
spec['initContainers'].append(tcpxo_daemon_container)
|
|
169
171
|
|
|
170
172
|
|
|
171
|
-
def update_gpu_containers(job_manifest):
|
|
173
|
+
def update_gpu_containers(job_manifest) -> None:
|
|
172
174
|
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
173
175
|
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
174
176
|
env: list = container.setdefault('env', [])
|