xpk 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +29 -30
- xpk/commands/cluster_gcluster.py +19 -14
- xpk/commands/cluster_test.py +1 -21
- xpk/commands/common.py +39 -6
- xpk/commands/common_test.py +170 -0
- xpk/commands/info.py +9 -5
- xpk/commands/inspector.py +33 -4
- xpk/commands/inspector_test.py +142 -0
- xpk/commands/workload.py +35 -17
- xpk/commands/workload_test.py +70 -3
- xpk/core/blueprint/blueprint_generator.py +19 -8
- xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- xpk/core/blueprint/testing/data/a4.yaml +3 -1
- xpk/core/capacity.py +37 -17
- xpk/core/capacity_test.py +66 -1
- xpk/core/cluster.py +10 -10
- xpk/core/cluster_private.py +3 -3
- xpk/core/cluster_test.py +29 -2
- xpk/core/docker_container.py +55 -30
- xpk/core/docker_manager.py +4 -4
- xpk/core/docker_resources.py +4 -1
- xpk/core/kueue_manager.py +6 -8
- xpk/core/kueue_manager_test.py +4 -5
- xpk/core/nap.py +14 -3
- xpk/core/nodepool.py +46 -13
- xpk/core/nodepool_test.py +143 -8
- xpk/core/pathways.py +4 -8
- xpk/core/remote_state/fuse_remote_state.py +1 -1
- xpk/core/scheduling.py +16 -13
- xpk/core/scheduling_test.py +15 -7
- xpk/core/system_characteristics.py +6 -0
- xpk/core/telemetry.py +11 -1
- xpk/core/telemetry_test.py +39 -0
- xpk/core/testing/commands_tester.py +26 -0
- xpk/core/testing/commands_tester_test.py +20 -1
- xpk/core/workload_decorators/rdma_decorator.py +9 -0
- xpk/parser/cluster.py +11 -1
- xpk/parser/cluster_test.py +59 -1
- xpk/parser/common.py +11 -0
- xpk/parser/storage.py +3 -3
- xpk/utils/console.py +1 -1
- xpk/utils/feature_flags.py +7 -3
- {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/METADATA +37 -21
- {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/RECORD +48 -55
- xpk-1.1.1.dist-info/top_level.txt +1 -0
- integration/README.md +0 -19
- integration/__init__.py +0 -15
- integration/docker_manager_test.py +0 -102
- integration/gcluster_a3mega_test.py +0 -215
- integration/gcluster_a3ultra_test.py +0 -187
- integration/gcluster_a4_test.py +0 -187
- integration/gcluster_test.py +0 -107
- xpk/utils/user_input.py +0 -48
- xpk/utils/user_input_test.py +0 -92
- xpk-1.0.0.dist-info/top_level.txt +0 -2
- {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/WHEEL +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/entry_points.txt +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/licenses/LICENSE +0 -0
xpk/core/nodepool_test.py
CHANGED
|
@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
|
|
|
20
20
|
ensure_resource_policy_exists,
|
|
21
21
|
get_desired_node_pool_names,
|
|
22
22
|
run_gke_node_pool_create_command,
|
|
23
|
+
_validate_reservation_count,
|
|
23
24
|
)
|
|
24
25
|
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
|
|
25
26
|
from xpk.core.commands import FailedCommand
|
|
@@ -103,6 +104,7 @@ def commands_tester(mocker):
|
|
|
103
104
|
return CommandsTester(
|
|
104
105
|
mocker,
|
|
105
106
|
run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
|
|
107
|
+
run_command_batch_path="xpk.core.commands.run_command_batch",
|
|
106
108
|
)
|
|
107
109
|
|
|
108
110
|
|
|
@@ -119,7 +121,7 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
|
|
|
119
121
|
|
|
120
122
|
assert len(commands_tester.commands_history) == 1
|
|
121
123
|
commands_tester.assert_command_run(
|
|
122
|
-
"gcloud compute resource-policies describe resource-policy",
|
|
124
|
+
"gcloud beta compute resource-policies describe resource-policy",
|
|
123
125
|
"--project=test-project",
|
|
124
126
|
"--region=us-central1",
|
|
125
127
|
)
|
|
@@ -129,7 +131,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
|
129
131
|
commands_tester: CommandsTester,
|
|
130
132
|
):
|
|
131
133
|
commands_tester.set_result_for_command(
|
|
132
|
-
(1, ""), "gcloud compute resource-policies describe"
|
|
134
|
+
(1, ""), "gcloud beta compute resource-policies describe"
|
|
133
135
|
)
|
|
134
136
|
|
|
135
137
|
ensure_resource_policy_exists(
|
|
@@ -142,16 +144,17 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
|
142
144
|
|
|
143
145
|
assert len(commands_tester.commands_history) == 2
|
|
144
146
|
commands_tester.assert_command_run(
|
|
145
|
-
"gcloud compute resource-policies describe"
|
|
147
|
+
"gcloud beta compute resource-policies describe"
|
|
146
148
|
)
|
|
147
149
|
commands_tester.assert_command_run(
|
|
148
|
-
"gcloud compute resource-policies create workload-policy
|
|
150
|
+
"gcloud beta compute resource-policies create workload-policy"
|
|
151
|
+
" resource-policy",
|
|
149
152
|
"--project=test-project",
|
|
150
153
|
"--region=us-central1",
|
|
151
154
|
"--accelerator-topology=2x2x1",
|
|
152
155
|
)
|
|
153
156
|
commands_tester.assert_command_not_run(
|
|
154
|
-
"gcloud compute resource-policies create workload-policy",
|
|
157
|
+
"gcloud beta compute resource-policies create workload-policy",
|
|
155
158
|
"--accelerator-topology-mode",
|
|
156
159
|
)
|
|
157
160
|
|
|
@@ -160,7 +163,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
|
|
|
160
163
|
commands_tester: CommandsTester,
|
|
161
164
|
):
|
|
162
165
|
commands_tester.set_result_for_command(
|
|
163
|
-
(1, ""), "gcloud compute resource-policies describe"
|
|
166
|
+
(1, ""), "gcloud beta compute resource-policies describe"
|
|
164
167
|
)
|
|
165
168
|
|
|
166
169
|
ensure_resource_policy_exists(
|
|
@@ -172,7 +175,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
|
|
|
172
175
|
)
|
|
173
176
|
|
|
174
177
|
commands_tester.assert_command_run(
|
|
175
|
-
"gcloud compute resource-policies create workload-policy",
|
|
178
|
+
"gcloud beta compute resource-policies create workload-policy",
|
|
176
179
|
"--accelerator-topology-mode",
|
|
177
180
|
)
|
|
178
181
|
|
|
@@ -182,7 +185,7 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
|
|
|
182
185
|
):
|
|
183
186
|
with pytest.raises(RuntimeError):
|
|
184
187
|
commands_tester.set_result_for_command(
|
|
185
|
-
(1, ""), "gcloud compute resource-policies"
|
|
188
|
+
(1, ""), "gcloud beta compute resource-policies"
|
|
186
189
|
)
|
|
187
190
|
|
|
188
191
|
ensure_resource_policy_exists(
|
|
@@ -433,3 +436,135 @@ def test_display_nodepool_creation_ignores_logs_without_errors(
|
|
|
433
436
|
mock_xpk_print.call_args_list[0].args[0]
|
|
434
437
|
== "Create Nodepools returned ERROR 1"
|
|
435
438
|
)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_validate_reservation_count_mismatch(mock_xpk_print):
|
|
442
|
+
result = _validate_reservation_count(
|
|
443
|
+
["res1", "res2"], num_node_pools_to_create=3
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
assert result == 1
|
|
447
|
+
assert mock_xpk_print.call_count == 1
|
|
448
|
+
assert (
|
|
449
|
+
"reservations (2) must match the number of NEW nodepools (3)"
|
|
450
|
+
in mock_xpk_print.call_args_list[0].args[0]
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def test_run_gke_node_pool_create_command_multiple_reservations(
|
|
455
|
+
mocker,
|
|
456
|
+
commands_tester: CommandsTester,
|
|
457
|
+
):
|
|
458
|
+
mocker.patch(
|
|
459
|
+
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
460
|
+
)
|
|
461
|
+
mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
|
|
462
|
+
args = mocker.Mock(
|
|
463
|
+
num_slices=2,
|
|
464
|
+
reservation="res1,res2",
|
|
465
|
+
tpu_type="v4-8",
|
|
466
|
+
device_type=None,
|
|
467
|
+
cluster="test-cluster",
|
|
468
|
+
project="test-project",
|
|
469
|
+
zone="us-central1-a",
|
|
470
|
+
on_demand=False,
|
|
471
|
+
spot=False,
|
|
472
|
+
flex=False,
|
|
473
|
+
enable_workload_identity=False,
|
|
474
|
+
enable_gcsfuse_csi_driver=False,
|
|
475
|
+
host_maintenance_interval="AS_NEEDED",
|
|
476
|
+
custom_nodepool_arguments="",
|
|
477
|
+
)
|
|
478
|
+
system = SystemCharacteristics(
|
|
479
|
+
topology="2x2x1",
|
|
480
|
+
vms_per_slice=2,
|
|
481
|
+
gke_accelerator="tpu-v4",
|
|
482
|
+
gce_machine_type="ct4p-hightpu-4t",
|
|
483
|
+
chips_per_vm=4,
|
|
484
|
+
accelerator_type=AcceleratorType.TPU,
|
|
485
|
+
device_type="v4-8",
|
|
486
|
+
requires_workload_policy=False,
|
|
487
|
+
supports_sub_slicing=False,
|
|
488
|
+
supports_super_slicing=False,
|
|
489
|
+
supports_accelerator_network_profile=True,
|
|
490
|
+
docker_platform=DockerPlatform.AMD,
|
|
491
|
+
)
|
|
492
|
+
commands_tester.set_result_for_command(
|
|
493
|
+
(0, ""), "gcloud beta container node-pools list"
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
result = run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
497
|
+
|
|
498
|
+
assert result == 0
|
|
499
|
+
commands_tester.assert_command_run(
|
|
500
|
+
"gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
|
|
501
|
+
)
|
|
502
|
+
commands_tester.assert_command_run(
|
|
503
|
+
"gcloud", "node-pools create", "test-cluster-np-0", "--reservation=res1"
|
|
504
|
+
)
|
|
505
|
+
commands_tester.assert_command_run(
|
|
506
|
+
"gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res2"
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def test_run_gke_node_pool_create_command_partial_reservations(
|
|
511
|
+
mocker,
|
|
512
|
+
commands_tester: CommandsTester,
|
|
513
|
+
):
|
|
514
|
+
mocker.patch(
|
|
515
|
+
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
516
|
+
)
|
|
517
|
+
mocker.patch("xpk.core.nodepool.get_node_pools_to_delete", return_value=[])
|
|
518
|
+
mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
|
|
519
|
+
args = mocker.Mock(
|
|
520
|
+
num_slices=3,
|
|
521
|
+
reservation="res1,res2",
|
|
522
|
+
tpu_type="v4-8",
|
|
523
|
+
device_type=None,
|
|
524
|
+
cluster="test-cluster",
|
|
525
|
+
project="test-project",
|
|
526
|
+
zone="us-central1-a",
|
|
527
|
+
on_demand=False,
|
|
528
|
+
spot=False,
|
|
529
|
+
flex=False,
|
|
530
|
+
enable_workload_identity=False,
|
|
531
|
+
enable_gcsfuse_csi_driver=False,
|
|
532
|
+
host_maintenance_interval="AS_NEEDED",
|
|
533
|
+
custom_nodepool_arguments="",
|
|
534
|
+
)
|
|
535
|
+
system = SystemCharacteristics(
|
|
536
|
+
topology="2x2x1",
|
|
537
|
+
vms_per_slice=2,
|
|
538
|
+
gke_accelerator="tpu-v4",
|
|
539
|
+
gce_machine_type="ct4p-hightpu-4t",
|
|
540
|
+
chips_per_vm=4,
|
|
541
|
+
accelerator_type=AcceleratorType.TPU,
|
|
542
|
+
device_type="v4-8",
|
|
543
|
+
requires_workload_policy=False,
|
|
544
|
+
supports_sub_slicing=False,
|
|
545
|
+
supports_super_slicing=False,
|
|
546
|
+
supports_accelerator_network_profile=True,
|
|
547
|
+
docker_platform=DockerPlatform.AMD,
|
|
548
|
+
)
|
|
549
|
+
commands_tester.set_result_for_command(
|
|
550
|
+
(0, "test-cluster-np-0"), "gcloud beta container node-pools list"
|
|
551
|
+
)
|
|
552
|
+
commands_tester.set_result_for_command(
|
|
553
|
+
(0, "us-central1-a"),
|
|
554
|
+
"gcloud",
|
|
555
|
+
"node-pools describe",
|
|
556
|
+
'--format="value(locations)"',
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
result = run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
560
|
+
|
|
561
|
+
assert result == 0
|
|
562
|
+
commands_tester.assert_command_run(
|
|
563
|
+
"gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
|
|
564
|
+
)
|
|
565
|
+
commands_tester.assert_command_run(
|
|
566
|
+
"gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res1"
|
|
567
|
+
)
|
|
568
|
+
commands_tester.assert_command_run(
|
|
569
|
+
"gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
|
|
570
|
+
)
|
xpk/core/pathways.py
CHANGED
|
@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
|
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
def get_user_workload_for_pathways(
|
|
248
|
-
args,
|
|
249
|
-
system: SystemCharacteristics,
|
|
248
|
+
args, system: SystemCharacteristics, parallel_containers: int
|
|
250
249
|
) -> str:
|
|
251
250
|
"""
|
|
252
251
|
Create a user workload container for Pathways.
|
|
253
252
|
Don't create one for Pathways headless mode.
|
|
254
253
|
|
|
255
|
-
Args:
|
|
256
|
-
args: user provided args.
|
|
257
|
-
system: system characteristics.
|
|
258
|
-
|
|
259
|
-
|
|
260
254
|
Returns:
|
|
261
255
|
str:
|
|
262
256
|
Pathways server port as a YAML string
|
|
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
|
|
|
280
274
|
if args.headless:
|
|
281
275
|
return ''
|
|
282
276
|
else:
|
|
283
|
-
container, _ = get_user_workload_container(
|
|
277
|
+
container, _ = get_user_workload_container(
|
|
278
|
+
args, system, parallel_containers
|
|
279
|
+
)
|
|
284
280
|
return user_workload_yaml.format(
|
|
285
281
|
args=args,
|
|
286
282
|
container=container,
|
|
@@ -56,7 +56,7 @@ class FuseStateClient(RemoteStateClient):
|
|
|
56
56
|
|
|
57
57
|
def upload_state(self) -> None:
|
|
58
58
|
xpk_print(
|
|
59
|
-
f'Uploading
|
|
59
|
+
f'Uploading dependencies from directory {self.state_dir} to bucket:'
|
|
60
60
|
f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
|
|
61
61
|
)
|
|
62
62
|
upload_directory_to_gcs(
|
xpk/core/scheduling.py
CHANGED
|
@@ -33,8 +33,11 @@ from .system_characteristics import (
|
|
|
33
33
|
from packaging.version import Version
|
|
34
34
|
|
|
35
35
|
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
36
|
-
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.
|
|
36
|
+
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
|
|
37
37
|
_SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
|
|
38
|
+
ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
|
|
39
|
+
'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
|
|
40
|
+
)
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class WorkloadScheduling(Enum):
|
|
@@ -85,6 +88,18 @@ def check_if_workload_can_schedule(
|
|
|
85
88
|
return WorkloadScheduling.UNAVAILABLE
|
|
86
89
|
return WorkloadScheduling.AVAILABLE
|
|
87
90
|
|
|
91
|
+
if cluster_system and _check_super_slicing_availability(
|
|
92
|
+
workload_system=workload_system, cluster_system=cluster_system
|
|
93
|
+
):
|
|
94
|
+
if _check_workload_size_fits(
|
|
95
|
+
args,
|
|
96
|
+
workload_system,
|
|
97
|
+
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
98
|
+
) and _check_super_slicing_topology(workload_system):
|
|
99
|
+
return WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
100
|
+
else:
|
|
101
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
102
|
+
|
|
88
103
|
if workload_system.device_type in resources_config_map:
|
|
89
104
|
if _check_workload_size_fits(
|
|
90
105
|
args,
|
|
@@ -109,18 +124,6 @@ def check_if_workload_can_schedule(
|
|
|
109
124
|
else:
|
|
110
125
|
return WorkloadScheduling.UNAVAILABLE
|
|
111
126
|
|
|
112
|
-
if cluster_system and _check_super_slicing_availability(
|
|
113
|
-
workload_system=workload_system, cluster_system=cluster_system
|
|
114
|
-
):
|
|
115
|
-
if _check_workload_size_fits(
|
|
116
|
-
args,
|
|
117
|
-
workload_system,
|
|
118
|
-
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
119
|
-
) and _check_super_slicing_topology(workload_system):
|
|
120
|
-
return WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
121
|
-
else:
|
|
122
|
-
return WorkloadScheduling.UNAVAILABLE
|
|
123
|
-
|
|
124
127
|
xpk_print(
|
|
125
128
|
'Workload scheduling validation failed. XPK will not create the workload'
|
|
126
129
|
f' {args.workload}.'
|
xpk/core/scheduling_test.py
CHANGED
|
@@ -208,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
208
208
|
cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
|
|
209
209
|
# 5 4x4x4 cubes:
|
|
210
210
|
resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
|
|
211
|
-
kueue_version='0.
|
|
211
|
+
kueue_version='0.15.2',
|
|
212
212
|
super_slicing_feature_enabled=True,
|
|
213
213
|
super_slicing_topology_set=True,
|
|
214
214
|
num_slices=1,
|
|
@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
398
398
|
WorkloadScheduling.UNAVAILABLE,
|
|
399
399
|
),
|
|
400
400
|
(
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
401
|
+
'Super-slicing, but one cube',
|
|
402
|
+
dataclasses.replace(
|
|
403
|
+
SUPER_SLICING_CASE,
|
|
404
|
+
workload_system=_get_system_characteristics_or_die('tpu7x-128'),
|
|
405
|
+
cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
|
|
406
|
+
resources_config_map={'tpu7x-128': '16'},
|
|
404
407
|
),
|
|
408
|
+
WorkloadScheduling.SUPER_SLICING_AVAILABLE,
|
|
409
|
+
),
|
|
410
|
+
(
|
|
411
|
+
'Super-slicing, but one cube and no super-slicing-topology',
|
|
405
412
|
dataclasses.replace(
|
|
406
413
|
SUPER_SLICING_CASE,
|
|
407
|
-
workload_system=_get_system_characteristics_or_die('tpu7x-
|
|
408
|
-
cluster_system=_get_system_characteristics_or_die('tpu7x-
|
|
409
|
-
resources_config_map={'tpu7x-
|
|
414
|
+
workload_system=_get_system_characteristics_or_die('tpu7x-128'),
|
|
415
|
+
cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
|
|
416
|
+
resources_config_map={'tpu7x-128': '16'},
|
|
417
|
+
super_slicing_topology_set=False,
|
|
410
418
|
),
|
|
411
419
|
WorkloadScheduling.AVAILABLE,
|
|
412
420
|
),
|
|
@@ -131,6 +131,8 @@ class SystemCharacteristics:
|
|
|
131
131
|
supports_super_slicing: Whether the Super-slicing feature is supported.
|
|
132
132
|
requires_workload_policy: A boolean indicating if a GCE resource
|
|
133
133
|
workload policy is required. This is automatically set to True for GPUs.
|
|
134
|
+
parallel_containers: The number of containers running on a single VM.
|
|
135
|
+
|
|
134
136
|
"""
|
|
135
137
|
|
|
136
138
|
topology: str
|
|
@@ -146,6 +148,7 @@ class SystemCharacteristics:
|
|
|
146
148
|
docker_platform: DockerPlatform
|
|
147
149
|
requires_workload_policy: bool = False
|
|
148
150
|
gpu_config: Optional[GpuConfig] = None
|
|
151
|
+
parallel_containers: int = 1
|
|
149
152
|
|
|
150
153
|
def __post_init__(self):
|
|
151
154
|
if self.accelerator_type == AcceleratorType.GPU:
|
|
@@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map(
|
|
|
239
242
|
default_topologies: set[str] | None = None,
|
|
240
243
|
sub_slicing_topologies: set[str] | None = None,
|
|
241
244
|
super_slicing_topologies: set[str] | None = None,
|
|
245
|
+
parallel_containers: int = 1,
|
|
242
246
|
) -> dict[str, SystemCharacteristics]:
|
|
243
247
|
system_characteristics_map = {}
|
|
244
248
|
default_topologies = default_topologies or set()
|
|
@@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map(
|
|
|
263
267
|
supports_super_slicing=topology in super_slicing_topologies,
|
|
264
268
|
supports_accelerator_network_profile=supports_accelerator_network_profile,
|
|
265
269
|
docker_platform=docker_platform,
|
|
270
|
+
parallel_containers=parallel_containers,
|
|
266
271
|
)
|
|
267
272
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
268
273
|
if (
|
|
@@ -544,6 +549,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
544
549
|
tpu_type_requires_workload_policy=True,
|
|
545
550
|
supports_accelerator_network_profile=False,
|
|
546
551
|
docker_platform=AMD_PLATFORM,
|
|
552
|
+
parallel_containers=2,
|
|
547
553
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
548
554
|
super_slicing_topologies=set(['4x4x4']),
|
|
549
555
|
default_topologies=set([
|
xpk/core/telemetry.py
CHANGED
|
@@ -30,7 +30,7 @@ from dataclasses import dataclass
|
|
|
30
30
|
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
31
|
from ..utils.execution_context import is_dry_run
|
|
32
32
|
from ..utils.user_agent import get_user_agent
|
|
33
|
-
from ..utils.feature_flags import FeatureFlags
|
|
33
|
+
from ..utils.feature_flags import FeatureFlags, is_tester
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def should_send_telemetry():
|
|
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
|
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
class MetricsEventMetadataKey(Enum):
|
|
117
|
+
"""Represents available metadata keys."""
|
|
118
|
+
|
|
117
119
|
SESSION_ID = "XPK_SESSION_ID"
|
|
118
120
|
DRY_RUN = "XPK_DRY_RUN"
|
|
119
121
|
PYTHON_VERSION = "XPK_PYTHON_VERSION"
|
|
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
|
|
|
125
127
|
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
128
|
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
129
|
LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
|
|
130
|
+
TESTER = "XPK_TESTER"
|
|
128
131
|
|
|
129
132
|
|
|
130
133
|
@dataclass
|
|
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
|
|
|
230
233
|
MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
|
|
231
234
|
_is_running_from_source()
|
|
232
235
|
).lower(),
|
|
236
|
+
MetricsEventMetadataKey.TESTER: str(
|
|
237
|
+
is_tester() or _is_trash_execution()
|
|
238
|
+
).lower(),
|
|
233
239
|
}
|
|
234
240
|
|
|
235
241
|
|
|
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
|
|
|
241
247
|
}
|
|
242
248
|
|
|
243
249
|
|
|
250
|
+
def _is_trash_execution() -> bool:
|
|
251
|
+
return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
|
|
252
|
+
|
|
253
|
+
|
|
244
254
|
def _is_running_as_pip() -> bool:
|
|
245
255
|
return os.path.basename(sys.argv[0]) == "xpk"
|
|
246
256
|
|
xpk/core/telemetry_test.py
CHANGED
|
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
|
|
|
30
30
|
mocker.patch('time.time', side_effect=itertools.count())
|
|
31
31
|
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
32
32
|
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
33
|
+
mocker.patch('os.getenv', return_value='false')
|
|
33
34
|
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
35
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
|
|
34
36
|
set_dry_run(False)
|
|
35
37
|
get_config().set(CLIENT_ID_KEY, 'client_id')
|
|
36
38
|
yield
|
|
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
|
|
|
76
78
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
77
79
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
78
80
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
81
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
79
82
|
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
80
83
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
81
84
|
],
|
|
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
|
|
|
107
110
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
108
111
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
109
112
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
113
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
110
114
|
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
111
115
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
112
116
|
],
|
|
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
131
135
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
132
136
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
133
137
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
138
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
134
139
|
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
135
140
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
136
141
|
],
|
|
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
|
|
|
219
224
|
assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
|
|
220
225
|
|
|
221
226
|
|
|
227
|
+
@pytest.mark.parametrize(
|
|
228
|
+
argnames='tester,expected',
|
|
229
|
+
argvalues=[
|
|
230
|
+
(True, 'true'),
|
|
231
|
+
(False, 'false'),
|
|
232
|
+
],
|
|
233
|
+
)
|
|
234
|
+
def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
|
|
235
|
+
tester: bool, expected: str, mocker: MockerFixture
|
|
236
|
+
):
|
|
237
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
|
|
238
|
+
MetricsCollector.log_start(command='test')
|
|
239
|
+
payload = MetricsCollector.flush()
|
|
240
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@pytest.mark.parametrize(
|
|
244
|
+
argnames='trash_execution,expected',
|
|
245
|
+
argvalues=[
|
|
246
|
+
('true', 'true'),
|
|
247
|
+
('false', 'false'),
|
|
248
|
+
('', 'false'),
|
|
249
|
+
(None, 'false'),
|
|
250
|
+
],
|
|
251
|
+
)
|
|
252
|
+
def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
|
|
253
|
+
trash_execution: str, expected: str, mocker: MockerFixture
|
|
254
|
+
):
|
|
255
|
+
mocker.patch('os.getenv', return_value=trash_execution)
|
|
256
|
+
MetricsCollector.log_start(command='test')
|
|
257
|
+
payload = MetricsCollector.flush()
|
|
258
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
259
|
+
|
|
260
|
+
|
|
222
261
|
def _get_metadata_value(payload_str: str, key: str) -> str | None:
|
|
223
262
|
payload = json.loads(payload_str)
|
|
224
263
|
metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
|
|
@@ -17,6 +17,8 @@ limitations under the License.
|
|
|
17
17
|
import re
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
+
from ..commands import FailedCommand
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
class CommandsTester:
|
|
22
24
|
"""Tester class useful for mocking and asserting command runs."""
|
|
@@ -27,6 +29,7 @@ class CommandsTester:
|
|
|
27
29
|
run_command_for_value_path: str | None = None,
|
|
28
30
|
run_command_with_updates_path: str | None = None,
|
|
29
31
|
run_command_with_updates_retry_path: str | None = None,
|
|
32
|
+
run_command_batch_path: str | None = None,
|
|
30
33
|
):
|
|
31
34
|
self.__results: dict[re.Pattern, tuple[int, str]] = {}
|
|
32
35
|
self.commands_history: list[str] = []
|
|
@@ -45,6 +48,11 @@ class CommandsTester:
|
|
|
45
48
|
run_command_with_updates_retry_path,
|
|
46
49
|
wraps=self.__fake_run_command_with_updates_retry,
|
|
47
50
|
)
|
|
51
|
+
if run_command_batch_path:
|
|
52
|
+
mocker.patch(
|
|
53
|
+
run_command_batch_path,
|
|
54
|
+
wraps=self.__fake_run_command_batch,
|
|
55
|
+
)
|
|
48
56
|
|
|
49
57
|
def set_result_for_command(
|
|
50
58
|
self, result: tuple[int, str], *command_parts: str
|
|
@@ -111,6 +119,24 @@ class CommandsTester:
|
|
|
111
119
|
) -> tuple[int, str]:
|
|
112
120
|
return self.__common_fake_run_command(command, (0, dry_run_return_val))
|
|
113
121
|
|
|
122
|
+
def __fake_run_command_batch(
|
|
123
|
+
self,
|
|
124
|
+
commands: list[str],
|
|
125
|
+
jobname: str,
|
|
126
|
+
per_command_name: list[str],
|
|
127
|
+
output_logs: list[str],
|
|
128
|
+
) -> FailedCommand | None:
|
|
129
|
+
for i, command in enumerate(commands):
|
|
130
|
+
result = self.__common_fake_run_command(command, (0, ""))[0]
|
|
131
|
+
if result != 0:
|
|
132
|
+
return FailedCommand(
|
|
133
|
+
return_code=result,
|
|
134
|
+
name=per_command_name[i],
|
|
135
|
+
command=command,
|
|
136
|
+
logfile=output_logs[i],
|
|
137
|
+
)
|
|
138
|
+
return None
|
|
139
|
+
|
|
114
140
|
# pylint: enable=unused-argument
|
|
115
141
|
|
|
116
142
|
def __common_fake_run_command(
|
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import pytest
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
-
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
|
|
20
|
+
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
|
|
21
21
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
22
22
|
|
|
23
23
|
|
|
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
|
|
|
31
31
|
run_command_with_updates_retry_path=(
|
|
32
32
|
"xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
|
|
33
33
|
),
|
|
34
|
+
run_command_batch_path=(
|
|
35
|
+
"xpk.core.testing.commands_tester_test.run_command_batch"
|
|
36
|
+
),
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
|
|
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
|
|
|
54
57
|
mock_commands.assert_command_run("cmd", "bar")
|
|
55
58
|
|
|
56
59
|
|
|
60
|
+
def test_run_command_batch_default_result(
|
|
61
|
+
mock_commands: CommandsTester,
|
|
62
|
+
):
|
|
63
|
+
result = run_command_batch(
|
|
64
|
+
commands=["cmd1 foo bar", "cmd2 foo bar"],
|
|
65
|
+
jobname="Test command",
|
|
66
|
+
per_command_name=["cmd1", "cmd2"],
|
|
67
|
+
output_logs=["log1", "log2"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
assert result is None
|
|
71
|
+
mock_commands.assert_command_run("foo bar", times=2)
|
|
72
|
+
mock_commands.assert_command_run("cmd1")
|
|
73
|
+
mock_commands.assert_command_run("cmd2")
|
|
74
|
+
|
|
75
|
+
|
|
57
76
|
def test_set_result_for_command(mock_commands: CommandsTester):
|
|
58
77
|
mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
|
|
59
78
|
|
|
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
|
|
|
84
84
|
volumes.append(
|
|
85
85
|
{'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
|
|
86
86
|
)
|
|
87
|
+
volumes.append({
|
|
88
|
+
'name': 'dshm',
|
|
89
|
+
'emptyDir': {
|
|
90
|
+
'medium': 'Memory',
|
|
91
|
+
},
|
|
92
|
+
})
|
|
87
93
|
|
|
88
94
|
|
|
89
95
|
def add_tolerations(job_manifest):
|
|
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
|
|
|
111
117
|
container['volumeMounts'].append(
|
|
112
118
|
{'name': 'gib', 'mountPath': '/usr/local/gib'}
|
|
113
119
|
)
|
|
120
|
+
container['volumeMounts'].append(
|
|
121
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
122
|
+
)
|
xpk/parser/cluster.py
CHANGED
|
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
338
338
|
add_resource_limits(cluster_create_resource_limits)
|
|
339
339
|
|
|
340
340
|
cluster_create_ray_parser.set_defaults(
|
|
341
|
-
func=cluster_create_ray_cluster,
|
|
341
|
+
func=cluster_create_ray_cluster,
|
|
342
|
+
sub_slicing=False,
|
|
343
|
+
super_slicing=False,
|
|
344
|
+
num_cubes=None,
|
|
342
345
|
)
|
|
343
346
|
|
|
344
347
|
|
|
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
503
506
|
)
|
|
504
507
|
add_driver_arguments(cluster_adapt_optional_arguments)
|
|
505
508
|
add_shared_arguments(cluster_adapt_optional_arguments)
|
|
509
|
+
add_resource_limits(cluster_adapt_optional_arguments)
|
|
510
|
+
|
|
511
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
512
|
+
add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
|
|
513
|
+
|
|
514
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
515
|
+
add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
|
|
506
516
|
|
|
507
517
|
cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
|
|
508
518
|
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|