xpk 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +33 -43
- xpk/commands/cluster_gcluster.py +19 -14
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +1 -21
- xpk/commands/common.py +39 -6
- xpk/commands/common_test.py +170 -0
- xpk/commands/info.py +9 -5
- xpk/commands/inspector.py +33 -4
- xpk/commands/inspector_test.py +142 -0
- xpk/commands/workload.py +32 -11
- xpk/commands/workload_test.py +71 -3
- xpk/core/blueprint/blueprint_generator.py +19 -8
- xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- xpk/core/blueprint/testing/data/a4.yaml +3 -1
- xpk/core/capacity.py +37 -17
- xpk/core/capacity_test.py +66 -1
- xpk/core/cluster.py +11 -10
- xpk/core/cluster_private.py +3 -3
- xpk/core/cluster_test.py +29 -2
- xpk/core/config.py +5 -2
- xpk/core/docker_container.py +31 -24
- xpk/core/docker_manager.py +4 -4
- xpk/core/docker_resources.py +4 -1
- xpk/core/kueue_manager.py +6 -8
- xpk/core/kueue_manager_test.py +6 -5
- xpk/core/nap.py +14 -3
- xpk/core/nodepool.py +52 -13
- xpk/core/nodepool_test.py +147 -8
- xpk/core/remote_state/fuse_remote_state.py +1 -1
- xpk/core/scheduling.py +32 -4
- xpk/core/scheduling_test.py +39 -2
- xpk/core/system_characteristics.py +44 -0
- xpk/core/system_characteristics_test.py +11 -0
- xpk/core/telemetry.py +11 -1
- xpk/core/telemetry_test.py +39 -0
- xpk/core/testing/commands_tester.py +26 -0
- xpk/core/testing/commands_tester_test.py +20 -1
- xpk/core/workload_decorators/rdma_decorator.py +9 -0
- xpk/parser/cluster.py +11 -1
- xpk/parser/cluster_test.py +59 -1
- xpk/parser/common.py +11 -17
- xpk/parser/core.py +0 -8
- xpk/parser/storage.py +3 -14
- xpk/utils/console.py +1 -1
- xpk/utils/feature_flags.py +8 -4
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/METADATA +50 -23
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/RECORD +51 -60
- xpk-1.1.0.dist-info/top_level.txt +1 -0
- integration/README.md +0 -19
- integration/__init__.py +0 -15
- integration/docker_manager_test.py +0 -102
- integration/gcluster_a3mega_test.py +0 -215
- integration/gcluster_a3ultra_test.py +0 -187
- integration/gcluster_a4_test.py +0 -187
- integration/gcluster_test.py +0 -107
- xpk/commands/kind.py +0 -265
- xpk/parser/kind.py +0 -95
- xpk/utils/user_input.py +0 -48
- xpk/utils/user_input_test.py +0 -92
- xpk-0.17.3.dist-info/top_level.txt +0 -2
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -131,6 +131,8 @@ class SystemCharacteristics:
|
|
|
131
131
|
supports_super_slicing: Whether the Super-slicing feature is supported.
|
|
132
132
|
requires_workload_policy: A boolean indicating if a GCE resource
|
|
133
133
|
workload policy is required. This is automatically set to True for GPUs.
|
|
134
|
+
parallel_containers: The number of containers running on a single VM.
|
|
135
|
+
|
|
134
136
|
"""
|
|
135
137
|
|
|
136
138
|
topology: str
|
|
@@ -142,9 +144,11 @@ class SystemCharacteristics:
|
|
|
142
144
|
device_type: str
|
|
143
145
|
supports_sub_slicing: bool
|
|
144
146
|
supports_super_slicing: bool
|
|
147
|
+
supports_accelerator_network_profile: bool
|
|
145
148
|
docker_platform: DockerPlatform
|
|
146
149
|
requires_workload_policy: bool = False
|
|
147
150
|
gpu_config: Optional[GpuConfig] = None
|
|
151
|
+
parallel_containers: int = 1
|
|
148
152
|
|
|
149
153
|
def __post_init__(self):
|
|
150
154
|
if self.accelerator_type == AcceleratorType.GPU:
|
|
@@ -233,10 +237,12 @@ def get_tpu_system_characteristics_map(
|
|
|
233
237
|
machine_type: str,
|
|
234
238
|
supported_topologies: list[str],
|
|
235
239
|
docker_platform: DockerPlatform,
|
|
240
|
+
supports_accelerator_network_profile: bool,
|
|
236
241
|
tpu_type_requires_workload_policy: bool = False,
|
|
237
242
|
default_topologies: set[str] | None = None,
|
|
238
243
|
sub_slicing_topologies: set[str] | None = None,
|
|
239
244
|
super_slicing_topologies: set[str] | None = None,
|
|
245
|
+
parallel_containers: int = 1,
|
|
240
246
|
) -> dict[str, SystemCharacteristics]:
|
|
241
247
|
system_characteristics_map = {}
|
|
242
248
|
default_topologies = default_topologies or set()
|
|
@@ -259,7 +265,9 @@ def get_tpu_system_characteristics_map(
|
|
|
259
265
|
and vms_per_slice > 1,
|
|
260
266
|
supports_sub_slicing=topology in sub_slicing_topologies,
|
|
261
267
|
supports_super_slicing=topology in super_slicing_topologies,
|
|
268
|
+
supports_accelerator_network_profile=supports_accelerator_network_profile,
|
|
262
269
|
docker_platform=docker_platform,
|
|
270
|
+
parallel_containers=parallel_containers,
|
|
263
271
|
)
|
|
264
272
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
265
273
|
if (
|
|
@@ -303,6 +311,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
303
311
|
device_type='l4-1',
|
|
304
312
|
supports_sub_slicing=False,
|
|
305
313
|
supports_super_slicing=False,
|
|
314
|
+
supports_accelerator_network_profile=False,
|
|
306
315
|
gpu_config=GpuConfig(requires_topology=False),
|
|
307
316
|
docker_platform=AMD_PLATFORM,
|
|
308
317
|
),
|
|
@@ -316,6 +325,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
316
325
|
device_type='l4-2',
|
|
317
326
|
supports_sub_slicing=False,
|
|
318
327
|
supports_super_slicing=False,
|
|
328
|
+
supports_accelerator_network_profile=False,
|
|
319
329
|
gpu_config=GpuConfig(requires_topology=False),
|
|
320
330
|
docker_platform=AMD_PLATFORM,
|
|
321
331
|
),
|
|
@@ -329,6 +339,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
329
339
|
device_type='l4-4',
|
|
330
340
|
supports_sub_slicing=False,
|
|
331
341
|
supports_super_slicing=False,
|
|
342
|
+
supports_accelerator_network_profile=False,
|
|
332
343
|
gpu_config=GpuConfig(requires_topology=False),
|
|
333
344
|
docker_platform=AMD_PLATFORM,
|
|
334
345
|
),
|
|
@@ -342,6 +353,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
342
353
|
device_type='l4-8',
|
|
343
354
|
supports_sub_slicing=False,
|
|
344
355
|
supports_super_slicing=False,
|
|
356
|
+
supports_accelerator_network_profile=False,
|
|
345
357
|
gpu_config=GpuConfig(requires_topology=False),
|
|
346
358
|
docker_platform=AMD_PLATFORM,
|
|
347
359
|
),
|
|
@@ -356,6 +368,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
356
368
|
device_type='a100-40gb-1',
|
|
357
369
|
supports_sub_slicing=False,
|
|
358
370
|
supports_super_slicing=False,
|
|
371
|
+
supports_accelerator_network_profile=False,
|
|
359
372
|
gpu_config=GpuConfig(requires_topology=False),
|
|
360
373
|
docker_platform=AMD_PLATFORM,
|
|
361
374
|
),
|
|
@@ -369,6 +382,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
369
382
|
device_type='a100-40gb-2',
|
|
370
383
|
supports_sub_slicing=False,
|
|
371
384
|
supports_super_slicing=False,
|
|
385
|
+
supports_accelerator_network_profile=False,
|
|
372
386
|
gpu_config=GpuConfig(requires_topology=False),
|
|
373
387
|
docker_platform=AMD_PLATFORM,
|
|
374
388
|
),
|
|
@@ -382,6 +396,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
382
396
|
device_type='a100-40gb-4',
|
|
383
397
|
supports_sub_slicing=False,
|
|
384
398
|
supports_super_slicing=False,
|
|
399
|
+
supports_accelerator_network_profile=False,
|
|
385
400
|
gpu_config=GpuConfig(requires_topology=False),
|
|
386
401
|
docker_platform=AMD_PLATFORM,
|
|
387
402
|
),
|
|
@@ -395,6 +410,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
395
410
|
device_type='a100-40gb-8',
|
|
396
411
|
supports_sub_slicing=False,
|
|
397
412
|
supports_super_slicing=False,
|
|
413
|
+
supports_accelerator_network_profile=False,
|
|
398
414
|
gpu_config=GpuConfig(requires_topology=False),
|
|
399
415
|
docker_platform=AMD_PLATFORM,
|
|
400
416
|
),
|
|
@@ -408,6 +424,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
408
424
|
device_type='gb200-4',
|
|
409
425
|
supports_sub_slicing=False,
|
|
410
426
|
supports_super_slicing=False,
|
|
427
|
+
supports_accelerator_network_profile=True,
|
|
411
428
|
gpu_config=GpuConfig(
|
|
412
429
|
requires_topology=True,
|
|
413
430
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
@@ -426,6 +443,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
426
443
|
device_type='gb200-4',
|
|
427
444
|
supports_sub_slicing=False,
|
|
428
445
|
supports_super_slicing=False,
|
|
446
|
+
supports_accelerator_network_profile=True,
|
|
429
447
|
gpu_config=GpuConfig(
|
|
430
448
|
requires_topology=True,
|
|
431
449
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
@@ -444,6 +462,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
444
462
|
device_type='b200-8',
|
|
445
463
|
supports_sub_slicing=False,
|
|
446
464
|
supports_super_slicing=False,
|
|
465
|
+
supports_accelerator_network_profile=True,
|
|
447
466
|
gpu_config=GpuConfig(
|
|
448
467
|
requires_topology=True,
|
|
449
468
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
@@ -462,6 +481,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
462
481
|
device_type='h200-141gb-8',
|
|
463
482
|
supports_sub_slicing=False,
|
|
464
483
|
supports_super_slicing=False,
|
|
484
|
+
supports_accelerator_network_profile=True,
|
|
465
485
|
gpu_config=GpuConfig(
|
|
466
486
|
requires_topology=True,
|
|
467
487
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
@@ -481,6 +501,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
481
501
|
device_type='h100-80gb-8',
|
|
482
502
|
supports_sub_slicing=False,
|
|
483
503
|
supports_super_slicing=False,
|
|
504
|
+
supports_accelerator_network_profile=True,
|
|
484
505
|
gpu_config=GpuConfig(
|
|
485
506
|
requires_topology=True,
|
|
486
507
|
nccl_installer=INSTALLER_NCCL_TCPX,
|
|
@@ -500,6 +521,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
500
521
|
device_type='h100-mega-80gb-8',
|
|
501
522
|
supports_sub_slicing=False,
|
|
502
523
|
supports_super_slicing=False,
|
|
524
|
+
supports_accelerator_network_profile=True,
|
|
503
525
|
gpu_config=GpuConfig(
|
|
504
526
|
requires_topology=True,
|
|
505
527
|
nccl_installer=INSTALLER_NCCL_TCPXO,
|
|
@@ -516,6 +538,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
516
538
|
machine_type='tpu7x-standard-1t',
|
|
517
539
|
supported_topologies=['1x1x1'],
|
|
518
540
|
tpu_type_requires_workload_policy=True,
|
|
541
|
+
supports_accelerator_network_profile=False,
|
|
519
542
|
docker_platform=AMD_PLATFORM,
|
|
520
543
|
),
|
|
521
544
|
**get_tpu_system_characteristics_map(
|
|
@@ -524,7 +547,9 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
524
547
|
gke_accelerator='tpu7x',
|
|
525
548
|
machine_type='tpu7x-standard-4t',
|
|
526
549
|
tpu_type_requires_workload_policy=True,
|
|
550
|
+
supports_accelerator_network_profile=False,
|
|
527
551
|
docker_platform=AMD_PLATFORM,
|
|
552
|
+
parallel_containers=2,
|
|
528
553
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
529
554
|
super_slicing_topologies=set(['4x4x4']),
|
|
530
555
|
default_topologies=set([
|
|
@@ -635,6 +660,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
635
660
|
machine_type='ct6e-standard-1t',
|
|
636
661
|
supported_topologies=['1x1'],
|
|
637
662
|
docker_platform=AMD_PLATFORM,
|
|
663
|
+
supports_accelerator_network_profile=True,
|
|
638
664
|
),
|
|
639
665
|
**get_tpu_system_characteristics_map(
|
|
640
666
|
prefix='v6e',
|
|
@@ -644,6 +670,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
644
670
|
supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
|
|
645
671
|
sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
|
|
646
672
|
docker_platform=AMD_PLATFORM,
|
|
673
|
+
supports_accelerator_network_profile=True,
|
|
647
674
|
),
|
|
648
675
|
**get_tpu_system_characteristics_map(
|
|
649
676
|
prefix='v5p',
|
|
@@ -652,6 +679,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
652
679
|
machine_type='ct5p-hightpu-4t',
|
|
653
680
|
docker_platform=AMD_PLATFORM,
|
|
654
681
|
supported_topologies=generate_tpu_topologies(max_cubes=140),
|
|
682
|
+
supports_accelerator_network_profile=False,
|
|
655
683
|
default_topologies=set([
|
|
656
684
|
'2x2x1',
|
|
657
685
|
'2x2x2',
|
|
@@ -758,6 +786,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
758
786
|
machine_type='ct5lp-hightpu-4t',
|
|
759
787
|
docker_platform=AMD_PLATFORM,
|
|
760
788
|
supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
|
|
789
|
+
supports_accelerator_network_profile=False,
|
|
761
790
|
),
|
|
762
791
|
**get_tpu_system_characteristics_map(
|
|
763
792
|
prefix='v4',
|
|
@@ -768,6 +797,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
768
797
|
supported_topologies=generate_tpu_topologies(
|
|
769
798
|
max_cubes=64, enforce_nondecreasing=False
|
|
770
799
|
),
|
|
800
|
+
supports_accelerator_network_profile=False,
|
|
771
801
|
default_topologies=set([
|
|
772
802
|
'2x2x1',
|
|
773
803
|
'2x2x2',
|
|
@@ -796,6 +826,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
796
826
|
device_type='m1-megamem-96-1',
|
|
797
827
|
supports_sub_slicing=False,
|
|
798
828
|
supports_super_slicing=False,
|
|
829
|
+
supports_accelerator_network_profile=False,
|
|
799
830
|
docker_platform=AMD_PLATFORM,
|
|
800
831
|
),
|
|
801
832
|
# n2-standard-#vCPUs-#VMs
|
|
@@ -809,6 +840,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
809
840
|
device_type='n2-standard-64-1',
|
|
810
841
|
supports_sub_slicing=False,
|
|
811
842
|
supports_super_slicing=False,
|
|
843
|
+
supports_accelerator_network_profile=False,
|
|
812
844
|
docker_platform=AMD_PLATFORM,
|
|
813
845
|
),
|
|
814
846
|
'n2-standard-32-1': SystemCharacteristics(
|
|
@@ -821,6 +853,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
821
853
|
device_type='n2-standard-32-1',
|
|
822
854
|
supports_sub_slicing=False,
|
|
823
855
|
supports_super_slicing=False,
|
|
856
|
+
supports_accelerator_network_profile=False,
|
|
824
857
|
docker_platform=AMD_PLATFORM,
|
|
825
858
|
),
|
|
826
859
|
'n2-standard-32-2': SystemCharacteristics(
|
|
@@ -833,6 +866,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
833
866
|
device_type='n2-standard-32-2',
|
|
834
867
|
supports_sub_slicing=False,
|
|
835
868
|
supports_super_slicing=False,
|
|
869
|
+
supports_accelerator_network_profile=False,
|
|
836
870
|
docker_platform=AMD_PLATFORM,
|
|
837
871
|
),
|
|
838
872
|
'n2-standard-32-4': SystemCharacteristics(
|
|
@@ -845,6 +879,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
845
879
|
device_type='n2-standard-32-4',
|
|
846
880
|
supports_sub_slicing=False,
|
|
847
881
|
supports_super_slicing=False,
|
|
882
|
+
supports_accelerator_network_profile=False,
|
|
848
883
|
docker_platform=AMD_PLATFORM,
|
|
849
884
|
),
|
|
850
885
|
'n2-standard-32-8': SystemCharacteristics(
|
|
@@ -857,6 +892,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
857
892
|
device_type='n2-standard-32-8',
|
|
858
893
|
supports_sub_slicing=False,
|
|
859
894
|
supports_super_slicing=False,
|
|
895
|
+
supports_accelerator_network_profile=False,
|
|
860
896
|
docker_platform=AMD_PLATFORM,
|
|
861
897
|
),
|
|
862
898
|
'n2-standard-32-16': SystemCharacteristics(
|
|
@@ -869,6 +905,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
869
905
|
device_type='n2-standard-32-16',
|
|
870
906
|
supports_sub_slicing=False,
|
|
871
907
|
supports_super_slicing=False,
|
|
908
|
+
supports_accelerator_network_profile=False,
|
|
872
909
|
docker_platform=AMD_PLATFORM,
|
|
873
910
|
),
|
|
874
911
|
'n2-standard-32-32': SystemCharacteristics(
|
|
@@ -881,6 +918,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
881
918
|
device_type='n2-standard-32-32',
|
|
882
919
|
supports_sub_slicing=False,
|
|
883
920
|
supports_super_slicing=False,
|
|
921
|
+
supports_accelerator_network_profile=False,
|
|
884
922
|
docker_platform=AMD_PLATFORM,
|
|
885
923
|
),
|
|
886
924
|
'n2-standard-32-64': SystemCharacteristics(
|
|
@@ -893,6 +931,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
893
931
|
device_type='n2-standard-32-64',
|
|
894
932
|
supports_sub_slicing=False,
|
|
895
933
|
supports_super_slicing=False,
|
|
934
|
+
supports_accelerator_network_profile=False,
|
|
896
935
|
docker_platform=AMD_PLATFORM,
|
|
897
936
|
),
|
|
898
937
|
'n2-standard-32-128': SystemCharacteristics(
|
|
@@ -905,6 +944,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
905
944
|
device_type='n2-standard-32-128',
|
|
906
945
|
supports_sub_slicing=False,
|
|
907
946
|
supports_super_slicing=False,
|
|
947
|
+
supports_accelerator_network_profile=False,
|
|
908
948
|
docker_platform=AMD_PLATFORM,
|
|
909
949
|
),
|
|
910
950
|
'n2-standard-32-256': SystemCharacteristics(
|
|
@@ -917,6 +957,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
917
957
|
device_type='n2-standard-32-256',
|
|
918
958
|
supports_sub_slicing=False,
|
|
919
959
|
supports_super_slicing=False,
|
|
960
|
+
supports_accelerator_network_profile=False,
|
|
920
961
|
docker_platform=AMD_PLATFORM,
|
|
921
962
|
),
|
|
922
963
|
'n2-standard-32-512': SystemCharacteristics(
|
|
@@ -929,6 +970,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
929
970
|
device_type='n2-standard-32-512',
|
|
930
971
|
supports_sub_slicing=False,
|
|
931
972
|
supports_super_slicing=False,
|
|
973
|
+
supports_accelerator_network_profile=False,
|
|
932
974
|
docker_platform=AMD_PLATFORM,
|
|
933
975
|
),
|
|
934
976
|
'n2-standard-32-1024': SystemCharacteristics(
|
|
@@ -941,6 +983,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
941
983
|
device_type='n2-standard-32-1024',
|
|
942
984
|
supports_sub_slicing=False,
|
|
943
985
|
supports_super_slicing=False,
|
|
986
|
+
supports_accelerator_network_profile=False,
|
|
944
987
|
docker_platform=AMD_PLATFORM,
|
|
945
988
|
),
|
|
946
989
|
'n2-standard-32-2048': SystemCharacteristics(
|
|
@@ -953,6 +996,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
953
996
|
device_type='n2-standard-32-2048',
|
|
954
997
|
supports_sub_slicing=False,
|
|
955
998
|
supports_super_slicing=False,
|
|
999
|
+
supports_accelerator_network_profile=False,
|
|
956
1000
|
docker_platform=AMD_PLATFORM,
|
|
957
1001
|
),
|
|
958
1002
|
}
|
|
@@ -34,6 +34,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
34
34
|
supported_topologies=["1x1"],
|
|
35
35
|
docker_platform=DockerPlatform.AMD,
|
|
36
36
|
tpu_type_requires_workload_policy=False,
|
|
37
|
+
supports_accelerator_network_profile=False,
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -48,6 +49,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
48
49
|
supports_super_slicing=False,
|
|
49
50
|
docker_platform=DockerPlatform.AMD,
|
|
50
51
|
requires_workload_policy=False,
|
|
52
|
+
supports_accelerator_network_profile=False,
|
|
51
53
|
)
|
|
52
54
|
assert result == {
|
|
53
55
|
"test-1": expected_system_characteristics,
|
|
@@ -62,6 +64,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
62
64
|
gke_accelerator="test",
|
|
63
65
|
machine_type="test",
|
|
64
66
|
supported_topologies=["2x2"],
|
|
67
|
+
supports_accelerator_network_profile=False,
|
|
65
68
|
docker_platform=DockerPlatform.AMD,
|
|
66
69
|
tpu_type_requires_workload_policy=True,
|
|
67
70
|
)
|
|
@@ -76,6 +79,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
76
79
|
device_type="test-8",
|
|
77
80
|
supports_sub_slicing=False,
|
|
78
81
|
supports_super_slicing=False,
|
|
82
|
+
supports_accelerator_network_profile=False,
|
|
79
83
|
docker_platform=DockerPlatform.AMD,
|
|
80
84
|
requires_workload_policy=False,
|
|
81
85
|
)
|
|
@@ -94,6 +98,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
94
98
|
supported_topologies=["2x2x2"],
|
|
95
99
|
docker_platform=DockerPlatform.AMD,
|
|
96
100
|
tpu_type_requires_workload_policy=True,
|
|
101
|
+
supports_accelerator_network_profile=False,
|
|
97
102
|
)
|
|
98
103
|
|
|
99
104
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -106,6 +111,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
106
111
|
device_type="test-16",
|
|
107
112
|
supports_sub_slicing=False,
|
|
108
113
|
supports_super_slicing=False,
|
|
114
|
+
supports_accelerator_network_profile=False,
|
|
109
115
|
docker_platform=DockerPlatform.AMD,
|
|
110
116
|
requires_workload_policy=True,
|
|
111
117
|
)
|
|
@@ -122,6 +128,7 @@ def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
|
|
|
122
128
|
gke_accelerator="test",
|
|
123
129
|
machine_type="test",
|
|
124
130
|
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
131
|
+
supports_accelerator_network_profile=False,
|
|
125
132
|
docker_platform=DockerPlatform.AMD,
|
|
126
133
|
sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
127
134
|
)
|
|
@@ -138,6 +145,7 @@ def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
|
|
|
138
145
|
gke_accelerator="test",
|
|
139
146
|
machine_type="test",
|
|
140
147
|
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
148
|
+
supports_accelerator_network_profile=False,
|
|
141
149
|
docker_platform=DockerPlatform.AMD,
|
|
142
150
|
super_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
143
151
|
)
|
|
@@ -154,6 +162,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
|
154
162
|
gke_accelerator="test",
|
|
155
163
|
machine_type="test",
|
|
156
164
|
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
165
|
+
supports_accelerator_network_profile=False,
|
|
157
166
|
docker_platform=DockerPlatform.AMD,
|
|
158
167
|
default_topologies=set(["4x8x16"]),
|
|
159
168
|
)
|
|
@@ -206,6 +215,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
|
|
|
206
215
|
device_type="l4-1",
|
|
207
216
|
supports_sub_slicing=False,
|
|
208
217
|
supports_super_slicing=False,
|
|
218
|
+
supports_accelerator_network_profile=False,
|
|
209
219
|
docker_platform=DockerPlatform.AMD,
|
|
210
220
|
gpu_config=GpuConfig(requires_topology=False),
|
|
211
221
|
)
|
|
@@ -225,5 +235,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
|
|
|
225
235
|
device_type="l4-1",
|
|
226
236
|
supports_sub_slicing=False,
|
|
227
237
|
supports_super_slicing=False,
|
|
238
|
+
supports_accelerator_network_profile=False,
|
|
228
239
|
docker_platform=DockerPlatform.AMD,
|
|
229
240
|
)
|
xpk/core/telemetry.py
CHANGED
|
@@ -30,7 +30,7 @@ from dataclasses import dataclass
|
|
|
30
30
|
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
31
|
from ..utils.execution_context import is_dry_run
|
|
32
32
|
from ..utils.user_agent import get_user_agent
|
|
33
|
-
from ..utils.feature_flags import FeatureFlags
|
|
33
|
+
from ..utils.feature_flags import FeatureFlags, is_tester
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def should_send_telemetry():
|
|
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
|
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
class MetricsEventMetadataKey(Enum):
|
|
117
|
+
"""Represents available metadata keys."""
|
|
118
|
+
|
|
117
119
|
SESSION_ID = "XPK_SESSION_ID"
|
|
118
120
|
DRY_RUN = "XPK_DRY_RUN"
|
|
119
121
|
PYTHON_VERSION = "XPK_PYTHON_VERSION"
|
|
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
|
|
|
125
127
|
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
128
|
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
129
|
LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
|
|
130
|
+
TESTER = "XPK_TESTER"
|
|
128
131
|
|
|
129
132
|
|
|
130
133
|
@dataclass
|
|
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
|
|
|
230
233
|
MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
|
|
231
234
|
_is_running_from_source()
|
|
232
235
|
).lower(),
|
|
236
|
+
MetricsEventMetadataKey.TESTER: str(
|
|
237
|
+
is_tester() or _is_trash_execution()
|
|
238
|
+
).lower(),
|
|
233
239
|
}
|
|
234
240
|
|
|
235
241
|
|
|
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
|
|
|
241
247
|
}
|
|
242
248
|
|
|
243
249
|
|
|
250
|
+
def _is_trash_execution() -> bool:
|
|
251
|
+
return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
|
|
252
|
+
|
|
253
|
+
|
|
244
254
|
def _is_running_as_pip() -> bool:
|
|
245
255
|
return os.path.basename(sys.argv[0]) == "xpk"
|
|
246
256
|
|
xpk/core/telemetry_test.py
CHANGED
|
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
|
|
|
30
30
|
mocker.patch('time.time', side_effect=itertools.count())
|
|
31
31
|
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
32
32
|
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
33
|
+
mocker.patch('os.getenv', return_value='false')
|
|
33
34
|
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
35
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
|
|
34
36
|
set_dry_run(False)
|
|
35
37
|
get_config().set(CLIENT_ID_KEY, 'client_id')
|
|
36
38
|
yield
|
|
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
|
|
|
76
78
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
77
79
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
78
80
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
81
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
79
82
|
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
80
83
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
81
84
|
],
|
|
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
|
|
|
107
110
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
108
111
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
109
112
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
113
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
110
114
|
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
111
115
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
112
116
|
],
|
|
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
131
135
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
132
136
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
133
137
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
138
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
134
139
|
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
135
140
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
136
141
|
],
|
|
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
|
|
|
219
224
|
assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
|
|
220
225
|
|
|
221
226
|
|
|
227
|
+
@pytest.mark.parametrize(
|
|
228
|
+
argnames='tester,expected',
|
|
229
|
+
argvalues=[
|
|
230
|
+
(True, 'true'),
|
|
231
|
+
(False, 'false'),
|
|
232
|
+
],
|
|
233
|
+
)
|
|
234
|
+
def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
|
|
235
|
+
tester: bool, expected: str, mocker: MockerFixture
|
|
236
|
+
):
|
|
237
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
|
|
238
|
+
MetricsCollector.log_start(command='test')
|
|
239
|
+
payload = MetricsCollector.flush()
|
|
240
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@pytest.mark.parametrize(
|
|
244
|
+
argnames='trash_execution,expected',
|
|
245
|
+
argvalues=[
|
|
246
|
+
('true', 'true'),
|
|
247
|
+
('false', 'false'),
|
|
248
|
+
('', 'false'),
|
|
249
|
+
(None, 'false'),
|
|
250
|
+
],
|
|
251
|
+
)
|
|
252
|
+
def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
|
|
253
|
+
trash_execution: str, expected: str, mocker: MockerFixture
|
|
254
|
+
):
|
|
255
|
+
mocker.patch('os.getenv', return_value=trash_execution)
|
|
256
|
+
MetricsCollector.log_start(command='test')
|
|
257
|
+
payload = MetricsCollector.flush()
|
|
258
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
259
|
+
|
|
260
|
+
|
|
222
261
|
def _get_metadata_value(payload_str: str, key: str) -> str | None:
|
|
223
262
|
payload = json.loads(payload_str)
|
|
224
263
|
metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
|
|
@@ -17,6 +17,8 @@ limitations under the License.
|
|
|
17
17
|
import re
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
+
from ..commands import FailedCommand
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
class CommandsTester:
|
|
22
24
|
"""Tester class useful for mocking and asserting command runs."""
|
|
@@ -27,6 +29,7 @@ class CommandsTester:
|
|
|
27
29
|
run_command_for_value_path: str | None = None,
|
|
28
30
|
run_command_with_updates_path: str | None = None,
|
|
29
31
|
run_command_with_updates_retry_path: str | None = None,
|
|
32
|
+
run_command_batch_path: str | None = None,
|
|
30
33
|
):
|
|
31
34
|
self.__results: dict[re.Pattern, tuple[int, str]] = {}
|
|
32
35
|
self.commands_history: list[str] = []
|
|
@@ -45,6 +48,11 @@ class CommandsTester:
|
|
|
45
48
|
run_command_with_updates_retry_path,
|
|
46
49
|
wraps=self.__fake_run_command_with_updates_retry,
|
|
47
50
|
)
|
|
51
|
+
if run_command_batch_path:
|
|
52
|
+
mocker.patch(
|
|
53
|
+
run_command_batch_path,
|
|
54
|
+
wraps=self.__fake_run_command_batch,
|
|
55
|
+
)
|
|
48
56
|
|
|
49
57
|
def set_result_for_command(
|
|
50
58
|
self, result: tuple[int, str], *command_parts: str
|
|
@@ -111,6 +119,24 @@ class CommandsTester:
|
|
|
111
119
|
) -> tuple[int, str]:
|
|
112
120
|
return self.__common_fake_run_command(command, (0, dry_run_return_val))
|
|
113
121
|
|
|
122
|
+
def __fake_run_command_batch(
|
|
123
|
+
self,
|
|
124
|
+
commands: list[str],
|
|
125
|
+
jobname: str,
|
|
126
|
+
per_command_name: list[str],
|
|
127
|
+
output_logs: list[str],
|
|
128
|
+
) -> FailedCommand | None:
|
|
129
|
+
for i, command in enumerate(commands):
|
|
130
|
+
result = self.__common_fake_run_command(command, (0, ""))[0]
|
|
131
|
+
if result != 0:
|
|
132
|
+
return FailedCommand(
|
|
133
|
+
return_code=result,
|
|
134
|
+
name=per_command_name[i],
|
|
135
|
+
command=command,
|
|
136
|
+
logfile=output_logs[i],
|
|
137
|
+
)
|
|
138
|
+
return None
|
|
139
|
+
|
|
114
140
|
# pylint: enable=unused-argument
|
|
115
141
|
|
|
116
142
|
def __common_fake_run_command(
|
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import pytest
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
-
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
|
|
20
|
+
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
|
|
21
21
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
22
22
|
|
|
23
23
|
|
|
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
|
|
|
31
31
|
run_command_with_updates_retry_path=(
|
|
32
32
|
"xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
|
|
33
33
|
),
|
|
34
|
+
run_command_batch_path=(
|
|
35
|
+
"xpk.core.testing.commands_tester_test.run_command_batch"
|
|
36
|
+
),
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
|
|
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
|
|
|
54
57
|
mock_commands.assert_command_run("cmd", "bar")
|
|
55
58
|
|
|
56
59
|
|
|
60
|
+
def test_run_command_batch_default_result(
|
|
61
|
+
mock_commands: CommandsTester,
|
|
62
|
+
):
|
|
63
|
+
result = run_command_batch(
|
|
64
|
+
commands=["cmd1 foo bar", "cmd2 foo bar"],
|
|
65
|
+
jobname="Test command",
|
|
66
|
+
per_command_name=["cmd1", "cmd2"],
|
|
67
|
+
output_logs=["log1", "log2"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
assert result is None
|
|
71
|
+
mock_commands.assert_command_run("foo bar", times=2)
|
|
72
|
+
mock_commands.assert_command_run("cmd1")
|
|
73
|
+
mock_commands.assert_command_run("cmd2")
|
|
74
|
+
|
|
75
|
+
|
|
57
76
|
def test_set_result_for_command(mock_commands: CommandsTester):
|
|
58
77
|
mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
|
|
59
78
|
|
|
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
|
|
|
84
84
|
volumes.append(
|
|
85
85
|
{'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
|
|
86
86
|
)
|
|
87
|
+
volumes.append({
|
|
88
|
+
'name': 'dshm',
|
|
89
|
+
'emptyDir': {
|
|
90
|
+
'medium': 'Memory',
|
|
91
|
+
},
|
|
92
|
+
})
|
|
87
93
|
|
|
88
94
|
|
|
89
95
|
def add_tolerations(job_manifest):
|
|
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
|
|
|
111
117
|
container['volumeMounts'].append(
|
|
112
118
|
{'name': 'gib', 'mountPath': '/usr/local/gib'}
|
|
113
119
|
)
|
|
120
|
+
container['volumeMounts'].append(
|
|
121
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
122
|
+
)
|
xpk/parser/cluster.py
CHANGED
|
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
338
338
|
add_resource_limits(cluster_create_resource_limits)
|
|
339
339
|
|
|
340
340
|
cluster_create_ray_parser.set_defaults(
|
|
341
|
-
func=cluster_create_ray_cluster,
|
|
341
|
+
func=cluster_create_ray_cluster,
|
|
342
|
+
sub_slicing=False,
|
|
343
|
+
super_slicing=False,
|
|
344
|
+
num_cubes=None,
|
|
342
345
|
)
|
|
343
346
|
|
|
344
347
|
|
|
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
503
506
|
)
|
|
504
507
|
add_driver_arguments(cluster_adapt_optional_arguments)
|
|
505
508
|
add_shared_arguments(cluster_adapt_optional_arguments)
|
|
509
|
+
add_resource_limits(cluster_adapt_optional_arguments)
|
|
510
|
+
|
|
511
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
512
|
+
add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
|
|
513
|
+
|
|
514
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
515
|
+
add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
|
|
506
516
|
|
|
507
517
|
cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
|
|
508
518
|
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|