xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +124 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.0.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -15,11 +15,29 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
+
import dataclasses
|
|
19
|
+
from typing import Callable, Literal, Optional
|
|
20
|
+
|
|
21
|
+
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, tcpx_decorator
|
|
18
22
|
from ..utils.topology import get_topology_product
|
|
19
23
|
from enum import Enum
|
|
20
24
|
|
|
21
25
|
SUB_SLICING_TOPOLOGIES = ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
|
|
22
26
|
|
|
27
|
+
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
28
|
+
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
29
|
+
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
30
|
+
INSTALLER_NCCL_RDMA_A4X = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DockerPlatform(str, Enum):
|
|
34
|
+
AMD = 'linux/amd64'
|
|
35
|
+
ARM = 'linux/arm64'
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
AMD_PLATFORM = DockerPlatform.AMD
|
|
39
|
+
ARM_PLATFORM = DockerPlatform.ARM
|
|
40
|
+
|
|
23
41
|
|
|
24
42
|
class AcceleratorType(Enum):
|
|
25
43
|
TPU = 1
|
|
@@ -56,6 +74,45 @@ AcceleratorTypeToAcceleratorCharacteristics = {
|
|
|
56
74
|
}
|
|
57
75
|
|
|
58
76
|
|
|
77
|
+
@dataclass
|
|
78
|
+
class GpuConfig:
|
|
79
|
+
"""Contains GPU-specific configuration and requirements."""
|
|
80
|
+
|
|
81
|
+
requires_topology: bool
|
|
82
|
+
gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
|
|
83
|
+
kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
|
|
84
|
+
"""A function to decorate the kjob template for GPU-specific configurations.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
job_manifest (dict): The kjob manifest as a dictionary.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
dict: The modified kjob manifest as a dictionary.
|
|
91
|
+
"""
|
|
92
|
+
nccl_installer: Optional[str] = None
|
|
93
|
+
jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
|
|
94
|
+
"""A function to decorate the jobset for GPU-specific configurations.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
jobset_manifest_str (str): The JobSet manifest as a YAML string.
|
|
98
|
+
sub_networks (list[str], optional): A list of sub-network names, used by some decorators.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: The modified JobSet manifest as a YAML string.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __repr__(self) -> str:
|
|
105
|
+
"""Returns a string representation of the GpuConfig, omitting memory addresses for functions."""
|
|
106
|
+
parts = []
|
|
107
|
+
for f in dataclasses.fields(self):
|
|
108
|
+
value = getattr(self, f.name)
|
|
109
|
+
if f.name in ('kjob_decorator_fn', 'jobset_decorator_fn') and value:
|
|
110
|
+
parts.append(f'{f.name}=<function {value.__name__}>')
|
|
111
|
+
else:
|
|
112
|
+
parts.append(f'{f.name}={repr(value)}')
|
|
113
|
+
return f"GpuConfig({', '.join(parts)})"
|
|
114
|
+
|
|
115
|
+
|
|
59
116
|
@dataclass
|
|
60
117
|
class SystemCharacteristics:
|
|
61
118
|
"""Contains the defining characteristics of a specific accelerator system.
|
|
@@ -92,12 +149,28 @@ class SystemCharacteristics:
|
|
|
92
149
|
accelerator_type: AcceleratorType
|
|
93
150
|
device_type: str
|
|
94
151
|
supports_sub_slicing: bool
|
|
152
|
+
docker_platform: DockerPlatform
|
|
95
153
|
requires_workload_policy: bool = False
|
|
154
|
+
gpu_config: Optional[GpuConfig] = None
|
|
96
155
|
|
|
97
156
|
def __post_init__(self):
|
|
98
157
|
if self.accelerator_type == AcceleratorType.GPU:
|
|
99
158
|
self.requires_workload_policy = True
|
|
100
159
|
|
|
160
|
+
if self.gpu_config is None:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"Validation Error: System '{self.device_type}' is a GPU, "
|
|
163
|
+
"but 'gpu_config' was not provided."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def gpu_requires_topology(self) -> bool:
|
|
168
|
+
"""
|
|
169
|
+
Safely returns whether the GPU config requires topology,
|
|
170
|
+
defaulting to False if no GPU config exists.
|
|
171
|
+
"""
|
|
172
|
+
return self.gpu_config.requires_topology if self.gpu_config else False
|
|
173
|
+
|
|
101
174
|
|
|
102
175
|
def get_system_characteristics(
|
|
103
176
|
args,
|
|
@@ -167,6 +240,7 @@ def get_tpu_system_characteristics_map(
|
|
|
167
240
|
machine_type: str,
|
|
168
241
|
supported_topologies: list[str],
|
|
169
242
|
supports_sub_slicing: bool,
|
|
243
|
+
docker_platform: DockerPlatform,
|
|
170
244
|
tpu_type_requires_workload_policy: bool = False,
|
|
171
245
|
default_topologies: set[str] | None = None,
|
|
172
246
|
) -> dict[str, SystemCharacteristics]:
|
|
@@ -189,6 +263,7 @@ def get_tpu_system_characteristics_map(
|
|
|
189
263
|
requires_workload_policy=tpu_type_requires_workload_policy
|
|
190
264
|
and vms_per_slice > 1,
|
|
191
265
|
supports_sub_slicing=supports_sub_slicing,
|
|
266
|
+
docker_platform=docker_platform,
|
|
192
267
|
)
|
|
193
268
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
194
269
|
if (
|
|
@@ -231,6 +306,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
231
306
|
accelerator_type=AcceleratorType.GPU,
|
|
232
307
|
device_type='l4-1',
|
|
233
308
|
supports_sub_slicing=False,
|
|
309
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
310
|
+
docker_platform=AMD_PLATFORM,
|
|
234
311
|
),
|
|
235
312
|
'l4-2': SystemCharacteristics(
|
|
236
313
|
topology='N/A',
|
|
@@ -241,6 +318,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
241
318
|
accelerator_type=AcceleratorType.GPU,
|
|
242
319
|
device_type='l4-2',
|
|
243
320
|
supports_sub_slicing=False,
|
|
321
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
322
|
+
docker_platform=AMD_PLATFORM,
|
|
244
323
|
),
|
|
245
324
|
'l4-4': SystemCharacteristics(
|
|
246
325
|
topology='N/A',
|
|
@@ -251,6 +330,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
251
330
|
accelerator_type=AcceleratorType.GPU,
|
|
252
331
|
device_type='l4-4',
|
|
253
332
|
supports_sub_slicing=False,
|
|
333
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
334
|
+
docker_platform=AMD_PLATFORM,
|
|
254
335
|
),
|
|
255
336
|
'l4-8': SystemCharacteristics(
|
|
256
337
|
topology='N/A',
|
|
@@ -261,6 +342,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
261
342
|
accelerator_type=AcceleratorType.GPU,
|
|
262
343
|
device_type='l4-8',
|
|
263
344
|
supports_sub_slicing=False,
|
|
345
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
346
|
+
docker_platform=AMD_PLATFORM,
|
|
264
347
|
),
|
|
265
348
|
# A100-40gb-$CHIPSc
|
|
266
349
|
'a100-40gb-1': SystemCharacteristics(
|
|
@@ -272,6 +355,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
272
355
|
accelerator_type=AcceleratorType.GPU,
|
|
273
356
|
device_type='a100-40gb-1',
|
|
274
357
|
supports_sub_slicing=False,
|
|
358
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
359
|
+
docker_platform=AMD_PLATFORM,
|
|
275
360
|
),
|
|
276
361
|
'a100-40gb-2': SystemCharacteristics(
|
|
277
362
|
topology='N/A',
|
|
@@ -282,6 +367,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
282
367
|
accelerator_type=AcceleratorType.GPU,
|
|
283
368
|
device_type='a100-40gb-2',
|
|
284
369
|
supports_sub_slicing=False,
|
|
370
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
371
|
+
docker_platform=AMD_PLATFORM,
|
|
285
372
|
),
|
|
286
373
|
'a100-40gb-4': SystemCharacteristics(
|
|
287
374
|
topology='N/A',
|
|
@@ -292,6 +379,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
292
379
|
accelerator_type=AcceleratorType.GPU,
|
|
293
380
|
device_type='a100-40gb-4',
|
|
294
381
|
supports_sub_slicing=False,
|
|
382
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
383
|
+
docker_platform=AMD_PLATFORM,
|
|
295
384
|
),
|
|
296
385
|
'a100-40gb-8': SystemCharacteristics(
|
|
297
386
|
topology='N/A',
|
|
@@ -302,6 +391,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
302
391
|
accelerator_type=AcceleratorType.GPU,
|
|
303
392
|
device_type='a100-40gb-8',
|
|
304
393
|
supports_sub_slicing=False,
|
|
394
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
395
|
+
docker_platform=AMD_PLATFORM,
|
|
305
396
|
),
|
|
306
397
|
'gb200-4': SystemCharacteristics(
|
|
307
398
|
topology='1x72',
|
|
@@ -312,6 +403,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
312
403
|
accelerator_type=AcceleratorType.GPU,
|
|
313
404
|
device_type='gb200-4',
|
|
314
405
|
supports_sub_slicing=False,
|
|
406
|
+
gpu_config=GpuConfig(
|
|
407
|
+
requires_topology=True,
|
|
408
|
+
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
409
|
+
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
410
|
+
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
411
|
+
gpu_direct_name='rdma',
|
|
412
|
+
),
|
|
413
|
+
docker_platform=ARM_PLATFORM,
|
|
315
414
|
),
|
|
316
415
|
'gb200-4-nolssd': SystemCharacteristics(
|
|
317
416
|
topology='1x72',
|
|
@@ -322,6 +421,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
322
421
|
accelerator_type=AcceleratorType.GPU,
|
|
323
422
|
device_type='gb200-4',
|
|
324
423
|
supports_sub_slicing=False,
|
|
424
|
+
gpu_config=GpuConfig(
|
|
425
|
+
requires_topology=True,
|
|
426
|
+
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
427
|
+
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
428
|
+
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
429
|
+
gpu_direct_name='rdma',
|
|
430
|
+
),
|
|
431
|
+
docker_platform=ARM_PLATFORM,
|
|
325
432
|
),
|
|
326
433
|
'b200-8': SystemCharacteristics(
|
|
327
434
|
topology='N/A',
|
|
@@ -332,6 +439,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
332
439
|
accelerator_type=AcceleratorType.GPU,
|
|
333
440
|
device_type='b200-8',
|
|
334
441
|
supports_sub_slicing=False,
|
|
442
|
+
gpu_config=GpuConfig(
|
|
443
|
+
requires_topology=True,
|
|
444
|
+
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
445
|
+
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
446
|
+
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
447
|
+
gpu_direct_name='rdma',
|
|
448
|
+
),
|
|
449
|
+
docker_platform=AMD_PLATFORM,
|
|
335
450
|
),
|
|
336
451
|
'h200-141gb-8': SystemCharacteristics(
|
|
337
452
|
topology='N/A',
|
|
@@ -342,6 +457,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
342
457
|
accelerator_type=AcceleratorType.GPU,
|
|
343
458
|
device_type='h200-141gb-8',
|
|
344
459
|
supports_sub_slicing=False,
|
|
460
|
+
gpu_config=GpuConfig(
|
|
461
|
+
requires_topology=True,
|
|
462
|
+
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
463
|
+
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
464
|
+
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
465
|
+
gpu_direct_name='rdma',
|
|
466
|
+
),
|
|
467
|
+
docker_platform=AMD_PLATFORM,
|
|
345
468
|
),
|
|
346
469
|
# H100-80gb-$CHIPS
|
|
347
470
|
'h100-80gb-8': SystemCharacteristics(
|
|
@@ -353,6 +476,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
353
476
|
accelerator_type=AcceleratorType.GPU,
|
|
354
477
|
device_type='h100-80gb-8',
|
|
355
478
|
supports_sub_slicing=False,
|
|
479
|
+
gpu_config=GpuConfig(
|
|
480
|
+
requires_topology=True,
|
|
481
|
+
nccl_installer=INSTALLER_NCCL_TCPX,
|
|
482
|
+
kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
|
|
483
|
+
jobset_decorator_fn=tcpx_decorator.decorate_jobset,
|
|
484
|
+
gpu_direct_name='tcpx',
|
|
485
|
+
),
|
|
486
|
+
docker_platform=AMD_PLATFORM,
|
|
356
487
|
),
|
|
357
488
|
# H100-mega-80gb-$CHIPS
|
|
358
489
|
'h100-mega-80gb-8': SystemCharacteristics(
|
|
@@ -364,6 +495,14 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
364
495
|
accelerator_type=AcceleratorType.GPU,
|
|
365
496
|
device_type='h100-mega-80gb-8',
|
|
366
497
|
supports_sub_slicing=False,
|
|
498
|
+
gpu_config=GpuConfig(
|
|
499
|
+
requires_topology=True,
|
|
500
|
+
nccl_installer=INSTALLER_NCCL_TCPXO,
|
|
501
|
+
kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
|
|
502
|
+
jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
|
|
503
|
+
gpu_direct_name='tcpxo',
|
|
504
|
+
),
|
|
505
|
+
docker_platform=AMD_PLATFORM,
|
|
367
506
|
),
|
|
368
507
|
# TPU system characteristics
|
|
369
508
|
**get_tpu_system_characteristics_map(
|
|
@@ -374,6 +513,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
374
513
|
supported_topologies=['1x1x1'],
|
|
375
514
|
tpu_type_requires_workload_policy=True,
|
|
376
515
|
supports_sub_slicing=False,
|
|
516
|
+
docker_platform=AMD_PLATFORM,
|
|
377
517
|
),
|
|
378
518
|
**get_tpu_system_characteristics_map(
|
|
379
519
|
prefix='tpu7x',
|
|
@@ -382,6 +522,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
382
522
|
machine_type='tpu7x-standard-4t',
|
|
383
523
|
tpu_type_requires_workload_policy=True,
|
|
384
524
|
supports_sub_slicing=False,
|
|
525
|
+
docker_platform=AMD_PLATFORM,
|
|
385
526
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
386
527
|
default_topologies=set([
|
|
387
528
|
'12x12x12',
|
|
@@ -491,6 +632,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
491
632
|
machine_type='ct6e-standard-1t',
|
|
492
633
|
supports_sub_slicing=False,
|
|
493
634
|
supported_topologies=['1x1'],
|
|
635
|
+
docker_platform=AMD_PLATFORM,
|
|
494
636
|
),
|
|
495
637
|
**get_tpu_system_characteristics_map(
|
|
496
638
|
prefix='v6e',
|
|
@@ -501,6 +643,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
501
643
|
supported_topologies=[
|
|
502
644
|
'2x2',
|
|
503
645
|
],
|
|
646
|
+
docker_platform=AMD_PLATFORM,
|
|
504
647
|
),
|
|
505
648
|
**get_tpu_system_characteristics_map(
|
|
506
649
|
prefix='v6e',
|
|
@@ -509,6 +652,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
509
652
|
machine_type='ct6e-standard-4t',
|
|
510
653
|
supports_sub_slicing=True,
|
|
511
654
|
supported_topologies=SUB_SLICING_TOPOLOGIES,
|
|
655
|
+
docker_platform=AMD_PLATFORM,
|
|
512
656
|
),
|
|
513
657
|
**get_tpu_system_characteristics_map(
|
|
514
658
|
prefix='v5p',
|
|
@@ -516,6 +660,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
516
660
|
gke_accelerator='tpu-v5p-slice',
|
|
517
661
|
machine_type='ct5p-hightpu-4t',
|
|
518
662
|
supports_sub_slicing=False,
|
|
663
|
+
docker_platform=AMD_PLATFORM,
|
|
519
664
|
supported_topologies=generate_tpu_topologies(max_cubes=140),
|
|
520
665
|
default_topologies=set([
|
|
521
666
|
'2x2x1',
|
|
@@ -621,6 +766,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
621
766
|
tensorcores_per_chip=1,
|
|
622
767
|
gke_accelerator='tpu-v5-lite-podslice',
|
|
623
768
|
machine_type='ct5lp-hightpu-4t',
|
|
769
|
+
docker_platform=AMD_PLATFORM,
|
|
624
770
|
supports_sub_slicing=False,
|
|
625
771
|
supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
|
|
626
772
|
),
|
|
@@ -629,6 +775,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
629
775
|
tensorcores_per_chip=2,
|
|
630
776
|
gke_accelerator='tpu-v4-podslice',
|
|
631
777
|
machine_type='ct4p-hightpu-4t',
|
|
778
|
+
docker_platform=AMD_PLATFORM,
|
|
632
779
|
supports_sub_slicing=False,
|
|
633
780
|
supported_topologies=generate_tpu_topologies(
|
|
634
781
|
max_cubes=64, enforce_nondecreasing=False
|
|
@@ -660,6 +807,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
660
807
|
accelerator_type=AcceleratorType.CPU,
|
|
661
808
|
device_type='m1-megamem-96-1',
|
|
662
809
|
supports_sub_slicing=False,
|
|
810
|
+
docker_platform=AMD_PLATFORM,
|
|
663
811
|
),
|
|
664
812
|
# n2-standard-#vCPUs-#VMs
|
|
665
813
|
'n2-standard-64-1': SystemCharacteristics(
|
|
@@ -671,6 +819,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
671
819
|
accelerator_type=AcceleratorType.CPU,
|
|
672
820
|
device_type='n2-standard-64-1',
|
|
673
821
|
supports_sub_slicing=False,
|
|
822
|
+
docker_platform=AMD_PLATFORM,
|
|
674
823
|
),
|
|
675
824
|
'n2-standard-32-1': SystemCharacteristics(
|
|
676
825
|
topology='N/A',
|
|
@@ -681,6 +830,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
681
830
|
accelerator_type=AcceleratorType.CPU,
|
|
682
831
|
device_type='n2-standard-32-1',
|
|
683
832
|
supports_sub_slicing=False,
|
|
833
|
+
docker_platform=AMD_PLATFORM,
|
|
684
834
|
),
|
|
685
835
|
'n2-standard-32-2': SystemCharacteristics(
|
|
686
836
|
topology='N/A',
|
|
@@ -691,6 +841,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
691
841
|
accelerator_type=AcceleratorType.CPU,
|
|
692
842
|
device_type='n2-standard-32-2',
|
|
693
843
|
supports_sub_slicing=False,
|
|
844
|
+
docker_platform=AMD_PLATFORM,
|
|
694
845
|
),
|
|
695
846
|
'n2-standard-32-4': SystemCharacteristics(
|
|
696
847
|
topology='N/A',
|
|
@@ -701,6 +852,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
701
852
|
accelerator_type=AcceleratorType.CPU,
|
|
702
853
|
device_type='n2-standard-32-4',
|
|
703
854
|
supports_sub_slicing=False,
|
|
855
|
+
docker_platform=AMD_PLATFORM,
|
|
704
856
|
),
|
|
705
857
|
'n2-standard-32-8': SystemCharacteristics(
|
|
706
858
|
topology='N/A',
|
|
@@ -711,6 +863,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
711
863
|
accelerator_type=AcceleratorType.CPU,
|
|
712
864
|
device_type='n2-standard-32-8',
|
|
713
865
|
supports_sub_slicing=False,
|
|
866
|
+
docker_platform=AMD_PLATFORM,
|
|
714
867
|
),
|
|
715
868
|
'n2-standard-32-16': SystemCharacteristics(
|
|
716
869
|
topology='N/A',
|
|
@@ -721,6 +874,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
721
874
|
accelerator_type=AcceleratorType.CPU,
|
|
722
875
|
device_type='n2-standard-32-16',
|
|
723
876
|
supports_sub_slicing=False,
|
|
877
|
+
docker_platform=AMD_PLATFORM,
|
|
724
878
|
),
|
|
725
879
|
'n2-standard-32-32': SystemCharacteristics(
|
|
726
880
|
topology='N/A',
|
|
@@ -731,6 +885,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
731
885
|
accelerator_type=AcceleratorType.CPU,
|
|
732
886
|
device_type='n2-standard-32-32',
|
|
733
887
|
supports_sub_slicing=False,
|
|
888
|
+
docker_platform=AMD_PLATFORM,
|
|
734
889
|
),
|
|
735
890
|
'n2-standard-32-64': SystemCharacteristics(
|
|
736
891
|
topology='N/A',
|
|
@@ -741,6 +896,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
741
896
|
accelerator_type=AcceleratorType.CPU,
|
|
742
897
|
device_type='n2-standard-32-64',
|
|
743
898
|
supports_sub_slicing=False,
|
|
899
|
+
docker_platform=AMD_PLATFORM,
|
|
744
900
|
),
|
|
745
901
|
'n2-standard-32-128': SystemCharacteristics(
|
|
746
902
|
topology='N/A',
|
|
@@ -751,6 +907,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
751
907
|
accelerator_type=AcceleratorType.CPU,
|
|
752
908
|
device_type='n2-standard-32-128',
|
|
753
909
|
supports_sub_slicing=False,
|
|
910
|
+
docker_platform=AMD_PLATFORM,
|
|
754
911
|
),
|
|
755
912
|
'n2-standard-32-256': SystemCharacteristics(
|
|
756
913
|
topology='N/A',
|
|
@@ -761,6 +918,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
761
918
|
accelerator_type=AcceleratorType.CPU,
|
|
762
919
|
device_type='n2-standard-32-256',
|
|
763
920
|
supports_sub_slicing=False,
|
|
921
|
+
docker_platform=AMD_PLATFORM,
|
|
764
922
|
),
|
|
765
923
|
'n2-standard-32-512': SystemCharacteristics(
|
|
766
924
|
topology='N/A',
|
|
@@ -771,6 +929,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
771
929
|
accelerator_type=AcceleratorType.CPU,
|
|
772
930
|
device_type='n2-standard-32-512',
|
|
773
931
|
supports_sub_slicing=False,
|
|
932
|
+
docker_platform=AMD_PLATFORM,
|
|
774
933
|
),
|
|
775
934
|
'n2-standard-32-1024': SystemCharacteristics(
|
|
776
935
|
topology='N/A',
|
|
@@ -781,6 +940,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
781
940
|
accelerator_type=AcceleratorType.CPU,
|
|
782
941
|
device_type='n2-standard-32-1024',
|
|
783
942
|
supports_sub_slicing=False,
|
|
943
|
+
docker_platform=AMD_PLATFORM,
|
|
784
944
|
),
|
|
785
945
|
'n2-standard-32-2048': SystemCharacteristics(
|
|
786
946
|
topology='N/A',
|
|
@@ -791,6 +951,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
791
951
|
accelerator_type=AcceleratorType.CPU,
|
|
792
952
|
device_type='n2-standard-32-2048',
|
|
793
953
|
supports_sub_slicing=False,
|
|
954
|
+
docker_platform=AMD_PLATFORM,
|
|
794
955
|
),
|
|
795
956
|
}
|
|
796
957
|
""" If you modify UserFacingNameToSystemCharacteristics you should also modify
|
|
@@ -808,3 +969,21 @@ def get_system_characteristics_keys_by_accelerator_type(
|
|
|
808
969
|
for key, value in UserFacingNameToSystemCharacteristics.items()
|
|
809
970
|
if value.accelerator_type in accelerators
|
|
810
971
|
]
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def create_accelerator_label(system: SystemCharacteristics) -> str:
|
|
975
|
+
if system.accelerator_type == AcceleratorType.CPU:
|
|
976
|
+
return ''
|
|
977
|
+
return (
|
|
978
|
+
f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].accelerator_label}:'
|
|
979
|
+
f' {system.gke_accelerator}'
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def create_machine_label(system: SystemCharacteristics) -> str:
|
|
984
|
+
if system.accelerator_type == AcceleratorType.TPU:
|
|
985
|
+
return (
|
|
986
|
+
f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].machine_label}:'
|
|
987
|
+
f' {system.topology}'
|
|
988
|
+
)
|
|
989
|
+
return ''
|
|
@@ -14,7 +14,15 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
import pytest
|
|
18
|
+
from .system_characteristics import (
|
|
19
|
+
get_tpu_system_characteristics_map,
|
|
20
|
+
generate_tpu_topologies,
|
|
21
|
+
DockerPlatform,
|
|
22
|
+
SystemCharacteristics,
|
|
23
|
+
AcceleratorType,
|
|
24
|
+
GpuConfig,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
|
|
@@ -25,6 +33,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
25
33
|
machine_type="test",
|
|
26
34
|
supported_topologies=["1x1"],
|
|
27
35
|
supports_sub_slicing=False,
|
|
36
|
+
docker_platform=DockerPlatform.AMD,
|
|
28
37
|
tpu_type_requires_workload_policy=False,
|
|
29
38
|
)
|
|
30
39
|
|
|
@@ -37,6 +46,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
37
46
|
accelerator_type=AcceleratorType.TPU,
|
|
38
47
|
device_type="test-1",
|
|
39
48
|
supports_sub_slicing=False,
|
|
49
|
+
docker_platform=DockerPlatform.AMD,
|
|
40
50
|
requires_workload_policy=False,
|
|
41
51
|
)
|
|
42
52
|
assert result == {
|
|
@@ -53,6 +63,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
53
63
|
machine_type="test",
|
|
54
64
|
supported_topologies=["2x2"],
|
|
55
65
|
supports_sub_slicing=False,
|
|
66
|
+
docker_platform=DockerPlatform.AMD,
|
|
56
67
|
tpu_type_requires_workload_policy=True,
|
|
57
68
|
)
|
|
58
69
|
|
|
@@ -65,6 +76,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
65
76
|
accelerator_type=AcceleratorType.TPU,
|
|
66
77
|
device_type="test-8",
|
|
67
78
|
supports_sub_slicing=False,
|
|
79
|
+
docker_platform=DockerPlatform.AMD,
|
|
68
80
|
requires_workload_policy=False,
|
|
69
81
|
)
|
|
70
82
|
assert result == {
|
|
@@ -81,6 +93,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
81
93
|
machine_type="test",
|
|
82
94
|
supported_topologies=["2x2x2"],
|
|
83
95
|
supports_sub_slicing=False,
|
|
96
|
+
docker_platform=DockerPlatform.AMD,
|
|
84
97
|
tpu_type_requires_workload_policy=True,
|
|
85
98
|
)
|
|
86
99
|
|
|
@@ -93,6 +106,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
93
106
|
accelerator_type=AcceleratorType.TPU,
|
|
94
107
|
device_type="test-16",
|
|
95
108
|
supports_sub_slicing=False,
|
|
109
|
+
docker_platform=DockerPlatform.AMD,
|
|
96
110
|
requires_workload_policy=True,
|
|
97
111
|
)
|
|
98
112
|
assert result == {
|
|
@@ -109,6 +123,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
|
109
123
|
machine_type="test",
|
|
110
124
|
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
111
125
|
supports_sub_slicing=False,
|
|
126
|
+
docker_platform=DockerPlatform.AMD,
|
|
112
127
|
default_topologies=set(["4x8x16"]),
|
|
113
128
|
)
|
|
114
129
|
|
|
@@ -146,3 +161,36 @@ def test_generate_tpu_topologies_contains_sub_cube_slices():
|
|
|
146
161
|
one_cube = generate_tpu_topologies(max_cubes=1)
|
|
147
162
|
|
|
148
163
|
assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
|
|
167
|
+
"""Tests that __post_init__ correctly sets requires_workload_policy for GPUs."""
|
|
168
|
+
gpu_system = SystemCharacteristics(
|
|
169
|
+
topology="N/A",
|
|
170
|
+
vms_per_slice=1,
|
|
171
|
+
gke_accelerator="nvidia-l4",
|
|
172
|
+
gce_machine_type="g2-standard-12",
|
|
173
|
+
chips_per_vm=1,
|
|
174
|
+
accelerator_type=AcceleratorType.GPU,
|
|
175
|
+
device_type="l4-1",
|
|
176
|
+
supports_sub_slicing=False,
|
|
177
|
+
docker_platform=DockerPlatform.AMD,
|
|
178
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
179
|
+
)
|
|
180
|
+
assert gpu_system.requires_workload_policy is True
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_system_characteristics_post_init_throws_for_gpu_without_config():
|
|
184
|
+
"""Tests that __post_init__ raises ValueError for GPU without gpu_config."""
|
|
185
|
+
with pytest.raises(ValueError, match="'gpu_config' was not provided"):
|
|
186
|
+
SystemCharacteristics(
|
|
187
|
+
topology="N/A",
|
|
188
|
+
vms_per_slice=1,
|
|
189
|
+
gke_accelerator="nvidia-l4",
|
|
190
|
+
gce_machine_type="g2-standard-12",
|
|
191
|
+
chips_per_vm=1,
|
|
192
|
+
accelerator_type=AcceleratorType.GPU,
|
|
193
|
+
device_type="l4-1",
|
|
194
|
+
supports_sub_slicing=False,
|
|
195
|
+
docker_platform=DockerPlatform.AMD,
|
|
196
|
+
)
|
xpk/core/telemetry.py
CHANGED
|
@@ -27,7 +27,7 @@ import requests
|
|
|
27
27
|
from enum import Enum
|
|
28
28
|
from typing import Any
|
|
29
29
|
from dataclasses import dataclass
|
|
30
|
-
from .config import
|
|
30
|
+
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
31
|
from ..utils.execution_context import is_dry_run
|
|
32
32
|
from ..utils.user_agent import get_user_agent
|
|
33
33
|
from ..utils.feature_flags import FeatureFlags
|
|
@@ -36,7 +36,7 @@ from ..utils.feature_flags import FeatureFlags
|
|
|
36
36
|
def should_send_telemetry():
|
|
37
37
|
return (
|
|
38
38
|
FeatureFlags.TELEMETRY_ENABLED
|
|
39
|
-
and
|
|
39
|
+
and get_config().get(SEND_TELEMETRY_KEY) != "false"
|
|
40
40
|
)
|
|
41
41
|
|
|
42
42
|
|
|
@@ -254,10 +254,10 @@ def _get_session_id() -> str:
|
|
|
254
254
|
|
|
255
255
|
def _ensure_client_id() -> str:
|
|
256
256
|
"""Generates Client ID and stores in configuration if not already present."""
|
|
257
|
-
current_client_id =
|
|
257
|
+
current_client_id = get_config().get(CLIENT_ID_KEY)
|
|
258
258
|
if current_client_id is not None:
|
|
259
259
|
return current_client_id
|
|
260
260
|
|
|
261
261
|
new_client_id = str(uuid.uuid4())
|
|
262
|
-
|
|
262
|
+
get_config().set(CLIENT_ID_KEY, new_client_id)
|
|
263
263
|
return new_client_id
|
xpk/core/telemetry_test.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
18
|
import json
|
|
19
|
-
from .config import
|
|
19
|
+
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
|
|
20
20
|
from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
|
|
21
21
|
from ..utils.execution_context import set_dry_run
|
|
22
22
|
from ..utils.feature_flags import FeatureFlags
|
|
@@ -31,9 +31,9 @@ def setup_mocks(mocker: MockerFixture):
|
|
|
31
31
|
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
32
32
|
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
33
33
|
set_dry_run(False)
|
|
34
|
-
|
|
34
|
+
get_config().set(CLIENT_ID_KEY, 'client_id')
|
|
35
35
|
yield
|
|
36
|
-
|
|
36
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
@pytest.mark.parametrize(
|
|
@@ -48,13 +48,13 @@ def setup_mocks(mocker: MockerFixture):
|
|
|
48
48
|
def test_should_send_telemetry_returns_correct_value(
|
|
49
49
|
feature_flag: bool, config_value: str, expected: bool
|
|
50
50
|
):
|
|
51
|
-
|
|
51
|
+
get_config().set(SEND_TELEMETRY_KEY, config_value)
|
|
52
52
|
FeatureFlags.TELEMETRY_ENABLED = feature_flag
|
|
53
53
|
assert should_send_telemetry() is expected
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
def test_metrics_collector_generates_client_id_if_not_present():
|
|
57
|
-
|
|
57
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
58
58
|
MetricsCollector.log_start(command='test')
|
|
59
59
|
payload = json.loads(MetricsCollector.flush())
|
|
60
60
|
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
@@ -79,12 +79,12 @@ def test_metrics_collector_logs_start_event_correctly():
|
|
|
79
79
|
],
|
|
80
80
|
'event_name': 'start',
|
|
81
81
|
'event_type': 'commands',
|
|
82
|
-
'release_version': 'v0.
|
|
82
|
+
'release_version': 'v0.0.0',
|
|
83
83
|
}
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
def test_metrics_collector_generates_client_id_when_not_present():
|
|
87
|
-
|
|
87
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
88
88
|
MetricsCollector.log_start(command='test')
|
|
89
89
|
payload = json.loads(MetricsCollector.flush())
|
|
90
90
|
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
@@ -109,7 +109,7 @@ def test_metrics_collector_logs_complete_event_correctly():
|
|
|
109
109
|
],
|
|
110
110
|
'event_name': 'complete',
|
|
111
111
|
'event_type': 'commands',
|
|
112
|
-
'release_version': 'v0.
|
|
112
|
+
'release_version': 'v0.0.0',
|
|
113
113
|
}
|
|
114
114
|
|
|
115
115
|
|
|
@@ -132,7 +132,7 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
132
132
|
],
|
|
133
133
|
'event_name': 'test',
|
|
134
134
|
'event_type': 'custom',
|
|
135
|
-
'release_version': 'v0.
|
|
135
|
+
'release_version': 'v0.0.0',
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
|
xpk/core/vertex.py
CHANGED
|
@@ -15,7 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
|
-
from .resources import
|
|
18
|
+
from .resources import ConfigMapType, get_cluster_configmap
|
|
19
19
|
|
|
20
20
|
DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
|
|
21
21
|
|
|
@@ -65,8 +65,9 @@ def create_vertex_experiment(args) -> dict | None:
|
|
|
65
65
|
tensorboard,
|
|
66
66
|
)
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
cluster_config_map = get_cluster_configmap(
|
|
69
|
+
args.cluster, ConfigMapType.METADATA
|
|
70
|
+
)
|
|
70
71
|
|
|
71
72
|
if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
|
|
72
73
|
xpk_print(
|