xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,15 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
import pytest
|
|
18
|
+
from .system_characteristics import (
|
|
19
|
+
get_tpu_system_characteristics_map,
|
|
20
|
+
generate_tpu_topologies,
|
|
21
|
+
DockerPlatform,
|
|
22
|
+
SystemCharacteristics,
|
|
23
|
+
AcceleratorType,
|
|
24
|
+
GpuConfig,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
|
|
@@ -25,7 +33,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
25
33
|
machine_type="test",
|
|
26
34
|
supported_topologies=["1x1"],
|
|
27
35
|
supports_sub_slicing=False,
|
|
28
|
-
|
|
36
|
+
docker_platform=DockerPlatform.AMD,
|
|
37
|
+
tpu_type_requires_workload_policy=False,
|
|
29
38
|
)
|
|
30
39
|
|
|
31
40
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -37,7 +46,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
37
46
|
accelerator_type=AcceleratorType.TPU,
|
|
38
47
|
device_type="test-1",
|
|
39
48
|
supports_sub_slicing=False,
|
|
40
|
-
|
|
49
|
+
docker_platform=DockerPlatform.AMD,
|
|
50
|
+
requires_workload_policy=False,
|
|
41
51
|
)
|
|
42
52
|
assert result == {
|
|
43
53
|
"test-1": expected_system_characteristics,
|
|
@@ -53,7 +63,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
53
63
|
machine_type="test",
|
|
54
64
|
supported_topologies=["2x2"],
|
|
55
65
|
supports_sub_slicing=False,
|
|
56
|
-
|
|
66
|
+
docker_platform=DockerPlatform.AMD,
|
|
67
|
+
tpu_type_requires_workload_policy=True,
|
|
57
68
|
)
|
|
58
69
|
|
|
59
70
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -65,9 +76,121 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
65
76
|
accelerator_type=AcceleratorType.TPU,
|
|
66
77
|
device_type="test-8",
|
|
67
78
|
supports_sub_slicing=False,
|
|
68
|
-
|
|
79
|
+
docker_platform=DockerPlatform.AMD,
|
|
80
|
+
requires_workload_policy=False,
|
|
69
81
|
)
|
|
70
82
|
assert result == {
|
|
71
83
|
"test-8": expected_system_characteristics,
|
|
72
84
|
"test-2x2": expected_system_characteristics,
|
|
73
85
|
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_topology():
|
|
89
|
+
result = get_tpu_system_characteristics_map(
|
|
90
|
+
prefix="test",
|
|
91
|
+
tensorcores_per_chip=2,
|
|
92
|
+
gke_accelerator="test",
|
|
93
|
+
machine_type="test",
|
|
94
|
+
supported_topologies=["2x2x2"],
|
|
95
|
+
supports_sub_slicing=False,
|
|
96
|
+
docker_platform=DockerPlatform.AMD,
|
|
97
|
+
tpu_type_requires_workload_policy=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
expected_system_characteristics = SystemCharacteristics(
|
|
101
|
+
topology="2x2x2",
|
|
102
|
+
vms_per_slice=2,
|
|
103
|
+
gke_accelerator="test",
|
|
104
|
+
gce_machine_type="test",
|
|
105
|
+
chips_per_vm=4,
|
|
106
|
+
accelerator_type=AcceleratorType.TPU,
|
|
107
|
+
device_type="test-16",
|
|
108
|
+
supports_sub_slicing=False,
|
|
109
|
+
docker_platform=DockerPlatform.AMD,
|
|
110
|
+
requires_workload_policy=True,
|
|
111
|
+
)
|
|
112
|
+
assert result == {
|
|
113
|
+
"test-16": expected_system_characteristics,
|
|
114
|
+
"test-2x2x2": expected_system_characteristics,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
119
|
+
result = get_tpu_system_characteristics_map(
|
|
120
|
+
prefix="test",
|
|
121
|
+
tensorcores_per_chip=2,
|
|
122
|
+
gke_accelerator="test",
|
|
123
|
+
machine_type="test",
|
|
124
|
+
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
125
|
+
supports_sub_slicing=False,
|
|
126
|
+
docker_platform=DockerPlatform.AMD,
|
|
127
|
+
default_topologies=set(["4x8x16"]),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
assert result["test-128"].topology == "4x4x4"
|
|
131
|
+
assert result["test-1024"].topology == "4x8x16"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_generate_tpu_topologies_returns_correct_number_of_values_for_TPU_platforms():
|
|
135
|
+
v4 = generate_tpu_topologies(max_cubes=64, enforce_nondecreasing=False)
|
|
136
|
+
v5p = generate_tpu_topologies(max_cubes=140)
|
|
137
|
+
tpu7x = generate_tpu_topologies(max_cubes=144)
|
|
138
|
+
|
|
139
|
+
assert len(v4) == 800
|
|
140
|
+
assert len(v5p) == 414
|
|
141
|
+
assert len(tpu7x) == 432
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_generate_tpu_topologies_respects_constraints():
|
|
145
|
+
ordered_6_cubes = generate_tpu_topologies(
|
|
146
|
+
max_cubes=6, enforce_nondecreasing=True
|
|
147
|
+
)
|
|
148
|
+
non_ordered_6_cubes = generate_tpu_topologies(
|
|
149
|
+
max_cubes=6, enforce_nondecreasing=False
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
assert "8x4x4" not in ordered_6_cubes
|
|
153
|
+
assert "8x4x4" in non_ordered_6_cubes
|
|
154
|
+
assert "4x8x12" in ordered_6_cubes # exactly 6 cubes
|
|
155
|
+
assert "4x8x12" in non_ordered_6_cubes # exactly 6 cubes
|
|
156
|
+
assert "4x8x16" not in ordered_6_cubes # too many cubes (8)
|
|
157
|
+
assert "4x8x16" not in non_ordered_6_cubes # too many cubes (8)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_generate_tpu_topologies_contains_sub_cube_slices():
|
|
161
|
+
one_cube = generate_tpu_topologies(max_cubes=1)
|
|
162
|
+
|
|
163
|
+
assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
|
|
167
|
+
"""Tests that __post_init__ correctly sets requires_workload_policy for GPUs."""
|
|
168
|
+
gpu_system = SystemCharacteristics(
|
|
169
|
+
topology="N/A",
|
|
170
|
+
vms_per_slice=1,
|
|
171
|
+
gke_accelerator="nvidia-l4",
|
|
172
|
+
gce_machine_type="g2-standard-12",
|
|
173
|
+
chips_per_vm=1,
|
|
174
|
+
accelerator_type=AcceleratorType.GPU,
|
|
175
|
+
device_type="l4-1",
|
|
176
|
+
supports_sub_slicing=False,
|
|
177
|
+
docker_platform=DockerPlatform.AMD,
|
|
178
|
+
gpu_config=GpuConfig(requires_topology=False),
|
|
179
|
+
)
|
|
180
|
+
assert gpu_system.requires_workload_policy is True
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_system_characteristics_post_init_throws_for_gpu_without_config():
|
|
184
|
+
"""Tests that __post_init__ raises ValueError for GPU without gpu_config."""
|
|
185
|
+
with pytest.raises(ValueError, match="'gpu_config' was not provided"):
|
|
186
|
+
SystemCharacteristics(
|
|
187
|
+
topology="N/A",
|
|
188
|
+
vms_per_slice=1,
|
|
189
|
+
gke_accelerator="nvidia-l4",
|
|
190
|
+
gce_machine_type="g2-standard-12",
|
|
191
|
+
chips_per_vm=1,
|
|
192
|
+
accelerator_type=AcceleratorType.GPU,
|
|
193
|
+
device_type="l4-1",
|
|
194
|
+
supports_sub_slicing=False,
|
|
195
|
+
docker_platform=DockerPlatform.AMD,
|
|
196
|
+
)
|
xpk/core/telemetry.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import platform
|
|
18
|
+
import uuid
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
import sys
|
|
23
|
+
import importlib
|
|
24
|
+
import subprocess
|
|
25
|
+
import tempfile
|
|
26
|
+
import requests
|
|
27
|
+
from enum import Enum
|
|
28
|
+
from typing import Any
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
|
+
from ..utils.execution_context import is_dry_run
|
|
32
|
+
from ..utils.user_agent import get_user_agent
|
|
33
|
+
from ..utils.feature_flags import FeatureFlags
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def should_send_telemetry():
|
|
37
|
+
return (
|
|
38
|
+
FeatureFlags.TELEMETRY_ENABLED
|
|
39
|
+
and get_config().get(SEND_TELEMETRY_KEY) != "false"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def send_clearcut_payload(data: str, wait_to_complete: bool = False) -> None:
|
|
44
|
+
"""Sends payload to clearcut endpoint."""
|
|
45
|
+
try:
|
|
46
|
+
file_path = _store_payload_in_temp_file(data)
|
|
47
|
+
if not _schedule_clearcut_background_flush(file_path, wait_to_complete):
|
|
48
|
+
_clearcut_flush(file_path)
|
|
49
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _store_payload_in_temp_file(data: str) -> str:
|
|
54
|
+
with tempfile.NamedTemporaryFile(
|
|
55
|
+
mode="w", delete=False, encoding="utf-8"
|
|
56
|
+
) as file:
|
|
57
|
+
json.dump(
|
|
58
|
+
{
|
|
59
|
+
"data": data,
|
|
60
|
+
"url": "https://play.googleapis.com/log",
|
|
61
|
+
"params": {"format": "json_proto"},
|
|
62
|
+
"headers": {"User-Agent": get_user_agent()},
|
|
63
|
+
"method": "POST",
|
|
64
|
+
},
|
|
65
|
+
file,
|
|
66
|
+
)
|
|
67
|
+
return file.name
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _schedule_clearcut_background_flush(
|
|
71
|
+
file_path: str, wait_to_complete: bool
|
|
72
|
+
) -> bool:
|
|
73
|
+
"""Schedules clearcut background flush.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path: path to the temporary file where the events are stored.
|
|
77
|
+
wait_to_complete: whenever to wait for the background script completion.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
True if successful and False otherwise
|
|
81
|
+
"""
|
|
82
|
+
with importlib.resources.path("xpk", "telemetry_uploader.py") as path:
|
|
83
|
+
if not os.path.exists(path):
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
kwargs: dict[str, Any] = {}
|
|
87
|
+
if sys.platform == "win32":
|
|
88
|
+
kwargs["creationflags"] = (
|
|
89
|
+
subprocess.DETACHED_PROCESS | subprocess.CREATE_NO_WINDOW
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
kwargs["start_new_session"] = True
|
|
93
|
+
|
|
94
|
+
process = subprocess.Popen(
|
|
95
|
+
args=[
|
|
96
|
+
sys.executable,
|
|
97
|
+
str(path),
|
|
98
|
+
file_path,
|
|
99
|
+
],
|
|
100
|
+
stdout=sys.stdout if wait_to_complete else subprocess.DEVNULL,
|
|
101
|
+
stderr=sys.stderr if wait_to_complete else subprocess.DEVNULL,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
104
|
+
if wait_to_complete:
|
|
105
|
+
process.wait()
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _clearcut_flush(file_path: str) -> None:
|
|
110
|
+
with open(file_path, mode="r", encoding="utf-8") as file:
|
|
111
|
+
kwargs = json.load(file)
|
|
112
|
+
requests.request(**kwargs)
|
|
113
|
+
os.remove(file_path)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class MetricsEventMetadataKey(Enum):
|
|
117
|
+
SESSION_ID = "XPK_SESSION_ID"
|
|
118
|
+
DRY_RUN = "XPK_DRY_RUN"
|
|
119
|
+
PYTHON_VERSION = "XPK_PYTHON_VERSION"
|
|
120
|
+
ZONE = "XPK_ZONE"
|
|
121
|
+
SYSTEM_CHARACTERISTICS = "XPK_SYSTEM_CHARACTERISTICS"
|
|
122
|
+
PROVISIONING_MODE = "XPK_PROVISIONING_MODE"
|
|
123
|
+
COMMAND = "XPK_COMMAND"
|
|
124
|
+
EXIT_CODE = "XPK_EXIT_CODE"
|
|
125
|
+
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
|
+
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class _MetricsEvent:
|
|
131
|
+
time: float
|
|
132
|
+
type: str
|
|
133
|
+
name: str
|
|
134
|
+
metadata: dict[MetricsEventMetadataKey, str]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class _MetricsCollector:
|
|
138
|
+
"""Metrics collector for collecting various metrics and events across application."""
|
|
139
|
+
|
|
140
|
+
_events: list[_MetricsEvent] = []
|
|
141
|
+
|
|
142
|
+
def log_start(self, command: str) -> None:
|
|
143
|
+
"""Logs start event."""
|
|
144
|
+
self._events.append(
|
|
145
|
+
_MetricsEvent(
|
|
146
|
+
time=time.time(),
|
|
147
|
+
type="commands",
|
|
148
|
+
name="start",
|
|
149
|
+
metadata={MetricsEventMetadataKey.COMMAND: command},
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def log_complete(self, exit_code: int) -> None:
|
|
154
|
+
"""Logs complete event."""
|
|
155
|
+
self._events.append(
|
|
156
|
+
_MetricsEvent(
|
|
157
|
+
time=time.time(),
|
|
158
|
+
type="commands",
|
|
159
|
+
name="complete",
|
|
160
|
+
metadata={MetricsEventMetadataKey.EXIT_CODE: str(exit_code)},
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def log_custom(
|
|
165
|
+
self,
|
|
166
|
+
name: str,
|
|
167
|
+
metadata: dict[MetricsEventMetadataKey, str] | None = None,
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Logs custom event."""
|
|
170
|
+
self._events.append(
|
|
171
|
+
_MetricsEvent(
|
|
172
|
+
time=time.time(),
|
|
173
|
+
type="custom",
|
|
174
|
+
name=name,
|
|
175
|
+
metadata=metadata if metadata is not None else {},
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def flush(self) -> str:
|
|
180
|
+
"""Flushes collected events into concord payload."""
|
|
181
|
+
result = _generate_payload(self._events)
|
|
182
|
+
self._events.clear()
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
MetricsCollector = _MetricsCollector()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _generate_payload(events: list[_MetricsEvent]) -> str:
|
|
190
|
+
base_concord_event = _get_base_concord_event()
|
|
191
|
+
base_event_metadata = _get_base_event_metadata()
|
|
192
|
+
serialized_events = []
|
|
193
|
+
for event in events:
|
|
194
|
+
metadata = {
|
|
195
|
+
**base_event_metadata,
|
|
196
|
+
**event.metadata,
|
|
197
|
+
}
|
|
198
|
+
serialized_events.append({
|
|
199
|
+
"event_time_ms": int(event.time * 1000),
|
|
200
|
+
"source_extension_json": json.dumps({
|
|
201
|
+
**base_concord_event,
|
|
202
|
+
"event_type": event.type,
|
|
203
|
+
"event_name": event.name,
|
|
204
|
+
"event_metadata": [
|
|
205
|
+
{"key": key.value, "value": value}
|
|
206
|
+
for key, value in metadata.items()
|
|
207
|
+
],
|
|
208
|
+
}),
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
return json.dumps({
|
|
212
|
+
"client_info": {"client_type": "XPK"},
|
|
213
|
+
"log_source_name": "CONCORD",
|
|
214
|
+
"request_time_ms": int(time.time() * 1000),
|
|
215
|
+
"log_event": serialized_events,
|
|
216
|
+
})
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
|
|
220
|
+
return {
|
|
221
|
+
MetricsEventMetadataKey.SESSION_ID: _get_session_id(),
|
|
222
|
+
MetricsEventMetadataKey.DRY_RUN: str(is_dry_run()).lower(),
|
|
223
|
+
MetricsEventMetadataKey.PYTHON_VERSION: platform.python_version(),
|
|
224
|
+
MetricsEventMetadataKey.RUNNING_AS_PIP: str(_is_running_as_pip()).lower(),
|
|
225
|
+
MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
|
|
226
|
+
_is_running_from_source()
|
|
227
|
+
).lower(),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _get_base_concord_event() -> dict[str, str]:
|
|
232
|
+
return {
|
|
233
|
+
"release_version": xpk_version,
|
|
234
|
+
"console_type": "XPK",
|
|
235
|
+
"client_install_id": _ensure_client_id(),
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _is_running_as_pip() -> bool:
|
|
240
|
+
return os.path.basename(sys.argv[0]) == "xpk"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _is_running_from_source() -> bool:
|
|
244
|
+
current_path = os.path.abspath(os.path.realpath(__file__))
|
|
245
|
+
return (
|
|
246
|
+
"site-packages" not in current_path
|
|
247
|
+
and "dist-packages" not in current_path
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _get_session_id() -> str:
|
|
252
|
+
return str(uuid.uuid4())
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _ensure_client_id() -> str:
|
|
256
|
+
"""Generates Client ID and stores in configuration if not already present."""
|
|
257
|
+
current_client_id = get_config().get(CLIENT_ID_KEY)
|
|
258
|
+
if current_client_id is not None:
|
|
259
|
+
return current_client_id
|
|
260
|
+
|
|
261
|
+
new_client_id = str(uuid.uuid4())
|
|
262
|
+
get_config().set(CLIENT_ID_KEY, new_client_id)
|
|
263
|
+
return new_client_id
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
import json
|
|
19
|
+
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
|
|
20
|
+
from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
|
|
21
|
+
from ..utils.execution_context import set_dry_run
|
|
22
|
+
from ..utils.feature_flags import FeatureFlags
|
|
23
|
+
from pytest_mock import MockerFixture
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture(autouse=True)
|
|
27
|
+
def setup_mocks(mocker: MockerFixture):
|
|
28
|
+
mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
|
|
29
|
+
mocker.patch('time.time', return_value=0)
|
|
30
|
+
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
31
|
+
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
32
|
+
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
33
|
+
set_dry_run(False)
|
|
34
|
+
get_config().set(CLIENT_ID_KEY, 'client_id')
|
|
35
|
+
yield
|
|
36
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
argnames='feature_flag,config_value,expected',
|
|
41
|
+
argvalues=[
|
|
42
|
+
(True, 'true', True),
|
|
43
|
+
(False, 'true', False),
|
|
44
|
+
(True, None, True),
|
|
45
|
+
(True, 'false', False),
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
def test_should_send_telemetry_returns_correct_value(
|
|
49
|
+
feature_flag: bool, config_value: str, expected: bool
|
|
50
|
+
):
|
|
51
|
+
get_config().set(SEND_TELEMETRY_KEY, config_value)
|
|
52
|
+
FeatureFlags.TELEMETRY_ENABLED = feature_flag
|
|
53
|
+
assert should_send_telemetry() is expected
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_metrics_collector_generates_client_id_if_not_present():
|
|
57
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
58
|
+
MetricsCollector.log_start(command='test')
|
|
59
|
+
payload = json.loads(MetricsCollector.flush())
|
|
60
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
61
|
+
assert extension_json['client_install_id'] is not None
|
|
62
|
+
assert len(extension_json['client_install_id']) > 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_metrics_collector_logs_start_event_correctly():
|
|
66
|
+
MetricsCollector.log_start(command='test')
|
|
67
|
+
payload = json.loads(MetricsCollector.flush())
|
|
68
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
69
|
+
assert extension_json == {
|
|
70
|
+
'client_install_id': 'client_id',
|
|
71
|
+
'console_type': 'XPK',
|
|
72
|
+
'event_metadata': [
|
|
73
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
74
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
75
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
76
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
77
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
78
|
+
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
79
|
+
],
|
|
80
|
+
'event_name': 'start',
|
|
81
|
+
'event_type': 'commands',
|
|
82
|
+
'release_version': 'v0.0.0',
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_metrics_collector_generates_client_id_when_not_present():
|
|
87
|
+
get_config().set(CLIENT_ID_KEY, None)
|
|
88
|
+
MetricsCollector.log_start(command='test')
|
|
89
|
+
payload = json.loads(MetricsCollector.flush())
|
|
90
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
91
|
+
assert extension_json['client_install_id'] is not None
|
|
92
|
+
assert len(extension_json['client_install_id']) > 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_metrics_collector_logs_complete_event_correctly():
|
|
96
|
+
MetricsCollector.log_complete(exit_code=2)
|
|
97
|
+
payload = json.loads(MetricsCollector.flush())
|
|
98
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
99
|
+
assert extension_json == {
|
|
100
|
+
'client_install_id': 'client_id',
|
|
101
|
+
'console_type': 'XPK',
|
|
102
|
+
'event_metadata': [
|
|
103
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
104
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
105
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
106
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
107
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
108
|
+
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
109
|
+
],
|
|
110
|
+
'event_name': 'complete',
|
|
111
|
+
'event_type': 'commands',
|
|
112
|
+
'release_version': 'v0.0.0',
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_metrics_collector_logs_custom_event_correctly():
|
|
117
|
+
MetricsCollector.log_custom(
|
|
118
|
+
name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
|
|
119
|
+
)
|
|
120
|
+
payload = json.loads(MetricsCollector.flush())
|
|
121
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
122
|
+
assert extension_json == {
|
|
123
|
+
'client_install_id': 'client_id',
|
|
124
|
+
'console_type': 'XPK',
|
|
125
|
+
'event_metadata': [
|
|
126
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
127
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
128
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
129
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
130
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
131
|
+
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
132
|
+
],
|
|
133
|
+
'event_name': 'test',
|
|
134
|
+
'event_type': 'custom',
|
|
135
|
+
'release_version': 'v0.0.0',
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_metrics_collector_logs_correct_envelope():
|
|
140
|
+
MetricsCollector.log_start(command='test')
|
|
141
|
+
MetricsCollector.log_custom(
|
|
142
|
+
name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
|
|
143
|
+
)
|
|
144
|
+
MetricsCollector.log_complete(exit_code=2)
|
|
145
|
+
payload = json.loads(MetricsCollector.flush())
|
|
146
|
+
assert payload['client_info'] == {'client_type': 'XPK'}
|
|
147
|
+
assert payload['log_source_name'] == 'CONCORD'
|
|
148
|
+
assert payload['request_time_ms'] == 0
|
|
149
|
+
assert len(payload['log_event']) == 3
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_metrics_collector_does_not_flush_event_twice():
|
|
153
|
+
MetricsCollector.log_start(command='test')
|
|
154
|
+
MetricsCollector.flush()
|
|
155
|
+
MetricsCollector.log_start(command='version')
|
|
156
|
+
payload = json.loads(MetricsCollector.flush())
|
|
157
|
+
assert len(payload['log_event']) == 1
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@pytest.mark.parametrize(
|
|
161
|
+
argnames='dry_run,expected', argvalues=[(False, 'false'), (True, 'true')]
|
|
162
|
+
)
|
|
163
|
+
def test_metrics_collector_logs_correct_dry_run_value(
|
|
164
|
+
dry_run: bool, expected: str
|
|
165
|
+
):
|
|
166
|
+
set_dry_run(dry_run)
|
|
167
|
+
MetricsCollector.log_start(command='test')
|
|
168
|
+
payload = MetricsCollector.flush()
|
|
169
|
+
assert _get_metadata_value(payload, 'XPK_DRY_RUN') == expected
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@pytest.mark.parametrize(
|
|
173
|
+
argnames='basename,expected',
|
|
174
|
+
argvalues=[
|
|
175
|
+
('xpk', 'true'),
|
|
176
|
+
('xpk.py', 'false'),
|
|
177
|
+
],
|
|
178
|
+
)
|
|
179
|
+
def test_metrics_collectors_logs_correct_running_as_pip_value(
|
|
180
|
+
basename: str, expected: str, mocker: MockerFixture
|
|
181
|
+
):
|
|
182
|
+
mocker.patch('os.path.basename', return_value=basename)
|
|
183
|
+
MetricsCollector.log_start(command='test')
|
|
184
|
+
payload = MetricsCollector.flush()
|
|
185
|
+
assert _get_metadata_value(payload, 'XPK_RUNNING_AS_PIP') == expected
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@pytest.mark.parametrize(
|
|
189
|
+
argnames='abspath,expected',
|
|
190
|
+
argvalues=[
|
|
191
|
+
('/site-packages/', 'false'),
|
|
192
|
+
('/dist-packages/', 'false'),
|
|
193
|
+
('/home/xpk_user', 'true'),
|
|
194
|
+
],
|
|
195
|
+
)
|
|
196
|
+
def test_metrics_collectors_logs_correct_running_from_source_value(
|
|
197
|
+
abspath: str, expected: str, mocker: MockerFixture
|
|
198
|
+
):
|
|
199
|
+
mocker.patch('os.path.abspath', return_value=abspath)
|
|
200
|
+
MetricsCollector.log_start(command='test')
|
|
201
|
+
payload = MetricsCollector.flush()
|
|
202
|
+
assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _get_metadata_value(payload_str: str, key: str) -> str | None:
|
|
206
|
+
payload = json.loads(payload_str)
|
|
207
|
+
metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
|
|
208
|
+
'event_metadata'
|
|
209
|
+
]
|
|
210
|
+
matching = (item['value'] for item in metadata if item['key'] == key)
|
|
211
|
+
return next(matching, None)
|
xpk/core/vertex.py
CHANGED
|
@@ -15,7 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
|
-
from .resources import
|
|
18
|
+
from .resources import ConfigMapType, get_cluster_configmap
|
|
19
19
|
|
|
20
20
|
DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
|
|
21
21
|
|
|
@@ -65,8 +65,9 @@ def create_vertex_experiment(args) -> dict | None:
|
|
|
65
65
|
tensorboard,
|
|
66
66
|
)
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
cluster_config_map = get_cluster_configmap(
|
|
69
|
+
args.cluster, ConfigMapType.METADATA
|
|
70
|
+
)
|
|
70
71
|
|
|
71
72
|
if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
|
|
72
73
|
xpk_print(
|
|
@@ -39,12 +39,16 @@ def decorate_job(job_manifest: dict) -> dict:
|
|
|
39
39
|
return job_manifest
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def decorate_jobset(
|
|
42
|
+
def decorate_jobset( # pylint: disable=dangerous-default-value
|
|
43
|
+
jobset_manifest_str: str,
|
|
44
|
+
sub_networks: list[str] = [], # pylint: disable=unused-argument
|
|
45
|
+
) -> str:
|
|
43
46
|
"""
|
|
44
47
|
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
45
48
|
|
|
46
49
|
Args:
|
|
47
50
|
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
51
|
+
sub_networks: This parameter is accepted for interface consistency but is not used.
|
|
48
52
|
|
|
49
53
|
Returns:
|
|
50
54
|
The modified JobSet manifest as a YAML string.
|