xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,15 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics, AcceleratorType
17
+ import pytest
18
+ from .system_characteristics import (
19
+ get_tpu_system_characteristics_map,
20
+ generate_tpu_topologies,
21
+ DockerPlatform,
22
+ SystemCharacteristics,
23
+ AcceleratorType,
24
+ GpuConfig,
25
+ )
18
26
 
19
27
 
20
28
  def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
@@ -25,7 +33,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
25
33
  machine_type="test",
26
34
  supported_topologies=["1x1"],
27
35
  supports_sub_slicing=False,
28
- requires_workload_policy=True,
36
+ docker_platform=DockerPlatform.AMD,
37
+ tpu_type_requires_workload_policy=False,
29
38
  )
30
39
 
31
40
  expected_system_characteristics = SystemCharacteristics(
@@ -37,7 +46,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
37
46
  accelerator_type=AcceleratorType.TPU,
38
47
  device_type="test-1",
39
48
  supports_sub_slicing=False,
40
- requires_workload_policy=True,
49
+ docker_platform=DockerPlatform.AMD,
50
+ requires_workload_policy=False,
41
51
  )
42
52
  assert result == {
43
53
  "test-1": expected_system_characteristics,
@@ -53,7 +63,8 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
53
63
  machine_type="test",
54
64
  supported_topologies=["2x2"],
55
65
  supports_sub_slicing=False,
56
- requires_workload_policy=True,
66
+ docker_platform=DockerPlatform.AMD,
67
+ tpu_type_requires_workload_policy=True,
57
68
  )
58
69
 
59
70
  expected_system_characteristics = SystemCharacteristics(
@@ -65,9 +76,121 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
65
76
  accelerator_type=AcceleratorType.TPU,
66
77
  device_type="test-8",
67
78
  supports_sub_slicing=False,
68
- requires_workload_policy=True,
79
+ docker_platform=DockerPlatform.AMD,
80
+ requires_workload_policy=False,
69
81
  )
70
82
  assert result == {
71
83
  "test-8": expected_system_characteristics,
72
84
  "test-2x2": expected_system_characteristics,
73
85
  }
86
+
87
+
88
+ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_topology():
89
+ result = get_tpu_system_characteristics_map(
90
+ prefix="test",
91
+ tensorcores_per_chip=2,
92
+ gke_accelerator="test",
93
+ machine_type="test",
94
+ supported_topologies=["2x2x2"],
95
+ supports_sub_slicing=False,
96
+ docker_platform=DockerPlatform.AMD,
97
+ tpu_type_requires_workload_policy=True,
98
+ )
99
+
100
+ expected_system_characteristics = SystemCharacteristics(
101
+ topology="2x2x2",
102
+ vms_per_slice=2,
103
+ gke_accelerator="test",
104
+ gce_machine_type="test",
105
+ chips_per_vm=4,
106
+ accelerator_type=AcceleratorType.TPU,
107
+ device_type="test-16",
108
+ supports_sub_slicing=False,
109
+ docker_platform=DockerPlatform.AMD,
110
+ requires_workload_policy=True,
111
+ )
112
+ assert result == {
113
+ "test-16": expected_system_characteristics,
114
+ "test-2x2x2": expected_system_characteristics,
115
+ }
116
+
117
+
118
+ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
119
+ result = get_tpu_system_characteristics_map(
120
+ prefix="test",
121
+ tensorcores_per_chip=2,
122
+ gke_accelerator="test",
123
+ machine_type="test",
124
+ supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
125
+ supports_sub_slicing=False,
126
+ docker_platform=DockerPlatform.AMD,
127
+ default_topologies=set(["4x8x16"]),
128
+ )
129
+
130
+ assert result["test-128"].topology == "4x4x4"
131
+ assert result["test-1024"].topology == "4x8x16"
132
+
133
+
134
+ def test_generate_tpu_topologies_returns_correct_number_of_values_for_TPU_platforms():
135
+ v4 = generate_tpu_topologies(max_cubes=64, enforce_nondecreasing=False)
136
+ v5p = generate_tpu_topologies(max_cubes=140)
137
+ tpu7x = generate_tpu_topologies(max_cubes=144)
138
+
139
+ assert len(v4) == 800
140
+ assert len(v5p) == 414
141
+ assert len(tpu7x) == 432
142
+
143
+
144
+ def test_generate_tpu_topologies_respects_constraints():
145
+ ordered_6_cubes = generate_tpu_topologies(
146
+ max_cubes=6, enforce_nondecreasing=True
147
+ )
148
+ non_ordered_6_cubes = generate_tpu_topologies(
149
+ max_cubes=6, enforce_nondecreasing=False
150
+ )
151
+
152
+ assert "8x4x4" not in ordered_6_cubes
153
+ assert "8x4x4" in non_ordered_6_cubes
154
+ assert "4x8x12" in ordered_6_cubes # exactly 6 cubes
155
+ assert "4x8x12" in non_ordered_6_cubes # exactly 6 cubes
156
+ assert "4x8x16" not in ordered_6_cubes # too many cubes (8)
157
+ assert "4x8x16" not in non_ordered_6_cubes # too many cubes (8)
158
+
159
+
160
+ def test_generate_tpu_topologies_contains_sub_cube_slices():
161
+ one_cube = generate_tpu_topologies(max_cubes=1)
162
+
163
+ assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
164
+
165
+
166
+ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
167
+ """Tests that __post_init__ correctly sets requires_workload_policy for GPUs."""
168
+ gpu_system = SystemCharacteristics(
169
+ topology="N/A",
170
+ vms_per_slice=1,
171
+ gke_accelerator="nvidia-l4",
172
+ gce_machine_type="g2-standard-12",
173
+ chips_per_vm=1,
174
+ accelerator_type=AcceleratorType.GPU,
175
+ device_type="l4-1",
176
+ supports_sub_slicing=False,
177
+ docker_platform=DockerPlatform.AMD,
178
+ gpu_config=GpuConfig(requires_topology=False),
179
+ )
180
+ assert gpu_system.requires_workload_policy is True
181
+
182
+
183
+ def test_system_characteristics_post_init_throws_for_gpu_without_config():
184
+ """Tests that __post_init__ raises ValueError for GPU without gpu_config."""
185
+ with pytest.raises(ValueError, match="'gpu_config' was not provided"):
186
+ SystemCharacteristics(
187
+ topology="N/A",
188
+ vms_per_slice=1,
189
+ gke_accelerator="nvidia-l4",
190
+ gce_machine_type="g2-standard-12",
191
+ chips_per_vm=1,
192
+ accelerator_type=AcceleratorType.GPU,
193
+ device_type="l4-1",
194
+ supports_sub_slicing=False,
195
+ docker_platform=DockerPlatform.AMD,
196
+ )
xpk/core/telemetry.py ADDED
@@ -0,0 +1,263 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import platform
18
+ import uuid
19
+ import json
20
+ import os
21
+ import time
22
+ import sys
23
+ import importlib
24
+ import subprocess
25
+ import tempfile
26
+ import requests
27
+ from enum import Enum
28
+ from typing import Any
29
+ from dataclasses import dataclass
30
+ from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
31
+ from ..utils.execution_context import is_dry_run
32
+ from ..utils.user_agent import get_user_agent
33
+ from ..utils.feature_flags import FeatureFlags
34
+
35
+
36
+ def should_send_telemetry():
37
+ return (
38
+ FeatureFlags.TELEMETRY_ENABLED
39
+ and get_config().get(SEND_TELEMETRY_KEY) != "false"
40
+ )
41
+
42
+
43
+ def send_clearcut_payload(data: str, wait_to_complete: bool = False) -> None:
44
+ """Sends payload to clearcut endpoint."""
45
+ try:
46
+ file_path = _store_payload_in_temp_file(data)
47
+ if not _schedule_clearcut_background_flush(file_path, wait_to_complete):
48
+ _clearcut_flush(file_path)
49
+ except Exception: # pylint: disable=broad-exception-caught
50
+ pass
51
+
52
+
53
+ def _store_payload_in_temp_file(data: str) -> str:
54
+ with tempfile.NamedTemporaryFile(
55
+ mode="w", delete=False, encoding="utf-8"
56
+ ) as file:
57
+ json.dump(
58
+ {
59
+ "data": data,
60
+ "url": "https://play.googleapis.com/log",
61
+ "params": {"format": "json_proto"},
62
+ "headers": {"User-Agent": get_user_agent()},
63
+ "method": "POST",
64
+ },
65
+ file,
66
+ )
67
+ return file.name
68
+
69
+
70
+ def _schedule_clearcut_background_flush(
71
+ file_path: str, wait_to_complete: bool
72
+ ) -> bool:
73
+ """Schedules clearcut background flush.
74
+
75
+ Args:
76
+ file_path: path to the temporary file where the events are stored.
77
+ wait_to_complete: whenever to wait for the background script completion.
78
+
79
+ Returns:
80
+ True if successful and False otherwise
81
+ """
82
+ with importlib.resources.path("xpk", "telemetry_uploader.py") as path:
83
+ if not os.path.exists(path):
84
+ return False
85
+
86
+ kwargs: dict[str, Any] = {}
87
+ if sys.platform == "win32":
88
+ kwargs["creationflags"] = (
89
+ subprocess.DETACHED_PROCESS | subprocess.CREATE_NO_WINDOW
90
+ )
91
+ else:
92
+ kwargs["start_new_session"] = True
93
+
94
+ process = subprocess.Popen(
95
+ args=[
96
+ sys.executable,
97
+ str(path),
98
+ file_path,
99
+ ],
100
+ stdout=sys.stdout if wait_to_complete else subprocess.DEVNULL,
101
+ stderr=sys.stderr if wait_to_complete else subprocess.DEVNULL,
102
+ **kwargs,
103
+ )
104
+ if wait_to_complete:
105
+ process.wait()
106
+ return True
107
+
108
+
109
+ def _clearcut_flush(file_path: str) -> None:
110
+ with open(file_path, mode="r", encoding="utf-8") as file:
111
+ kwargs = json.load(file)
112
+ requests.request(**kwargs)
113
+ os.remove(file_path)
114
+
115
+
116
+ class MetricsEventMetadataKey(Enum):
117
+ SESSION_ID = "XPK_SESSION_ID"
118
+ DRY_RUN = "XPK_DRY_RUN"
119
+ PYTHON_VERSION = "XPK_PYTHON_VERSION"
120
+ ZONE = "XPK_ZONE"
121
+ SYSTEM_CHARACTERISTICS = "XPK_SYSTEM_CHARACTERISTICS"
122
+ PROVISIONING_MODE = "XPK_PROVISIONING_MODE"
123
+ COMMAND = "XPK_COMMAND"
124
+ EXIT_CODE = "XPK_EXIT_CODE"
125
+ RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
126
+ RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
127
+
128
+
129
+ @dataclass
130
+ class _MetricsEvent:
131
+ time: float
132
+ type: str
133
+ name: str
134
+ metadata: dict[MetricsEventMetadataKey, str]
135
+
136
+
137
+ class _MetricsCollector:
138
+ """Metrics collector for collecting various metrics and events across application."""
139
+
140
+ _events: list[_MetricsEvent] = []
141
+
142
+ def log_start(self, command: str) -> None:
143
+ """Logs start event."""
144
+ self._events.append(
145
+ _MetricsEvent(
146
+ time=time.time(),
147
+ type="commands",
148
+ name="start",
149
+ metadata={MetricsEventMetadataKey.COMMAND: command},
150
+ )
151
+ )
152
+
153
+ def log_complete(self, exit_code: int) -> None:
154
+ """Logs complete event."""
155
+ self._events.append(
156
+ _MetricsEvent(
157
+ time=time.time(),
158
+ type="commands",
159
+ name="complete",
160
+ metadata={MetricsEventMetadataKey.EXIT_CODE: str(exit_code)},
161
+ )
162
+ )
163
+
164
+ def log_custom(
165
+ self,
166
+ name: str,
167
+ metadata: dict[MetricsEventMetadataKey, str] | None = None,
168
+ ) -> None:
169
+ """Logs custom event."""
170
+ self._events.append(
171
+ _MetricsEvent(
172
+ time=time.time(),
173
+ type="custom",
174
+ name=name,
175
+ metadata=metadata if metadata is not None else {},
176
+ )
177
+ )
178
+
179
+ def flush(self) -> str:
180
+ """Flushes collected events into concord payload."""
181
+ result = _generate_payload(self._events)
182
+ self._events.clear()
183
+ return result
184
+
185
+
186
+ MetricsCollector = _MetricsCollector()
187
+
188
+
189
+ def _generate_payload(events: list[_MetricsEvent]) -> str:
190
+ base_concord_event = _get_base_concord_event()
191
+ base_event_metadata = _get_base_event_metadata()
192
+ serialized_events = []
193
+ for event in events:
194
+ metadata = {
195
+ **base_event_metadata,
196
+ **event.metadata,
197
+ }
198
+ serialized_events.append({
199
+ "event_time_ms": int(event.time * 1000),
200
+ "source_extension_json": json.dumps({
201
+ **base_concord_event,
202
+ "event_type": event.type,
203
+ "event_name": event.name,
204
+ "event_metadata": [
205
+ {"key": key.value, "value": value}
206
+ for key, value in metadata.items()
207
+ ],
208
+ }),
209
+ })
210
+
211
+ return json.dumps({
212
+ "client_info": {"client_type": "XPK"},
213
+ "log_source_name": "CONCORD",
214
+ "request_time_ms": int(time.time() * 1000),
215
+ "log_event": serialized_events,
216
+ })
217
+
218
+
219
+ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
220
+ return {
221
+ MetricsEventMetadataKey.SESSION_ID: _get_session_id(),
222
+ MetricsEventMetadataKey.DRY_RUN: str(is_dry_run()).lower(),
223
+ MetricsEventMetadataKey.PYTHON_VERSION: platform.python_version(),
224
+ MetricsEventMetadataKey.RUNNING_AS_PIP: str(_is_running_as_pip()).lower(),
225
+ MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
226
+ _is_running_from_source()
227
+ ).lower(),
228
+ }
229
+
230
+
231
+ def _get_base_concord_event() -> dict[str, str]:
232
+ return {
233
+ "release_version": xpk_version,
234
+ "console_type": "XPK",
235
+ "client_install_id": _ensure_client_id(),
236
+ }
237
+
238
+
239
+ def _is_running_as_pip() -> bool:
240
+ return os.path.basename(sys.argv[0]) == "xpk"
241
+
242
+
243
+ def _is_running_from_source() -> bool:
244
+ current_path = os.path.abspath(os.path.realpath(__file__))
245
+ return (
246
+ "site-packages" not in current_path
247
+ and "dist-packages" not in current_path
248
+ )
249
+
250
+
251
+ def _get_session_id() -> str:
252
+ return str(uuid.uuid4())
253
+
254
+
255
+ def _ensure_client_id() -> str:
256
+ """Generates Client ID and stores in configuration if not already present."""
257
+ current_client_id = get_config().get(CLIENT_ID_KEY)
258
+ if current_client_id is not None:
259
+ return current_client_id
260
+
261
+ new_client_id = str(uuid.uuid4())
262
+ get_config().set(CLIENT_ID_KEY, new_client_id)
263
+ return new_client_id
@@ -0,0 +1,211 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ import json
19
+ from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
20
+ from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
21
+ from ..utils.execution_context import set_dry_run
22
+ from ..utils.feature_flags import FeatureFlags
23
+ from pytest_mock import MockerFixture
24
+
25
+
26
+ @pytest.fixture(autouse=True)
27
+ def setup_mocks(mocker: MockerFixture):
28
+ mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
29
+ mocker.patch('time.time', return_value=0)
30
+ mocker.patch('platform.python_version', return_value='99.99.99')
31
+ mocker.patch('os.path.basename', return_value='xpk.py')
32
+ mocker.patch('os.path.abspath', return_value='/home/xpk_user')
33
+ set_dry_run(False)
34
+ get_config().set(CLIENT_ID_KEY, 'client_id')
35
+ yield
36
+ get_config().set(CLIENT_ID_KEY, None)
37
+
38
+
39
+ @pytest.mark.parametrize(
40
+ argnames='feature_flag,config_value,expected',
41
+ argvalues=[
42
+ (True, 'true', True),
43
+ (False, 'true', False),
44
+ (True, None, True),
45
+ (True, 'false', False),
46
+ ],
47
+ )
48
+ def test_should_send_telemetry_returns_correct_value(
49
+ feature_flag: bool, config_value: str, expected: bool
50
+ ):
51
+ get_config().set(SEND_TELEMETRY_KEY, config_value)
52
+ FeatureFlags.TELEMETRY_ENABLED = feature_flag
53
+ assert should_send_telemetry() is expected
54
+
55
+
56
+ def test_metrics_collector_generates_client_id_if_not_present():
57
+ get_config().set(CLIENT_ID_KEY, None)
58
+ MetricsCollector.log_start(command='test')
59
+ payload = json.loads(MetricsCollector.flush())
60
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
61
+ assert extension_json['client_install_id'] is not None
62
+ assert len(extension_json['client_install_id']) > 0
63
+
64
+
65
+ def test_metrics_collector_logs_start_event_correctly():
66
+ MetricsCollector.log_start(command='test')
67
+ payload = json.loads(MetricsCollector.flush())
68
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
69
+ assert extension_json == {
70
+ 'client_install_id': 'client_id',
71
+ 'console_type': 'XPK',
72
+ 'event_metadata': [
73
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
74
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
75
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
76
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
77
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
78
+ {'key': 'XPK_COMMAND', 'value': 'test'},
79
+ ],
80
+ 'event_name': 'start',
81
+ 'event_type': 'commands',
82
+ 'release_version': 'v0.0.0',
83
+ }
84
+
85
+
86
+ def test_metrics_collector_generates_client_id_when_not_present():
87
+ get_config().set(CLIENT_ID_KEY, None)
88
+ MetricsCollector.log_start(command='test')
89
+ payload = json.loads(MetricsCollector.flush())
90
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
91
+ assert extension_json['client_install_id'] is not None
92
+ assert len(extension_json['client_install_id']) > 0
93
+
94
+
95
+ def test_metrics_collector_logs_complete_event_correctly():
96
+ MetricsCollector.log_complete(exit_code=2)
97
+ payload = json.loads(MetricsCollector.flush())
98
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
99
+ assert extension_json == {
100
+ 'client_install_id': 'client_id',
101
+ 'console_type': 'XPK',
102
+ 'event_metadata': [
103
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
104
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
105
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
106
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
107
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
108
+ {'key': 'XPK_EXIT_CODE', 'value': '2'},
109
+ ],
110
+ 'event_name': 'complete',
111
+ 'event_type': 'commands',
112
+ 'release_version': 'v0.0.0',
113
+ }
114
+
115
+
116
+ def test_metrics_collector_logs_custom_event_correctly():
117
+ MetricsCollector.log_custom(
118
+ name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
119
+ )
120
+ payload = json.loads(MetricsCollector.flush())
121
+ extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
122
+ assert extension_json == {
123
+ 'client_install_id': 'client_id',
124
+ 'console_type': 'XPK',
125
+ 'event_metadata': [
126
+ {'key': 'XPK_SESSION_ID', 'value': '321231'},
127
+ {'key': 'XPK_DRY_RUN', 'value': 'false'},
128
+ {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
129
+ {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
130
+ {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
131
+ {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
132
+ ],
133
+ 'event_name': 'test',
134
+ 'event_type': 'custom',
135
+ 'release_version': 'v0.0.0',
136
+ }
137
+
138
+
139
+ def test_metrics_collector_logs_correct_envelope():
140
+ MetricsCollector.log_start(command='test')
141
+ MetricsCollector.log_custom(
142
+ name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
143
+ )
144
+ MetricsCollector.log_complete(exit_code=2)
145
+ payload = json.loads(MetricsCollector.flush())
146
+ assert payload['client_info'] == {'client_type': 'XPK'}
147
+ assert payload['log_source_name'] == 'CONCORD'
148
+ assert payload['request_time_ms'] == 0
149
+ assert len(payload['log_event']) == 3
150
+
151
+
152
+ def test_metrics_collector_does_not_flush_event_twice():
153
+ MetricsCollector.log_start(command='test')
154
+ MetricsCollector.flush()
155
+ MetricsCollector.log_start(command='version')
156
+ payload = json.loads(MetricsCollector.flush())
157
+ assert len(payload['log_event']) == 1
158
+
159
+
160
+ @pytest.mark.parametrize(
161
+ argnames='dry_run,expected', argvalues=[(False, 'false'), (True, 'true')]
162
+ )
163
+ def test_metrics_collector_logs_correct_dry_run_value(
164
+ dry_run: bool, expected: str
165
+ ):
166
+ set_dry_run(dry_run)
167
+ MetricsCollector.log_start(command='test')
168
+ payload = MetricsCollector.flush()
169
+ assert _get_metadata_value(payload, 'XPK_DRY_RUN') == expected
170
+
171
+
172
+ @pytest.mark.parametrize(
173
+ argnames='basename,expected',
174
+ argvalues=[
175
+ ('xpk', 'true'),
176
+ ('xpk.py', 'false'),
177
+ ],
178
+ )
179
+ def test_metrics_collectors_logs_correct_running_as_pip_value(
180
+ basename: str, expected: str, mocker: MockerFixture
181
+ ):
182
+ mocker.patch('os.path.basename', return_value=basename)
183
+ MetricsCollector.log_start(command='test')
184
+ payload = MetricsCollector.flush()
185
+ assert _get_metadata_value(payload, 'XPK_RUNNING_AS_PIP') == expected
186
+
187
+
188
+ @pytest.mark.parametrize(
189
+ argnames='abspath,expected',
190
+ argvalues=[
191
+ ('/site-packages/', 'false'),
192
+ ('/dist-packages/', 'false'),
193
+ ('/home/xpk_user', 'true'),
194
+ ],
195
+ )
196
+ def test_metrics_collectors_logs_correct_running_from_source_value(
197
+ abspath: str, expected: str, mocker: MockerFixture
198
+ ):
199
+ mocker.patch('os.path.abspath', return_value=abspath)
200
+ MetricsCollector.log_start(command='test')
201
+ payload = MetricsCollector.flush()
202
+ assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
203
+
204
+
205
+ def _get_metadata_value(payload_str: str, key: str) -> str | None:
206
+ payload = json.loads(payload_str)
207
+ metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
208
+ 'event_metadata'
209
+ ]
210
+ matching = (item['value'] for item in metadata if item['key'] == key)
211
+ return next(matching, None)
xpk/core/vertex.py CHANGED
@@ -15,7 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..utils.console import xpk_print
18
- from .resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
18
+ from .resources import ConfigMapType, get_cluster_configmap
19
19
 
20
20
  DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
21
21
 
@@ -65,8 +65,9 @@ def create_vertex_experiment(args) -> dict | None:
65
65
  tensorboard,
66
66
  )
67
67
 
68
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
69
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
68
+ cluster_config_map = get_cluster_configmap(
69
+ args.cluster, ConfigMapType.METADATA
70
+ )
70
71
 
71
72
  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
72
73
  xpk_print(
@@ -39,12 +39,16 @@ def decorate_job(job_manifest: dict) -> dict:
39
39
  return job_manifest
40
40
 
41
41
 
42
- def decorate_jobset(jobset_manifest_str: str) -> str:
42
+ def decorate_jobset( # pylint: disable=dangerous-default-value
43
+ jobset_manifest_str: str,
44
+ sub_networks: list[str] = [], # pylint: disable=unused-argument
45
+ ) -> str:
43
46
  """
44
47
  Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
45
48
 
46
49
  Args:
47
50
  jobset_manifest_str: The JobSet manifest as a YAML string.
51
+ sub_networks: This parameter is accepted for interface consistency but is not used.
48
52
 
49
53
  Returns:
50
54
  The modified JobSet manifest as a YAML string.