xpk 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/commands/cluster.py +57 -21
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +11 -2
- xpk/commands/cluster_test.py +233 -12
- xpk/commands/config.py +3 -5
- xpk/commands/kind.py +1 -1
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +28 -11
- xpk/commands/workload_test.py +3 -3
- xpk/core/blueprint/blueprint_generator.py +70 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/capacity.py +46 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +37 -57
- xpk/core/cluster_test.py +95 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +9 -2
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +6 -9
- xpk/core/kueue_manager.py +192 -32
- xpk/core/kueue_manager_test.py +132 -4
- xpk/core/nodepool.py +21 -29
- xpk/core/nodepool_test.py +17 -15
- xpk/core/scheduling.py +16 -1
- xpk/core/scheduling_test.py +85 -6
- xpk/core/system_characteristics.py +77 -19
- xpk/core/system_characteristics_test.py +80 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/main.py +31 -13
- xpk/parser/cluster.py +48 -9
- xpk/parser/cluster_test.py +42 -3
- xpk/parser/workload.py +12 -0
- xpk/parser/workload_test.py +4 -4
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +7 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +0 -11
- xpk/utils/versions.py +31 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,8 @@ from dataclasses import dataclass
|
|
|
18
18
|
from ..utils.topology import get_topology_product
|
|
19
19
|
from enum import Enum
|
|
20
20
|
|
|
21
|
+
SUB_SLICING_TOPOLOGIES = ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
|
|
22
|
+
|
|
21
23
|
|
|
22
24
|
class AcceleratorType(Enum):
|
|
23
25
|
TPU = 1
|
|
@@ -131,6 +133,33 @@ def get_system_characteristics_by_device_type(
|
|
|
131
133
|
return None, 1
|
|
132
134
|
|
|
133
135
|
|
|
136
|
+
def generate_tpu_topologies(
|
|
137
|
+
max_cubes: int, enforce_nondecreasing: bool = True
|
|
138
|
+
) -> list[str]:
|
|
139
|
+
"""Generates a list of unique TPU topologies formatted as strings "AxBxC".
|
|
140
|
+
|
|
141
|
+
The list will contain all triplets (A, B, C) such that:
|
|
142
|
+
- A, B and C are integers in range 4..256 (including 4 and 256)
|
|
143
|
+
- A, B and C are divisible by 4
|
|
144
|
+
- (A/4) * (B/4) * (C/4) <= max_cubes
|
|
145
|
+
- if enforce_nondecreasing: A <= B <= C
|
|
146
|
+
Additionally, the list will also contain the following triplets:
|
|
147
|
+
2x2x1, 2x2x2, 2x2x4, 2x4x4
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
max_cubes: maximum number of cubes supported by a TPU platform
|
|
151
|
+
enforce_nondecreasing: whether to enforce A <= B <= C or not
|
|
152
|
+
"""
|
|
153
|
+
topologies = ['2x2x1', '2x2x2', '2x2x4', '2x4x4']
|
|
154
|
+
MAX = 256
|
|
155
|
+
for x in range(4, MAX + 1, 4):
|
|
156
|
+
for y in range(x if enforce_nondecreasing else 4, MAX + 1, 4):
|
|
157
|
+
for z in range(y if enforce_nondecreasing else 4, MAX + 1, 4):
|
|
158
|
+
if (x // 4) * (y // 4) * (z // 4) <= max_cubes:
|
|
159
|
+
topologies.append(f'{x}x{y}x{z}')
|
|
160
|
+
return topologies
|
|
161
|
+
|
|
162
|
+
|
|
134
163
|
def get_tpu_system_characteristics_map(
|
|
135
164
|
prefix: str,
|
|
136
165
|
tensorcores_per_chip: int,
|
|
@@ -138,13 +167,17 @@ def get_tpu_system_characteristics_map(
|
|
|
138
167
|
machine_type: str,
|
|
139
168
|
supported_topologies: list[str],
|
|
140
169
|
supports_sub_slicing: bool,
|
|
141
|
-
|
|
170
|
+
tpu_type_requires_workload_policy: bool = False,
|
|
171
|
+
default_topologies: set[str] | None = None,
|
|
142
172
|
) -> dict[str, SystemCharacteristics]:
|
|
143
173
|
system_characteristics_map = {}
|
|
174
|
+
if default_topologies is None:
|
|
175
|
+
default_topologies = set()
|
|
144
176
|
for topology in supported_topologies:
|
|
145
177
|
chips_per_vm = compute_chips_per_vm(topology)
|
|
146
178
|
vms_per_slice = compute_vms_per_slice(topology)
|
|
147
179
|
num_tensorcores = compute_num_tensorcores(tensorcores_per_chip, topology)
|
|
180
|
+
device_type = f'{prefix}-{num_tensorcores}'
|
|
148
181
|
system = SystemCharacteristics(
|
|
149
182
|
topology=topology,
|
|
150
183
|
vms_per_slice=vms_per_slice,
|
|
@@ -152,12 +185,17 @@ def get_tpu_system_characteristics_map(
|
|
|
152
185
|
gce_machine_type=machine_type,
|
|
153
186
|
chips_per_vm=chips_per_vm,
|
|
154
187
|
accelerator_type=AcceleratorType.TPU,
|
|
155
|
-
device_type=
|
|
156
|
-
requires_workload_policy=
|
|
188
|
+
device_type=device_type,
|
|
189
|
+
requires_workload_policy=tpu_type_requires_workload_policy
|
|
190
|
+
and vms_per_slice > 1,
|
|
157
191
|
supports_sub_slicing=supports_sub_slicing,
|
|
158
192
|
)
|
|
159
193
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
160
|
-
|
|
194
|
+
if (
|
|
195
|
+
topology in default_topologies
|
|
196
|
+
or device_type not in system_characteristics_map
|
|
197
|
+
):
|
|
198
|
+
system_characteristics_map[device_type] = system
|
|
161
199
|
|
|
162
200
|
return system_characteristics_map
|
|
163
201
|
|
|
@@ -334,7 +372,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
334
372
|
gke_accelerator='tpu7x',
|
|
335
373
|
machine_type='tpu7x-standard-1t',
|
|
336
374
|
supported_topologies=['1x1x1'],
|
|
337
|
-
|
|
375
|
+
tpu_type_requires_workload_policy=True,
|
|
338
376
|
supports_sub_slicing=False,
|
|
339
377
|
),
|
|
340
378
|
**get_tpu_system_characteristics_map(
|
|
@@ -342,9 +380,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
342
380
|
tensorcores_per_chip=2,
|
|
343
381
|
gke_accelerator='tpu7x',
|
|
344
382
|
machine_type='tpu7x-standard-4t',
|
|
345
|
-
|
|
383
|
+
tpu_type_requires_workload_policy=True,
|
|
346
384
|
supports_sub_slicing=False,
|
|
347
|
-
supported_topologies=
|
|
385
|
+
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
386
|
+
default_topologies=set([
|
|
348
387
|
'12x12x12',
|
|
349
388
|
'12x12x16',
|
|
350
389
|
'12x12x20',
|
|
@@ -443,7 +482,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
443
482
|
'8x8x76',
|
|
444
483
|
'8x8x8',
|
|
445
484
|
'8x8x92',
|
|
446
|
-
],
|
|
485
|
+
]),
|
|
447
486
|
),
|
|
448
487
|
**get_tpu_system_characteristics_map(
|
|
449
488
|
prefix='v6e',
|
|
@@ -458,24 +497,27 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
458
497
|
tensorcores_per_chip=1,
|
|
459
498
|
gke_accelerator='tpu-v6e-slice',
|
|
460
499
|
machine_type='ct6e-standard-4t',
|
|
461
|
-
supports_sub_slicing=
|
|
500
|
+
supports_sub_slicing=False,
|
|
462
501
|
supported_topologies=[
|
|
463
502
|
'2x2',
|
|
464
|
-
'2x4',
|
|
465
|
-
'4x4',
|
|
466
|
-
'4x8',
|
|
467
|
-
'8x8',
|
|
468
|
-
'8x16',
|
|
469
|
-
'16x16',
|
|
470
503
|
],
|
|
471
504
|
),
|
|
505
|
+
**get_tpu_system_characteristics_map(
|
|
506
|
+
prefix='v6e',
|
|
507
|
+
tensorcores_per_chip=1,
|
|
508
|
+
gke_accelerator='tpu-v6e-slice',
|
|
509
|
+
machine_type='ct6e-standard-4t',
|
|
510
|
+
supports_sub_slicing=True,
|
|
511
|
+
supported_topologies=SUB_SLICING_TOPOLOGIES,
|
|
512
|
+
),
|
|
472
513
|
**get_tpu_system_characteristics_map(
|
|
473
514
|
prefix='v5p',
|
|
474
515
|
tensorcores_per_chip=2,
|
|
475
516
|
gke_accelerator='tpu-v5p-slice',
|
|
476
517
|
machine_type='ct5p-hightpu-4t',
|
|
477
518
|
supports_sub_slicing=False,
|
|
478
|
-
supported_topologies=
|
|
519
|
+
supported_topologies=generate_tpu_topologies(max_cubes=140),
|
|
520
|
+
default_topologies=set([
|
|
479
521
|
'2x2x1',
|
|
480
522
|
'2x2x2',
|
|
481
523
|
'2x2x4',
|
|
@@ -572,7 +614,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
572
614
|
'16x16x24',
|
|
573
615
|
'12x24x24',
|
|
574
616
|
'16x20x28',
|
|
575
|
-
],
|
|
617
|
+
]),
|
|
576
618
|
),
|
|
577
619
|
**get_tpu_system_characteristics_map(
|
|
578
620
|
prefix='v5litepod',
|
|
@@ -588,7 +630,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
588
630
|
gke_accelerator='tpu-v4-podslice',
|
|
589
631
|
machine_type='ct4p-hightpu-4t',
|
|
590
632
|
supports_sub_slicing=False,
|
|
591
|
-
supported_topologies=
|
|
633
|
+
supported_topologies=generate_tpu_topologies(
|
|
634
|
+
max_cubes=64, enforce_nondecreasing=False
|
|
635
|
+
),
|
|
636
|
+
default_topologies=set([
|
|
592
637
|
'2x2x1',
|
|
593
638
|
'2x2x2',
|
|
594
639
|
'2x2x4',
|
|
@@ -600,7 +645,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
600
645
|
'8x8x12',
|
|
601
646
|
'8x8x16',
|
|
602
647
|
'8x16x16',
|
|
603
|
-
],
|
|
648
|
+
]),
|
|
604
649
|
),
|
|
605
650
|
# CPU system characteristics.
|
|
606
651
|
# Note that chips_per_vm is actually the number of vCPUs in that CPU.
|
|
@@ -750,3 +795,16 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
750
795
|
}
|
|
751
796
|
""" If you modify UserFacingNameToSystemCharacteristics you should also modify
|
|
752
797
|
the corresponding Map in MaxText/accelerator_to_spec_map.py """
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def get_system_characteristics_keys_by_accelerator_type(
|
|
801
|
+
accelerators: list[AcceleratorType] | None = None,
|
|
802
|
+
) -> list[str]:
|
|
803
|
+
"""Returns UserFacingNameToSystemCharacteristics keys for given AcceleratorTypes."""
|
|
804
|
+
if accelerators is None:
|
|
805
|
+
accelerators = list(AcceleratorType)
|
|
806
|
+
return [
|
|
807
|
+
key
|
|
808
|
+
for key, value in UserFacingNameToSystemCharacteristics.items()
|
|
809
|
+
if value.accelerator_type in accelerators
|
|
810
|
+
]
|
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics, AcceleratorType
|
|
17
|
+
from .system_characteristics import get_tpu_system_characteristics_map, generate_tpu_topologies, SystemCharacteristics, AcceleratorType
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
|
|
@@ -25,7 +25,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
25
25
|
machine_type="test",
|
|
26
26
|
supported_topologies=["1x1"],
|
|
27
27
|
supports_sub_slicing=False,
|
|
28
|
-
|
|
28
|
+
tpu_type_requires_workload_policy=False,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -37,7 +37,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
37
37
|
accelerator_type=AcceleratorType.TPU,
|
|
38
38
|
device_type="test-1",
|
|
39
39
|
supports_sub_slicing=False,
|
|
40
|
-
requires_workload_policy=
|
|
40
|
+
requires_workload_policy=False,
|
|
41
41
|
)
|
|
42
42
|
assert result == {
|
|
43
43
|
"test-1": expected_system_characteristics,
|
|
@@ -53,7 +53,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
53
53
|
machine_type="test",
|
|
54
54
|
supported_topologies=["2x2"],
|
|
55
55
|
supports_sub_slicing=False,
|
|
56
|
-
|
|
56
|
+
tpu_type_requires_workload_policy=True,
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -65,9 +65,84 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
65
65
|
accelerator_type=AcceleratorType.TPU,
|
|
66
66
|
device_type="test-8",
|
|
67
67
|
supports_sub_slicing=False,
|
|
68
|
-
requires_workload_policy=
|
|
68
|
+
requires_workload_policy=False,
|
|
69
69
|
)
|
|
70
70
|
assert result == {
|
|
71
71
|
"test-8": expected_system_characteristics,
|
|
72
72
|
"test-2x2": expected_system_characteristics,
|
|
73
73
|
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_topology():
|
|
77
|
+
result = get_tpu_system_characteristics_map(
|
|
78
|
+
prefix="test",
|
|
79
|
+
tensorcores_per_chip=2,
|
|
80
|
+
gke_accelerator="test",
|
|
81
|
+
machine_type="test",
|
|
82
|
+
supported_topologies=["2x2x2"],
|
|
83
|
+
supports_sub_slicing=False,
|
|
84
|
+
tpu_type_requires_workload_policy=True,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
expected_system_characteristics = SystemCharacteristics(
|
|
88
|
+
topology="2x2x2",
|
|
89
|
+
vms_per_slice=2,
|
|
90
|
+
gke_accelerator="test",
|
|
91
|
+
gce_machine_type="test",
|
|
92
|
+
chips_per_vm=4,
|
|
93
|
+
accelerator_type=AcceleratorType.TPU,
|
|
94
|
+
device_type="test-16",
|
|
95
|
+
supports_sub_slicing=False,
|
|
96
|
+
requires_workload_policy=True,
|
|
97
|
+
)
|
|
98
|
+
assert result == {
|
|
99
|
+
"test-16": expected_system_characteristics,
|
|
100
|
+
"test-2x2x2": expected_system_characteristics,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
105
|
+
result = get_tpu_system_characteristics_map(
|
|
106
|
+
prefix="test",
|
|
107
|
+
tensorcores_per_chip=2,
|
|
108
|
+
gke_accelerator="test",
|
|
109
|
+
machine_type="test",
|
|
110
|
+
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
111
|
+
supports_sub_slicing=False,
|
|
112
|
+
default_topologies=set(["4x8x16"]),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
assert result["test-128"].topology == "4x4x4"
|
|
116
|
+
assert result["test-1024"].topology == "4x8x16"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_generate_tpu_topologies_returns_correct_number_of_values_for_TPU_platforms():
|
|
120
|
+
v4 = generate_tpu_topologies(max_cubes=64, enforce_nondecreasing=False)
|
|
121
|
+
v5p = generate_tpu_topologies(max_cubes=140)
|
|
122
|
+
tpu7x = generate_tpu_topologies(max_cubes=144)
|
|
123
|
+
|
|
124
|
+
assert len(v4) == 800
|
|
125
|
+
assert len(v5p) == 414
|
|
126
|
+
assert len(tpu7x) == 432
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_generate_tpu_topologies_respects_constraints():
|
|
130
|
+
ordered_6_cubes = generate_tpu_topologies(
|
|
131
|
+
max_cubes=6, enforce_nondecreasing=True
|
|
132
|
+
)
|
|
133
|
+
non_ordered_6_cubes = generate_tpu_topologies(
|
|
134
|
+
max_cubes=6, enforce_nondecreasing=False
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
assert "8x4x4" not in ordered_6_cubes
|
|
138
|
+
assert "8x4x4" in non_ordered_6_cubes
|
|
139
|
+
assert "4x8x12" in ordered_6_cubes # exactly 6 cubes
|
|
140
|
+
assert "4x8x12" in non_ordered_6_cubes # exactly 6 cubes
|
|
141
|
+
assert "4x8x16" not in ordered_6_cubes # too many cubes (8)
|
|
142
|
+
assert "4x8x16" not in non_ordered_6_cubes # too many cubes (8)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_generate_tpu_topologies_contains_sub_cube_slices():
|
|
146
|
+
one_cube = generate_tpu_topologies(max_cubes=1)
|
|
147
|
+
|
|
148
|
+
assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
|
xpk/core/telemetry.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import platform
|
|
18
|
+
import uuid
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
import sys
|
|
23
|
+
import importlib
|
|
24
|
+
import subprocess
|
|
25
|
+
import tempfile
|
|
26
|
+
import requests
|
|
27
|
+
from enum import Enum
|
|
28
|
+
from typing import Any
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
|
+
from ..utils.execution_context import is_dry_run
|
|
32
|
+
from ..utils.user_agent import get_user_agent
|
|
33
|
+
from ..utils.feature_flags import FeatureFlags
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def should_send_telemetry():
|
|
37
|
+
return (
|
|
38
|
+
FeatureFlags.TELEMETRY_ENABLED
|
|
39
|
+
and xpk_config.get(SEND_TELEMETRY_KEY) != "false"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def send_clearcut_payload(data: str, wait_to_complete: bool = False) -> None:
|
|
44
|
+
"""Sends payload to clearcut endpoint."""
|
|
45
|
+
try:
|
|
46
|
+
file_path = _store_payload_in_temp_file(data)
|
|
47
|
+
if not _schedule_clearcut_background_flush(file_path, wait_to_complete):
|
|
48
|
+
_clearcut_flush(file_path)
|
|
49
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _store_payload_in_temp_file(data: str) -> str:
|
|
54
|
+
with tempfile.NamedTemporaryFile(
|
|
55
|
+
mode="w", delete=False, encoding="utf-8"
|
|
56
|
+
) as file:
|
|
57
|
+
json.dump(
|
|
58
|
+
{
|
|
59
|
+
"data": data,
|
|
60
|
+
"url": "https://play.googleapis.com/log",
|
|
61
|
+
"params": {"format": "json_proto"},
|
|
62
|
+
"headers": {"User-Agent": get_user_agent()},
|
|
63
|
+
"method": "POST",
|
|
64
|
+
},
|
|
65
|
+
file,
|
|
66
|
+
)
|
|
67
|
+
return file.name
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _schedule_clearcut_background_flush(
|
|
71
|
+
file_path: str, wait_to_complete: bool
|
|
72
|
+
) -> bool:
|
|
73
|
+
"""Schedules clearcut background flush.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path: path to the temporary file where the events are stored.
|
|
77
|
+
wait_to_complete: whenever to wait for the background script completion.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
True if successful and False otherwise
|
|
81
|
+
"""
|
|
82
|
+
with importlib.resources.path("xpk", "telemetry_uploader.py") as path:
|
|
83
|
+
if not os.path.exists(path):
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
kwargs: dict[str, Any] = {}
|
|
87
|
+
if sys.platform == "win32":
|
|
88
|
+
kwargs["creationflags"] = (
|
|
89
|
+
subprocess.DETACHED_PROCESS | subprocess.CREATE_NO_WINDOW
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
kwargs["start_new_session"] = True
|
|
93
|
+
|
|
94
|
+
process = subprocess.Popen(
|
|
95
|
+
args=[
|
|
96
|
+
sys.executable,
|
|
97
|
+
str(path),
|
|
98
|
+
file_path,
|
|
99
|
+
],
|
|
100
|
+
stdout=sys.stdout if wait_to_complete else subprocess.DEVNULL,
|
|
101
|
+
stderr=sys.stderr if wait_to_complete else subprocess.DEVNULL,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
104
|
+
if wait_to_complete:
|
|
105
|
+
process.wait()
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _clearcut_flush(file_path: str) -> None:
|
|
110
|
+
with open(file_path, mode="r", encoding="utf-8") as file:
|
|
111
|
+
kwargs = json.load(file)
|
|
112
|
+
requests.request(**kwargs)
|
|
113
|
+
os.remove(file_path)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class MetricsEventMetadataKey(Enum):
|
|
117
|
+
SESSION_ID = "XPK_SESSION_ID"
|
|
118
|
+
DRY_RUN = "XPK_DRY_RUN"
|
|
119
|
+
PYTHON_VERSION = "XPK_PYTHON_VERSION"
|
|
120
|
+
ZONE = "XPK_ZONE"
|
|
121
|
+
SYSTEM_CHARACTERISTICS = "XPK_SYSTEM_CHARACTERISTICS"
|
|
122
|
+
PROVISIONING_MODE = "XPK_PROVISIONING_MODE"
|
|
123
|
+
COMMAND = "XPK_COMMAND"
|
|
124
|
+
EXIT_CODE = "XPK_EXIT_CODE"
|
|
125
|
+
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
|
+
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class _MetricsEvent:
|
|
131
|
+
time: float
|
|
132
|
+
type: str
|
|
133
|
+
name: str
|
|
134
|
+
metadata: dict[MetricsEventMetadataKey, str]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class _MetricsCollector:
|
|
138
|
+
"""Metrics collector for collecting various metrics and events across application."""
|
|
139
|
+
|
|
140
|
+
_events: list[_MetricsEvent] = []
|
|
141
|
+
|
|
142
|
+
def log_start(self, command: str) -> None:
|
|
143
|
+
"""Logs start event."""
|
|
144
|
+
self._events.append(
|
|
145
|
+
_MetricsEvent(
|
|
146
|
+
time=time.time(),
|
|
147
|
+
type="commands",
|
|
148
|
+
name="start",
|
|
149
|
+
metadata={MetricsEventMetadataKey.COMMAND: command},
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def log_complete(self, exit_code: int) -> None:
|
|
154
|
+
"""Logs complete event."""
|
|
155
|
+
self._events.append(
|
|
156
|
+
_MetricsEvent(
|
|
157
|
+
time=time.time(),
|
|
158
|
+
type="commands",
|
|
159
|
+
name="complete",
|
|
160
|
+
metadata={MetricsEventMetadataKey.EXIT_CODE: str(exit_code)},
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def log_custom(
|
|
165
|
+
self,
|
|
166
|
+
name: str,
|
|
167
|
+
metadata: dict[MetricsEventMetadataKey, str] | None = None,
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Logs custom event."""
|
|
170
|
+
self._events.append(
|
|
171
|
+
_MetricsEvent(
|
|
172
|
+
time=time.time(),
|
|
173
|
+
type="custom",
|
|
174
|
+
name=name,
|
|
175
|
+
metadata=metadata if metadata is not None else {},
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def flush(self) -> str:
|
|
180
|
+
"""Flushes collected events into concord payload."""
|
|
181
|
+
result = _generate_payload(self._events)
|
|
182
|
+
self._events.clear()
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
MetricsCollector = _MetricsCollector()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _generate_payload(events: list[_MetricsEvent]) -> str:
|
|
190
|
+
base_concord_event = _get_base_concord_event()
|
|
191
|
+
base_event_metadata = _get_base_event_metadata()
|
|
192
|
+
serialized_events = []
|
|
193
|
+
for event in events:
|
|
194
|
+
metadata = {
|
|
195
|
+
**base_event_metadata,
|
|
196
|
+
**event.metadata,
|
|
197
|
+
}
|
|
198
|
+
serialized_events.append({
|
|
199
|
+
"event_time_ms": int(event.time * 1000),
|
|
200
|
+
"source_extension_json": json.dumps({
|
|
201
|
+
**base_concord_event,
|
|
202
|
+
"event_type": event.type,
|
|
203
|
+
"event_name": event.name,
|
|
204
|
+
"event_metadata": [
|
|
205
|
+
{"key": key.value, "value": value}
|
|
206
|
+
for key, value in metadata.items()
|
|
207
|
+
],
|
|
208
|
+
}),
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
return json.dumps({
|
|
212
|
+
"client_info": {"client_type": "XPK"},
|
|
213
|
+
"log_source_name": "CONCORD",
|
|
214
|
+
"request_time_ms": int(time.time() * 1000),
|
|
215
|
+
"log_event": serialized_events,
|
|
216
|
+
})
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
|
|
220
|
+
return {
|
|
221
|
+
MetricsEventMetadataKey.SESSION_ID: _get_session_id(),
|
|
222
|
+
MetricsEventMetadataKey.DRY_RUN: str(is_dry_run()).lower(),
|
|
223
|
+
MetricsEventMetadataKey.PYTHON_VERSION: platform.python_version(),
|
|
224
|
+
MetricsEventMetadataKey.RUNNING_AS_PIP: str(_is_running_as_pip()).lower(),
|
|
225
|
+
MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
|
|
226
|
+
_is_running_from_source()
|
|
227
|
+
).lower(),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _get_base_concord_event() -> dict[str, str]:
|
|
232
|
+
return {
|
|
233
|
+
"release_version": xpk_version,
|
|
234
|
+
"console_type": "XPK",
|
|
235
|
+
"client_install_id": _ensure_client_id(),
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _is_running_as_pip() -> bool:
|
|
240
|
+
return os.path.basename(sys.argv[0]) == "xpk"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _is_running_from_source() -> bool:
|
|
244
|
+
current_path = os.path.abspath(os.path.realpath(__file__))
|
|
245
|
+
return (
|
|
246
|
+
"site-packages" not in current_path
|
|
247
|
+
and "dist-packages" not in current_path
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _get_session_id() -> str:
|
|
252
|
+
return str(uuid.uuid4())
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _ensure_client_id() -> str:
|
|
256
|
+
"""Generates Client ID and stores in configuration if not already present."""
|
|
257
|
+
current_client_id = xpk_config.get(CLIENT_ID_KEY)
|
|
258
|
+
if current_client_id is not None:
|
|
259
|
+
return current_client_id
|
|
260
|
+
|
|
261
|
+
new_client_id = str(uuid.uuid4())
|
|
262
|
+
xpk_config.set(CLIENT_ID_KEY, new_client_id)
|
|
263
|
+
return new_client_id
|