xpk 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +29 -30
- xpk/commands/cluster_gcluster.py +19 -14
- xpk/commands/cluster_test.py +1 -21
- xpk/commands/common.py +39 -6
- xpk/commands/common_test.py +170 -0
- xpk/commands/info.py +9 -5
- xpk/commands/inspector.py +33 -4
- xpk/commands/inspector_test.py +142 -0
- xpk/commands/workload.py +22 -8
- xpk/commands/workload_test.py +70 -3
- xpk/core/blueprint/blueprint_generator.py +19 -8
- xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- xpk/core/blueprint/testing/data/a4.yaml +3 -1
- xpk/core/capacity.py +37 -17
- xpk/core/capacity_test.py +66 -1
- xpk/core/cluster.py +10 -10
- xpk/core/cluster_private.py +3 -3
- xpk/core/cluster_test.py +29 -2
- xpk/core/docker_container.py +31 -24
- xpk/core/docker_manager.py +4 -4
- xpk/core/docker_resources.py +4 -1
- xpk/core/kueue_manager.py +6 -8
- xpk/core/kueue_manager_test.py +4 -5
- xpk/core/nap.py +14 -3
- xpk/core/nodepool.py +46 -13
- xpk/core/nodepool_test.py +143 -8
- xpk/core/remote_state/fuse_remote_state.py +1 -1
- xpk/core/scheduling.py +4 -1
- xpk/core/scheduling_test.py +1 -1
- xpk/core/system_characteristics.py +6 -0
- xpk/core/telemetry.py +11 -1
- xpk/core/telemetry_test.py +39 -0
- xpk/core/testing/commands_tester.py +26 -0
- xpk/core/testing/commands_tester_test.py +20 -1
- xpk/core/workload_decorators/rdma_decorator.py +9 -0
- xpk/parser/cluster.py +11 -1
- xpk/parser/cluster_test.py +59 -1
- xpk/parser/common.py +11 -0
- xpk/parser/storage.py +3 -3
- xpk/utils/console.py +1 -1
- xpk/utils/feature_flags.py +7 -3
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/METADATA +37 -21
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/RECORD +47 -54
- xpk-1.1.0.dist-info/top_level.txt +1 -0
- integration/README.md +0 -19
- integration/__init__.py +0 -15
- integration/docker_manager_test.py +0 -102
- integration/gcluster_a3mega_test.py +0 -215
- integration/gcluster_a3ultra_test.py +0 -187
- integration/gcluster_a4_test.py +0 -187
- integration/gcluster_test.py +0 -107
- xpk/utils/user_input.py +0 -48
- xpk/utils/user_input_test.py +0 -92
- xpk-1.0.0.dist-info/top_level.txt +0 -2
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
xpk/core/scheduling.py
CHANGED
|
@@ -33,8 +33,11 @@ from .system_characteristics import (
|
|
|
33
33
|
from packaging.version import Version
|
|
34
34
|
|
|
35
35
|
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
36
|
-
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.
|
|
36
|
+
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
|
|
37
37
|
_SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
|
|
38
|
+
ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
|
|
39
|
+
'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
|
|
40
|
+
)
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class WorkloadScheduling(Enum):
|
xpk/core/scheduling_test.py
CHANGED
|
@@ -208,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
208
208
|
cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
|
|
209
209
|
# 5 4x4x4 cubes:
|
|
210
210
|
resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
|
|
211
|
-
kueue_version='0.
|
|
211
|
+
kueue_version='0.15.2',
|
|
212
212
|
super_slicing_feature_enabled=True,
|
|
213
213
|
super_slicing_topology_set=True,
|
|
214
214
|
num_slices=1,
|
|
@@ -131,6 +131,8 @@ class SystemCharacteristics:
|
|
|
131
131
|
supports_super_slicing: Whether the Super-slicing feature is supported.
|
|
132
132
|
requires_workload_policy: A boolean indicating if a GCE resource
|
|
133
133
|
workload policy is required. This is automatically set to True for GPUs.
|
|
134
|
+
parallel_containers: The number of containers running on a single VM.
|
|
135
|
+
|
|
134
136
|
"""
|
|
135
137
|
|
|
136
138
|
topology: str
|
|
@@ -146,6 +148,7 @@ class SystemCharacteristics:
|
|
|
146
148
|
docker_platform: DockerPlatform
|
|
147
149
|
requires_workload_policy: bool = False
|
|
148
150
|
gpu_config: Optional[GpuConfig] = None
|
|
151
|
+
parallel_containers: int = 1
|
|
149
152
|
|
|
150
153
|
def __post_init__(self):
|
|
151
154
|
if self.accelerator_type == AcceleratorType.GPU:
|
|
@@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map(
|
|
|
239
242
|
default_topologies: set[str] | None = None,
|
|
240
243
|
sub_slicing_topologies: set[str] | None = None,
|
|
241
244
|
super_slicing_topologies: set[str] | None = None,
|
|
245
|
+
parallel_containers: int = 1,
|
|
242
246
|
) -> dict[str, SystemCharacteristics]:
|
|
243
247
|
system_characteristics_map = {}
|
|
244
248
|
default_topologies = default_topologies or set()
|
|
@@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map(
|
|
|
263
267
|
supports_super_slicing=topology in super_slicing_topologies,
|
|
264
268
|
supports_accelerator_network_profile=supports_accelerator_network_profile,
|
|
265
269
|
docker_platform=docker_platform,
|
|
270
|
+
parallel_containers=parallel_containers,
|
|
266
271
|
)
|
|
267
272
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
268
273
|
if (
|
|
@@ -544,6 +549,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
544
549
|
tpu_type_requires_workload_policy=True,
|
|
545
550
|
supports_accelerator_network_profile=False,
|
|
546
551
|
docker_platform=AMD_PLATFORM,
|
|
552
|
+
parallel_containers=2,
|
|
547
553
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
548
554
|
super_slicing_topologies=set(['4x4x4']),
|
|
549
555
|
default_topologies=set([
|
xpk/core/telemetry.py
CHANGED
|
@@ -30,7 +30,7 @@ from dataclasses import dataclass
|
|
|
30
30
|
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
|
|
31
31
|
from ..utils.execution_context import is_dry_run
|
|
32
32
|
from ..utils.user_agent import get_user_agent
|
|
33
|
-
from ..utils.feature_flags import FeatureFlags
|
|
33
|
+
from ..utils.feature_flags import FeatureFlags, is_tester
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def should_send_telemetry():
|
|
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
|
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
class MetricsEventMetadataKey(Enum):
|
|
117
|
+
"""Represents available metadata keys."""
|
|
118
|
+
|
|
117
119
|
SESSION_ID = "XPK_SESSION_ID"
|
|
118
120
|
DRY_RUN = "XPK_DRY_RUN"
|
|
119
121
|
PYTHON_VERSION = "XPK_PYTHON_VERSION"
|
|
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
|
|
|
125
127
|
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
128
|
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
129
|
LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
|
|
130
|
+
TESTER = "XPK_TESTER"
|
|
128
131
|
|
|
129
132
|
|
|
130
133
|
@dataclass
|
|
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
|
|
|
230
233
|
MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
|
|
231
234
|
_is_running_from_source()
|
|
232
235
|
).lower(),
|
|
236
|
+
MetricsEventMetadataKey.TESTER: str(
|
|
237
|
+
is_tester() or _is_trash_execution()
|
|
238
|
+
).lower(),
|
|
233
239
|
}
|
|
234
240
|
|
|
235
241
|
|
|
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
|
|
|
241
247
|
}
|
|
242
248
|
|
|
243
249
|
|
|
250
|
+
def _is_trash_execution() -> bool:
|
|
251
|
+
return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
|
|
252
|
+
|
|
253
|
+
|
|
244
254
|
def _is_running_as_pip() -> bool:
|
|
245
255
|
return os.path.basename(sys.argv[0]) == "xpk"
|
|
246
256
|
|
xpk/core/telemetry_test.py
CHANGED
|
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
|
|
|
30
30
|
mocker.patch('time.time', side_effect=itertools.count())
|
|
31
31
|
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
32
32
|
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
33
|
+
mocker.patch('os.getenv', return_value='false')
|
|
33
34
|
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
35
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
|
|
34
36
|
set_dry_run(False)
|
|
35
37
|
get_config().set(CLIENT_ID_KEY, 'client_id')
|
|
36
38
|
yield
|
|
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
|
|
|
76
78
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
77
79
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
78
80
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
81
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
79
82
|
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
80
83
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
81
84
|
],
|
|
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
|
|
|
107
110
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
108
111
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
109
112
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
113
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
110
114
|
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
111
115
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
112
116
|
],
|
|
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
131
135
|
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
132
136
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
133
137
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
138
|
+
{'key': 'XPK_TESTER', 'value': 'false'},
|
|
134
139
|
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
135
140
|
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
136
141
|
],
|
|
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
|
|
|
219
224
|
assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
|
|
220
225
|
|
|
221
226
|
|
|
227
|
+
@pytest.mark.parametrize(
|
|
228
|
+
argnames='tester,expected',
|
|
229
|
+
argvalues=[
|
|
230
|
+
(True, 'true'),
|
|
231
|
+
(False, 'false'),
|
|
232
|
+
],
|
|
233
|
+
)
|
|
234
|
+
def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
|
|
235
|
+
tester: bool, expected: str, mocker: MockerFixture
|
|
236
|
+
):
|
|
237
|
+
mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
|
|
238
|
+
MetricsCollector.log_start(command='test')
|
|
239
|
+
payload = MetricsCollector.flush()
|
|
240
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@pytest.mark.parametrize(
|
|
244
|
+
argnames='trash_execution,expected',
|
|
245
|
+
argvalues=[
|
|
246
|
+
('true', 'true'),
|
|
247
|
+
('false', 'false'),
|
|
248
|
+
('', 'false'),
|
|
249
|
+
(None, 'false'),
|
|
250
|
+
],
|
|
251
|
+
)
|
|
252
|
+
def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
|
|
253
|
+
trash_execution: str, expected: str, mocker: MockerFixture
|
|
254
|
+
):
|
|
255
|
+
mocker.patch('os.getenv', return_value=trash_execution)
|
|
256
|
+
MetricsCollector.log_start(command='test')
|
|
257
|
+
payload = MetricsCollector.flush()
|
|
258
|
+
assert _get_metadata_value(payload, 'XPK_TESTER') == expected
|
|
259
|
+
|
|
260
|
+
|
|
222
261
|
def _get_metadata_value(payload_str: str, key: str) -> str | None:
|
|
223
262
|
payload = json.loads(payload_str)
|
|
224
263
|
metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
|
|
@@ -17,6 +17,8 @@ limitations under the License.
|
|
|
17
17
|
import re
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
+
from ..commands import FailedCommand
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
class CommandsTester:
|
|
22
24
|
"""Tester class useful for mocking and asserting command runs."""
|
|
@@ -27,6 +29,7 @@ class CommandsTester:
|
|
|
27
29
|
run_command_for_value_path: str | None = None,
|
|
28
30
|
run_command_with_updates_path: str | None = None,
|
|
29
31
|
run_command_with_updates_retry_path: str | None = None,
|
|
32
|
+
run_command_batch_path: str | None = None,
|
|
30
33
|
):
|
|
31
34
|
self.__results: dict[re.Pattern, tuple[int, str]] = {}
|
|
32
35
|
self.commands_history: list[str] = []
|
|
@@ -45,6 +48,11 @@ class CommandsTester:
|
|
|
45
48
|
run_command_with_updates_retry_path,
|
|
46
49
|
wraps=self.__fake_run_command_with_updates_retry,
|
|
47
50
|
)
|
|
51
|
+
if run_command_batch_path:
|
|
52
|
+
mocker.patch(
|
|
53
|
+
run_command_batch_path,
|
|
54
|
+
wraps=self.__fake_run_command_batch,
|
|
55
|
+
)
|
|
48
56
|
|
|
49
57
|
def set_result_for_command(
|
|
50
58
|
self, result: tuple[int, str], *command_parts: str
|
|
@@ -111,6 +119,24 @@ class CommandsTester:
|
|
|
111
119
|
) -> tuple[int, str]:
|
|
112
120
|
return self.__common_fake_run_command(command, (0, dry_run_return_val))
|
|
113
121
|
|
|
122
|
+
def __fake_run_command_batch(
|
|
123
|
+
self,
|
|
124
|
+
commands: list[str],
|
|
125
|
+
jobname: str,
|
|
126
|
+
per_command_name: list[str],
|
|
127
|
+
output_logs: list[str],
|
|
128
|
+
) -> FailedCommand | None:
|
|
129
|
+
for i, command in enumerate(commands):
|
|
130
|
+
result = self.__common_fake_run_command(command, (0, ""))[0]
|
|
131
|
+
if result != 0:
|
|
132
|
+
return FailedCommand(
|
|
133
|
+
return_code=result,
|
|
134
|
+
name=per_command_name[i],
|
|
135
|
+
command=command,
|
|
136
|
+
logfile=output_logs[i],
|
|
137
|
+
)
|
|
138
|
+
return None
|
|
139
|
+
|
|
114
140
|
# pylint: enable=unused-argument
|
|
115
141
|
|
|
116
142
|
def __common_fake_run_command(
|
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import pytest
|
|
18
18
|
from pytest_mock import MockerFixture
|
|
19
19
|
|
|
20
|
-
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
|
|
20
|
+
from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
|
|
21
21
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
22
22
|
|
|
23
23
|
|
|
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
|
|
|
31
31
|
run_command_with_updates_retry_path=(
|
|
32
32
|
"xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
|
|
33
33
|
),
|
|
34
|
+
run_command_batch_path=(
|
|
35
|
+
"xpk.core.testing.commands_tester_test.run_command_batch"
|
|
36
|
+
),
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
|
|
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
|
|
|
54
57
|
mock_commands.assert_command_run("cmd", "bar")
|
|
55
58
|
|
|
56
59
|
|
|
60
|
+
def test_run_command_batch_default_result(
|
|
61
|
+
mock_commands: CommandsTester,
|
|
62
|
+
):
|
|
63
|
+
result = run_command_batch(
|
|
64
|
+
commands=["cmd1 foo bar", "cmd2 foo bar"],
|
|
65
|
+
jobname="Test command",
|
|
66
|
+
per_command_name=["cmd1", "cmd2"],
|
|
67
|
+
output_logs=["log1", "log2"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
assert result is None
|
|
71
|
+
mock_commands.assert_command_run("foo bar", times=2)
|
|
72
|
+
mock_commands.assert_command_run("cmd1")
|
|
73
|
+
mock_commands.assert_command_run("cmd2")
|
|
74
|
+
|
|
75
|
+
|
|
57
76
|
def test_set_result_for_command(mock_commands: CommandsTester):
|
|
58
77
|
mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
|
|
59
78
|
|
|
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
|
|
|
84
84
|
volumes.append(
|
|
85
85
|
{'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
|
|
86
86
|
)
|
|
87
|
+
volumes.append({
|
|
88
|
+
'name': 'dshm',
|
|
89
|
+
'emptyDir': {
|
|
90
|
+
'medium': 'Memory',
|
|
91
|
+
},
|
|
92
|
+
})
|
|
87
93
|
|
|
88
94
|
|
|
89
95
|
def add_tolerations(job_manifest):
|
|
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
|
|
|
111
117
|
container['volumeMounts'].append(
|
|
112
118
|
{'name': 'gib', 'mountPath': '/usr/local/gib'}
|
|
113
119
|
)
|
|
120
|
+
container['volumeMounts'].append(
|
|
121
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
122
|
+
)
|
xpk/parser/cluster.py
CHANGED
|
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
338
338
|
add_resource_limits(cluster_create_resource_limits)
|
|
339
339
|
|
|
340
340
|
cluster_create_ray_parser.set_defaults(
|
|
341
|
-
func=cluster_create_ray_cluster,
|
|
341
|
+
func=cluster_create_ray_cluster,
|
|
342
|
+
sub_slicing=False,
|
|
343
|
+
super_slicing=False,
|
|
344
|
+
num_cubes=None,
|
|
342
345
|
)
|
|
343
346
|
|
|
344
347
|
|
|
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
503
506
|
)
|
|
504
507
|
add_driver_arguments(cluster_adapt_optional_arguments)
|
|
505
508
|
add_shared_arguments(cluster_adapt_optional_arguments)
|
|
509
|
+
add_resource_limits(cluster_adapt_optional_arguments)
|
|
510
|
+
|
|
511
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
512
|
+
add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
|
|
513
|
+
|
|
514
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
515
|
+
add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
|
|
506
516
|
|
|
507
517
|
cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
|
|
508
518
|
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
xpk/parser/cluster_test.py
CHANGED
|
@@ -15,8 +15,8 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
|
-
from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser
|
|
19
18
|
import pytest
|
|
19
|
+
from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser, set_cluster_adapt_parser
|
|
20
20
|
from ..utils.feature_flags import FeatureFlags
|
|
21
21
|
|
|
22
22
|
|
|
@@ -261,3 +261,61 @@ def test_cluster_create_num_slices_has_no_default_if_superslicing_feature():
|
|
|
261
261
|
)
|
|
262
262
|
|
|
263
263
|
assert args.num_slices is None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def test_cluster_adapt_sub_slicing_is_hidden_with_flag_off():
|
|
267
|
+
FeatureFlags.SUB_SLICING_ENABLED = False
|
|
268
|
+
parser = argparse.ArgumentParser()
|
|
269
|
+
|
|
270
|
+
set_cluster_adapt_parser(parser)
|
|
271
|
+
help_str = parser.format_help()
|
|
272
|
+
|
|
273
|
+
assert "--sub-slicing" not in help_str
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def test_cluster_adapt_sub_slicing_is_shown_with_flag_on():
|
|
277
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
278
|
+
parser = argparse.ArgumentParser()
|
|
279
|
+
|
|
280
|
+
set_cluster_adapt_parser(parser)
|
|
281
|
+
help_str = parser.format_help()
|
|
282
|
+
|
|
283
|
+
assert "--sub-slicing" in help_str
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def test_cluster_adapt_super_slicing_is_hidden_with_flag_off():
|
|
287
|
+
FeatureFlags.SUPER_SLICING_ENABLED = False
|
|
288
|
+
parser = argparse.ArgumentParser()
|
|
289
|
+
|
|
290
|
+
set_cluster_adapt_parser(parser)
|
|
291
|
+
help_str = parser.format_help()
|
|
292
|
+
|
|
293
|
+
assert "--super-slicing" not in help_str
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def test_cluster_adapt_super_slicing_is_shown_with_flag_on():
|
|
297
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
298
|
+
parser = argparse.ArgumentParser()
|
|
299
|
+
|
|
300
|
+
set_cluster_adapt_parser(parser)
|
|
301
|
+
help_str = parser.format_help()
|
|
302
|
+
|
|
303
|
+
assert "--super-slicing" in help_str
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def test_cluster_adapt_memory_limit_is_shown():
|
|
307
|
+
parser = argparse.ArgumentParser()
|
|
308
|
+
|
|
309
|
+
set_cluster_adapt_parser(parser)
|
|
310
|
+
help_str = parser.format_help()
|
|
311
|
+
|
|
312
|
+
assert "--memory-limit" in help_str
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def test_cluster_adapt_cpu_limit_is_shown():
|
|
316
|
+
parser = argparse.ArgumentParser()
|
|
317
|
+
|
|
318
|
+
set_cluster_adapt_parser(parser)
|
|
319
|
+
help_str = parser.format_help()
|
|
320
|
+
|
|
321
|
+
assert "--cpu-limit" in help_str
|
xpk/parser/common.py
CHANGED
|
@@ -83,6 +83,17 @@ def add_shared_arguments(
|
|
|
83
83
|
help='GCE project name, defaults to "gcloud config project."',
|
|
84
84
|
required=required,
|
|
85
85
|
)
|
|
86
|
+
custom_parser_or_group.add_argument(
|
|
87
|
+
'--project-number',
|
|
88
|
+
type=str,
|
|
89
|
+
default=None,
|
|
90
|
+
help=(
|
|
91
|
+
'GCE project number. If provided, skips the Cloud Resource Manager'
|
|
92
|
+
' API call to translate project ID to project number. Useful when'
|
|
93
|
+
' the API is not enabled or you lack permissions.'
|
|
94
|
+
),
|
|
95
|
+
required=False,
|
|
96
|
+
)
|
|
86
97
|
custom_parser_or_group.add_argument(
|
|
87
98
|
'--zone',
|
|
88
99
|
type=str,
|
xpk/parser/storage.py
CHANGED
|
@@ -127,7 +127,7 @@ def add_storage_attach_parser(
|
|
|
127
127
|
type=str,
|
|
128
128
|
help=(
|
|
129
129
|
'(optional) Name of the bucket. If not set, then the "name" parameter'
|
|
130
|
-
' is
|
|
130
|
+
' is inferred as a bucket name.'
|
|
131
131
|
),
|
|
132
132
|
)
|
|
133
133
|
gcsfuse_args.add_argument(
|
|
@@ -165,7 +165,7 @@ def add_storage_attach_parser(
|
|
|
165
165
|
type=str,
|
|
166
166
|
help=(
|
|
167
167
|
'(optional) Name of the filestore instance. If not set, then the'
|
|
168
|
-
' "name" parameter is
|
|
168
|
+
' "name" parameter is inferred as an instance name.'
|
|
169
169
|
),
|
|
170
170
|
)
|
|
171
171
|
|
|
@@ -238,7 +238,7 @@ def add_storage_create_parser(storage_subcommands_parser: Subcommands) -> None:
|
|
|
238
238
|
type=str,
|
|
239
239
|
help=(
|
|
240
240
|
'(optional) Name of the filestore instance. If not set, then the'
|
|
241
|
-
' "name" parameter is
|
|
241
|
+
' "name" parameter is inferred as an instance name.'
|
|
242
242
|
),
|
|
243
243
|
)
|
|
244
244
|
opt_args.add_argument(
|
xpk/utils/console.py
CHANGED
|
@@ -51,7 +51,7 @@ def ask_for_user_consent(
|
|
|
51
51
|
question: str, default_option: Literal["Y", "N"] = "N"
|
|
52
52
|
) -> bool:
|
|
53
53
|
"""Prompts user with the given question, asking for a yes/no answer and returns a relevant boolean.
|
|
54
|
-
Important:
|
|
54
|
+
Important: immediately returns `True` in quiet mode!
|
|
55
55
|
|
|
56
56
|
Example prompt for `question='Continue?'`: `[XPK] Continue? (y/N): `.
|
|
57
57
|
|
xpk/utils/feature_flags.py
CHANGED
|
@@ -17,20 +17,24 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
def is_tester() -> bool:
|
|
21
|
+
"""Returns true if user is a tester."""
|
|
22
|
+
return os.getenv("XPK_TESTER", "").lower() == "true"
|
|
23
|
+
|
|
24
|
+
|
|
20
25
|
def _get_boolean_flag(flag: str, default: bool) -> bool:
|
|
21
26
|
experiment_value = os.getenv(flag, "").lower()
|
|
22
27
|
if experiment_value in ["true", "false"]:
|
|
23
28
|
return experiment_value == "true"
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
return xpk_tester or default
|
|
30
|
+
return is_tester() or default
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class _FeatureFlags:
|
|
30
34
|
SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
|
|
31
35
|
TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=True)
|
|
32
36
|
SUPER_SLICING_ENABLED = _get_boolean_flag(
|
|
33
|
-
"SUPER_SLICING_ENABLED", default=
|
|
37
|
+
"SUPER_SLICING_ENABLED", default=True
|
|
34
38
|
)
|
|
35
39
|
|
|
36
40
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -93,28 +93,41 @@ XPK supports a variety of hardware accelerators.
|
|
|
93
93
|
|
|
94
94
|
XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
|
|
95
95
|
|
|
96
|
-
| Storage Type | Documentation
|
|
97
|
-
|
|
98
|
-
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse)
|
|
99
|
-
| Filestore | [docs](./docs/usage/storage.md#filestore)
|
|
100
|
-
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore)
|
|
101
|
-
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk)
|
|
96
|
+
| Storage Type | Documentation |
|
|
97
|
+
| ------------------------------------------ | ----------------------------------------------------------------------- |
|
|
98
|
+
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
|
|
99
|
+
| Filestore | [docs](./docs/usage/storage.md#filestore) |
|
|
100
|
+
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
|
|
101
|
+
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
|
|
102
102
|
|
|
103
103
|
# Documentation
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
105
|
+
- [Permissions](./docs/permissions.md)
|
|
106
|
+
- [Installation](./docs/installation.md)
|
|
107
|
+
- Usage:
|
|
108
|
+
- [Clusters](./docs/usage/clusters.md)
|
|
109
|
+
- [GPU](./docs/usage/gpu.md)
|
|
110
|
+
- [CPU](./docs/usage/cpu.md)
|
|
111
|
+
- [Autoprovisioning](./docs/usage/autoprovisioning.md)
|
|
112
|
+
- [Workloads](./docs/usage/workloads.md)
|
|
113
|
+
- [Docker](./docs/usage/docker.md)
|
|
114
|
+
- [Storage](./docs/usage/storage.md)
|
|
115
|
+
- [Advanced](./docs/usage/advanced.md)
|
|
116
|
+
- [Inspector](./docs/usage/inspector.md)
|
|
117
|
+
- [Troubleshooting](./docs/troubleshooting.md)
|
|
118
|
+
|
|
119
|
+
# Dependencies
|
|
120
|
+
|
|
121
|
+
| Dependency | When used |
|
|
122
|
+
| ------------------------------------------------------------------------------------------------------------ | --------------------------- |
|
|
123
|
+
| [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
|
|
124
|
+
| [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
|
|
125
|
+
| [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
|
|
126
|
+
| [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
|
|
127
|
+
| [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
|
|
128
|
+
| [Docker](https://docs.docker.com/engine/install/) | Building workload container |
|
|
129
|
+
| [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
|
|
130
|
+
| [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
|
|
118
131
|
|
|
119
132
|
# Privacy notice
|
|
120
133
|
|
|
@@ -129,11 +142,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
|
|
|
129
142
|
you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
|
|
130
143
|
[Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
|
|
131
144
|
|
|
132
|
-
|
|
133
145
|
# Contributing
|
|
134
146
|
|
|
135
147
|
Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
|
|
136
148
|
|
|
149
|
+
# Get involved
|
|
150
|
+
|
|
151
|
+
We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
|
|
152
|
+
|
|
137
153
|
# License
|
|
138
154
|
|
|
139
155
|
This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details
|