xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/config.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
import ruamel.yaml
|
|
21
|
+
|
|
22
|
+
from ..utils import file
|
|
23
|
+
from ..utils.console import xpk_print
|
|
24
|
+
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
25
|
+
|
|
26
|
+
# This is the version for XPK PyPI package
|
|
27
|
+
__version__ = 'v0.7.1'
|
|
28
|
+
XPK_CURRENT_VERSION = __version__
|
|
29
|
+
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
30
|
+
|
|
31
|
+
CONFIGS_KEY = 'configs'
|
|
32
|
+
CFG_BUCKET_KEY = 'cluster-state-gcs-bucket'
|
|
33
|
+
CLUSTER_NAME_KEY = 'cluster-name'
|
|
34
|
+
PROJECT_KEY = 'project-id'
|
|
35
|
+
ZONE_KEY = 'zone'
|
|
36
|
+
KJOB_BATCH_IMAGE = 'batch-image'
|
|
37
|
+
KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
|
|
38
|
+
KJOB_SHELL_IMAGE = 'shell-image'
|
|
39
|
+
KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
|
|
40
|
+
KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
|
|
41
|
+
CONFIGS_KEY = 'configs'
|
|
42
|
+
DEPENDENCIES_KEY = 'deps-verified-version'
|
|
43
|
+
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
44
|
+
|
|
45
|
+
DEFAULT_KEYS = [
|
|
46
|
+
CFG_BUCKET_KEY,
|
|
47
|
+
CLUSTER_NAME_KEY,
|
|
48
|
+
PROJECT_KEY,
|
|
49
|
+
ZONE_KEY,
|
|
50
|
+
DEPENDENCIES_KEY,
|
|
51
|
+
KJOB_BATCH_IMAGE,
|
|
52
|
+
KJOB_BATCH_WORKING_DIRECTORY,
|
|
53
|
+
KJOB_SHELL_IMAGE,
|
|
54
|
+
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
55
|
+
KJOB_SHELL_WORKING_DIRECTORY,
|
|
56
|
+
]
|
|
57
|
+
VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
yaml = ruamel.yaml.YAML()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class XpkConfig:
|
|
64
|
+
"""XpkConfig is a class for setting and getting values from .yaml config file."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, custom_config_file: str = XPK_CONFIG_FILE) -> None:
|
|
67
|
+
self._config = custom_config_file
|
|
68
|
+
self._allowed_keys = DEFAULT_KEYS
|
|
69
|
+
|
|
70
|
+
def _open_configs(self) -> dict | None:
|
|
71
|
+
dir_path = '/'.join(self._config.split('/')[:-1])
|
|
72
|
+
file.ensure_directory_exists(dir_path)
|
|
73
|
+
|
|
74
|
+
config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
|
|
75
|
+
if not os.path.exists(self._config):
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
with open(self._config, encoding='utf-8', mode='r') as stream:
|
|
79
|
+
config_yaml: dict = yaml.load(stream)
|
|
80
|
+
return config_yaml
|
|
81
|
+
|
|
82
|
+
def _save_configs(self, config_yaml: dict) -> None:
|
|
83
|
+
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
84
|
+
yaml.dump(config_yaml, stream)
|
|
85
|
+
|
|
86
|
+
def set(self, key: str, value: str) -> None:
|
|
87
|
+
if key not in self._allowed_keys:
|
|
88
|
+
xpk_print(f'Key {key} is not an allowed xpk config key.')
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
config_yaml = self._open_configs()
|
|
92
|
+
if config_yaml is None:
|
|
93
|
+
config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
|
|
94
|
+
|
|
95
|
+
config_yaml[CONFIGS_KEY][key] = value
|
|
96
|
+
self._save_configs(config_yaml)
|
|
97
|
+
|
|
98
|
+
def get(self, key: str) -> str | None:
|
|
99
|
+
if key not in self._allowed_keys:
|
|
100
|
+
xpk_print(f'Key {key} is not an allowed xpk config key.')
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
config_yaml = self._open_configs()
|
|
104
|
+
if config_yaml is None:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
vals: dict[str, str] = config_yaml[CONFIGS_KEY]
|
|
108
|
+
return vals.get(key)
|
|
109
|
+
|
|
110
|
+
def get_all(
|
|
111
|
+
self,
|
|
112
|
+
) -> dict[str, dict[str, str] | str] | None:
|
|
113
|
+
config_yaml = self._open_configs()
|
|
114
|
+
if config_yaml is None:
|
|
115
|
+
return None
|
|
116
|
+
val: dict[str, str] = config_yaml[CONFIGS_KEY]
|
|
117
|
+
return val
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
|
|
121
|
+
"""Parses the environment configurations to the jobset config.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
args: user provided arguments for running the command.
|
|
125
|
+
tensorboard_config: configuration of Vertex Tensorboard.
|
|
126
|
+
system: system characteristics.
|
|
127
|
+
"""
|
|
128
|
+
env = {}
|
|
129
|
+
|
|
130
|
+
env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
|
|
131
|
+
if args.env_file:
|
|
132
|
+
print('Setting container environment from', args.env_file)
|
|
133
|
+
with open(file=args.env_file, mode='r', encoding='utf-8') as f:
|
|
134
|
+
for match in env_pat.finditer(f.read()):
|
|
135
|
+
variable = match.group(1)
|
|
136
|
+
if match.group(2) is not None:
|
|
137
|
+
env[variable] = match.group(2)
|
|
138
|
+
else:
|
|
139
|
+
assert variable in os.environ, (
|
|
140
|
+
f'Variable {variable} is not set in the current '
|
|
141
|
+
'environment, a value must be specified.'
|
|
142
|
+
)
|
|
143
|
+
env[variable] = os.environ[variable]
|
|
144
|
+
if args.env:
|
|
145
|
+
for var in args.env:
|
|
146
|
+
match = env_pat.match(var)
|
|
147
|
+
assert match and match.group(2) is not None, (
|
|
148
|
+
'Invalid environment variable, format must be '
|
|
149
|
+
f'`--env VARIABLE=value`: {var}'
|
|
150
|
+
)
|
|
151
|
+
variable = match.group(1)
|
|
152
|
+
env[variable] = match.group(2)
|
|
153
|
+
|
|
154
|
+
if not args.use_pathways:
|
|
155
|
+
if args.debug_dump_gcs:
|
|
156
|
+
if 'XLA_FLAGS' in env:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
|
|
159
|
+
'and environment file. Please choose one way to define '
|
|
160
|
+
'XLA_FLAGS.'
|
|
161
|
+
)
|
|
162
|
+
env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
|
|
163
|
+
|
|
164
|
+
if tensorboard_config:
|
|
165
|
+
env['UPLOAD_DATA_TO_TENSORBOARD'] = True
|
|
166
|
+
for key, value in tensorboard_config.items():
|
|
167
|
+
env[key.upper()] = value
|
|
168
|
+
|
|
169
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
170
|
+
# For GPUs, it has two more spaces ahead of name and value respectively
|
|
171
|
+
env_format = '''
|
|
172
|
+
- name: {key}
|
|
173
|
+
value: "{value}"'''
|
|
174
|
+
else:
|
|
175
|
+
env_format = '''
|
|
176
|
+
- name: {key}
|
|
177
|
+
value: "{value}"'''
|
|
178
|
+
|
|
179
|
+
args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
18
|
+
from .docker_image import setup_docker_image
|
|
19
|
+
from .docker_resources import (
|
|
20
|
+
add_container_ports,
|
|
21
|
+
add_image_pull_policy_for_pw_or_gpu,
|
|
22
|
+
add_jax_coordinator_port,
|
|
23
|
+
get_env_container,
|
|
24
|
+
get_main_container_resources,
|
|
25
|
+
get_volume_mounts,
|
|
26
|
+
)
|
|
27
|
+
from .monitoring import get_gke_debugging_dashboard
|
|
28
|
+
from .system_characteristics import (
|
|
29
|
+
AcceleratorType,
|
|
30
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
31
|
+
SystemCharacteristics,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_main_and_sidecar_container(args, system, docker_image) -> str:
|
|
36
|
+
"""Generate yaml for main and sidecar container.
|
|
37
|
+
Args:
|
|
38
|
+
args: user provided arguments for running the command.
|
|
39
|
+
system: system characteristics
|
|
40
|
+
docker_image: docker image
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str:
|
|
44
|
+
yaml for main and sidecar container
|
|
45
|
+
"""
|
|
46
|
+
resource_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
47
|
+
system.accelerator_type
|
|
48
|
+
].resource_type
|
|
49
|
+
main_container = get_main_container(args, system, docker_image, resource_type)
|
|
50
|
+
yaml = """- name: stacktrace-explorer
|
|
51
|
+
image: busybox:1.28
|
|
52
|
+
args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
|
|
53
|
+
volumeMounts:
|
|
54
|
+
- name: tpu-stack-trace
|
|
55
|
+
readOnly: true
|
|
56
|
+
mountPath: /tmp/debugging
|
|
57
|
+
- name: shared-data
|
|
58
|
+
mountPath: /shared-volume
|
|
59
|
+
{main_container}
|
|
60
|
+
"""
|
|
61
|
+
return yaml.format(main_container=main_container)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_main_container(args, system, docker_image, resource_type) -> str:
|
|
65
|
+
"""Generate yaml for main container including the xpk command.
|
|
66
|
+
Args:
|
|
67
|
+
args: user provided arguments for running the command.
|
|
68
|
+
system: system characteristics
|
|
69
|
+
docker_image: docker image
|
|
70
|
+
resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
str:
|
|
74
|
+
yaml for main container
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
xpk_internal_commands = ''
|
|
78
|
+
gsutil_test_command = ''
|
|
79
|
+
if not args.use_pathways and args.debug_dump_gcs:
|
|
80
|
+
gsutil_test_command = (
|
|
81
|
+
'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil'
|
|
82
|
+
' is required but not installed. Aborting"; exit 24;};'
|
|
83
|
+
)
|
|
84
|
+
xpk_internal_commands += (
|
|
85
|
+
'WORKER_ID=$HOSTNAME;'
|
|
86
|
+
f'gsutil -m cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID;'
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
command = args.command
|
|
90
|
+
if args.enable_debug_logs:
|
|
91
|
+
command = (
|
|
92
|
+
'export TPU_STDERR_LOG_LEVEL=0 &&'
|
|
93
|
+
' export TPU_MIN_LOG_LEVEL=0 &&'
|
|
94
|
+
' export TF_CPP_MIN_LOG_LEVEL=0 &&'
|
|
95
|
+
' export TPU_VMODULE=real_program_continuator=1 &&'
|
|
96
|
+
f' {args.command}'
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
gpu_workload_terminate_command = ''
|
|
100
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
101
|
+
gpu_workload_terminate_command = (
|
|
102
|
+
'echo Main app is done > /usr/share/workload/workload_terminated; '
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
tpu_stacktrace_terminate_command = ''
|
|
106
|
+
if (
|
|
107
|
+
not args.use_pathways
|
|
108
|
+
and system.accelerator_type == AcceleratorType['TPU']
|
|
109
|
+
and args.deploy_stacktrace_sidecar
|
|
110
|
+
):
|
|
111
|
+
tpu_stacktrace_terminate_command = (
|
|
112
|
+
'touch /shared-volume/stacktrace_signal; '
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
yaml = """- name: {docker_name}
|
|
116
|
+
image: {docker_image}
|
|
117
|
+
{image_pull_policy}
|
|
118
|
+
env: {env}
|
|
119
|
+
ports:
|
|
120
|
+
{container_ports}
|
|
121
|
+
{jax_coordinator_port}
|
|
122
|
+
securityContext:
|
|
123
|
+
privileged: true
|
|
124
|
+
command:
|
|
125
|
+
- bash
|
|
126
|
+
- -c
|
|
127
|
+
- |
|
|
128
|
+
echo XPK Start: $(date);
|
|
129
|
+
_sigterm() (kill -SIGTERM $! 2>/dev/null;);
|
|
130
|
+
trap _sigterm SIGTERM;
|
|
131
|
+
{gsutil_test_command}
|
|
132
|
+
({command}) & PID=$!;
|
|
133
|
+
while kill -0 $PID 2>/dev/null;
|
|
134
|
+
do sleep 5;
|
|
135
|
+
done;
|
|
136
|
+
wait $PID;
|
|
137
|
+
EXIT_CODE=$?;
|
|
138
|
+
{xpk_internal_commands}
|
|
139
|
+
echo XPK End: $(date);
|
|
140
|
+
echo EXIT_CODE=$EXIT_CODE;
|
|
141
|
+
{tpu_stacktrace_terminate_command}
|
|
142
|
+
{gpu_workload_terminate_command}
|
|
143
|
+
exit $EXIT_CODE
|
|
144
|
+
resources:
|
|
145
|
+
limits:
|
|
146
|
+
{resources}
|
|
147
|
+
"""
|
|
148
|
+
volume_mounts = get_volume_mounts(args, system)
|
|
149
|
+
if volume_mounts != '':
|
|
150
|
+
yaml += """
|
|
151
|
+
volumeMounts:
|
|
152
|
+
{volume_mounts}
|
|
153
|
+
"""
|
|
154
|
+
return yaml.format(
|
|
155
|
+
args=args,
|
|
156
|
+
system=system,
|
|
157
|
+
image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system),
|
|
158
|
+
env=get_env_container(args, system),
|
|
159
|
+
container_ports=add_container_ports(args, system),
|
|
160
|
+
jax_coordinator_port=add_jax_coordinator_port(system),
|
|
161
|
+
docker_name=get_main_container_docker_image(args, system),
|
|
162
|
+
docker_image=docker_image,
|
|
163
|
+
gsutil_test_command=gsutil_test_command,
|
|
164
|
+
command=command,
|
|
165
|
+
tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
|
|
166
|
+
gpu_workload_terminate_command=gpu_workload_terminate_command,
|
|
167
|
+
xpk_internal_commands=xpk_internal_commands,
|
|
168
|
+
resources=get_main_container_resources(args, system, resource_type),
|
|
169
|
+
volume_mounts=volume_mounts,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_user_workload_container(args, system: SystemCharacteristics):
|
|
174
|
+
"""Deploy user workload container
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
args: user provided args.
|
|
178
|
+
system: system characteristics.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
container: main container
|
|
182
|
+
debugging_dashboard_id: id of the GKE dashboard
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
setup_docker_image_code, docker_image = setup_docker_image(args)
|
|
186
|
+
if setup_docker_image_code != 0:
|
|
187
|
+
xpk_exit(setup_docker_image_code)
|
|
188
|
+
|
|
189
|
+
# Determine if we deploy a sidecar and if we deploy a container.
|
|
190
|
+
debugging_dashboard_id = None
|
|
191
|
+
resource_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
192
|
+
system.accelerator_type
|
|
193
|
+
].resource_type
|
|
194
|
+
if (
|
|
195
|
+
not args.use_pathways
|
|
196
|
+
and system.accelerator_type == AcceleratorType['TPU']
|
|
197
|
+
and args.deploy_stacktrace_sidecar
|
|
198
|
+
):
|
|
199
|
+
xpk_print(
|
|
200
|
+
'Sidecar container to display stack traces for TPU workloads will also'
|
|
201
|
+
' be deployed.'
|
|
202
|
+
)
|
|
203
|
+
container = get_main_and_sidecar_container(args, system, docker_image)
|
|
204
|
+
# Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
|
|
205
|
+
debugging_dashboard_id = get_gke_debugging_dashboard(args)
|
|
206
|
+
else:
|
|
207
|
+
container = get_main_container(args, system, docker_image, resource_type)
|
|
208
|
+
return container, debugging_dashboard_id
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_main_container_docker_image(args, system: SystemCharacteristics) -> str:
|
|
212
|
+
"""Docker name for the main container.
|
|
213
|
+
Args:
|
|
214
|
+
args: user provided args.
|
|
215
|
+
system: system characteristics.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
str:
|
|
219
|
+
Workload docker image as a YAML string
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
223
|
+
return 'gpu-image'
|
|
224
|
+
|
|
225
|
+
return f'{args.docker_name}'
|
xpk/core/docker_image.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import datetime
|
|
18
|
+
import os
|
|
19
|
+
import random
|
|
20
|
+
import string
|
|
21
|
+
|
|
22
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
23
|
+
from ..utils.file import write_tmp_file
|
|
24
|
+
from .commands import run_command_with_updates
|
|
25
|
+
|
|
26
|
+
DEFAULT_DOCKER_IMAGE = 'python:3.10'
|
|
27
|
+
DEFAULT_SCRIPT_DIR = os.getcwd()
|
|
28
|
+
PLATFORM = 'linux/amd64'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_docker_image(docker_image, args) -> int:
|
|
32
|
+
"""Validates that the user provided docker image exists in your project.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
docker_image: The docker image to verify.
|
|
36
|
+
args: user provided arguments for running the command.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
0 if successful and 1 otherwise.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
project = args.project
|
|
43
|
+
|
|
44
|
+
if not any(repo in docker_image for repo in ['gcr.io', 'docker.pkg.dev']):
|
|
45
|
+
return 0
|
|
46
|
+
|
|
47
|
+
command = (
|
|
48
|
+
f'gcloud container images describe {docker_image} --project {project}'
|
|
49
|
+
)
|
|
50
|
+
return_code = run_command_with_updates(
|
|
51
|
+
command, 'Validate Docker Image', args, verbose=False
|
|
52
|
+
)
|
|
53
|
+
if return_code != 0:
|
|
54
|
+
xpk_print(
|
|
55
|
+
'Failed to validate your docker image, check that the docker image'
|
|
56
|
+
f' exists. You may be able to find the {docker_image} in {project}.'
|
|
57
|
+
' If the docker image exists, the service account of this'
|
|
58
|
+
' project maybe be missing the permissions to access the docker image.'
|
|
59
|
+
)
|
|
60
|
+
return return_code
|
|
61
|
+
else:
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
66
|
+
"""Adds script dir to the base docker image and uploads the image.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
args: user provided arguments for running the command.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of:
|
|
73
|
+
0 if successful and 1 otherwise.
|
|
74
|
+
Name of the Docker image created.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# Pick a name for the docker image.
|
|
78
|
+
docker_image_prefix = os.getenv('USER', 'unknown')
|
|
79
|
+
docker_name = f'{docker_image_prefix}-runner'
|
|
80
|
+
|
|
81
|
+
script_dir_dockerfile = """FROM {base_docker_image}
|
|
82
|
+
|
|
83
|
+
# Set the working directory in the container
|
|
84
|
+
WORKDIR /app
|
|
85
|
+
|
|
86
|
+
# Copy all files from local workspace into docker container
|
|
87
|
+
COPY . .
|
|
88
|
+
|
|
89
|
+
WORKDIR /app
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
docker_file = script_dir_dockerfile.format(
|
|
93
|
+
base_docker_image=args.base_docker_image,
|
|
94
|
+
)
|
|
95
|
+
tmp = write_tmp_file(docker_file)
|
|
96
|
+
docker_build_command = (
|
|
97
|
+
f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t'
|
|
98
|
+
f' {docker_name} {args.script_dir}'
|
|
99
|
+
)
|
|
100
|
+
xpk_print(f'Building {args.script_dir} into docker image.')
|
|
101
|
+
return_code = run_command_with_updates(
|
|
102
|
+
docker_build_command,
|
|
103
|
+
'Building script_dir into docker image',
|
|
104
|
+
args,
|
|
105
|
+
verbose=verbose,
|
|
106
|
+
)
|
|
107
|
+
if return_code != 0:
|
|
108
|
+
xpk_print(
|
|
109
|
+
'Failed to add script_dir to docker image, check the base docker image.'
|
|
110
|
+
f' You should be able to navigate to the URL {args.base_docker_image}'
|
|
111
|
+
f' in {args.project}.'
|
|
112
|
+
)
|
|
113
|
+
xpk_exit(1)
|
|
114
|
+
|
|
115
|
+
# Pick a randomly generated `tag_length` character docker tag.
|
|
116
|
+
tag_length = 4
|
|
117
|
+
tag_random_prefix = ''.join(
|
|
118
|
+
random.choices(string.ascii_lowercase, k=tag_length)
|
|
119
|
+
)
|
|
120
|
+
tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
121
|
+
tag_name = f'{tag_random_prefix}-{tag_datetime}'
|
|
122
|
+
cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
|
|
123
|
+
xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
|
|
124
|
+
|
|
125
|
+
# Tag the docker image.
|
|
126
|
+
tag_docker_image_command = f'docker tag {docker_name} {cloud_docker_image}'
|
|
127
|
+
return_code = run_command_with_updates(
|
|
128
|
+
tag_docker_image_command, 'Tag Docker Image', args, verbose=verbose
|
|
129
|
+
)
|
|
130
|
+
if return_code != 0:
|
|
131
|
+
xpk_print(
|
|
132
|
+
f'Failed to tag docker image with tag: {tag_name}.'
|
|
133
|
+
f' You should be able to navigate to the URL {cloud_docker_image} in'
|
|
134
|
+
f' {args.project}.'
|
|
135
|
+
)
|
|
136
|
+
xpk_exit(1)
|
|
137
|
+
|
|
138
|
+
# Upload image to Artifact Registry.
|
|
139
|
+
upload_docker_image_command = f'docker push {cloud_docker_image}'
|
|
140
|
+
return_code = run_command_with_updates(
|
|
141
|
+
upload_docker_image_command, 'Upload Docker Image', args, verbose=verbose
|
|
142
|
+
)
|
|
143
|
+
if return_code != 0:
|
|
144
|
+
xpk_print(
|
|
145
|
+
'Failed to upload docker image.'
|
|
146
|
+
f' You should be able to navigate to the URL {cloud_docker_image} in'
|
|
147
|
+
f' {args.project}.'
|
|
148
|
+
)
|
|
149
|
+
xpk_exit(1)
|
|
150
|
+
return return_code, cloud_docker_image
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def setup_docker_image(args) -> tuple[int, str]:
|
|
154
|
+
"""Does steps to verify docker args, check image, and build image (if asked).
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
args: user provided arguments for running the command.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
tuple:
|
|
161
|
+
0 if successful and 1 otherwise.
|
|
162
|
+
Name of the docker image to use.
|
|
163
|
+
"""
|
|
164
|
+
use_base_docker_image = use_base_docker_image_or_docker_image(args)
|
|
165
|
+
|
|
166
|
+
docker_image = args.base_docker_image
|
|
167
|
+
if use_base_docker_image:
|
|
168
|
+
validate_docker_image_code = validate_docker_image(docker_image, args)
|
|
169
|
+
if validate_docker_image_code != 0:
|
|
170
|
+
xpk_exit(validate_docker_image_code)
|
|
171
|
+
build_docker_image_code, docker_image = build_docker_image_from_base_image(
|
|
172
|
+
args
|
|
173
|
+
)
|
|
174
|
+
if build_docker_image_code != 0:
|
|
175
|
+
xpk_exit(build_docker_image_code)
|
|
176
|
+
else:
|
|
177
|
+
docker_image = args.docker_image
|
|
178
|
+
validate_docker_image_code = validate_docker_image(args.docker_image, args)
|
|
179
|
+
if validate_docker_image_code != 0:
|
|
180
|
+
xpk_exit(validate_docker_image_code)
|
|
181
|
+
|
|
182
|
+
return 0, docker_image
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def use_base_docker_image_or_docker_image(args) -> bool:
|
|
186
|
+
"""Checks for correct docker image arguments.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
args: user provided arguments for running the command.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
True if intended to use base docker image, False to use docker image.
|
|
193
|
+
"""
|
|
194
|
+
use_base_docker_image = True
|
|
195
|
+
# Check if (base_docker_image and script_dir) or (docker_image) is set.
|
|
196
|
+
if args.docker_image is not None:
|
|
197
|
+
if args.script_dir is not DEFAULT_SCRIPT_DIR:
|
|
198
|
+
xpk_print(
|
|
199
|
+
'`--script-dir` and --docker-image can not be used together. Please'
|
|
200
|
+
' see `--help` command for more details.'
|
|
201
|
+
)
|
|
202
|
+
xpk_exit(1)
|
|
203
|
+
if args.base_docker_image is not DEFAULT_DOCKER_IMAGE:
|
|
204
|
+
xpk_print(
|
|
205
|
+
'`--base-docker-image` and --docker-image can not be used together.'
|
|
206
|
+
' Please see `--help` command for more details.'
|
|
207
|
+
)
|
|
208
|
+
xpk_exit(1)
|
|
209
|
+
use_base_docker_image = False
|
|
210
|
+
return use_base_docker_image
|