konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/cli.py
ADDED
|
@@ -0,0 +1,1945 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""The 'konduktor' command line tool.
|
|
14
|
+
|
|
15
|
+
Example usage:
|
|
16
|
+
|
|
17
|
+
# See available commands.
|
|
18
|
+
>> konduktor
|
|
19
|
+
|
|
20
|
+
# Run a task, described in a yaml file.
|
|
21
|
+
>> konduktor launch task.yaml
|
|
22
|
+
|
|
23
|
+
# Show the list of scheduled jobs
|
|
24
|
+
>> konduktor status
|
|
25
|
+
|
|
26
|
+
# Tear down a specific job.
|
|
27
|
+
>> konduktor down cluster_name
|
|
28
|
+
|
|
29
|
+
# Tear down all scheduled jobs
|
|
30
|
+
>> konduktor down -a
|
|
31
|
+
|
|
32
|
+
NOTE: the order of command definitions in this file corresponds to how they are
|
|
33
|
+
listed in "konduktor --help". Take care to put logically connected commands close to
|
|
34
|
+
each other.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import difflib
|
|
38
|
+
import fnmatch
|
|
39
|
+
import os
|
|
40
|
+
import pathlib
|
|
41
|
+
import shlex
|
|
42
|
+
from base64 import b64encode
|
|
43
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
44
|
+
|
|
45
|
+
import click
|
|
46
|
+
import colorama
|
|
47
|
+
import dotenv
|
|
48
|
+
import prettytable
|
|
49
|
+
import yaml # type: ignore
|
|
50
|
+
from rich.progress import track
|
|
51
|
+
|
|
52
|
+
import konduktor
|
|
53
|
+
from konduktor import check as konduktor_check
|
|
54
|
+
from konduktor import logging
|
|
55
|
+
from konduktor.backends import constants as backend_constants
|
|
56
|
+
from konduktor.backends import deployment_utils, jobset_utils
|
|
57
|
+
from konduktor.utils import (
|
|
58
|
+
base64_utils,
|
|
59
|
+
common_utils,
|
|
60
|
+
kubernetes_utils,
|
|
61
|
+
log_utils,
|
|
62
|
+
ux_utils,
|
|
63
|
+
validator,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
|
|
67
|
+
|
|
68
|
+
logger = logging.get_logger(__name__)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _parse_env_var(env_var: str) -> Tuple[str, str]:
|
|
72
|
+
"""Parse env vars into a (KEY, VAL) pair."""
|
|
73
|
+
if '=' not in env_var:
|
|
74
|
+
value = os.environ.get(env_var)
|
|
75
|
+
if value is None:
|
|
76
|
+
raise click.UsageError(f'{env_var} is not set in local environment.')
|
|
77
|
+
return (env_var, value)
|
|
78
|
+
ret = tuple(env_var.split('=', 1))
|
|
79
|
+
if len(ret) != 2:
|
|
80
|
+
raise click.UsageError(
|
|
81
|
+
f'Invalid env var: {env_var}. Must be in the form of KEY=VALUE'
|
|
82
|
+
)
|
|
83
|
+
return ret[0], ret[1]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _merge_env_vars(
|
|
87
|
+
env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]]
|
|
88
|
+
) -> List[Tuple[str, str]]:
|
|
89
|
+
"""Merges all values from env_list into env_dict."""
|
|
90
|
+
if not env_dict:
|
|
91
|
+
return env_list
|
|
92
|
+
for key, value in env_list:
|
|
93
|
+
env_dict[key] = value
|
|
94
|
+
return list(env_dict.items())
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _make_task_with_overrides(
|
|
98
|
+
entrypoint: Tuple[str, ...],
|
|
99
|
+
*,
|
|
100
|
+
entrypoint_name: str = 'konduktor.Task',
|
|
101
|
+
name: Optional[str] = None,
|
|
102
|
+
workdir: Optional[str] = None,
|
|
103
|
+
cloud: Optional[str] = None,
|
|
104
|
+
gpus: Optional[str] = None,
|
|
105
|
+
cpus: Optional[str] = None,
|
|
106
|
+
memory: Optional[str] = None,
|
|
107
|
+
instance_type: Optional[str] = None,
|
|
108
|
+
num_nodes: Optional[int] = None,
|
|
109
|
+
max_restarts: Optional[int] = None,
|
|
110
|
+
completions: Optional[int] = None,
|
|
111
|
+
image_id: Optional[str] = None,
|
|
112
|
+
disk_size: Optional[int] = None,
|
|
113
|
+
env: Optional[List[Tuple[str, str]]] = None,
|
|
114
|
+
field_to_ignore: Optional[List[str]] = None,
|
|
115
|
+
min_replicas: Optional[int] = None,
|
|
116
|
+
max_replicas: Optional[int] = None,
|
|
117
|
+
ports: Optional[int] = None,
|
|
118
|
+
probe: Optional[str] = None,
|
|
119
|
+
) -> konduktor.Task:
|
|
120
|
+
"""Creates a task from an entrypoint with overrides.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
konduktor.Task
|
|
124
|
+
"""
|
|
125
|
+
entrypoint = ' '.join(entrypoint)
|
|
126
|
+
is_yaml, _ = _check_yaml(entrypoint)
|
|
127
|
+
entrypoint: Optional[str]
|
|
128
|
+
if is_yaml:
|
|
129
|
+
# Treat entrypoint as a yaml.
|
|
130
|
+
click.secho(f'{entrypoint_name} from YAML spec: ', fg='yellow', nl=False)
|
|
131
|
+
click.secho(entrypoint, bold=True)
|
|
132
|
+
else:
|
|
133
|
+
if entrypoint is not None and len(entrypoint) == 0:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
'no entrypoint specified, run with \n' '`konduktor launch task.yaml'
|
|
136
|
+
)
|
|
137
|
+
raise ValueError(f'{entrypoint} is not a valid YAML spec,')
|
|
138
|
+
|
|
139
|
+
override_params = _parse_override_params(
|
|
140
|
+
gpus=gpus,
|
|
141
|
+
cpus=cpus,
|
|
142
|
+
memory=memory,
|
|
143
|
+
image_id=image_id,
|
|
144
|
+
disk_size=disk_size,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
serving_override_params = _parse_serving_override_params(
|
|
148
|
+
num_nodes=num_nodes,
|
|
149
|
+
min_replicas=min_replicas,
|
|
150
|
+
max_replicas=max_replicas,
|
|
151
|
+
ports=ports,
|
|
152
|
+
probe=probe,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if field_to_ignore is not None:
|
|
156
|
+
_pop_and_ignore_fields_in_override_params(override_params, field_to_ignore)
|
|
157
|
+
|
|
158
|
+
assert entrypoint is not None
|
|
159
|
+
task_configs = common_utils.read_yaml_all(entrypoint)
|
|
160
|
+
assert len(task_configs) == 1, 'Only single tasks are supported'
|
|
161
|
+
task = konduktor.Task.from_yaml_config(task_configs[0], env)
|
|
162
|
+
# Override.
|
|
163
|
+
if workdir is not None:
|
|
164
|
+
task.workdir = workdir
|
|
165
|
+
|
|
166
|
+
# perform overrides from CLI
|
|
167
|
+
if override_params:
|
|
168
|
+
task.set_resources_override(override_params)
|
|
169
|
+
if task.serving:
|
|
170
|
+
task.set_serving_override(serving_override_params)
|
|
171
|
+
|
|
172
|
+
if max_restarts is not None:
|
|
173
|
+
assert task.resources is not None
|
|
174
|
+
task.resources.job_config['max_restarts'] = max_restarts
|
|
175
|
+
if completions is not None:
|
|
176
|
+
assert task.resources is not None
|
|
177
|
+
task.resources.job_config['completions'] = completions
|
|
178
|
+
if num_nodes is not None:
|
|
179
|
+
task.num_nodes = num_nodes
|
|
180
|
+
if name is not None:
|
|
181
|
+
task.name = name
|
|
182
|
+
return task
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
_TASK_OPTIONS = [
|
|
186
|
+
click.option(
|
|
187
|
+
'--workdir',
|
|
188
|
+
required=False,
|
|
189
|
+
type=click.Path(exists=True, file_okay=False),
|
|
190
|
+
help=(
|
|
191
|
+
'If specified, sync this dir to the remote working directory, '
|
|
192
|
+
'where the task will be invoked. '
|
|
193
|
+
'Overrides the "workdir" config in the YAML if both are supplied.'
|
|
194
|
+
),
|
|
195
|
+
),
|
|
196
|
+
click.option(
|
|
197
|
+
'--cloud',
|
|
198
|
+
required=False,
|
|
199
|
+
type=str,
|
|
200
|
+
help=(
|
|
201
|
+
'The cloud to use. If specified, overrides the "resources.cloud" '
|
|
202
|
+
'config. Passing "none" resets the config. [defunct] currently '
|
|
203
|
+
'only supports a single cloud'
|
|
204
|
+
),
|
|
205
|
+
),
|
|
206
|
+
click.option(
|
|
207
|
+
'--num-nodes',
|
|
208
|
+
required=False,
|
|
209
|
+
type=int,
|
|
210
|
+
help=(
|
|
211
|
+
'Number of nodes to execute the task on. '
|
|
212
|
+
'Overrides the "num_nodes" config in the YAML if both are '
|
|
213
|
+
'supplied.'
|
|
214
|
+
),
|
|
215
|
+
),
|
|
216
|
+
click.option(
|
|
217
|
+
'--max-restarts',
|
|
218
|
+
required=False,
|
|
219
|
+
type=int,
|
|
220
|
+
help=(
|
|
221
|
+
'Maximum number of jobset restarts allowed. Overrides YAML.'
|
|
222
|
+
'Overrides the "max_restarts" config in the YAML if both are '
|
|
223
|
+
'supplied.'
|
|
224
|
+
),
|
|
225
|
+
),
|
|
226
|
+
click.option(
|
|
227
|
+
'--completions',
|
|
228
|
+
required=False,
|
|
229
|
+
type=int,
|
|
230
|
+
help=(
|
|
231
|
+
'Number of successful completions required. Overrides YAML.'
|
|
232
|
+
'Overrides the "completions" config in the YAML if both are '
|
|
233
|
+
'supplied.'
|
|
234
|
+
),
|
|
235
|
+
),
|
|
236
|
+
click.option(
|
|
237
|
+
'--cpus',
|
|
238
|
+
default=None,
|
|
239
|
+
type=str,
|
|
240
|
+
required=False,
|
|
241
|
+
help=(
|
|
242
|
+
'Number of vCPUs each instance must have (e.g., '
|
|
243
|
+
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
|
|
244
|
+
'This is used to automatically select the instance type.'
|
|
245
|
+
),
|
|
246
|
+
),
|
|
247
|
+
click.option(
|
|
248
|
+
'--memory',
|
|
249
|
+
default=None,
|
|
250
|
+
type=str,
|
|
251
|
+
required=False,
|
|
252
|
+
help=(
|
|
253
|
+
'Amount of memory each instance must have in GB (e.g., '
|
|
254
|
+
'``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'
|
|
255
|
+
),
|
|
256
|
+
),
|
|
257
|
+
click.option(
|
|
258
|
+
'--disk-size',
|
|
259
|
+
default=None,
|
|
260
|
+
type=int,
|
|
261
|
+
required=False,
|
|
262
|
+
help=('OS disk size in GBs.'),
|
|
263
|
+
),
|
|
264
|
+
click.option(
|
|
265
|
+
'--image-id',
|
|
266
|
+
required=False,
|
|
267
|
+
default=None,
|
|
268
|
+
help=(
|
|
269
|
+
'Custom image id for launching the instances. '
|
|
270
|
+
'Passing "none" resets the config.'
|
|
271
|
+
),
|
|
272
|
+
),
|
|
273
|
+
click.option(
|
|
274
|
+
'--env-file',
|
|
275
|
+
required=False,
|
|
276
|
+
type=dotenv.dotenv_values,
|
|
277
|
+
help="""\
|
|
278
|
+
Path to a dotenv file with environment variables to set on the remote
|
|
279
|
+
node.
|
|
280
|
+
|
|
281
|
+
If any values from ``--env-file`` conflict with values set by
|
|
282
|
+
``--env``, the ``--env`` value will be preferred.""",
|
|
283
|
+
),
|
|
284
|
+
click.option(
|
|
285
|
+
'--env',
|
|
286
|
+
required=False,
|
|
287
|
+
type=_parse_env_var,
|
|
288
|
+
multiple=True,
|
|
289
|
+
help="""\
|
|
290
|
+
Environment variable to set on the remote node.
|
|
291
|
+
It can be specified multiple times.
|
|
292
|
+
Examples:
|
|
293
|
+
|
|
294
|
+
\b
|
|
295
|
+
1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
|
|
296
|
+
|
|
297
|
+
2. ``--env MY_ENV2=$HOME``: set ``$MY_ENV2`` on the cluster to be the
|
|
298
|
+
same value of ``$HOME`` in the local environment where the CLI command
|
|
299
|
+
is run.
|
|
300
|
+
|
|
301
|
+
3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
|
|
302
|
+
same value of ``$MY_ENV3`` in the local environment.""",
|
|
303
|
+
),
|
|
304
|
+
]
|
|
305
|
+
_TASK_OPTIONS_WITH_NAME = [
|
|
306
|
+
click.option(
|
|
307
|
+
'--name',
|
|
308
|
+
'-n',
|
|
309
|
+
required=False,
|
|
310
|
+
type=str,
|
|
311
|
+
help=(
|
|
312
|
+
'Task name. Overrides the "name" '
|
|
313
|
+
'config in the YAML if both are supplied.'
|
|
314
|
+
),
|
|
315
|
+
),
|
|
316
|
+
] + _TASK_OPTIONS
|
|
317
|
+
_EXTRA_RESOURCES_OPTIONS = [
|
|
318
|
+
click.option(
|
|
319
|
+
'--gpus',
|
|
320
|
+
required=False,
|
|
321
|
+
type=str,
|
|
322
|
+
help=(
|
|
323
|
+
'Type and number of GPUs to use. Example values: '
|
|
324
|
+
'"V100:8", "V100" (short for a count of 1)'
|
|
325
|
+
'If a new cluster is being launched by this command, this is the '
|
|
326
|
+
'resources to provision. If an existing cluster is being reused, this'
|
|
327
|
+
" is seen as the task demand, which must fit the cluster's total "
|
|
328
|
+
'resources and is used for scheduling the task. '
|
|
329
|
+
'Overrides the "accelerators" '
|
|
330
|
+
'config in the YAML if both are supplied. '
|
|
331
|
+
'Passing "none" resets the config.'
|
|
332
|
+
),
|
|
333
|
+
),
|
|
334
|
+
]
|
|
335
|
+
_EXTRA_SERVING_OPTIONS = [
|
|
336
|
+
click.option(
|
|
337
|
+
'--min-replicas',
|
|
338
|
+
required=False,
|
|
339
|
+
type=int,
|
|
340
|
+
help=(
|
|
341
|
+
'Minimum number of replicas to run for the service. '
|
|
342
|
+
'Overrides the "min_replicas" field in the YAML if both '
|
|
343
|
+
'are supplied.'
|
|
344
|
+
),
|
|
345
|
+
),
|
|
346
|
+
click.option(
|
|
347
|
+
'--max-replicas',
|
|
348
|
+
required=False,
|
|
349
|
+
type=int,
|
|
350
|
+
help=(
|
|
351
|
+
'Maximum number of replicas to allow for the service. '
|
|
352
|
+
'Overrides the "max_replicas" field in the YAML if both '
|
|
353
|
+
'are supplied.'
|
|
354
|
+
),
|
|
355
|
+
),
|
|
356
|
+
click.option(
|
|
357
|
+
'--ports',
|
|
358
|
+
required=False,
|
|
359
|
+
type=int,
|
|
360
|
+
help=(
|
|
361
|
+
'The container port on which your service will listen for HTTP '
|
|
362
|
+
'traffic. Overrides the "ports" field in the YAML if both '
|
|
363
|
+
'are supplied.'
|
|
364
|
+
),
|
|
365
|
+
),
|
|
366
|
+
click.option(
|
|
367
|
+
'--probe',
|
|
368
|
+
required=False,
|
|
369
|
+
type=str,
|
|
370
|
+
help=(
|
|
371
|
+
'The HTTP path to use for health checks (liveness, readiness, and '
|
|
372
|
+
'startup probes). Overrides the "probe" field in the YAML '
|
|
373
|
+
'if both are supplied. The service should respond with HTTP 200 on '
|
|
374
|
+
'this path when healthy.'
|
|
375
|
+
),
|
|
376
|
+
),
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _get_click_major_version():
|
|
381
|
+
return int(click.__version__.split('.', maxsplit=1)[0])
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
_RELOAD_ZSH_CMD = 'source ~/.zshrc'
|
|
385
|
+
_RELOAD_BASH_CMD = 'source ~/.bashrc'
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _add_click_options(options: List[click.Option]):
|
|
389
|
+
"""A decorator for adding a list of click option decorators."""
|
|
390
|
+
|
|
391
|
+
def _add_options(func):
|
|
392
|
+
for option in reversed(options):
|
|
393
|
+
func = option(func)
|
|
394
|
+
return func
|
|
395
|
+
|
|
396
|
+
return _add_options
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _parse_override_params(
|
|
400
|
+
gpus: Optional[str] = None,
|
|
401
|
+
cpus: Optional[str] = None,
|
|
402
|
+
memory: Optional[str] = None,
|
|
403
|
+
image_id: Optional[str] = None,
|
|
404
|
+
disk_size: Optional[int] = None,
|
|
405
|
+
) -> Dict[str, Any]:
|
|
406
|
+
"""Parses the override parameters into a dictionary."""
|
|
407
|
+
override_params: Dict[str, Any] = {}
|
|
408
|
+
if gpus is not None:
|
|
409
|
+
if gpus.lower() == 'none':
|
|
410
|
+
override_params['accelerators'] = None
|
|
411
|
+
else:
|
|
412
|
+
override_params['accelerators'] = gpus
|
|
413
|
+
if cpus is not None:
|
|
414
|
+
if cpus.lower() == 'none':
|
|
415
|
+
override_params['cpus'] = None
|
|
416
|
+
else:
|
|
417
|
+
override_params['cpus'] = cpus
|
|
418
|
+
if memory is not None:
|
|
419
|
+
if memory.lower() == 'none':
|
|
420
|
+
override_params['memory'] = None
|
|
421
|
+
else:
|
|
422
|
+
override_params['memory'] = memory
|
|
423
|
+
if image_id is not None:
|
|
424
|
+
if image_id.lower() == 'none':
|
|
425
|
+
override_params['image_id'] = None
|
|
426
|
+
else:
|
|
427
|
+
# Validate Docker image before adding to override params
|
|
428
|
+
validator.validate_and_warn_image(image_id, 'task')
|
|
429
|
+
override_params['image_id'] = image_id
|
|
430
|
+
if disk_size is not None:
|
|
431
|
+
override_params['disk_size'] = disk_size
|
|
432
|
+
return override_params
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _parse_serving_override_params(
|
|
436
|
+
num_nodes: Optional[int] = None,
|
|
437
|
+
min_replicas: Optional[int] = None,
|
|
438
|
+
max_replicas: Optional[int] = None,
|
|
439
|
+
ports: Optional[int] = None,
|
|
440
|
+
probe: Optional[str] = None,
|
|
441
|
+
) -> Dict[str, Any]:
|
|
442
|
+
"""Parses the relevant serving override parameters into a dictionary."""
|
|
443
|
+
override_params: Dict[str, Any] = {}
|
|
444
|
+
if num_nodes is not None:
|
|
445
|
+
override_params['num_nodes'] = num_nodes
|
|
446
|
+
if min_replicas is not None:
|
|
447
|
+
override_params['min_replicas'] = min_replicas
|
|
448
|
+
if max_replicas is not None:
|
|
449
|
+
override_params['max_replicas'] = max_replicas
|
|
450
|
+
if ports is not None:
|
|
451
|
+
override_params['ports'] = ports
|
|
452
|
+
if probe is not None:
|
|
453
|
+
override_params['probe'] = probe
|
|
454
|
+
|
|
455
|
+
return override_params
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _launch_with_confirm(
|
|
459
|
+
task: konduktor.Task,
|
|
460
|
+
*,
|
|
461
|
+
dryrun: bool,
|
|
462
|
+
detach_run: bool,
|
|
463
|
+
no_confirm: bool,
|
|
464
|
+
serving: bool,
|
|
465
|
+
):
|
|
466
|
+
"""Launch a cluster with a Task."""
|
|
467
|
+
|
|
468
|
+
confirm_shown = False
|
|
469
|
+
if not no_confirm:
|
|
470
|
+
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
|
471
|
+
# it exists but is STOPPED.
|
|
472
|
+
if serving:
|
|
473
|
+
prompt = (
|
|
474
|
+
f'Launching a new deployment {colorama.Style.BRIGHT}'
|
|
475
|
+
f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
|
|
476
|
+
'Proceed?'
|
|
477
|
+
)
|
|
478
|
+
else:
|
|
479
|
+
prompt = (
|
|
480
|
+
f'Launching a new job {colorama.Style.BRIGHT}'
|
|
481
|
+
f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
|
|
482
|
+
'Proceed?'
|
|
483
|
+
)
|
|
484
|
+
if prompt is not None:
|
|
485
|
+
confirm_shown = True
|
|
486
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
487
|
+
|
|
488
|
+
if not confirm_shown:
|
|
489
|
+
if serving:
|
|
490
|
+
click.secho(f'Creating deployment {task.name}...', fg='yellow')
|
|
491
|
+
else:
|
|
492
|
+
click.secho(f'Running task {task.name}...', fg='yellow')
|
|
493
|
+
return konduktor.launch(
|
|
494
|
+
task,
|
|
495
|
+
dryrun=dryrun,
|
|
496
|
+
detach_run=detach_run,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
|
501
|
+
"""Checks if entrypoint is a readable YAML file.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
entrypoint: Path to a YAML file.
|
|
505
|
+
"""
|
|
506
|
+
is_yaml = True
|
|
507
|
+
config: Optional[List[Dict[str, Any]]] = None
|
|
508
|
+
result = None
|
|
509
|
+
shell_splits = shlex.split(entrypoint)
|
|
510
|
+
yaml_file_provided = len(shell_splits) == 1 and (
|
|
511
|
+
shell_splits[0].endswith('yaml') or shell_splits[0].endswith('.yml')
|
|
512
|
+
)
|
|
513
|
+
invalid_reason = ''
|
|
514
|
+
try:
|
|
515
|
+
with open(entrypoint, 'r', encoding='utf-8') as f:
|
|
516
|
+
try:
|
|
517
|
+
config = list(yaml.safe_load_all(f))
|
|
518
|
+
if config:
|
|
519
|
+
result = config[0]
|
|
520
|
+
else:
|
|
521
|
+
result = {}
|
|
522
|
+
if isinstance(result, str):
|
|
523
|
+
# 'konduktor exec cluster ./my_script.sh'
|
|
524
|
+
is_yaml = False
|
|
525
|
+
except yaml.YAMLError as e:
|
|
526
|
+
if yaml_file_provided:
|
|
527
|
+
logger.debug(e)
|
|
528
|
+
detailed_error = f'\nYAML Error: {e}\n'
|
|
529
|
+
invalid_reason = (
|
|
530
|
+
'contains an invalid configuration. '
|
|
531
|
+
'Please check syntax.\n'
|
|
532
|
+
f'{detailed_error}'
|
|
533
|
+
)
|
|
534
|
+
is_yaml = False
|
|
535
|
+
|
|
536
|
+
except OSError:
|
|
537
|
+
if yaml_file_provided:
|
|
538
|
+
entry_point_path = os.path.expanduser(entrypoint)
|
|
539
|
+
if not os.path.exists(entry_point_path):
|
|
540
|
+
invalid_reason = (
|
|
541
|
+
'does not exist. Please check if the path' ' is correct.'
|
|
542
|
+
)
|
|
543
|
+
elif not os.path.isfile(entry_point_path):
|
|
544
|
+
invalid_reason = (
|
|
545
|
+
'is not a file. Please check if the path' ' is correct.'
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
invalid_reason = (
|
|
549
|
+
'yaml.safe_load() failed. Please check if the' ' path is correct.'
|
|
550
|
+
)
|
|
551
|
+
is_yaml = False
|
|
552
|
+
if not is_yaml:
|
|
553
|
+
if yaml_file_provided:
|
|
554
|
+
click.confirm(
|
|
555
|
+
f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n'
|
|
556
|
+
'It will be treated as a command to be run remotely. Continue?',
|
|
557
|
+
abort=True,
|
|
558
|
+
)
|
|
559
|
+
return is_yaml, result
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def _pop_and_ignore_fields_in_override_params(
|
|
563
|
+
params: Dict[str, Any], field_to_ignore: List[str]
|
|
564
|
+
) -> None:
|
|
565
|
+
"""Pops and ignores fields in override params.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
params: Override params.
|
|
569
|
+
field_to_ignore: Fields to ignore.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
Override params with fields ignored.
|
|
573
|
+
"""
|
|
574
|
+
if field_to_ignore is not None:
|
|
575
|
+
for field in field_to_ignore:
|
|
576
|
+
field_value = params.pop(field, None)
|
|
577
|
+
if field_value is not None:
|
|
578
|
+
click.secho(
|
|
579
|
+
f'Override param {field}={field_value} is ignored.', fg='yellow'
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
class _NaturalOrderGroup(click.Group):
|
|
584
|
+
"""Lists commands in the order defined in this script.
|
|
585
|
+
|
|
586
|
+
Reference: https://github.com/pallets/click/issues/513
|
|
587
|
+
"""
|
|
588
|
+
|
|
589
|
+
def list_commands(self, ctx):
|
|
590
|
+
return self.commands.keys()
|
|
591
|
+
|
|
592
|
+
def invoke(self, ctx):
|
|
593
|
+
return super().invoke(ctx)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
class _DocumentedCodeCommand(click.Command):
|
|
597
|
+
"""Corrects help strings for documented commands such that --help displays
|
|
598
|
+
properly and code blocks are rendered in the official web documentation.
|
|
599
|
+
"""
|
|
600
|
+
|
|
601
|
+
def get_help(self, ctx):
|
|
602
|
+
help_str = ctx.command.help
|
|
603
|
+
ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b')
|
|
604
|
+
return super().get_help(ctx)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
@click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS)
|
|
608
|
+
@click.version_option(konduktor.__version__, '--version', '-v', prog_name='konduktor')
|
|
609
|
+
@click.version_option(
|
|
610
|
+
konduktor.__commit__,
|
|
611
|
+
'--commit',
|
|
612
|
+
'-c',
|
|
613
|
+
prog_name='konduktor',
|
|
614
|
+
message='%(prog)s, commit %(version)s',
|
|
615
|
+
help='Show the commit hash and exit',
|
|
616
|
+
)
|
|
617
|
+
def cli():
|
|
618
|
+
pass
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
@cli.command()
|
|
622
|
+
@click.option(
|
|
623
|
+
'--all-users',
|
|
624
|
+
'-u',
|
|
625
|
+
default=False,
|
|
626
|
+
is_flag=True,
|
|
627
|
+
required=False,
|
|
628
|
+
help='Show all jobs, including those not owned by the current user.',
|
|
629
|
+
)
|
|
630
|
+
@click.option(
|
|
631
|
+
'--limit',
|
|
632
|
+
'-l',
|
|
633
|
+
default=None,
|
|
634
|
+
type=int,
|
|
635
|
+
help='Maximum number of jobs to display (e.g., --limit 100)',
|
|
636
|
+
)
|
|
637
|
+
@click.option(
|
|
638
|
+
'--after',
|
|
639
|
+
default=None,
|
|
640
|
+
type=str,
|
|
641
|
+
help=(
|
|
642
|
+
'Show jobs created after this timestamp '
|
|
643
|
+
'(e.g., --after "08/06/25 03:54PM", --after "08/06/25", --after "03:54PM")'
|
|
644
|
+
),
|
|
645
|
+
)
|
|
646
|
+
@click.option(
|
|
647
|
+
'--before',
|
|
648
|
+
default=None,
|
|
649
|
+
type=str,
|
|
650
|
+
help=(
|
|
651
|
+
'Show jobs created before this timestamp '
|
|
652
|
+
'(e.g., --before "08/06/25 03:54PM", --before "08/06/25", --before "03:54PM")'
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
# pylint: disable=redefined-builtin
|
|
656
|
+
def status(
|
|
657
|
+
all_users: bool, limit: Optional[int], after: Optional[str], before: Optional[str]
|
|
658
|
+
):
|
|
659
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
660
|
+
"""Shows list of all the jobs with optional filtering and pagination.
|
|
661
|
+
|
|
662
|
+
\b
|
|
663
|
+
Examples:
|
|
664
|
+
konduktor status --limit 10
|
|
665
|
+
konduktor status --before "08/06/25 03:53PM"
|
|
666
|
+
konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
|
|
667
|
+
|
|
668
|
+
\b
|
|
669
|
+
Notes:
|
|
670
|
+
• When using --before or --after timestamps, "08/06/25"
|
|
671
|
+
is equivalent to "08/06/25 00:00".
|
|
672
|
+
• "03:53PM" is equivalent to "03:53:00PM".
|
|
673
|
+
• Timestamps shown in "konduktor status" are truncated
|
|
674
|
+
and are in the local timezone.
|
|
675
|
+
Example: "03:53:55PM" → "03:53PM" — would show up in
|
|
676
|
+
--after "03:53PM" but not in --before "03:53PM".
|
|
677
|
+
"""
|
|
678
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
679
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
680
|
+
user = common_utils.user_and_hostname_hash() if not all_users else 'All'
|
|
681
|
+
click.secho(f'User: {user}', fg='green', bold=True)
|
|
682
|
+
click.secho('Jobs', fg='cyan', bold=True)
|
|
683
|
+
jobset_utils.show_status_table(
|
|
684
|
+
namespace, all_users=all_users, limit=limit, after=after, before=before
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
@cli.command()
|
|
689
|
+
@click.option(
|
|
690
|
+
'--status',
|
|
691
|
+
is_flag=True,
|
|
692
|
+
default=False,
|
|
693
|
+
help=(
|
|
694
|
+
'If specified, do not show logs but exit with a status code for the '
|
|
695
|
+
"job's status: 0 for succeeded, or 1 for all other statuses."
|
|
696
|
+
),
|
|
697
|
+
)
|
|
698
|
+
@click.option(
|
|
699
|
+
'--follow/--no-follow',
|
|
700
|
+
is_flag=True,
|
|
701
|
+
default=True,
|
|
702
|
+
help=(
|
|
703
|
+
'Follow the logs of a job. '
|
|
704
|
+
'If --no-follow is specified, print the log so far and exit. '
|
|
705
|
+
'[default: --follow]'
|
|
706
|
+
),
|
|
707
|
+
)
|
|
708
|
+
@click.option(
|
|
709
|
+
'--num-lines',
|
|
710
|
+
'--num_lines',
|
|
711
|
+
'-n',
|
|
712
|
+
default=-1,
|
|
713
|
+
type=int,
|
|
714
|
+
help=(
|
|
715
|
+
'The number of lines to display from the end of the log file. '
|
|
716
|
+
'Default is -1 (no limit).'
|
|
717
|
+
),
|
|
718
|
+
)
|
|
719
|
+
@click.option(
|
|
720
|
+
'--node-rank',
|
|
721
|
+
'--node_rank',
|
|
722
|
+
'-N',
|
|
723
|
+
default=0,
|
|
724
|
+
type=int,
|
|
725
|
+
help='The node rank to tail logs from.',
|
|
726
|
+
)
|
|
727
|
+
@click.option(
|
|
728
|
+
'--start-offset',
|
|
729
|
+
'--start_offset',
|
|
730
|
+
type=str,
|
|
731
|
+
required=False,
|
|
732
|
+
default='1h',
|
|
733
|
+
help=(
|
|
734
|
+
'Choose how much time from now to look back in logs. '
|
|
735
|
+
'Examples: 30s, 5m, 2h, 1d. Default is 1h.'
|
|
736
|
+
'Note: currently only applies when streaming (default --follow). '
|
|
737
|
+
'With --no-follow, all available logs are returned.'
|
|
738
|
+
),
|
|
739
|
+
)
|
|
740
|
+
@click.argument('job_id', type=str, nargs=1)
|
|
741
|
+
# TODO(zhwu): support logs by job name
|
|
742
|
+
def logs(
|
|
743
|
+
status: bool,
|
|
744
|
+
job_id: str,
|
|
745
|
+
follow: bool,
|
|
746
|
+
num_lines: int,
|
|
747
|
+
node_rank: int,
|
|
748
|
+
start_offset: str,
|
|
749
|
+
):
|
|
750
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
751
|
+
"""Retrieve/tail the log of a job."""
|
|
752
|
+
if status:
|
|
753
|
+
raise click.UsageError('`--status` is being deprecated')
|
|
754
|
+
|
|
755
|
+
# Check if the job exists
|
|
756
|
+
if not job_id:
|
|
757
|
+
raise click.UsageError('Please provide a job ID.')
|
|
758
|
+
|
|
759
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
760
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
761
|
+
|
|
762
|
+
# Verify the job exists before attempting to tail logs
|
|
763
|
+
# TODO(asaiacai): unify the 404 logic under jobset_utils
|
|
764
|
+
try:
|
|
765
|
+
_ = jobset_utils.get_jobset(namespace, job_id)
|
|
766
|
+
except jobset_utils.JobNotFoundError:
|
|
767
|
+
message = (
|
|
768
|
+
f"Job '{job_id}' not found in namespace '{namespace}'. "
|
|
769
|
+
f'This may be due to a typo, `konduktor down`, or garbage collected. '
|
|
770
|
+
f'Check your jobs with '
|
|
771
|
+
f'{colorama.Style.BRIGHT}`konduktor status`'
|
|
772
|
+
f'{colorama.Style.RESET_ALL}.'
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
# Try to find near string matches to help with typos.
|
|
776
|
+
try:
|
|
777
|
+
job_specs = jobset_utils.list_jobset(namespace)
|
|
778
|
+
job_names = [
|
|
779
|
+
item['metadata']['name'] for item in (job_specs or {}).get('items', [])
|
|
780
|
+
]
|
|
781
|
+
close_matches = difflib.get_close_matches(
|
|
782
|
+
job_id, job_names, n=3, cutoff=0.4
|
|
783
|
+
)
|
|
784
|
+
except Exception:
|
|
785
|
+
close_matches = []
|
|
786
|
+
|
|
787
|
+
if close_matches:
|
|
788
|
+
suggestions = ', '.join(
|
|
789
|
+
f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{name}{colorama.Style.NORMAL}'
|
|
790
|
+
for name in close_matches
|
|
791
|
+
)
|
|
792
|
+
message += f'{colorama.Fore.YELLOW} Did you mean: {suggestions}?'
|
|
793
|
+
|
|
794
|
+
click.secho(message, fg='yellow')
|
|
795
|
+
|
|
796
|
+
log_utils.tail_logs(
|
|
797
|
+
job_id,
|
|
798
|
+
worker_id=node_rank,
|
|
799
|
+
follow=follow,
|
|
800
|
+
num_logs=num_lines,
|
|
801
|
+
start_offset=start_offset,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
806
|
+
@click.argument(
|
|
807
|
+
'entrypoint',
|
|
808
|
+
required=False,
|
|
809
|
+
type=str,
|
|
810
|
+
nargs=-1,
|
|
811
|
+
)
|
|
812
|
+
@click.option(
|
|
813
|
+
'--dryrun',
|
|
814
|
+
default=False,
|
|
815
|
+
is_flag=True,
|
|
816
|
+
help='If True, do not actually run the job.',
|
|
817
|
+
)
|
|
818
|
+
@click.option(
|
|
819
|
+
'--detach-run',
|
|
820
|
+
'-d',
|
|
821
|
+
default=False,
|
|
822
|
+
is_flag=True,
|
|
823
|
+
help=(
|
|
824
|
+
'If True, as soon as a job is submitted, return from this call '
|
|
825
|
+
'and do not stream execution logs.'
|
|
826
|
+
),
|
|
827
|
+
)
|
|
828
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
|
|
829
|
+
@click.option(
|
|
830
|
+
'--yes',
|
|
831
|
+
'-y',
|
|
832
|
+
is_flag=True,
|
|
833
|
+
default=False,
|
|
834
|
+
required=False,
|
|
835
|
+
# Disabling quote check here, as there seems to be a bug in pylint,
|
|
836
|
+
# which incorrectly recognizes the help string as a docstring.
|
|
837
|
+
# pylint: disable=bad-docstring-quotes
|
|
838
|
+
help='Skip confirmation prompt.',
|
|
839
|
+
)
|
|
840
|
+
@click.option(
|
|
841
|
+
'--skip-image-check',
|
|
842
|
+
'-s',
|
|
843
|
+
is_flag=True,
|
|
844
|
+
default=False,
|
|
845
|
+
help='Skip Docker image validation checks for faster startup.',
|
|
846
|
+
)
|
|
847
|
+
def launch(
|
|
848
|
+
entrypoint: Tuple[str, ...],
|
|
849
|
+
dryrun: bool,
|
|
850
|
+
detach_run: bool,
|
|
851
|
+
name: Optional[str],
|
|
852
|
+
workdir: Optional[str],
|
|
853
|
+
cloud: Optional[str],
|
|
854
|
+
gpus: Optional[str],
|
|
855
|
+
cpus: Optional[str],
|
|
856
|
+
memory: Optional[str],
|
|
857
|
+
num_nodes: Optional[int],
|
|
858
|
+
max_restarts: Optional[int],
|
|
859
|
+
completions: Optional[int],
|
|
860
|
+
image_id: Optional[str],
|
|
861
|
+
env_file: Optional[Dict[str, str]],
|
|
862
|
+
env: List[Tuple[str, str]],
|
|
863
|
+
disk_size: Optional[int],
|
|
864
|
+
yes: bool,
|
|
865
|
+
skip_image_check: bool,
|
|
866
|
+
):
|
|
867
|
+
"""Launch a task.
|
|
868
|
+
|
|
869
|
+
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
|
870
|
+
specification. Otherwise, it is interpreted as a bash command.
|
|
871
|
+
"""
|
|
872
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
873
|
+
env = _merge_env_vars(env_file, env)
|
|
874
|
+
|
|
875
|
+
if skip_image_check:
|
|
876
|
+
os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
|
|
877
|
+
|
|
878
|
+
task = _make_task_with_overrides(
|
|
879
|
+
entrypoint=entrypoint,
|
|
880
|
+
name=name,
|
|
881
|
+
workdir=workdir,
|
|
882
|
+
cloud=cloud,
|
|
883
|
+
gpus=gpus,
|
|
884
|
+
cpus=cpus,
|
|
885
|
+
memory=memory,
|
|
886
|
+
num_nodes=num_nodes,
|
|
887
|
+
max_restarts=max_restarts,
|
|
888
|
+
completions=completions,
|
|
889
|
+
image_id=image_id,
|
|
890
|
+
env=env,
|
|
891
|
+
disk_size=disk_size,
|
|
892
|
+
# serving stuff
|
|
893
|
+
min_replicas=None,
|
|
894
|
+
max_replicas=None,
|
|
895
|
+
ports=None,
|
|
896
|
+
probe=None,
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
click.secho(
|
|
900
|
+
f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
|
|
901
|
+
)
|
|
902
|
+
table_kwargs = {
|
|
903
|
+
'hrules': prettytable.FRAME,
|
|
904
|
+
'vrules': prettytable.NONE,
|
|
905
|
+
'border': True,
|
|
906
|
+
}
|
|
907
|
+
headers = ['CPUs', 'Mem (GB)', 'GPUs']
|
|
908
|
+
table = log_utils.create_table(headers, **table_kwargs)
|
|
909
|
+
assert task.resources is not None
|
|
910
|
+
table.add_row(
|
|
911
|
+
[task.resources.cpus, task.resources.memory, task.resources.accelerators]
|
|
912
|
+
)
|
|
913
|
+
print(table)
|
|
914
|
+
|
|
915
|
+
if task.serving:
|
|
916
|
+
raise click.UsageError(
|
|
917
|
+
'Serving information detected. Use '
|
|
918
|
+
'`konduktor serve launch` instead for serving.'
|
|
919
|
+
)
|
|
920
|
+
try:
|
|
921
|
+
_launch_with_confirm(
|
|
922
|
+
task,
|
|
923
|
+
dryrun=dryrun,
|
|
924
|
+
detach_run=detach_run,
|
|
925
|
+
no_confirm=yes,
|
|
926
|
+
serving=bool(task.serving),
|
|
927
|
+
)
|
|
928
|
+
except KeyboardInterrupt:
|
|
929
|
+
click.secho(
|
|
930
|
+
f'Detaching... manage your job {task.name} with the following commands:',
|
|
931
|
+
fg='yellow',
|
|
932
|
+
bold=True,
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
click.secho(
|
|
936
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, task.name),
|
|
937
|
+
fg='green',
|
|
938
|
+
bold=True,
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _find_matching_jobs(
|
|
943
|
+
jobs: List[str],
|
|
944
|
+
jobs_response: Dict[str, Any],
|
|
945
|
+
namespace: str,
|
|
946
|
+
all_users: Optional[bool],
|
|
947
|
+
):
|
|
948
|
+
"""
|
|
949
|
+
Find all jobs matching against the user specified pattern.
|
|
950
|
+
In use in `konduktor down` and `konduktor stop`
|
|
951
|
+
|
|
952
|
+
Note(asaiacai): `jobs_response` should be the list of
|
|
953
|
+
all jobsets in this namespace, not necessarily belonging
|
|
954
|
+
to this user.
|
|
955
|
+
"""
|
|
956
|
+
|
|
957
|
+
jobs_specs = [job for job in jobs_response['items']]
|
|
958
|
+
|
|
959
|
+
if all_users:
|
|
960
|
+
assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
|
|
961
|
+
assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
|
|
962
|
+
jobs = [
|
|
963
|
+
job['metadata']['name']
|
|
964
|
+
for job in jobs_specs
|
|
965
|
+
if job['metadata']['labels'][backend_constants.USER_LABEL]
|
|
966
|
+
== common_utils.get_cleaned_username()
|
|
967
|
+
]
|
|
968
|
+
logger.debug(
|
|
969
|
+
f'Jobs found for user {colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
|
|
970
|
+
f'{common_utils.get_cleaned_username()}{colorama.Style.RESET_ALL}: {jobs}'
|
|
971
|
+
)
|
|
972
|
+
elif jobs:
|
|
973
|
+
# Get all available jobs to match against patterns
|
|
974
|
+
if len(jobs_specs) == 0:
|
|
975
|
+
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
976
|
+
|
|
977
|
+
all_job_names = {
|
|
978
|
+
job['metadata']['name']: job['metadata']['labels'][
|
|
979
|
+
backend_constants.USER_LABEL
|
|
980
|
+
]
|
|
981
|
+
for job in jobs_specs
|
|
982
|
+
}
|
|
983
|
+
matched_jobs = []
|
|
984
|
+
|
|
985
|
+
for job_pattern in jobs:
|
|
986
|
+
# Use fnmatch for both wildcard and exact pattern matching
|
|
987
|
+
pattern_matches = fnmatch.filter(all_job_names, job_pattern)
|
|
988
|
+
if not pattern_matches:
|
|
989
|
+
click.secho(
|
|
990
|
+
f'Warning: No jobs found matching pattern "{job_pattern}"',
|
|
991
|
+
fg='yellow',
|
|
992
|
+
err=True,
|
|
993
|
+
)
|
|
994
|
+
for matched_name in pattern_matches:
|
|
995
|
+
if all_job_names[matched_name] != common_utils.get_cleaned_username():
|
|
996
|
+
warning_label = (
|
|
997
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}Warning'
|
|
998
|
+
f'{colorama.Style.RESET_ALL}'
|
|
999
|
+
)
|
|
1000
|
+
job_name = (
|
|
1001
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.WHITE}{matched_name}'
|
|
1002
|
+
f'{colorama.Style.RESET_ALL}'
|
|
1003
|
+
)
|
|
1004
|
+
launched_user = (
|
|
1005
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
|
|
1006
|
+
f'{all_job_names[matched_name]}{colorama.Style.RESET_ALL}'
|
|
1007
|
+
)
|
|
1008
|
+
current_user = (
|
|
1009
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}'
|
|
1010
|
+
f'{common_utils.get_cleaned_username()}'
|
|
1011
|
+
f'{colorama.Style.RESET_ALL}'
|
|
1012
|
+
)
|
|
1013
|
+
logger.info(
|
|
1014
|
+
f'{warning_label}: job {job_name} was launched by '
|
|
1015
|
+
f'{launched_user}, while the current user is {current_user}',
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
matched_jobs.extend(pattern_matches)
|
|
1019
|
+
|
|
1020
|
+
# Remove duplicates while preserving order
|
|
1021
|
+
seen = set()
|
|
1022
|
+
jobs = []
|
|
1023
|
+
for job in matched_jobs:
|
|
1024
|
+
if job not in seen:
|
|
1025
|
+
seen.add(job)
|
|
1026
|
+
jobs.append(job)
|
|
1027
|
+
|
|
1028
|
+
if not jobs:
|
|
1029
|
+
raise click.ClickException(
|
|
1030
|
+
f'No matching jobs found check status with '
|
|
1031
|
+
f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
|
|
1032
|
+
)
|
|
1033
|
+
else:
|
|
1034
|
+
raise click.ClickException(
|
|
1035
|
+
'No jobs specified. Use --all to specify '
|
|
1036
|
+
'all jobs belonging to a user '
|
|
1037
|
+
'or specify job names/patterns.'
|
|
1038
|
+
)
|
|
1039
|
+
return jobs
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1043
|
+
@click.argument(
|
|
1044
|
+
'jobs',
|
|
1045
|
+
nargs=-1,
|
|
1046
|
+
required=False,
|
|
1047
|
+
)
|
|
1048
|
+
@click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
|
|
1049
|
+
@click.option(
|
|
1050
|
+
'--all-users',
|
|
1051
|
+
'--all_users',
|
|
1052
|
+
default=False,
|
|
1053
|
+
is_flag=True,
|
|
1054
|
+
help='Include other users for teardown',
|
|
1055
|
+
)
|
|
1056
|
+
@click.option(
|
|
1057
|
+
'--yes',
|
|
1058
|
+
'-y',
|
|
1059
|
+
is_flag=True,
|
|
1060
|
+
default=False,
|
|
1061
|
+
required=False,
|
|
1062
|
+
help='Skip confirmation prompt.',
|
|
1063
|
+
)
|
|
1064
|
+
def down(
|
|
1065
|
+
jobs: List[str],
|
|
1066
|
+
all: Optional[bool],
|
|
1067
|
+
all_users: Optional[bool],
|
|
1068
|
+
yes: bool,
|
|
1069
|
+
):
|
|
1070
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1071
|
+
"""Tear down job(s).
|
|
1072
|
+
|
|
1073
|
+
JOB is the name of the job to tear down. If both
|
|
1074
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1075
|
+
|
|
1076
|
+
Tearing down a job will delete all associated containers (all billing
|
|
1077
|
+
stops), and any data on the containers disks will be lost. Accelerators
|
|
1078
|
+
(e.g., GPUs) that are part of the job will be deleted too.
|
|
1079
|
+
|
|
1080
|
+
Wildcard patterns are supported using * characters.
|
|
1081
|
+
Examples: "test-*" matches all jobs starting with "test-",
|
|
1082
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1083
|
+
|
|
1084
|
+
Examples:
|
|
1085
|
+
|
|
1086
|
+
.. code-block:: bash
|
|
1087
|
+
|
|
1088
|
+
# Tear down a specific job.
|
|
1089
|
+
konduktor down cluster_name
|
|
1090
|
+
\b
|
|
1091
|
+
# Tear down multiple jobs.
|
|
1092
|
+
konduktor down job1 job2
|
|
1093
|
+
\b
|
|
1094
|
+
# Tear down all jobs matching a pattern.
|
|
1095
|
+
konduktor down "test-*"
|
|
1096
|
+
\b
|
|
1097
|
+
# Tear down all of this users jobs.
|
|
1098
|
+
konduktor down -a
|
|
1099
|
+
konduktor down --all
|
|
1100
|
+
|
|
1101
|
+
# Tear down all jobs across all users
|
|
1102
|
+
konduktor down --all --all-users
|
|
1103
|
+
|
|
1104
|
+
"""
|
|
1105
|
+
|
|
1106
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1107
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1108
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1109
|
+
assert jobs_response
|
|
1110
|
+
filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
|
|
1111
|
+
|
|
1112
|
+
if not yes:
|
|
1113
|
+
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
|
1114
|
+
# it exists but is STOPPED.
|
|
1115
|
+
prompt = (
|
|
1116
|
+
f'Tearing down job(s) {colorama.Style.BRIGHT} '
|
|
1117
|
+
f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
|
|
1118
|
+
'Proceed?'
|
|
1119
|
+
)
|
|
1120
|
+
if prompt is not None:
|
|
1121
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1122
|
+
|
|
1123
|
+
for job in track(filtered_jobs, description='Tearing down job(s)...'):
|
|
1124
|
+
jobset_utils.delete_jobset(namespace, job)
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1128
|
+
@click.argument(
|
|
1129
|
+
'jobs',
|
|
1130
|
+
nargs=-1,
|
|
1131
|
+
required=False,
|
|
1132
|
+
)
|
|
1133
|
+
@click.option('--all', '-a', default=None, is_flag=True, help='Suspend all jobs.')
|
|
1134
|
+
@click.option(
|
|
1135
|
+
'--all-users',
|
|
1136
|
+
'--all_users',
|
|
1137
|
+
default=False,
|
|
1138
|
+
is_flag=True,
|
|
1139
|
+
help='Include other users for suspension',
|
|
1140
|
+
)
|
|
1141
|
+
@click.option(
|
|
1142
|
+
'--yes',
|
|
1143
|
+
'-y',
|
|
1144
|
+
is_flag=True,
|
|
1145
|
+
default=False,
|
|
1146
|
+
required=False,
|
|
1147
|
+
help='Skip confirmation prompt.',
|
|
1148
|
+
)
|
|
1149
|
+
def stop(
|
|
1150
|
+
jobs: List[str],
|
|
1151
|
+
all: Optional[bool],
|
|
1152
|
+
all_users: Optional[bool],
|
|
1153
|
+
yes: bool,
|
|
1154
|
+
):
|
|
1155
|
+
"""Suspend job(s) (manual/user-initiated).
|
|
1156
|
+
|
|
1157
|
+
JOB is the name of the job to suspend. If both
|
|
1158
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1159
|
+
|
|
1160
|
+
Suspending a job will pause execution and mark the job as SUSPENDED (by user).
|
|
1161
|
+
The job can be resumed later with `konduktor start`.
|
|
1162
|
+
|
|
1163
|
+
If a job is suspended by the system (e.g., due to queueing),
|
|
1164
|
+
it will show as SUSPENDED (by system).
|
|
1165
|
+
|
|
1166
|
+
Wildcard patterns are supported using * characters.
|
|
1167
|
+
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1168
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1169
|
+
|
|
1170
|
+
Examples:
|
|
1171
|
+
|
|
1172
|
+
.. code-block:: bash
|
|
1173
|
+
|
|
1174
|
+
# Suspend a specific job.
|
|
1175
|
+
konduktor stop my_job
|
|
1176
|
+
\b
|
|
1177
|
+
# Suspend multiple jobs.
|
|
1178
|
+
konduktor stop my_job1 my_job2
|
|
1179
|
+
\b
|
|
1180
|
+
# Suspend all jobs matching a pattern.
|
|
1181
|
+
konduktor stop "my_job-*"
|
|
1182
|
+
\b
|
|
1183
|
+
# Suspend all of this users jobs.
|
|
1184
|
+
konduktor stop -a
|
|
1185
|
+
konduktor stop --all
|
|
1186
|
+
|
|
1187
|
+
# Suspend all jobs across all users
|
|
1188
|
+
konduktor stop --all --all-users
|
|
1189
|
+
|
|
1190
|
+
"""
|
|
1191
|
+
|
|
1192
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1193
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1194
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1195
|
+
assert jobs_response
|
|
1196
|
+
filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
|
|
1197
|
+
|
|
1198
|
+
if not yes:
|
|
1199
|
+
# Prompt for confirmation
|
|
1200
|
+
prompt = (
|
|
1201
|
+
f'Suspending job(s) {colorama.Style.BRIGHT} '
|
|
1202
|
+
f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
|
|
1203
|
+
'Proceed?'
|
|
1204
|
+
)
|
|
1205
|
+
if prompt is not None:
|
|
1206
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1207
|
+
|
|
1208
|
+
for job in track(filtered_jobs, description='Suspending job(s)...'):
|
|
1209
|
+
jobset_utils.stop_jobset(namespace, job)
|
|
1210
|
+
|
|
1211
|
+
click.secho(
|
|
1212
|
+
ux_utils.command_hint_messages(
|
|
1213
|
+
ux_utils.CommandHintType.JOB_STOP, filtered_jobs
|
|
1214
|
+
),
|
|
1215
|
+
fg='green',
|
|
1216
|
+
bold=True,
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1221
|
+
@click.argument(
|
|
1222
|
+
'jobs',
|
|
1223
|
+
nargs=-1,
|
|
1224
|
+
required=False,
|
|
1225
|
+
)
|
|
1226
|
+
@click.option(
|
|
1227
|
+
'--all', '-a', default=None, is_flag=True, help='Resume all suspended jobs.'
|
|
1228
|
+
)
|
|
1229
|
+
@click.option(
|
|
1230
|
+
'--all-users',
|
|
1231
|
+
'--all_users',
|
|
1232
|
+
default=False,
|
|
1233
|
+
is_flag=True,
|
|
1234
|
+
help='Include other users for resumption',
|
|
1235
|
+
)
|
|
1236
|
+
@click.option(
|
|
1237
|
+
'--yes',
|
|
1238
|
+
'-y',
|
|
1239
|
+
is_flag=True,
|
|
1240
|
+
default=False,
|
|
1241
|
+
required=False,
|
|
1242
|
+
help='Skip confirmation prompt.',
|
|
1243
|
+
)
|
|
1244
|
+
def start(
|
|
1245
|
+
jobs: List[str],
|
|
1246
|
+
all: Optional[bool],
|
|
1247
|
+
all_users: Optional[bool],
|
|
1248
|
+
yes: bool,
|
|
1249
|
+
):
|
|
1250
|
+
"""Resume suspended job(s) (manual/user-initiated).
|
|
1251
|
+
|
|
1252
|
+
JOB is the name of the job to resume. If both
|
|
1253
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1254
|
+
|
|
1255
|
+
Resuming a job will restart execution from where it was suspended.
|
|
1256
|
+
Only suspended jobs can be resumed.
|
|
1257
|
+
|
|
1258
|
+
This command works for both manually suspended jobs (SUSPENDED by user)
|
|
1259
|
+
and system-suspended jobs (SUSPENDED by system).
|
|
1260
|
+
|
|
1261
|
+
Wildcard patterns are supported using * characters.
|
|
1262
|
+
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1263
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1264
|
+
|
|
1265
|
+
Examples:
|
|
1266
|
+
|
|
1267
|
+
.. code-block:: bash
|
|
1268
|
+
|
|
1269
|
+
# Resume a specific job.
|
|
1270
|
+
konduktor start my_job
|
|
1271
|
+
\b
|
|
1272
|
+
# Resume multiple jobs.
|
|
1273
|
+
konduktor start my_job1 my_job2
|
|
1274
|
+
\b
|
|
1275
|
+
# Resume all jobs matching a pattern.
|
|
1276
|
+
konduktor start "my_job-*"
|
|
1277
|
+
\b
|
|
1278
|
+
# Resume all of this users suspended jobs.
|
|
1279
|
+
konduktor start -a
|
|
1280
|
+
konduktor start --all
|
|
1281
|
+
|
|
1282
|
+
# Resume all suspended jobs across all users
|
|
1283
|
+
konduktor start --all --all-users
|
|
1284
|
+
|
|
1285
|
+
"""
|
|
1286
|
+
|
|
1287
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1288
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1289
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1290
|
+
assert jobs_response
|
|
1291
|
+
jobs_specs = [job for job in jobs_response['items']]
|
|
1292
|
+
|
|
1293
|
+
if all:
|
|
1294
|
+
# Only get suspended jobs when using --all
|
|
1295
|
+
suspended_jobs = [
|
|
1296
|
+
job['metadata']['name']
|
|
1297
|
+
for job in jobs_specs
|
|
1298
|
+
if job.get('status', {})
|
|
1299
|
+
.get('replicatedJobsStatus', [{}])[0]
|
|
1300
|
+
.get('suspended', False)
|
|
1301
|
+
]
|
|
1302
|
+
if not suspended_jobs:
|
|
1303
|
+
raise click.ClickException(
|
|
1304
|
+
f'No suspended jobs found in namespace {namespace}'
|
|
1305
|
+
)
|
|
1306
|
+
jobs = suspended_jobs
|
|
1307
|
+
elif jobs:
|
|
1308
|
+
# Get all available jobs to match against patterns
|
|
1309
|
+
if len(jobs_specs) == 0:
|
|
1310
|
+
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
1311
|
+
|
|
1312
|
+
all_job_names = [job['metadata']['name'] for job in jobs_specs]
|
|
1313
|
+
matched_jobs = []
|
|
1314
|
+
|
|
1315
|
+
for job_pattern in jobs:
|
|
1316
|
+
# Use fnmatch for both wildcard and exact pattern matching
|
|
1317
|
+
pattern_matches = fnmatch.filter(all_job_names, job_pattern)
|
|
1318
|
+
if not pattern_matches:
|
|
1319
|
+
click.secho(
|
|
1320
|
+
f'Warning: No jobs found matching pattern "{job_pattern}"',
|
|
1321
|
+
fg='yellow',
|
|
1322
|
+
err=True,
|
|
1323
|
+
)
|
|
1324
|
+
matched_jobs.extend(pattern_matches)
|
|
1325
|
+
|
|
1326
|
+
# Remove duplicates while preserving order
|
|
1327
|
+
seen = set()
|
|
1328
|
+
jobs = []
|
|
1329
|
+
for job in matched_jobs:
|
|
1330
|
+
if job not in seen:
|
|
1331
|
+
seen.add(job)
|
|
1332
|
+
jobs.append(job)
|
|
1333
|
+
|
|
1334
|
+
if not jobs:
|
|
1335
|
+
raise click.ClickException(
|
|
1336
|
+
f'No matching jobs found check status with '
|
|
1337
|
+
f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
|
|
1338
|
+
)
|
|
1339
|
+
else:
|
|
1340
|
+
raise click.ClickException(
|
|
1341
|
+
'No jobs specified. Use --all to resume '
|
|
1342
|
+
'all suspended jobs or specify job names/patterns.'
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
if not yes:
|
|
1346
|
+
# Prompt for confirmation
|
|
1347
|
+
prompt = (
|
|
1348
|
+
f'Resuming job(s) {colorama.Style.BRIGHT} '
|
|
1349
|
+
f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
|
|
1350
|
+
'Proceed?'
|
|
1351
|
+
)
|
|
1352
|
+
if prompt is not None:
|
|
1353
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1354
|
+
|
|
1355
|
+
for job in track(jobs, description='Resuming job(s)...'):
|
|
1356
|
+
jobset_utils.start_jobset(namespace, job)
|
|
1357
|
+
|
|
1358
|
+
|
|
1359
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1360
|
+
@click.argument(
|
|
1361
|
+
'clouds',
|
|
1362
|
+
required=True,
|
|
1363
|
+
type=str,
|
|
1364
|
+
nargs=-1,
|
|
1365
|
+
)
|
|
1366
|
+
def check(clouds: Tuple[str]):
|
|
1367
|
+
"""Check which clouds are available to use for storage
|
|
1368
|
+
|
|
1369
|
+
This checks storage credentials for a cloud supported by konduktor. If a
|
|
1370
|
+
cloud is detected to be inaccessible, the reason and correction steps will
|
|
1371
|
+
be shown.
|
|
1372
|
+
|
|
1373
|
+
If CLOUDS are specified, checks credentials for only those clouds.
|
|
1374
|
+
|
|
1375
|
+
The enabled clouds are cached and form the "search space" to be considered
|
|
1376
|
+
for each task.
|
|
1377
|
+
|
|
1378
|
+
Examples:
|
|
1379
|
+
|
|
1380
|
+
.. code-block:: bash
|
|
1381
|
+
|
|
1382
|
+
# Check only specific clouds - gs, s3.
|
|
1383
|
+
konduktor check gs
|
|
1384
|
+
konduktor check s3
|
|
1385
|
+
"""
|
|
1386
|
+
clouds_arg = clouds if len(clouds) > 0 else None
|
|
1387
|
+
konduktor_check.check(clouds=clouds_arg)
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
class KeyValueType(click.ParamType):
|
|
1391
|
+
name = 'key=value'
|
|
1392
|
+
|
|
1393
|
+
def convert(self, value, param, ctx):
|
|
1394
|
+
if '=' not in value:
|
|
1395
|
+
self.fail(f'{value!r} is not a valid key=value pair', param, ctx)
|
|
1396
|
+
key, val = value.split('=', 1)
|
|
1397
|
+
return key, val
|
|
1398
|
+
|
|
1399
|
+
|
|
1400
|
+
_SECRET_CREATE_OPTIONS = [
|
|
1401
|
+
click.option(
|
|
1402
|
+
'--inline',
|
|
1403
|
+
type=KeyValueType(),
|
|
1404
|
+
help='Key=value pair to store as an env secret (only valid with --kind env).',
|
|
1405
|
+
),
|
|
1406
|
+
click.option(
|
|
1407
|
+
'--from-file',
|
|
1408
|
+
'--from_file',
|
|
1409
|
+
type=click.Path(dir_okay=False),
|
|
1410
|
+
help='Path to a single file to store as a secret.',
|
|
1411
|
+
),
|
|
1412
|
+
click.option(
|
|
1413
|
+
'--from-directory',
|
|
1414
|
+
'--from_directory',
|
|
1415
|
+
type=click.Path(file_okay=False),
|
|
1416
|
+
help='Path to a directory to store as a multi-file secret.',
|
|
1417
|
+
),
|
|
1418
|
+
click.option(
|
|
1419
|
+
'--kind',
|
|
1420
|
+
default='default',
|
|
1421
|
+
type=click.Choice(['default', 'env', 'git-ssh']),
|
|
1422
|
+
help='Type of secret being created. More kinds coming soon.',
|
|
1423
|
+
),
|
|
1424
|
+
]
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
@cli.group(cls=_NaturalOrderGroup)
|
|
1428
|
+
def secret():
|
|
1429
|
+
"""Manage secrets used in Konduktor.
|
|
1430
|
+
|
|
1431
|
+
USAGE: konduktor secret COMMAND
|
|
1432
|
+
|
|
1433
|
+
\b
|
|
1434
|
+
Use one of the following COMMANDS:
|
|
1435
|
+
create [FLAGS] [NAME]
|
|
1436
|
+
delete [NAME]
|
|
1437
|
+
list [FLAGS]
|
|
1438
|
+
|
|
1439
|
+
\b
|
|
1440
|
+
Examples:
|
|
1441
|
+
konduktor secret create --kind git-ssh --from-file=~/.ssh/id_rsa my-ssh-name
|
|
1442
|
+
konduktor secret create --kind env --inline FOO=bar my-env-name
|
|
1443
|
+
konduktor delete my-ssh-name
|
|
1444
|
+
konduktor secret list
|
|
1445
|
+
|
|
1446
|
+
\b
|
|
1447
|
+
For details on COMMAND ARGS:
|
|
1448
|
+
konduktor secret create -h
|
|
1449
|
+
konduktor secret list -h
|
|
1450
|
+
"""
|
|
1451
|
+
|
|
1452
|
+
|
|
1453
|
+
@_add_click_options(_SECRET_CREATE_OPTIONS)
|
|
1454
|
+
@secret.command()
|
|
1455
|
+
@click.argument('name', required=True)
|
|
1456
|
+
def create(kind, from_file, from_directory, inline, name):
|
|
1457
|
+
"""Create a new secret."""
|
|
1458
|
+
|
|
1459
|
+
if not kubernetes_utils.is_k8s_resource_name_valid(name):
|
|
1460
|
+
raise click.BadParameter(
|
|
1461
|
+
f'Invalid secret name: {name}. '
|
|
1462
|
+
f'Name must consist of lower case alphanumeric characters or -, '
|
|
1463
|
+
f'and must start and end with alphanumeric characters.',
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
basename = name
|
|
1467
|
+
secret_name = f'{basename}-{common_utils.get_user_hash()}'
|
|
1468
|
+
|
|
1469
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1470
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1471
|
+
|
|
1472
|
+
from_file = os.path.expanduser(from_file) if from_file else None
|
|
1473
|
+
from_directory = os.path.expanduser(from_directory) if from_directory else None
|
|
1474
|
+
|
|
1475
|
+
sources = [bool(from_file), bool(from_directory), bool(inline)]
|
|
1476
|
+
|
|
1477
|
+
if sources.count(True) > 1:
|
|
1478
|
+
raise click.UsageError(
|
|
1479
|
+
'Only one of --from-file, --from-directory, or --inline can be used.\n'
|
|
1480
|
+
'Examples:\n'
|
|
1481
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
|
|
1482
|
+
f'--from-file=~/.ssh/id_rsa my-ssh-name\n{colorama.Style.RESET_ALL}'
|
|
1483
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
|
|
1484
|
+
f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
if sources.count(True) == 0:
|
|
1488
|
+
raise click.UsageError(
|
|
1489
|
+
'You must specify one of --from-file, --from-directory, or --inline.\n'
|
|
1490
|
+
'Examples:\n'
|
|
1491
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
|
|
1492
|
+
f'--from-file=~/.ssh/id_rsa my-ssh-name\n{colorama.Style.RESET_ALL}'
|
|
1493
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
|
|
1494
|
+
f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
|
|
1495
|
+
)
|
|
1496
|
+
|
|
1497
|
+
if from_file and not os.path.isfile(from_file):
|
|
1498
|
+
raise click.BadParameter(
|
|
1499
|
+
f'--from-file {from_file} does not exist or is not a file'
|
|
1500
|
+
)
|
|
1501
|
+
if from_directory and not os.path.isdir(from_directory):
|
|
1502
|
+
raise click.BadParameter(
|
|
1503
|
+
f'--from-directory {from_directory} does not exist or is not a directory'
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
if kind == 'git-ssh' and not from_file:
|
|
1507
|
+
raise click.UsageError(
|
|
1508
|
+
'--kind git-ssh requires --from-file (not --from-directory or --inline). \n'
|
|
1509
|
+
'Example:\n'
|
|
1510
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
|
|
1511
|
+
f'--from-file=~/.ssh/id_rsa my-ssh-name{colorama.Style.RESET_ALL}'
|
|
1512
|
+
)
|
|
1513
|
+
if kind == 'env' and not inline:
|
|
1514
|
+
raise click.UsageError(
|
|
1515
|
+
'--kind env requires --inline (not --from-file or --from-directory). \n'
|
|
1516
|
+
'Example:\n'
|
|
1517
|
+
f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
|
|
1518
|
+
f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
data = {}
|
|
1522
|
+
if from_directory:
|
|
1523
|
+
click.echo(f'Creating secret from directory: {from_directory}')
|
|
1524
|
+
# Use ABSOLUTE directory path so the top-level folder name is preserved
|
|
1525
|
+
base_dir_abs = os.path.abspath(os.path.expanduser(from_directory))
|
|
1526
|
+
if not os.path.isdir(base_dir_abs):
|
|
1527
|
+
raise click.BadParameter(
|
|
1528
|
+
f"--from-directory {from_directory} doesn't exist or is not a directory"
|
|
1529
|
+
)
|
|
1530
|
+
# Ensure there is at least one file inside
|
|
1531
|
+
if not any(p.is_file() for p in pathlib.Path(base_dir_abs).rglob('*')):
|
|
1532
|
+
raise click.BadParameter(f'--from-directory {from_directory} is empty.')
|
|
1533
|
+
|
|
1534
|
+
# Zip + base64 the WHOLE directory (this preserves the inner structure)
|
|
1535
|
+
archive_b64 = base64_utils.zip_base64encode([base_dir_abs])
|
|
1536
|
+
|
|
1537
|
+
# Store as a single key; pod will unzip to the expanded path
|
|
1538
|
+
data = {'payload.zip': archive_b64}
|
|
1539
|
+
elif from_file:
|
|
1540
|
+
click.echo(f'Creating secret from file: {from_file}')
|
|
1541
|
+
key = os.path.basename(from_file)
|
|
1542
|
+
if kind == 'git-ssh':
|
|
1543
|
+
key = 'gitkey'
|
|
1544
|
+
try:
|
|
1545
|
+
with open(from_file, 'rb') as f:
|
|
1546
|
+
data[key] = b64encode(f.read()).decode()
|
|
1547
|
+
except OSError as e:
|
|
1548
|
+
raise click.ClickException(f'Failed to read {kind} file {from_file}: {e}')
|
|
1549
|
+
else:
|
|
1550
|
+
click.echo('Creating secret from inline key=value pair')
|
|
1551
|
+
key, value = inline
|
|
1552
|
+
data = {key: b64encode(value.encode()).decode()}
|
|
1553
|
+
|
|
1554
|
+
secret_metadata = {
|
|
1555
|
+
'name': secret_name,
|
|
1556
|
+
'labels': {
|
|
1557
|
+
'parent': 'konduktor',
|
|
1558
|
+
backend_constants.SECRET_OWNER_LABEL: common_utils.get_user_hash(),
|
|
1559
|
+
backend_constants.SECRET_BASENAME_LABEL: basename,
|
|
1560
|
+
backend_constants.SECRET_KIND_LABEL: kind or None,
|
|
1561
|
+
},
|
|
1562
|
+
}
|
|
1563
|
+
|
|
1564
|
+
# Limit --kind git-ssh secret to 1 max per user
|
|
1565
|
+
# Overwrites if user trying to create more than 1
|
|
1566
|
+
if kind == 'git-ssh':
|
|
1567
|
+
user_hash = common_utils.get_user_hash()
|
|
1568
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1569
|
+
existing = kubernetes_utils.list_secrets(
|
|
1570
|
+
namespace, context, label_filter=label_selector
|
|
1571
|
+
)
|
|
1572
|
+
for s in existing:
|
|
1573
|
+
labels = s.metadata.labels or {}
|
|
1574
|
+
if labels.get(backend_constants.SECRET_KIND_LABEL) == 'git-ssh':
|
|
1575
|
+
old_name = s.metadata.name
|
|
1576
|
+
click.echo(f'Found existing git-ssh secret: {old_name}, deleting it.')
|
|
1577
|
+
kubernetes_utils.delete_secret(
|
|
1578
|
+
name=old_name, namespace=namespace, context=context
|
|
1579
|
+
)
|
|
1580
|
+
break
|
|
1581
|
+
|
|
1582
|
+
ok, err = kubernetes_utils.set_secret(
|
|
1583
|
+
secret_name=secret_name,
|
|
1584
|
+
namespace=namespace,
|
|
1585
|
+
context=context,
|
|
1586
|
+
data=data,
|
|
1587
|
+
secret_metadata=secret_metadata,
|
|
1588
|
+
)
|
|
1589
|
+
if not ok:
|
|
1590
|
+
raise click.ClickException(f'Failed to create secret: {err}')
|
|
1591
|
+
click.secho(f'Secret {basename} created in namespace {namespace}.', fg='green')
|
|
1592
|
+
|
|
1593
|
+
|
|
1594
|
+
@secret.command()
|
|
1595
|
+
@click.argument('name', required=True)
|
|
1596
|
+
def delete(name):
|
|
1597
|
+
"""Delete a secret by name."""
|
|
1598
|
+
|
|
1599
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1600
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1601
|
+
user_hash = common_utils.get_user_hash()
|
|
1602
|
+
|
|
1603
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1604
|
+
secrets = kubernetes_utils.list_secrets(
|
|
1605
|
+
namespace, context, label_filter=label_selector
|
|
1606
|
+
)
|
|
1607
|
+
|
|
1608
|
+
matches = [
|
|
1609
|
+
s
|
|
1610
|
+
for s in secrets
|
|
1611
|
+
if s.metadata.labels
|
|
1612
|
+
and s.metadata.labels.get(backend_constants.SECRET_BASENAME_LABEL) == name
|
|
1613
|
+
]
|
|
1614
|
+
|
|
1615
|
+
if not matches:
|
|
1616
|
+
raise click.ClickException(
|
|
1617
|
+
f'No secret named "{name}" owned by you found in namespace {namespace}.'
|
|
1618
|
+
)
|
|
1619
|
+
elif len(matches) > 1:
|
|
1620
|
+
raise click.ClickException(f'Multiple secrets with basename "{name}" found.')
|
|
1621
|
+
|
|
1622
|
+
full_name = matches[0].metadata.name
|
|
1623
|
+
|
|
1624
|
+
ok, err = kubernetes_utils.delete_secret(full_name, namespace, context)
|
|
1625
|
+
if not ok:
|
|
1626
|
+
raise click.ClickException(f'Failed to delete secret: {err}')
|
|
1627
|
+
click.secho(f'Secret {name} deleted from namespace {namespace}.', fg='yellow')
|
|
1628
|
+
|
|
1629
|
+
|
|
1630
|
+
@secret.command(name='list')
|
|
1631
|
+
@click.option(
|
|
1632
|
+
'--all-users',
|
|
1633
|
+
'--all_users',
|
|
1634
|
+
'-u',
|
|
1635
|
+
is_flag=True,
|
|
1636
|
+
default=False,
|
|
1637
|
+
help='Show all secrets, including those not owned by the current user.',
|
|
1638
|
+
)
|
|
1639
|
+
def list_secrets(all_users: bool):
|
|
1640
|
+
"""List secrets in the namespace.
|
|
1641
|
+
Defaults to only your secrets unless --all-users is set."""
|
|
1642
|
+
|
|
1643
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1644
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1645
|
+
|
|
1646
|
+
if not all_users:
|
|
1647
|
+
user_hash = common_utils.get_user_hash()
|
|
1648
|
+
username = common_utils.get_cleaned_username()
|
|
1649
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1650
|
+
secrets = kubernetes_utils.list_secrets(
|
|
1651
|
+
namespace, context, label_filter=label_selector
|
|
1652
|
+
)
|
|
1653
|
+
else:
|
|
1654
|
+
secrets = kubernetes_utils.list_secrets(namespace, context)
|
|
1655
|
+
|
|
1656
|
+
if not secrets:
|
|
1657
|
+
if all_users:
|
|
1658
|
+
click.secho(f'No secrets found in {namespace}.', fg='yellow')
|
|
1659
|
+
else:
|
|
1660
|
+
click.secho(f'No secrets found for {username} in {namespace}.', fg='yellow')
|
|
1661
|
+
return
|
|
1662
|
+
|
|
1663
|
+
if all_users:
|
|
1664
|
+
click.secho(f'All secrets in {namespace} namespace:\n', bold=True)
|
|
1665
|
+
else:
|
|
1666
|
+
click.secho(f'Secrets in {namespace} namespace owned by you:\n', bold=True)
|
|
1667
|
+
|
|
1668
|
+
for s in secrets:
|
|
1669
|
+
labels = s.metadata.labels or {}
|
|
1670
|
+
basename = labels.get(backend_constants.SECRET_BASENAME_LABEL, s.metadata.name)
|
|
1671
|
+
kind = labels.get(backend_constants.SECRET_KIND_LABEL, '(none)')
|
|
1672
|
+
owner = labels.get(backend_constants.SECRET_OWNER_LABEL, '(none)')
|
|
1673
|
+
|
|
1674
|
+
if all_users:
|
|
1675
|
+
click.echo(f'{basename:30} kind={kind:10} owner={owner}')
|
|
1676
|
+
else:
|
|
1677
|
+
click.echo(f'{basename:30} kind={kind:10}')
|
|
1678
|
+
|
|
1679
|
+
|
|
1680
|
+
@cli.group(cls=_NaturalOrderGroup)
|
|
1681
|
+
def serve():
|
|
1682
|
+
"""Manage deployment serving with Konduktor.
|
|
1683
|
+
|
|
1684
|
+
USAGE: konduktor serve COMMAND
|
|
1685
|
+
|
|
1686
|
+
\b
|
|
1687
|
+
Use one of the following COMMANDS:
|
|
1688
|
+
launch
|
|
1689
|
+
down
|
|
1690
|
+
status
|
|
1691
|
+
|
|
1692
|
+
\b
|
|
1693
|
+
Examples:
|
|
1694
|
+
konduktor serve launch my-deployment
|
|
1695
|
+
konduktor serve down my-deployment
|
|
1696
|
+
konduktor serve status
|
|
1697
|
+
|
|
1698
|
+
\b
|
|
1699
|
+
For details on COMMAND ARGS:
|
|
1700
|
+
konduktor serve launch -h
|
|
1701
|
+
konduktor serve down -h
|
|
1702
|
+
konduktor serve status -h
|
|
1703
|
+
"""
|
|
1704
|
+
pass
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
@serve.command(name='launch')
|
|
1708
|
+
@click.argument(
|
|
1709
|
+
'entrypoint',
|
|
1710
|
+
required=False,
|
|
1711
|
+
type=str,
|
|
1712
|
+
nargs=-1,
|
|
1713
|
+
)
|
|
1714
|
+
@click.option(
|
|
1715
|
+
'--dryrun',
|
|
1716
|
+
default=False,
|
|
1717
|
+
is_flag=True,
|
|
1718
|
+
help='If True, do not actually run the job.',
|
|
1719
|
+
)
|
|
1720
|
+
@click.option(
|
|
1721
|
+
'--detach-run',
|
|
1722
|
+
'-d',
|
|
1723
|
+
default=False,
|
|
1724
|
+
is_flag=True,
|
|
1725
|
+
help=(
|
|
1726
|
+
'If True, as soon as a job is submitted, return from this call '
|
|
1727
|
+
'and do not stream execution logs.'
|
|
1728
|
+
),
|
|
1729
|
+
)
|
|
1730
|
+
@_add_click_options(
|
|
1731
|
+
_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS + _EXTRA_SERVING_OPTIONS
|
|
1732
|
+
)
|
|
1733
|
+
@click.option(
|
|
1734
|
+
'--yes',
|
|
1735
|
+
'-y',
|
|
1736
|
+
is_flag=True,
|
|
1737
|
+
default=False,
|
|
1738
|
+
required=False,
|
|
1739
|
+
# Disabling quote check here, as there seems to be a bug in pylint,
|
|
1740
|
+
# which incorrectly recognizes the help string as a docstring.
|
|
1741
|
+
# pylint: disable=bad-docstring-quotes
|
|
1742
|
+
help='Skip confirmation prompt.',
|
|
1743
|
+
)
|
|
1744
|
+
@click.option(
|
|
1745
|
+
'--skip-image-check',
|
|
1746
|
+
'-s',
|
|
1747
|
+
is_flag=True,
|
|
1748
|
+
default=False,
|
|
1749
|
+
help='Skip Docker image validation checks for faster startup.',
|
|
1750
|
+
)
|
|
1751
|
+
def serve_launch(
|
|
1752
|
+
entrypoint: Tuple[str, ...],
|
|
1753
|
+
dryrun: bool,
|
|
1754
|
+
detach_run: bool,
|
|
1755
|
+
name: Optional[str],
|
|
1756
|
+
workdir: Optional[str],
|
|
1757
|
+
cloud: Optional[str],
|
|
1758
|
+
gpus: Optional[str],
|
|
1759
|
+
cpus: Optional[str],
|
|
1760
|
+
memory: Optional[str],
|
|
1761
|
+
num_nodes: Optional[int],
|
|
1762
|
+
max_restarts: Optional[int],
|
|
1763
|
+
completions: Optional[int],
|
|
1764
|
+
image_id: Optional[str],
|
|
1765
|
+
env_file: Optional[Dict[str, str]],
|
|
1766
|
+
env: List[Tuple[str, str]],
|
|
1767
|
+
disk_size: Optional[int],
|
|
1768
|
+
min_replicas: Optional[int],
|
|
1769
|
+
max_replicas: Optional[int],
|
|
1770
|
+
ports: Optional[int],
|
|
1771
|
+
probe: Optional[str],
|
|
1772
|
+
yes: bool,
|
|
1773
|
+
skip_image_check: bool = False,
|
|
1774
|
+
):
|
|
1775
|
+
"""Launch a deployment to serve.
|
|
1776
|
+
|
|
1777
|
+
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
|
1778
|
+
specification. Otherwise, it is interpreted as a bash command.
|
|
1779
|
+
"""
|
|
1780
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1781
|
+
env = _merge_env_vars(env_file, env)
|
|
1782
|
+
|
|
1783
|
+
if skip_image_check:
|
|
1784
|
+
os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
|
|
1785
|
+
|
|
1786
|
+
task = _make_task_with_overrides(
|
|
1787
|
+
entrypoint=entrypoint,
|
|
1788
|
+
name=name,
|
|
1789
|
+
workdir=workdir,
|
|
1790
|
+
cloud=cloud,
|
|
1791
|
+
gpus=gpus,
|
|
1792
|
+
cpus=cpus,
|
|
1793
|
+
memory=memory,
|
|
1794
|
+
num_nodes=num_nodes,
|
|
1795
|
+
max_restarts=max_restarts,
|
|
1796
|
+
completions=completions,
|
|
1797
|
+
image_id=image_id,
|
|
1798
|
+
env=env,
|
|
1799
|
+
disk_size=disk_size,
|
|
1800
|
+
# serving stuff
|
|
1801
|
+
min_replicas=min_replicas,
|
|
1802
|
+
max_replicas=max_replicas,
|
|
1803
|
+
ports=ports,
|
|
1804
|
+
probe=probe,
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1807
|
+
click.secho(
|
|
1808
|
+
f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
|
|
1809
|
+
)
|
|
1810
|
+
table_kwargs = {
|
|
1811
|
+
'hrules': prettytable.FRAME,
|
|
1812
|
+
'vrules': prettytable.NONE,
|
|
1813
|
+
'border': True,
|
|
1814
|
+
}
|
|
1815
|
+
headers = ['CPUs', 'Mem (GB)', 'GPUs']
|
|
1816
|
+
table = log_utils.create_table(headers, **table_kwargs)
|
|
1817
|
+
assert task.resources is not None
|
|
1818
|
+
table.add_row(
|
|
1819
|
+
[task.resources.cpus, task.resources.memory, task.resources.accelerators]
|
|
1820
|
+
)
|
|
1821
|
+
print(table)
|
|
1822
|
+
|
|
1823
|
+
if not task.serving:
|
|
1824
|
+
raise click.UsageError(
|
|
1825
|
+
'No serving information detected. '
|
|
1826
|
+
'Use `konduktor launch` instead for workloads.'
|
|
1827
|
+
)
|
|
1828
|
+
|
|
1829
|
+
job_name = _launch_with_confirm(
|
|
1830
|
+
task,
|
|
1831
|
+
dryrun=dryrun,
|
|
1832
|
+
detach_run=detach_run,
|
|
1833
|
+
no_confirm=yes,
|
|
1834
|
+
serving=bool(task.serving),
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
click.secho(f'Deployment Name: {job_name}', fg='green', bold=True)
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
@serve.command(name='down')
|
|
1841
|
+
@click.argument('names', nargs=-1, required=False)
|
|
1842
|
+
@click.option(
|
|
1843
|
+
'--all', '-a', default=False, is_flag=True, help='Tear down all deployments.'
|
|
1844
|
+
)
|
|
1845
|
+
@click.option(
|
|
1846
|
+
'--yes',
|
|
1847
|
+
'-y',
|
|
1848
|
+
is_flag=True,
|
|
1849
|
+
default=False,
|
|
1850
|
+
required=False,
|
|
1851
|
+
help='Skip confirmation prompt.',
|
|
1852
|
+
)
|
|
1853
|
+
def serve_down(
|
|
1854
|
+
names: List[str],
|
|
1855
|
+
all: bool,
|
|
1856
|
+
yes: bool,
|
|
1857
|
+
):
|
|
1858
|
+
"""Tear down deployments (Deployment, Service, PodAutoscaler).
|
|
1859
|
+
|
|
1860
|
+
Use --all or -a to tear down all deployments.
|
|
1861
|
+
|
|
1862
|
+
Examples:
|
|
1863
|
+
|
|
1864
|
+
\b
|
|
1865
|
+
konduktor serve down my-deployment
|
|
1866
|
+
konduktor serve down -a
|
|
1867
|
+
"""
|
|
1868
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1869
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1870
|
+
|
|
1871
|
+
all_models = deployment_utils.list_models(namespace)
|
|
1872
|
+
|
|
1873
|
+
if all:
|
|
1874
|
+
names = all_models
|
|
1875
|
+
if not names:
|
|
1876
|
+
logger.warning(
|
|
1877
|
+
f'No deployments found in namespace '
|
|
1878
|
+
f'{namespace}, but continuing teardown.'
|
|
1879
|
+
)
|
|
1880
|
+
elif names:
|
|
1881
|
+
matched = []
|
|
1882
|
+
for pattern in names:
|
|
1883
|
+
matched.extend(fnmatch.filter(all_models, pattern))
|
|
1884
|
+
names = sorted(set(matched))
|
|
1885
|
+
if not names:
|
|
1886
|
+
raise click.ClickException(
|
|
1887
|
+
f'No matching deployments found. Check with: '
|
|
1888
|
+
f'{colorama.Style.BRIGHT}konduktor serve '
|
|
1889
|
+
f'status{colorama.Style.RESET_ALL}'
|
|
1890
|
+
)
|
|
1891
|
+
else:
|
|
1892
|
+
raise click.ClickException(
|
|
1893
|
+
'No deployments specified. Use --all to tear down all deployments '
|
|
1894
|
+
'or pass names/patterns.'
|
|
1895
|
+
)
|
|
1896
|
+
|
|
1897
|
+
if not yes:
|
|
1898
|
+
prompt = (
|
|
1899
|
+
f'Tearing down deployment(s) '
|
|
1900
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}{names}'
|
|
1901
|
+
f'{colorama.Style.RESET_ALL}. '
|
|
1902
|
+
f'Proceed?'
|
|
1903
|
+
)
|
|
1904
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1905
|
+
|
|
1906
|
+
for name in track(names, description='Tearing down deployment(s)...'):
|
|
1907
|
+
deployment_utils.delete_serving_specs(name, namespace)
|
|
1908
|
+
|
|
1909
|
+
|
|
1910
|
+
@serve.command(name='status')
|
|
1911
|
+
@click.option(
|
|
1912
|
+
'--all-users',
|
|
1913
|
+
'-u',
|
|
1914
|
+
default=False,
|
|
1915
|
+
is_flag=True,
|
|
1916
|
+
required=False,
|
|
1917
|
+
help='Show all deployments, including those not owned by the ' 'current user.',
|
|
1918
|
+
)
|
|
1919
|
+
@click.option(
|
|
1920
|
+
'--direct',
|
|
1921
|
+
'-d',
|
|
1922
|
+
default=False,
|
|
1923
|
+
is_flag=True,
|
|
1924
|
+
required=False,
|
|
1925
|
+
help='Force display of direct IP endpoints instead of trainy.us endpoints.',
|
|
1926
|
+
)
|
|
1927
|
+
def serve_status(all_users: bool, direct: bool):
|
|
1928
|
+
"""Show status of deployments launched via `konduktor serve launch`."""
|
|
1929
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1930
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1931
|
+
deployment_utils.show_status_table(
|
|
1932
|
+
namespace, all_users=all_users, force_direct=direct
|
|
1933
|
+
)
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
def main():
|
|
1937
|
+
try:
|
|
1938
|
+
return cli(standalone_mode=False)
|
|
1939
|
+
except click.exceptions.Abort:
|
|
1940
|
+
click.secho('Detaching...', fg='yellow', bold=True)
|
|
1941
|
+
return None
|
|
1942
|
+
|
|
1943
|
+
|
|
1944
|
+
if __name__ == '__main__':
|
|
1945
|
+
main()
|