konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/cli.py
ADDED
@@ -0,0 +1,790 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""The 'konduktor' command line tool.
|
14
|
+
|
15
|
+
Example usage:
|
16
|
+
|
17
|
+
# See available commands.
|
18
|
+
>> konduktor
|
19
|
+
|
20
|
+
# Run a task, described in a yaml file.
|
21
|
+
>> konduktor launch task.yaml
|
22
|
+
|
23
|
+
# Show the list of scheduled jobs
|
24
|
+
>> konduktor status
|
25
|
+
|
26
|
+
# Tear down a specific job.
|
27
|
+
>> konduktor down cluster_name
|
28
|
+
|
29
|
+
# Tear down all scheduled jobs
|
30
|
+
>> konduktor down -a
|
31
|
+
|
32
|
+
NOTE: the order of command definitions in this file corresponds to how they are
|
33
|
+
listed in "konduktor --help". Take care to put logically connected commands close to
|
34
|
+
each other.
|
35
|
+
"""
|
36
|
+
|
37
|
+
import os
|
38
|
+
import shlex
|
39
|
+
from typing import Any, Dict, List, Optional, Tuple
|
40
|
+
|
41
|
+
import click
|
42
|
+
import colorama
|
43
|
+
import dotenv
|
44
|
+
import prettytable
|
45
|
+
import yaml
|
46
|
+
from rich.progress import track
|
47
|
+
|
48
|
+
import konduktor
|
49
|
+
from konduktor import check as konduktor_check
|
50
|
+
from konduktor import logging
|
51
|
+
from konduktor.backends import jobset_utils
|
52
|
+
from konduktor.utils import (
|
53
|
+
common_utils,
|
54
|
+
kubernetes_utils,
|
55
|
+
log_utils,
|
56
|
+
loki_utils,
|
57
|
+
ux_utils,
|
58
|
+
)
|
59
|
+
|
60
|
+
_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
|
61
|
+
|
62
|
+
logger = logging.get_logger(__name__)
|
63
|
+
|
64
|
+
|
65
|
+
def _parse_env_var(env_var: str) -> Tuple[str, str]:
|
66
|
+
"""Parse env vars into a (KEY, VAL) pair."""
|
67
|
+
if '=' not in env_var:
|
68
|
+
value = os.environ.get(env_var)
|
69
|
+
if value is None:
|
70
|
+
raise click.UsageError(f'{env_var} is not set in local environment.')
|
71
|
+
return (env_var, value)
|
72
|
+
ret = tuple(env_var.split('=', 1))
|
73
|
+
if len(ret) != 2:
|
74
|
+
raise click.UsageError(
|
75
|
+
f'Invalid env var: {env_var}. Must be in the form of KEY=VAL ' 'or KEY.'
|
76
|
+
)
|
77
|
+
return ret[0], ret[1]
|
78
|
+
|
79
|
+
|
80
|
+
def _merge_env_vars(
|
81
|
+
env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]]
|
82
|
+
) -> List[Tuple[str, str]]:
|
83
|
+
"""Merges all values from env_list into env_dict."""
|
84
|
+
if not env_dict:
|
85
|
+
return env_list
|
86
|
+
for key, value in env_list:
|
87
|
+
env_dict[key] = value
|
88
|
+
return list(env_dict.items())
|
89
|
+
|
90
|
+
|
91
|
+
def _make_task_with_overrides(
|
92
|
+
entrypoint: Tuple[str, ...],
|
93
|
+
*,
|
94
|
+
entrypoint_name: str = 'konduktor.Task',
|
95
|
+
name: Optional[str] = None,
|
96
|
+
workdir: Optional[str] = None,
|
97
|
+
cloud: Optional[str] = None,
|
98
|
+
gpus: Optional[str] = None,
|
99
|
+
cpus: Optional[str] = None,
|
100
|
+
memory: Optional[str] = None,
|
101
|
+
instance_type: Optional[str] = None,
|
102
|
+
num_nodes: Optional[int] = None,
|
103
|
+
image_id: Optional[str] = None,
|
104
|
+
disk_size: Optional[int] = None,
|
105
|
+
env: Optional[List[Tuple[str, str]]] = None,
|
106
|
+
field_to_ignore: Optional[List[str]] = None,
|
107
|
+
) -> konduktor.Task:
|
108
|
+
"""Creates a task or a dag from an entrypoint with overrides.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
konduktor.Task
|
112
|
+
"""
|
113
|
+
entrypoint = ' '.join(entrypoint)
|
114
|
+
is_yaml, _ = _check_yaml(entrypoint)
|
115
|
+
entrypoint: Optional[str]
|
116
|
+
if is_yaml:
|
117
|
+
# Treat entrypoint as a yaml.
|
118
|
+
click.secho(f'{entrypoint_name} from YAML spec: ', fg='yellow', nl=False)
|
119
|
+
click.secho(entrypoint, bold=True)
|
120
|
+
else:
|
121
|
+
if entrypoint is not None and len(entrypoint) == 0:
|
122
|
+
raise ValueError(
|
123
|
+
'no entrypoint specified, run with \n' '`konduktor launch task.yaml'
|
124
|
+
)
|
125
|
+
raise ValueError(f'{entrypoint} is not a valid YAML spec,')
|
126
|
+
|
127
|
+
override_params = _parse_override_params(
|
128
|
+
gpus=gpus,
|
129
|
+
cpus=cpus,
|
130
|
+
memory=memory,
|
131
|
+
image_id=image_id,
|
132
|
+
disk_size=disk_size,
|
133
|
+
)
|
134
|
+
|
135
|
+
if field_to_ignore is not None:
|
136
|
+
_pop_and_ignore_fields_in_override_params(override_params, field_to_ignore)
|
137
|
+
|
138
|
+
assert entrypoint is not None
|
139
|
+
task_configs = common_utils.read_yaml_all(entrypoint)
|
140
|
+
assert len(task_configs) == 1, 'Only single tasks are supported'
|
141
|
+
task = konduktor.Task.from_yaml_config(task_configs[0], env)
|
142
|
+
# Override.
|
143
|
+
if workdir is not None:
|
144
|
+
task.workdir = workdir
|
145
|
+
|
146
|
+
task.set_resources_override(override_params)
|
147
|
+
|
148
|
+
if num_nodes is not None:
|
149
|
+
task.num_nodes = num_nodes
|
150
|
+
if name is not None:
|
151
|
+
task.name = name
|
152
|
+
return task
|
153
|
+
|
154
|
+
|
155
|
+
_TASK_OPTIONS = [
|
156
|
+
click.option(
|
157
|
+
'--workdir',
|
158
|
+
required=False,
|
159
|
+
type=click.Path(exists=True, file_okay=False),
|
160
|
+
help=(
|
161
|
+
'If specified, sync this dir to the remote working directory, '
|
162
|
+
'where the task will be invoked. '
|
163
|
+
'Overrides the "workdir" config in the YAML if both are supplied.'
|
164
|
+
),
|
165
|
+
),
|
166
|
+
click.option(
|
167
|
+
'--cloud',
|
168
|
+
required=False,
|
169
|
+
type=str,
|
170
|
+
help=(
|
171
|
+
'The cloud to use. If specified, overrides the "resources.cloud" '
|
172
|
+
'config. Passing "none" resets the config. [defunct] currently '
|
173
|
+
'only supports a single cloud'
|
174
|
+
),
|
175
|
+
),
|
176
|
+
click.option(
|
177
|
+
'--num-nodes',
|
178
|
+
required=False,
|
179
|
+
type=int,
|
180
|
+
help=(
|
181
|
+
'Number of nodes to execute the task on. '
|
182
|
+
'Overrides the "num_nodes" config in the YAML if both are '
|
183
|
+
'supplied.'
|
184
|
+
),
|
185
|
+
),
|
186
|
+
click.option(
|
187
|
+
'--cpus',
|
188
|
+
default=None,
|
189
|
+
type=str,
|
190
|
+
required=False,
|
191
|
+
help=(
|
192
|
+
'Number of vCPUs each instance must have (e.g., '
|
193
|
+
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
|
194
|
+
'This is used to automatically select the instance type.'
|
195
|
+
),
|
196
|
+
),
|
197
|
+
click.option(
|
198
|
+
'--memory',
|
199
|
+
default=None,
|
200
|
+
type=str,
|
201
|
+
required=False,
|
202
|
+
help=(
|
203
|
+
'Amount of memory each instance must have in GB (e.g., '
|
204
|
+
'``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'
|
205
|
+
),
|
206
|
+
),
|
207
|
+
click.option(
|
208
|
+
'--disk-size',
|
209
|
+
default=None,
|
210
|
+
type=int,
|
211
|
+
required=False,
|
212
|
+
help=('OS disk size in GBs.'),
|
213
|
+
),
|
214
|
+
click.option(
|
215
|
+
'--image-id',
|
216
|
+
required=False,
|
217
|
+
default=None,
|
218
|
+
help=(
|
219
|
+
'Custom image id for launching the instances. '
|
220
|
+
'Passing "none" resets the config.'
|
221
|
+
),
|
222
|
+
),
|
223
|
+
click.option(
|
224
|
+
'--env-file',
|
225
|
+
required=False,
|
226
|
+
type=dotenv.dotenv_values,
|
227
|
+
help="""\
|
228
|
+
Path to a dotenv file with environment variables to set on the remote
|
229
|
+
node.
|
230
|
+
|
231
|
+
If any values from ``--env-file`` conflict with values set by
|
232
|
+
``--env``, the ``--env`` value will be preferred.""",
|
233
|
+
),
|
234
|
+
click.option(
|
235
|
+
'--env',
|
236
|
+
required=False,
|
237
|
+
type=_parse_env_var,
|
238
|
+
multiple=True,
|
239
|
+
help="""\
|
240
|
+
Environment variable to set on the remote node.
|
241
|
+
It can be specified multiple times.
|
242
|
+
Examples:
|
243
|
+
|
244
|
+
\b
|
245
|
+
1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
|
246
|
+
|
247
|
+
2. ``--env MY_ENV2=$HOME``: set ``$MY_ENV2`` on the cluster to be the
|
248
|
+
same value of ``$HOME`` in the local environment where the CLI command
|
249
|
+
is run.
|
250
|
+
|
251
|
+
3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
|
252
|
+
same value of ``$MY_ENV3`` in the local environment.""",
|
253
|
+
),
|
254
|
+
]
|
255
|
+
_TASK_OPTIONS_WITH_NAME = [
|
256
|
+
click.option(
|
257
|
+
'--name',
|
258
|
+
'-n',
|
259
|
+
required=False,
|
260
|
+
type=str,
|
261
|
+
help=(
|
262
|
+
'Task name. Overrides the "name" '
|
263
|
+
'config in the YAML if both are supplied.'
|
264
|
+
),
|
265
|
+
),
|
266
|
+
] + _TASK_OPTIONS
|
267
|
+
_EXTRA_RESOURCES_OPTIONS = [
|
268
|
+
click.option(
|
269
|
+
'--gpus',
|
270
|
+
required=False,
|
271
|
+
type=str,
|
272
|
+
help=(
|
273
|
+
'Type and number of GPUs to use. Example values: '
|
274
|
+
'"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
|
275
|
+
'(fractional counts are supported by the scheduling framework). '
|
276
|
+
'If a new cluster is being launched by this command, this is the '
|
277
|
+
'resources to provision. If an existing cluster is being reused, this'
|
278
|
+
" is seen as the task demand, which must fit the cluster's total "
|
279
|
+
'resources and is used for scheduling the task. '
|
280
|
+
'Overrides the "accelerators" '
|
281
|
+
'config in the YAML if both are supplied. '
|
282
|
+
'Passing "none" resets the config.'
|
283
|
+
),
|
284
|
+
),
|
285
|
+
]
|
286
|
+
|
287
|
+
|
288
|
+
def _get_click_major_version():
|
289
|
+
return int(click.__version__.split('.', maxsplit=1)[0])
|
290
|
+
|
291
|
+
|
292
|
+
_RELOAD_ZSH_CMD = 'source ~/.zshrc'
|
293
|
+
_RELOAD_BASH_CMD = 'source ~/.bashrc'
|
294
|
+
|
295
|
+
|
296
|
+
def _add_click_options(options: List[click.Option]):
|
297
|
+
"""A decorator for adding a list of click option decorators."""
|
298
|
+
|
299
|
+
def _add_options(func):
|
300
|
+
for option in reversed(options):
|
301
|
+
func = option(func)
|
302
|
+
return func
|
303
|
+
|
304
|
+
return _add_options
|
305
|
+
|
306
|
+
|
307
|
+
def _parse_override_params(
|
308
|
+
gpus: Optional[str] = None,
|
309
|
+
cpus: Optional[str] = None,
|
310
|
+
memory: Optional[str] = None,
|
311
|
+
image_id: Optional[str] = None,
|
312
|
+
disk_size: Optional[int] = None,
|
313
|
+
) -> Dict[str, Any]:
|
314
|
+
"""Parses the override parameters into a dictionary."""
|
315
|
+
override_params: Dict[str, Any] = {}
|
316
|
+
if gpus is not None:
|
317
|
+
if gpus.lower() == 'none':
|
318
|
+
override_params['accelerators'] = None
|
319
|
+
else:
|
320
|
+
override_params['accelerators'] = gpus
|
321
|
+
if cpus is not None:
|
322
|
+
if cpus.lower() == 'none':
|
323
|
+
override_params['cpus'] = None
|
324
|
+
else:
|
325
|
+
override_params['cpus'] = cpus
|
326
|
+
if memory is not None:
|
327
|
+
if memory.lower() == 'none':
|
328
|
+
override_params['memory'] = None
|
329
|
+
else:
|
330
|
+
override_params['memory'] = memory
|
331
|
+
if image_id is not None:
|
332
|
+
if image_id.lower() == 'none':
|
333
|
+
override_params['image_id'] = None
|
334
|
+
else:
|
335
|
+
override_params['image_id'] = image_id
|
336
|
+
if disk_size is not None:
|
337
|
+
override_params['disk_size'] = disk_size
|
338
|
+
return override_params
|
339
|
+
|
340
|
+
|
341
|
+
def _launch_with_confirm(
|
342
|
+
task: konduktor.Task,
|
343
|
+
*,
|
344
|
+
dryrun: bool,
|
345
|
+
detach_run: bool,
|
346
|
+
no_confirm: bool,
|
347
|
+
):
|
348
|
+
"""Launch a cluster with a Task."""
|
349
|
+
|
350
|
+
confirm_shown = False
|
351
|
+
if not no_confirm:
|
352
|
+
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
353
|
+
# it exists but is STOPPED.
|
354
|
+
prompt = (
|
355
|
+
f'Launching a new job {colorama.Style.BRIGHT}'
|
356
|
+
f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
|
357
|
+
'Proceed?'
|
358
|
+
)
|
359
|
+
if prompt is not None:
|
360
|
+
confirm_shown = True
|
361
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
362
|
+
|
363
|
+
if not confirm_shown:
|
364
|
+
click.secho(f'Running task {task.name}...', fg='yellow')
|
365
|
+
return konduktor.launch(
|
366
|
+
task,
|
367
|
+
dryrun=dryrun,
|
368
|
+
detach_run=detach_run,
|
369
|
+
)
|
370
|
+
|
371
|
+
|
372
|
+
def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
373
|
+
"""Checks if entrypoint is a readable YAML file.
|
374
|
+
|
375
|
+
Args:
|
376
|
+
entrypoint: Path to a YAML file.
|
377
|
+
"""
|
378
|
+
is_yaml = True
|
379
|
+
config: Optional[List[Dict[str, Any]]] = None
|
380
|
+
result = None
|
381
|
+
shell_splits = shlex.split(entrypoint)
|
382
|
+
yaml_file_provided = len(shell_splits) == 1 and (
|
383
|
+
shell_splits[0].endswith('yaml') or shell_splits[0].endswith('.yml')
|
384
|
+
)
|
385
|
+
invalid_reason = ''
|
386
|
+
try:
|
387
|
+
with open(entrypoint, 'r', encoding='utf-8') as f:
|
388
|
+
try:
|
389
|
+
config = list(yaml.safe_load_all(f))
|
390
|
+
if config:
|
391
|
+
result = config[0]
|
392
|
+
else:
|
393
|
+
result = {}
|
394
|
+
if isinstance(result, str):
|
395
|
+
# 'konduktor exec cluster ./my_script.sh'
|
396
|
+
is_yaml = False
|
397
|
+
except yaml.YAMLError as e:
|
398
|
+
if yaml_file_provided:
|
399
|
+
logger.debug(e)
|
400
|
+
detailed_error = f'\nYAML Error: {e}\n'
|
401
|
+
invalid_reason = (
|
402
|
+
'contains an invalid configuration. '
|
403
|
+
'Please check syntax.\n'
|
404
|
+
f'{detailed_error}'
|
405
|
+
)
|
406
|
+
is_yaml = False
|
407
|
+
|
408
|
+
except OSError:
|
409
|
+
if yaml_file_provided:
|
410
|
+
entry_point_path = os.path.expanduser(entrypoint)
|
411
|
+
if not os.path.exists(entry_point_path):
|
412
|
+
invalid_reason = (
|
413
|
+
'does not exist. Please check if the path' ' is correct.'
|
414
|
+
)
|
415
|
+
elif not os.path.isfile(entry_point_path):
|
416
|
+
invalid_reason = (
|
417
|
+
'is not a file. Please check if the path' ' is correct.'
|
418
|
+
)
|
419
|
+
else:
|
420
|
+
invalid_reason = (
|
421
|
+
'yaml.safe_load() failed. Please check if the' ' path is correct.'
|
422
|
+
)
|
423
|
+
is_yaml = False
|
424
|
+
if not is_yaml:
|
425
|
+
if yaml_file_provided:
|
426
|
+
click.confirm(
|
427
|
+
f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n'
|
428
|
+
'It will be treated as a command to be run remotely. Continue?',
|
429
|
+
abort=True,
|
430
|
+
)
|
431
|
+
return is_yaml, result
|
432
|
+
|
433
|
+
|
434
|
+
def _pop_and_ignore_fields_in_override_params(
|
435
|
+
params: Dict[str, Any], field_to_ignore: List[str]
|
436
|
+
) -> None:
|
437
|
+
"""Pops and ignores fields in override params.
|
438
|
+
|
439
|
+
Args:
|
440
|
+
params: Override params.
|
441
|
+
field_to_ignore: Fields to ignore.
|
442
|
+
|
443
|
+
Returns:
|
444
|
+
Override params with fields ignored.
|
445
|
+
"""
|
446
|
+
if field_to_ignore is not None:
|
447
|
+
for field in field_to_ignore:
|
448
|
+
field_value = params.pop(field, None)
|
449
|
+
if field_value is not None:
|
450
|
+
click.secho(
|
451
|
+
f'Override param {field}={field_value} is ignored.', fg='yellow'
|
452
|
+
)
|
453
|
+
|
454
|
+
|
455
|
+
class _NaturalOrderGroup(click.Group):
|
456
|
+
"""Lists commands in the order defined in this script.
|
457
|
+
|
458
|
+
Reference: https://github.com/pallets/click/issues/513
|
459
|
+
"""
|
460
|
+
|
461
|
+
def list_commands(self, ctx):
|
462
|
+
return self.commands.keys()
|
463
|
+
|
464
|
+
def invoke(self, ctx):
|
465
|
+
return super().invoke(ctx)
|
466
|
+
|
467
|
+
|
468
|
+
class _DocumentedCodeCommand(click.Command):
|
469
|
+
"""Corrects help strings for documented commands such that --help displays
|
470
|
+
properly and code blocks are rendered in the official web documentation.
|
471
|
+
"""
|
472
|
+
|
473
|
+
def get_help(self, ctx):
|
474
|
+
help_str = ctx.command.help
|
475
|
+
ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b')
|
476
|
+
return super().get_help(ctx)
|
477
|
+
|
478
|
+
|
479
|
+
@click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS)
|
480
|
+
@click.version_option(konduktor.__version__, '--version', '-v', prog_name='konduktor')
|
481
|
+
@click.version_option(
|
482
|
+
konduktor.__commit__,
|
483
|
+
'--commit',
|
484
|
+
'-c',
|
485
|
+
prog_name='konduktor',
|
486
|
+
message='%(prog)s, commit %(version)s',
|
487
|
+
help='Show the commit hash and exit',
|
488
|
+
)
|
489
|
+
def cli():
|
490
|
+
pass
|
491
|
+
|
492
|
+
|
493
|
+
@cli.command()
|
494
|
+
@click.option(
|
495
|
+
'--all-users',
|
496
|
+
'-u',
|
497
|
+
default=False,
|
498
|
+
is_flag=True,
|
499
|
+
required=False,
|
500
|
+
help='Show all clusters, including those not owned by the ' 'current user.',
|
501
|
+
)
|
502
|
+
# pylint: disable=redefined-builtin
|
503
|
+
def status(all_users: bool):
|
504
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
505
|
+
"""Shows list of all the jobs
|
506
|
+
|
507
|
+
Args:
|
508
|
+
all_users (bool): whether to show all jobs
|
509
|
+
regardless of the user in this namespace
|
510
|
+
"""
|
511
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
512
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
513
|
+
user = common_utils.user_and_hostname_hash() if not all_users else 'All'
|
514
|
+
click.secho(f'User: {user}', fg='green', bold=True)
|
515
|
+
click.secho('Jobs', fg='cyan', bold=True)
|
516
|
+
jobset_utils.show_status_table(namespace, all_users=all_users)
|
517
|
+
|
518
|
+
|
519
|
+
@cli.command()
|
520
|
+
@click.option(
|
521
|
+
'--status',
|
522
|
+
is_flag=True,
|
523
|
+
default=False,
|
524
|
+
help=(
|
525
|
+
'If specified, do not show logs but exit with a status code for the '
|
526
|
+
"job's status: 0 for succeeded, or 1 for all other statuses."
|
527
|
+
),
|
528
|
+
)
|
529
|
+
@click.option(
|
530
|
+
'--follow/--no-follow',
|
531
|
+
is_flag=True,
|
532
|
+
default=True,
|
533
|
+
help=(
|
534
|
+
'Follow the logs of a job. '
|
535
|
+
'If --no-follow is specified, print the log so far and exit. '
|
536
|
+
'[default: --follow]'
|
537
|
+
),
|
538
|
+
)
|
539
|
+
@click.option(
|
540
|
+
'--tail',
|
541
|
+
default=1000,
|
542
|
+
type=int,
|
543
|
+
help=(
|
544
|
+
'The number of lines to display from the end of the log file. '
|
545
|
+
'Default is 1000.'
|
546
|
+
),
|
547
|
+
)
|
548
|
+
@click.argument('job_id', type=str, nargs=1)
|
549
|
+
# TODO(zhwu): support logs by job name
|
550
|
+
def logs(
|
551
|
+
status: bool,
|
552
|
+
job_id: str,
|
553
|
+
follow: bool,
|
554
|
+
tail: int,
|
555
|
+
):
|
556
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
557
|
+
"""Tail the log of a job."""
|
558
|
+
if status:
|
559
|
+
raise click.UsageError('`--status` is being deprecated)')
|
560
|
+
|
561
|
+
# Check if the job exists
|
562
|
+
if not job_id:
|
563
|
+
raise click.UsageError('Please provide a job ID.')
|
564
|
+
|
565
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
566
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
567
|
+
|
568
|
+
# Verify the job exists before attempting to tail logs
|
569
|
+
# TODO(asaiacai): unify the 404 logic under jobset_utils
|
570
|
+
try:
|
571
|
+
jobset_utils.get_jobset(namespace, job_id)
|
572
|
+
except jobset_utils.JobNotFoundError:
|
573
|
+
raise click.UsageError(
|
574
|
+
f"Job '{job_id}' not found in namespace "
|
575
|
+
f"'{namespace}'. Check your jobs with "
|
576
|
+
f'{colorama.Style.BRIGHT}`konduktor status`'
|
577
|
+
f'{colorama.Style.RESET_ALL}.'
|
578
|
+
)
|
579
|
+
|
580
|
+
click.secho(
|
581
|
+
'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
|
582
|
+
fg='yellow',
|
583
|
+
)
|
584
|
+
loki_utils.tail_loki_logs_ws(job_id, follow=follow, num_logs=tail)
|
585
|
+
|
586
|
+
|
587
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
588
|
+
@click.argument(
|
589
|
+
'entrypoint',
|
590
|
+
required=False,
|
591
|
+
type=str,
|
592
|
+
nargs=-1,
|
593
|
+
)
|
594
|
+
@click.option(
|
595
|
+
'--dryrun',
|
596
|
+
default=False,
|
597
|
+
is_flag=True,
|
598
|
+
help='If True, do not actually run the job.',
|
599
|
+
)
|
600
|
+
@click.option(
|
601
|
+
'--detach-run',
|
602
|
+
'-d',
|
603
|
+
default=False,
|
604
|
+
is_flag=True,
|
605
|
+
help=(
|
606
|
+
'If True, as soon as a job is submitted, return from this call '
|
607
|
+
'and do not stream execution logs.'
|
608
|
+
),
|
609
|
+
)
|
610
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
|
611
|
+
@click.option(
|
612
|
+
'--yes',
|
613
|
+
'-y',
|
614
|
+
is_flag=True,
|
615
|
+
default=False,
|
616
|
+
required=False,
|
617
|
+
# Disabling quote check here, as there seems to be a bug in pylint,
|
618
|
+
# which incorrectly recognizes the help string as a docstring.
|
619
|
+
# pylint: disable=bad-docstring-quotes
|
620
|
+
help='Skip confirmation prompt.',
|
621
|
+
)
|
622
|
+
def launch(
|
623
|
+
entrypoint: Tuple[str, ...],
|
624
|
+
dryrun: bool,
|
625
|
+
detach_run: bool,
|
626
|
+
name: Optional[str],
|
627
|
+
workdir: Optional[str],
|
628
|
+
cloud: Optional[str],
|
629
|
+
gpus: Optional[str],
|
630
|
+
cpus: Optional[str],
|
631
|
+
memory: Optional[str],
|
632
|
+
num_nodes: Optional[int],
|
633
|
+
image_id: Optional[str],
|
634
|
+
env_file: Optional[Dict[str, str]],
|
635
|
+
env: List[Tuple[str, str]],
|
636
|
+
disk_size: Optional[int],
|
637
|
+
yes: bool,
|
638
|
+
):
|
639
|
+
"""Launch a task.
|
640
|
+
|
641
|
+
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
642
|
+
specification. Otherwise, it is interpreted as a bash command.
|
643
|
+
"""
|
644
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
645
|
+
env = _merge_env_vars(env_file, env)
|
646
|
+
|
647
|
+
task = _make_task_with_overrides(
|
648
|
+
entrypoint=entrypoint,
|
649
|
+
name=name,
|
650
|
+
workdir=workdir,
|
651
|
+
cloud=cloud,
|
652
|
+
gpus=gpus,
|
653
|
+
cpus=cpus,
|
654
|
+
memory=memory,
|
655
|
+
num_nodes=num_nodes,
|
656
|
+
image_id=image_id,
|
657
|
+
env=env,
|
658
|
+
disk_size=disk_size,
|
659
|
+
)
|
660
|
+
|
661
|
+
click.secho(
|
662
|
+
f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
|
663
|
+
)
|
664
|
+
table_kwargs = {
|
665
|
+
'hrules': prettytable.FRAME,
|
666
|
+
'vrules': prettytable.NONE,
|
667
|
+
'border': True,
|
668
|
+
}
|
669
|
+
headers = ['CPUs', 'Mem (GB)', 'GPUs']
|
670
|
+
table = log_utils.create_table(headers, **table_kwargs)
|
671
|
+
assert task.resources is not None
|
672
|
+
table.add_row(
|
673
|
+
[task.resources.cpus, task.resources.memory, task.resources.accelerators]
|
674
|
+
)
|
675
|
+
print(table)
|
676
|
+
|
677
|
+
job_name = _launch_with_confirm(
|
678
|
+
task,
|
679
|
+
dryrun=dryrun,
|
680
|
+
detach_run=detach_run,
|
681
|
+
no_confirm=yes,
|
682
|
+
)
|
683
|
+
click.secho(
|
684
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, job_name),
|
685
|
+
fg='green',
|
686
|
+
bold=True,
|
687
|
+
)
|
688
|
+
|
689
|
+
|
690
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
691
|
+
@click.argument(
|
692
|
+
'jobs',
|
693
|
+
nargs=-1,
|
694
|
+
required=False,
|
695
|
+
)
|
696
|
+
@click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
|
697
|
+
@click.option(
|
698
|
+
'--yes',
|
699
|
+
'-y',
|
700
|
+
is_flag=True,
|
701
|
+
default=False,
|
702
|
+
required=False,
|
703
|
+
help='Skip confirmation prompt.',
|
704
|
+
)
|
705
|
+
def down(
|
706
|
+
jobs: List[str],
|
707
|
+
all: Optional[bool], # pylint: disable=redefined-builtin
|
708
|
+
yes: bool,
|
709
|
+
):
|
710
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
711
|
+
"""Tear down job(s).
|
712
|
+
|
713
|
+
JOB is the name of the job to tear down. If both
|
714
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
715
|
+
|
716
|
+
Tearing down a job will delete all associated containers (all billing
|
717
|
+
stops), and any data on the containers disks will be lost. Accelerators
|
718
|
+
(e.g., GPUs) that are part of the job will be deleted too.
|
719
|
+
|
720
|
+
|
721
|
+
Examples:
|
722
|
+
|
723
|
+
.. code-block:: bash
|
724
|
+
|
725
|
+
# Tear down a specific job.
|
726
|
+
konduktor down cluster_name
|
727
|
+
\b
|
728
|
+
# Tear down multiple clusters.
|
729
|
+
konduktor down jobs
|
730
|
+
\b
|
731
|
+
# Tear down all existing clusters.
|
732
|
+
konduktor down -a
|
733
|
+
|
734
|
+
"""
|
735
|
+
|
736
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
737
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
738
|
+
if all:
|
739
|
+
jobs_specs = jobset_utils.list_jobset(namespace)
|
740
|
+
assert jobs_specs is not None, f'No ' f'jobs found in namespace {namespace}'
|
741
|
+
assert len(jobs_specs) > 0, f'No ' f'jobs found in namespace {namespace}'
|
742
|
+
jobs = [job['metadata']['name'] for job in jobs_specs['items']]
|
743
|
+
if not yes:
|
744
|
+
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
745
|
+
# it exists but is STOPPED.
|
746
|
+
prompt = (
|
747
|
+
f'Tearing down job(s) {colorama.Style.BRIGHT} '
|
748
|
+
f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
|
749
|
+
'Proceed?'
|
750
|
+
)
|
751
|
+
if prompt is not None:
|
752
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
753
|
+
|
754
|
+
for job in track(jobs, description='Tearing down job(s)...'):
|
755
|
+
jobset_utils.delete_jobset(namespace, job)
|
756
|
+
|
757
|
+
|
758
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
759
|
+
@click.argument('clouds', required=True, type=str, nargs=-1)
|
760
|
+
def check(clouds: Tuple[str]):
|
761
|
+
"""Check which clouds are available to use for storage
|
762
|
+
|
763
|
+
This checks storage credentials for a cloud supported by konduktor. If a
|
764
|
+
cloud is detected to be inaccessible, the reason and correction steps will
|
765
|
+
be shown.
|
766
|
+
|
767
|
+
If CLOUDS are specified, checks credentials for only those clouds.
|
768
|
+
|
769
|
+
The enabled clouds are cached and form the "search space" to be considered
|
770
|
+
for each task.
|
771
|
+
|
772
|
+
Examples:
|
773
|
+
|
774
|
+
.. code-block:: bash
|
775
|
+
|
776
|
+
# Check credentials for all supported clouds.
|
777
|
+
konduktor check
|
778
|
+
# Check only specific clouds - GCP.
|
779
|
+
konduktor check gcp
|
780
|
+
"""
|
781
|
+
clouds_arg = clouds if len(clouds) > 0 else None
|
782
|
+
konduktor_check.check(clouds=clouds_arg)
|
783
|
+
|
784
|
+
|
785
|
+
def main():
|
786
|
+
return cli()
|
787
|
+
|
788
|
+
|
789
|
+
if __name__ == '__main__':
|
790
|
+
main()
|