konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/cli.py ADDED
@@ -0,0 +1,790 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """The 'konduktor' command line tool.
14
+
15
+ Example usage:
16
+
17
+ # See available commands.
18
+ >> konduktor
19
+
20
+ # Run a task, described in a yaml file.
21
+ >> konduktor launch task.yaml
22
+
23
+ # Show the list of scheduled jobs
24
+ >> konduktor status
25
+
26
+ # Tear down a specific job.
27
+ >> konduktor down cluster_name
28
+
29
+ # Tear down all scheduled jobs
30
+ >> konduktor down -a
31
+
32
+ NOTE: the order of command definitions in this file corresponds to how they are
33
+ listed in "konduktor --help". Take care to put logically connected commands close to
34
+ each other.
35
+ """
36
+
37
+ import os
38
+ import shlex
39
+ from typing import Any, Dict, List, Optional, Tuple
40
+
41
+ import click
42
+ import colorama
43
+ import dotenv
44
+ import prettytable
45
+ import yaml
46
+ from rich.progress import track
47
+
48
+ import konduktor
49
+ from konduktor import check as konduktor_check
50
+ from konduktor import logging
51
+ from konduktor.backends import jobset_utils
52
+ from konduktor.utils import (
53
+ common_utils,
54
+ kubernetes_utils,
55
+ log_utils,
56
+ loki_utils,
57
+ ux_utils,
58
+ )
59
+
60
+ _CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
61
+
62
+ logger = logging.get_logger(__name__)
63
+
64
+
65
+ def _parse_env_var(env_var: str) -> Tuple[str, str]:
66
+ """Parse env vars into a (KEY, VAL) pair."""
67
+ if '=' not in env_var:
68
+ value = os.environ.get(env_var)
69
+ if value is None:
70
+ raise click.UsageError(f'{env_var} is not set in local environment.')
71
+ return (env_var, value)
72
+ ret = tuple(env_var.split('=', 1))
73
+ if len(ret) != 2:
74
+ raise click.UsageError(
75
+ f'Invalid env var: {env_var}. Must be in the form of KEY=VAL ' 'or KEY.'
76
+ )
77
+ return ret[0], ret[1]
78
+
79
+
80
+ def _merge_env_vars(
81
+ env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]]
82
+ ) -> List[Tuple[str, str]]:
83
+ """Merges all values from env_list into env_dict."""
84
+ if not env_dict:
85
+ return env_list
86
+ for key, value in env_list:
87
+ env_dict[key] = value
88
+ return list(env_dict.items())
89
+
90
+
91
+ def _make_task_with_overrides(
92
+ entrypoint: Tuple[str, ...],
93
+ *,
94
+ entrypoint_name: str = 'konduktor.Task',
95
+ name: Optional[str] = None,
96
+ workdir: Optional[str] = None,
97
+ cloud: Optional[str] = None,
98
+ gpus: Optional[str] = None,
99
+ cpus: Optional[str] = None,
100
+ memory: Optional[str] = None,
101
+ instance_type: Optional[str] = None,
102
+ num_nodes: Optional[int] = None,
103
+ image_id: Optional[str] = None,
104
+ disk_size: Optional[int] = None,
105
+ env: Optional[List[Tuple[str, str]]] = None,
106
+ field_to_ignore: Optional[List[str]] = None,
107
+ ) -> konduktor.Task:
108
+ """Creates a task or a dag from an entrypoint with overrides.
109
+
110
+ Returns:
111
+ konduktor.Task
112
+ """
113
+ entrypoint = ' '.join(entrypoint)
114
+ is_yaml, _ = _check_yaml(entrypoint)
115
+ entrypoint: Optional[str]
116
+ if is_yaml:
117
+ # Treat entrypoint as a yaml.
118
+ click.secho(f'{entrypoint_name} from YAML spec: ', fg='yellow', nl=False)
119
+ click.secho(entrypoint, bold=True)
120
+ else:
121
+ if entrypoint is not None and len(entrypoint) == 0:
122
+ raise ValueError(
123
+ 'no entrypoint specified, run with \n' '`konduktor launch task.yaml'
124
+ )
125
+ raise ValueError(f'{entrypoint} is not a valid YAML spec,')
126
+
127
+ override_params = _parse_override_params(
128
+ gpus=gpus,
129
+ cpus=cpus,
130
+ memory=memory,
131
+ image_id=image_id,
132
+ disk_size=disk_size,
133
+ )
134
+
135
+ if field_to_ignore is not None:
136
+ _pop_and_ignore_fields_in_override_params(override_params, field_to_ignore)
137
+
138
+ assert entrypoint is not None
139
+ task_configs = common_utils.read_yaml_all(entrypoint)
140
+ assert len(task_configs) == 1, 'Only single tasks are supported'
141
+ task = konduktor.Task.from_yaml_config(task_configs[0], env)
142
+ # Override.
143
+ if workdir is not None:
144
+ task.workdir = workdir
145
+
146
+ task.set_resources_override(override_params)
147
+
148
+ if num_nodes is not None:
149
+ task.num_nodes = num_nodes
150
+ if name is not None:
151
+ task.name = name
152
+ return task
153
+
154
+
155
+ _TASK_OPTIONS = [
156
+ click.option(
157
+ '--workdir',
158
+ required=False,
159
+ type=click.Path(exists=True, file_okay=False),
160
+ help=(
161
+ 'If specified, sync this dir to the remote working directory, '
162
+ 'where the task will be invoked. '
163
+ 'Overrides the "workdir" config in the YAML if both are supplied.'
164
+ ),
165
+ ),
166
+ click.option(
167
+ '--cloud',
168
+ required=False,
169
+ type=str,
170
+ help=(
171
+ 'The cloud to use. If specified, overrides the "resources.cloud" '
172
+ 'config. Passing "none" resets the config. [defunct] currently '
173
+ 'only supports a single cloud'
174
+ ),
175
+ ),
176
+ click.option(
177
+ '--num-nodes',
178
+ required=False,
179
+ type=int,
180
+ help=(
181
+ 'Number of nodes to execute the task on. '
182
+ 'Overrides the "num_nodes" config in the YAML if both are '
183
+ 'supplied.'
184
+ ),
185
+ ),
186
+ click.option(
187
+ '--cpus',
188
+ default=None,
189
+ type=str,
190
+ required=False,
191
+ help=(
192
+ 'Number of vCPUs each instance must have (e.g., '
193
+ '``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
194
+ 'This is used to automatically select the instance type.'
195
+ ),
196
+ ),
197
+ click.option(
198
+ '--memory',
199
+ default=None,
200
+ type=str,
201
+ required=False,
202
+ help=(
203
+ 'Amount of memory each instance must have in GB (e.g., '
204
+ '``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'
205
+ ),
206
+ ),
207
+ click.option(
208
+ '--disk-size',
209
+ default=None,
210
+ type=int,
211
+ required=False,
212
+ help=('OS disk size in GBs.'),
213
+ ),
214
+ click.option(
215
+ '--image-id',
216
+ required=False,
217
+ default=None,
218
+ help=(
219
+ 'Custom image id for launching the instances. '
220
+ 'Passing "none" resets the config.'
221
+ ),
222
+ ),
223
+ click.option(
224
+ '--env-file',
225
+ required=False,
226
+ type=dotenv.dotenv_values,
227
+ help="""\
228
+ Path to a dotenv file with environment variables to set on the remote
229
+ node.
230
+
231
+ If any values from ``--env-file`` conflict with values set by
232
+ ``--env``, the ``--env`` value will be preferred.""",
233
+ ),
234
+ click.option(
235
+ '--env',
236
+ required=False,
237
+ type=_parse_env_var,
238
+ multiple=True,
239
+ help="""\
240
+ Environment variable to set on the remote node.
241
+ It can be specified multiple times.
242
+ Examples:
243
+
244
+ \b
245
+ 1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
246
+
247
+ 2. ``--env MY_ENV2=$HOME``: set ``$MY_ENV2`` on the cluster to be the
248
+ same value of ``$HOME`` in the local environment where the CLI command
249
+ is run.
250
+
251
+ 3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
252
+ same value of ``$MY_ENV3`` in the local environment.""",
253
+ ),
254
+ ]
255
+ _TASK_OPTIONS_WITH_NAME = [
256
+ click.option(
257
+ '--name',
258
+ '-n',
259
+ required=False,
260
+ type=str,
261
+ help=(
262
+ 'Task name. Overrides the "name" '
263
+ 'config in the YAML if both are supplied.'
264
+ ),
265
+ ),
266
+ ] + _TASK_OPTIONS
267
+ _EXTRA_RESOURCES_OPTIONS = [
268
+ click.option(
269
+ '--gpus',
270
+ required=False,
271
+ type=str,
272
+ help=(
273
+ 'Type and number of GPUs to use. Example values: '
274
+ '"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
275
+ '(fractional counts are supported by the scheduling framework). '
276
+ 'If a new cluster is being launched by this command, this is the '
277
+ 'resources to provision. If an existing cluster is being reused, this'
278
+ " is seen as the task demand, which must fit the cluster's total "
279
+ 'resources and is used for scheduling the task. '
280
+ 'Overrides the "accelerators" '
281
+ 'config in the YAML if both are supplied. '
282
+ 'Passing "none" resets the config.'
283
+ ),
284
+ ),
285
+ ]
286
+
287
+
288
+ def _get_click_major_version():
289
+ return int(click.__version__.split('.', maxsplit=1)[0])
290
+
291
+
292
+ _RELOAD_ZSH_CMD = 'source ~/.zshrc'
293
+ _RELOAD_BASH_CMD = 'source ~/.bashrc'
294
+
295
+
296
+ def _add_click_options(options: List[click.Option]):
297
+ """A decorator for adding a list of click option decorators."""
298
+
299
+ def _add_options(func):
300
+ for option in reversed(options):
301
+ func = option(func)
302
+ return func
303
+
304
+ return _add_options
305
+
306
+
307
+ def _parse_override_params(
308
+ gpus: Optional[str] = None,
309
+ cpus: Optional[str] = None,
310
+ memory: Optional[str] = None,
311
+ image_id: Optional[str] = None,
312
+ disk_size: Optional[int] = None,
313
+ ) -> Dict[str, Any]:
314
+ """Parses the override parameters into a dictionary."""
315
+ override_params: Dict[str, Any] = {}
316
+ if gpus is not None:
317
+ if gpus.lower() == 'none':
318
+ override_params['accelerators'] = None
319
+ else:
320
+ override_params['accelerators'] = gpus
321
+ if cpus is not None:
322
+ if cpus.lower() == 'none':
323
+ override_params['cpus'] = None
324
+ else:
325
+ override_params['cpus'] = cpus
326
+ if memory is not None:
327
+ if memory.lower() == 'none':
328
+ override_params['memory'] = None
329
+ else:
330
+ override_params['memory'] = memory
331
+ if image_id is not None:
332
+ if image_id.lower() == 'none':
333
+ override_params['image_id'] = None
334
+ else:
335
+ override_params['image_id'] = image_id
336
+ if disk_size is not None:
337
+ override_params['disk_size'] = disk_size
338
+ return override_params
339
+
340
+
341
+ def _launch_with_confirm(
342
+ task: konduktor.Task,
343
+ *,
344
+ dryrun: bool,
345
+ detach_run: bool,
346
+ no_confirm: bool,
347
+ ):
348
+ """Launch a cluster with a Task."""
349
+
350
+ confirm_shown = False
351
+ if not no_confirm:
352
+ # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
353
+ # it exists but is STOPPED.
354
+ prompt = (
355
+ f'Launching a new job {colorama.Style.BRIGHT}'
356
+ f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
357
+ 'Proceed?'
358
+ )
359
+ if prompt is not None:
360
+ confirm_shown = True
361
+ click.confirm(prompt, default=True, abort=True, show_default=True)
362
+
363
+ if not confirm_shown:
364
+ click.secho(f'Running task {task.name}...', fg='yellow')
365
+ return konduktor.launch(
366
+ task,
367
+ dryrun=dryrun,
368
+ detach_run=detach_run,
369
+ )
370
+
371
+
372
+ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
373
+ """Checks if entrypoint is a readable YAML file.
374
+
375
+ Args:
376
+ entrypoint: Path to a YAML file.
377
+ """
378
+ is_yaml = True
379
+ config: Optional[List[Dict[str, Any]]] = None
380
+ result = None
381
+ shell_splits = shlex.split(entrypoint)
382
+ yaml_file_provided = len(shell_splits) == 1 and (
383
+ shell_splits[0].endswith('yaml') or shell_splits[0].endswith('.yml')
384
+ )
385
+ invalid_reason = ''
386
+ try:
387
+ with open(entrypoint, 'r', encoding='utf-8') as f:
388
+ try:
389
+ config = list(yaml.safe_load_all(f))
390
+ if config:
391
+ result = config[0]
392
+ else:
393
+ result = {}
394
+ if isinstance(result, str):
395
+ # 'konduktor exec cluster ./my_script.sh'
396
+ is_yaml = False
397
+ except yaml.YAMLError as e:
398
+ if yaml_file_provided:
399
+ logger.debug(e)
400
+ detailed_error = f'\nYAML Error: {e}\n'
401
+ invalid_reason = (
402
+ 'contains an invalid configuration. '
403
+ 'Please check syntax.\n'
404
+ f'{detailed_error}'
405
+ )
406
+ is_yaml = False
407
+
408
+ except OSError:
409
+ if yaml_file_provided:
410
+ entry_point_path = os.path.expanduser(entrypoint)
411
+ if not os.path.exists(entry_point_path):
412
+ invalid_reason = (
413
+ 'does not exist. Please check if the path' ' is correct.'
414
+ )
415
+ elif not os.path.isfile(entry_point_path):
416
+ invalid_reason = (
417
+ 'is not a file. Please check if the path' ' is correct.'
418
+ )
419
+ else:
420
+ invalid_reason = (
421
+ 'yaml.safe_load() failed. Please check if the' ' path is correct.'
422
+ )
423
+ is_yaml = False
424
+ if not is_yaml:
425
+ if yaml_file_provided:
426
+ click.confirm(
427
+ f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n'
428
+ 'It will be treated as a command to be run remotely. Continue?',
429
+ abort=True,
430
+ )
431
+ return is_yaml, result
432
+
433
+
434
+ def _pop_and_ignore_fields_in_override_params(
435
+ params: Dict[str, Any], field_to_ignore: List[str]
436
+ ) -> None:
437
+ """Pops and ignores fields in override params.
438
+
439
+ Args:
440
+ params: Override params.
441
+ field_to_ignore: Fields to ignore.
442
+
443
+ Returns:
444
+ Override params with fields ignored.
445
+ """
446
+ if field_to_ignore is not None:
447
+ for field in field_to_ignore:
448
+ field_value = params.pop(field, None)
449
+ if field_value is not None:
450
+ click.secho(
451
+ f'Override param {field}={field_value} is ignored.', fg='yellow'
452
+ )
453
+
454
+
455
+ class _NaturalOrderGroup(click.Group):
456
+ """Lists commands in the order defined in this script.
457
+
458
+ Reference: https://github.com/pallets/click/issues/513
459
+ """
460
+
461
+ def list_commands(self, ctx):
462
+ return self.commands.keys()
463
+
464
+ def invoke(self, ctx):
465
+ return super().invoke(ctx)
466
+
467
+
468
+ class _DocumentedCodeCommand(click.Command):
469
+ """Corrects help strings for documented commands such that --help displays
470
+ properly and code blocks are rendered in the official web documentation.
471
+ """
472
+
473
+ def get_help(self, ctx):
474
+ help_str = ctx.command.help
475
+ ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b')
476
+ return super().get_help(ctx)
477
+
478
+
479
+ @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS)
480
+ @click.version_option(konduktor.__version__, '--version', '-v', prog_name='konduktor')
481
+ @click.version_option(
482
+ konduktor.__commit__,
483
+ '--commit',
484
+ '-c',
485
+ prog_name='konduktor',
486
+ message='%(prog)s, commit %(version)s',
487
+ help='Show the commit hash and exit',
488
+ )
489
+ def cli():
490
+ pass
491
+
492
+
493
+ @cli.command()
494
+ @click.option(
495
+ '--all-users',
496
+ '-u',
497
+ default=False,
498
+ is_flag=True,
499
+ required=False,
500
+ help='Show all clusters, including those not owned by the ' 'current user.',
501
+ )
502
+ # pylint: disable=redefined-builtin
503
+ def status(all_users: bool):
504
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
505
+ """Shows list of all the jobs
506
+
507
+ Args:
508
+ all_users (bool): whether to show all jobs
509
+ regardless of the user in this namespace
510
+ """
511
+ context = kubernetes_utils.get_current_kube_config_context_name()
512
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
513
+ user = common_utils.user_and_hostname_hash() if not all_users else 'All'
514
+ click.secho(f'User: {user}', fg='green', bold=True)
515
+ click.secho('Jobs', fg='cyan', bold=True)
516
+ jobset_utils.show_status_table(namespace, all_users=all_users)
517
+
518
+
519
+ @cli.command()
520
+ @click.option(
521
+ '--status',
522
+ is_flag=True,
523
+ default=False,
524
+ help=(
525
+ 'If specified, do not show logs but exit with a status code for the '
526
+ "job's status: 0 for succeeded, or 1 for all other statuses."
527
+ ),
528
+ )
529
+ @click.option(
530
+ '--follow/--no-follow',
531
+ is_flag=True,
532
+ default=True,
533
+ help=(
534
+ 'Follow the logs of a job. '
535
+ 'If --no-follow is specified, print the log so far and exit. '
536
+ '[default: --follow]'
537
+ ),
538
+ )
539
+ @click.option(
540
+ '--tail',
541
+ default=1000,
542
+ type=int,
543
+ help=(
544
+ 'The number of lines to display from the end of the log file. '
545
+ 'Default is 1000.'
546
+ ),
547
+ )
548
+ @click.argument('job_id', type=str, nargs=1)
549
+ # TODO(zhwu): support logs by job name
550
+ def logs(
551
+ status: bool,
552
+ job_id: str,
553
+ follow: bool,
554
+ tail: int,
555
+ ):
556
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
557
+ """Tail the log of a job."""
558
+ if status:
559
+ raise click.UsageError('`--status` is being deprecated)')
560
+
561
+ # Check if the job exists
562
+ if not job_id:
563
+ raise click.UsageError('Please provide a job ID.')
564
+
565
+ context = kubernetes_utils.get_current_kube_config_context_name()
566
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
567
+
568
+ # Verify the job exists before attempting to tail logs
569
+ # TODO(asaiacai): unify the 404 logic under jobset_utils
570
+ try:
571
+ jobset_utils.get_jobset(namespace, job_id)
572
+ except jobset_utils.JobNotFoundError:
573
+ raise click.UsageError(
574
+ f"Job '{job_id}' not found in namespace "
575
+ f"'{namespace}'. Check your jobs with "
576
+ f'{colorama.Style.BRIGHT}`konduktor status`'
577
+ f'{colorama.Style.RESET_ALL}.'
578
+ )
579
+
580
+ click.secho(
581
+ 'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
582
+ fg='yellow',
583
+ )
584
+ loki_utils.tail_loki_logs_ws(job_id, follow=follow, num_logs=tail)
585
+
586
+
587
+ @cli.command(cls=_DocumentedCodeCommand)
588
+ @click.argument(
589
+ 'entrypoint',
590
+ required=False,
591
+ type=str,
592
+ nargs=-1,
593
+ )
594
+ @click.option(
595
+ '--dryrun',
596
+ default=False,
597
+ is_flag=True,
598
+ help='If True, do not actually run the job.',
599
+ )
600
+ @click.option(
601
+ '--detach-run',
602
+ '-d',
603
+ default=False,
604
+ is_flag=True,
605
+ help=(
606
+ 'If True, as soon as a job is submitted, return from this call '
607
+ 'and do not stream execution logs.'
608
+ ),
609
+ )
610
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
611
+ @click.option(
612
+ '--yes',
613
+ '-y',
614
+ is_flag=True,
615
+ default=False,
616
+ required=False,
617
+ # Disabling quote check here, as there seems to be a bug in pylint,
618
+ # which incorrectly recognizes the help string as a docstring.
619
+ # pylint: disable=bad-docstring-quotes
620
+ help='Skip confirmation prompt.',
621
+ )
622
+ def launch(
623
+ entrypoint: Tuple[str, ...],
624
+ dryrun: bool,
625
+ detach_run: bool,
626
+ name: Optional[str],
627
+ workdir: Optional[str],
628
+ cloud: Optional[str],
629
+ gpus: Optional[str],
630
+ cpus: Optional[str],
631
+ memory: Optional[str],
632
+ num_nodes: Optional[int],
633
+ image_id: Optional[str],
634
+ env_file: Optional[Dict[str, str]],
635
+ env: List[Tuple[str, str]],
636
+ disk_size: Optional[int],
637
+ yes: bool,
638
+ ):
639
+ """Launch a task.
640
+
641
+ If ENTRYPOINT points to a valid YAML file, it is read in as the task
642
+ specification. Otherwise, it is interpreted as a bash command.
643
+ """
644
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
645
+ env = _merge_env_vars(env_file, env)
646
+
647
+ task = _make_task_with_overrides(
648
+ entrypoint=entrypoint,
649
+ name=name,
650
+ workdir=workdir,
651
+ cloud=cloud,
652
+ gpus=gpus,
653
+ cpus=cpus,
654
+ memory=memory,
655
+ num_nodes=num_nodes,
656
+ image_id=image_id,
657
+ env=env,
658
+ disk_size=disk_size,
659
+ )
660
+
661
+ click.secho(
662
+ f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
663
+ )
664
+ table_kwargs = {
665
+ 'hrules': prettytable.FRAME,
666
+ 'vrules': prettytable.NONE,
667
+ 'border': True,
668
+ }
669
+ headers = ['CPUs', 'Mem (GB)', 'GPUs']
670
+ table = log_utils.create_table(headers, **table_kwargs)
671
+ assert task.resources is not None
672
+ table.add_row(
673
+ [task.resources.cpus, task.resources.memory, task.resources.accelerators]
674
+ )
675
+ print(table)
676
+
677
+ job_name = _launch_with_confirm(
678
+ task,
679
+ dryrun=dryrun,
680
+ detach_run=detach_run,
681
+ no_confirm=yes,
682
+ )
683
+ click.secho(
684
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, job_name),
685
+ fg='green',
686
+ bold=True,
687
+ )
688
+
689
+
690
+ @cli.command(cls=_DocumentedCodeCommand)
691
+ @click.argument(
692
+ 'jobs',
693
+ nargs=-1,
694
+ required=False,
695
+ )
696
+ @click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
697
+ @click.option(
698
+ '--yes',
699
+ '-y',
700
+ is_flag=True,
701
+ default=False,
702
+ required=False,
703
+ help='Skip confirmation prompt.',
704
+ )
705
+ def down(
706
+ jobs: List[str],
707
+ all: Optional[bool], # pylint: disable=redefined-builtin
708
+ yes: bool,
709
+ ):
710
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
711
+ """Tear down job(s).
712
+
713
+ JOB is the name of the job to tear down. If both
714
+ JOB and ``--all`` are supplied, the latter takes precedence.
715
+
716
+ Tearing down a job will delete all associated containers (all billing
717
+ stops), and any data on the containers disks will be lost. Accelerators
718
+ (e.g., GPUs) that are part of the job will be deleted too.
719
+
720
+
721
+ Examples:
722
+
723
+ .. code-block:: bash
724
+
725
+ # Tear down a specific job.
726
+ konduktor down cluster_name
727
+ \b
728
+ # Tear down multiple clusters.
729
+ konduktor down jobs
730
+ \b
731
+ # Tear down all existing clusters.
732
+ konduktor down -a
733
+
734
+ """
735
+
736
+ context = kubernetes_utils.get_current_kube_config_context_name()
737
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
738
+ if all:
739
+ jobs_specs = jobset_utils.list_jobset(namespace)
740
+ assert jobs_specs is not None, f'No ' f'jobs found in namespace {namespace}'
741
+ assert len(jobs_specs) > 0, f'No ' f'jobs found in namespace {namespace}'
742
+ jobs = [job['metadata']['name'] for job in jobs_specs['items']]
743
+ if not yes:
744
+ # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
745
+ # it exists but is STOPPED.
746
+ prompt = (
747
+ f'Tearing down job(s) {colorama.Style.BRIGHT} '
748
+ f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
749
+ 'Proceed?'
750
+ )
751
+ if prompt is not None:
752
+ click.confirm(prompt, default=True, abort=True, show_default=True)
753
+
754
+ for job in track(jobs, description='Tearing down job(s)...'):
755
+ jobset_utils.delete_jobset(namespace, job)
756
+
757
+
758
+ @cli.command(cls=_DocumentedCodeCommand)
759
+ @click.argument('clouds', required=True, type=str, nargs=-1)
760
+ def check(clouds: Tuple[str]):
761
+ """Check which clouds are available to use for storage
762
+
763
+ This checks storage credentials for a cloud supported by konduktor. If a
764
+ cloud is detected to be inaccessible, the reason and correction steps will
765
+ be shown.
766
+
767
+ If CLOUDS are specified, checks credentials for only those clouds.
768
+
769
+ The enabled clouds are cached and form the "search space" to be considered
770
+ for each task.
771
+
772
+ Examples:
773
+
774
+ .. code-block:: bash
775
+
776
+ # Check credentials for all supported clouds.
777
+ konduktor check
778
+ # Check only specific clouds - GCP.
779
+ konduktor check gcp
780
+ """
781
+ clouds_arg = clouds if len(clouds) > 0 else None
782
+ konduktor_check.check(clouds=clouds_arg)
783
+
784
+
785
+ def main():
786
+ return cli()
787
+
788
+
789
+ if __name__ == '__main__':
790
+ main()