konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/cli.py ADDED
@@ -0,0 +1,1945 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """The 'konduktor' command line tool.
14
+
15
+ Example usage:
16
+
17
+ # See available commands.
18
+ >> konduktor
19
+
20
+ # Run a task, described in a yaml file.
21
+ >> konduktor launch task.yaml
22
+
23
+ # Show the list of scheduled jobs
24
+ >> konduktor status
25
+
26
+ # Tear down a specific job.
27
+ >> konduktor down cluster_name
28
+
29
+ # Tear down all scheduled jobs
30
+ >> konduktor down -a
31
+
32
+ NOTE: the order of command definitions in this file corresponds to how they are
33
+ listed in "konduktor --help". Take care to put logically connected commands close to
34
+ each other.
35
+ """
36
+
37
+ import difflib
38
+ import fnmatch
39
+ import os
40
+ import pathlib
41
+ import shlex
42
+ from base64 import b64encode
43
+ from typing import Any, Dict, List, Optional, Tuple
44
+
45
+ import click
46
+ import colorama
47
+ import dotenv
48
+ import prettytable
49
+ import yaml # type: ignore
50
+ from rich.progress import track
51
+
52
+ import konduktor
53
+ from konduktor import check as konduktor_check
54
+ from konduktor import logging
55
+ from konduktor.backends import constants as backend_constants
56
+ from konduktor.backends import deployment_utils, jobset_utils
57
+ from konduktor.utils import (
58
+ base64_utils,
59
+ common_utils,
60
+ kubernetes_utils,
61
+ log_utils,
62
+ ux_utils,
63
+ validator,
64
+ )
65
+
66
+ _CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
67
+
68
+ logger = logging.get_logger(__name__)
69
+
70
+
71
+ def _parse_env_var(env_var: str) -> Tuple[str, str]:
72
+ """Parse env vars into a (KEY, VAL) pair."""
73
+ if '=' not in env_var:
74
+ value = os.environ.get(env_var)
75
+ if value is None:
76
+ raise click.UsageError(f'{env_var} is not set in local environment.')
77
+ return (env_var, value)
78
+ ret = tuple(env_var.split('=', 1))
79
+ if len(ret) != 2:
80
+ raise click.UsageError(
81
+ f'Invalid env var: {env_var}. Must be in the form of KEY=VALUE'
82
+ )
83
+ return ret[0], ret[1]
84
+
85
+
86
+ def _merge_env_vars(
87
+ env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]]
88
+ ) -> List[Tuple[str, str]]:
89
+ """Merges all values from env_list into env_dict."""
90
+ if not env_dict:
91
+ return env_list
92
+ for key, value in env_list:
93
+ env_dict[key] = value
94
+ return list(env_dict.items())
95
+
96
+
97
+ def _make_task_with_overrides(
98
+ entrypoint: Tuple[str, ...],
99
+ *,
100
+ entrypoint_name: str = 'konduktor.Task',
101
+ name: Optional[str] = None,
102
+ workdir: Optional[str] = None,
103
+ cloud: Optional[str] = None,
104
+ gpus: Optional[str] = None,
105
+ cpus: Optional[str] = None,
106
+ memory: Optional[str] = None,
107
+ instance_type: Optional[str] = None,
108
+ num_nodes: Optional[int] = None,
109
+ max_restarts: Optional[int] = None,
110
+ completions: Optional[int] = None,
111
+ image_id: Optional[str] = None,
112
+ disk_size: Optional[int] = None,
113
+ env: Optional[List[Tuple[str, str]]] = None,
114
+ field_to_ignore: Optional[List[str]] = None,
115
+ min_replicas: Optional[int] = None,
116
+ max_replicas: Optional[int] = None,
117
+ ports: Optional[int] = None,
118
+ probe: Optional[str] = None,
119
+ ) -> konduktor.Task:
120
+ """Creates a task from an entrypoint with overrides.
121
+
122
+ Returns:
123
+ konduktor.Task
124
+ """
125
+ entrypoint = ' '.join(entrypoint)
126
+ is_yaml, _ = _check_yaml(entrypoint)
127
+ entrypoint: Optional[str]
128
+ if is_yaml:
129
+ # Treat entrypoint as a yaml.
130
+ click.secho(f'{entrypoint_name} from YAML spec: ', fg='yellow', nl=False)
131
+ click.secho(entrypoint, bold=True)
132
+ else:
133
+ if entrypoint is not None and len(entrypoint) == 0:
134
+ raise ValueError(
135
+ 'no entrypoint specified, run with \n' '`konduktor launch task.yaml'
136
+ )
137
+ raise ValueError(f'{entrypoint} is not a valid YAML spec,')
138
+
139
+ override_params = _parse_override_params(
140
+ gpus=gpus,
141
+ cpus=cpus,
142
+ memory=memory,
143
+ image_id=image_id,
144
+ disk_size=disk_size,
145
+ )
146
+
147
+ serving_override_params = _parse_serving_override_params(
148
+ num_nodes=num_nodes,
149
+ min_replicas=min_replicas,
150
+ max_replicas=max_replicas,
151
+ ports=ports,
152
+ probe=probe,
153
+ )
154
+
155
+ if field_to_ignore is not None:
156
+ _pop_and_ignore_fields_in_override_params(override_params, field_to_ignore)
157
+
158
+ assert entrypoint is not None
159
+ task_configs = common_utils.read_yaml_all(entrypoint)
160
+ assert len(task_configs) == 1, 'Only single tasks are supported'
161
+ task = konduktor.Task.from_yaml_config(task_configs[0], env)
162
+ # Override.
163
+ if workdir is not None:
164
+ task.workdir = workdir
165
+
166
+ # perform overrides from CLI
167
+ if override_params:
168
+ task.set_resources_override(override_params)
169
+ if task.serving:
170
+ task.set_serving_override(serving_override_params)
171
+
172
+ if max_restarts is not None:
173
+ assert task.resources is not None
174
+ task.resources.job_config['max_restarts'] = max_restarts
175
+ if completions is not None:
176
+ assert task.resources is not None
177
+ task.resources.job_config['completions'] = completions
178
+ if num_nodes is not None:
179
+ task.num_nodes = num_nodes
180
+ if name is not None:
181
+ task.name = name
182
+ return task
183
+
184
+
185
+ _TASK_OPTIONS = [
186
+ click.option(
187
+ '--workdir',
188
+ required=False,
189
+ type=click.Path(exists=True, file_okay=False),
190
+ help=(
191
+ 'If specified, sync this dir to the remote working directory, '
192
+ 'where the task will be invoked. '
193
+ 'Overrides the "workdir" config in the YAML if both are supplied.'
194
+ ),
195
+ ),
196
+ click.option(
197
+ '--cloud',
198
+ required=False,
199
+ type=str,
200
+ help=(
201
+ 'The cloud to use. If specified, overrides the "resources.cloud" '
202
+ 'config. Passing "none" resets the config. [defunct] currently '
203
+ 'only supports a single cloud'
204
+ ),
205
+ ),
206
+ click.option(
207
+ '--num-nodes',
208
+ required=False,
209
+ type=int,
210
+ help=(
211
+ 'Number of nodes to execute the task on. '
212
+ 'Overrides the "num_nodes" config in the YAML if both are '
213
+ 'supplied.'
214
+ ),
215
+ ),
216
+ click.option(
217
+ '--max-restarts',
218
+ required=False,
219
+ type=int,
220
+ help=(
221
+ 'Maximum number of jobset restarts allowed. Overrides YAML.'
222
+ 'Overrides the "max_restarts" config in the YAML if both are '
223
+ 'supplied.'
224
+ ),
225
+ ),
226
+ click.option(
227
+ '--completions',
228
+ required=False,
229
+ type=int,
230
+ help=(
231
+ 'Number of successful completions required. Overrides YAML.'
232
+ 'Overrides the "completions" config in the YAML if both are '
233
+ 'supplied.'
234
+ ),
235
+ ),
236
+ click.option(
237
+ '--cpus',
238
+ default=None,
239
+ type=str,
240
+ required=False,
241
+ help=(
242
+ 'Number of vCPUs each instance must have (e.g., '
243
+ '``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
244
+ 'This is used to automatically select the instance type.'
245
+ ),
246
+ ),
247
+ click.option(
248
+ '--memory',
249
+ default=None,
250
+ type=str,
251
+ required=False,
252
+ help=(
253
+ 'Amount of memory each instance must have in GB (e.g., '
254
+ '``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'
255
+ ),
256
+ ),
257
+ click.option(
258
+ '--disk-size',
259
+ default=None,
260
+ type=int,
261
+ required=False,
262
+ help=('OS disk size in GBs.'),
263
+ ),
264
+ click.option(
265
+ '--image-id',
266
+ required=False,
267
+ default=None,
268
+ help=(
269
+ 'Custom image id for launching the instances. '
270
+ 'Passing "none" resets the config.'
271
+ ),
272
+ ),
273
+ click.option(
274
+ '--env-file',
275
+ required=False,
276
+ type=dotenv.dotenv_values,
277
+ help="""\
278
+ Path to a dotenv file with environment variables to set on the remote
279
+ node.
280
+
281
+ If any values from ``--env-file`` conflict with values set by
282
+ ``--env``, the ``--env`` value will be preferred.""",
283
+ ),
284
+ click.option(
285
+ '--env',
286
+ required=False,
287
+ type=_parse_env_var,
288
+ multiple=True,
289
+ help="""\
290
+ Environment variable to set on the remote node.
291
+ It can be specified multiple times.
292
+ Examples:
293
+
294
+ \b
295
+ 1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
296
+
297
+ 2. ``--env MY_ENV2=$HOME``: set ``$MY_ENV2`` on the cluster to be the
298
+ same value of ``$HOME`` in the local environment where the CLI command
299
+ is run.
300
+
301
+ 3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
302
+ same value of ``$MY_ENV3`` in the local environment.""",
303
+ ),
304
+ ]
305
+ _TASK_OPTIONS_WITH_NAME = [
306
+ click.option(
307
+ '--name',
308
+ '-n',
309
+ required=False,
310
+ type=str,
311
+ help=(
312
+ 'Task name. Overrides the "name" '
313
+ 'config in the YAML if both are supplied.'
314
+ ),
315
+ ),
316
+ ] + _TASK_OPTIONS
317
+ _EXTRA_RESOURCES_OPTIONS = [
318
+ click.option(
319
+ '--gpus',
320
+ required=False,
321
+ type=str,
322
+ help=(
323
+ 'Type and number of GPUs to use. Example values: '
324
+ '"V100:8", "V100" (short for a count of 1)'
325
+ 'If a new cluster is being launched by this command, this is the '
326
+ 'resources to provision. If an existing cluster is being reused, this'
327
+ " is seen as the task demand, which must fit the cluster's total "
328
+ 'resources and is used for scheduling the task. '
329
+ 'Overrides the "accelerators" '
330
+ 'config in the YAML if both are supplied. '
331
+ 'Passing "none" resets the config.'
332
+ ),
333
+ ),
334
+ ]
335
+ _EXTRA_SERVING_OPTIONS = [
336
+ click.option(
337
+ '--min-replicas',
338
+ required=False,
339
+ type=int,
340
+ help=(
341
+ 'Minimum number of replicas to run for the service. '
342
+ 'Overrides the "min_replicas" field in the YAML if both '
343
+ 'are supplied.'
344
+ ),
345
+ ),
346
+ click.option(
347
+ '--max-replicas',
348
+ required=False,
349
+ type=int,
350
+ help=(
351
+ 'Maximum number of replicas to allow for the service. '
352
+ 'Overrides the "max_replicas" field in the YAML if both '
353
+ 'are supplied.'
354
+ ),
355
+ ),
356
+ click.option(
357
+ '--ports',
358
+ required=False,
359
+ type=int,
360
+ help=(
361
+ 'The container port on which your service will listen for HTTP '
362
+ 'traffic. Overrides the "ports" field in the YAML if both '
363
+ 'are supplied.'
364
+ ),
365
+ ),
366
+ click.option(
367
+ '--probe',
368
+ required=False,
369
+ type=str,
370
+ help=(
371
+ 'The HTTP path to use for health checks (liveness, readiness, and '
372
+ 'startup probes). Overrides the "probe" field in the YAML '
373
+ 'if both are supplied. The service should respond with HTTP 200 on '
374
+ 'this path when healthy.'
375
+ ),
376
+ ),
377
+ ]
378
+
379
+
380
+ def _get_click_major_version():
381
+ return int(click.__version__.split('.', maxsplit=1)[0])
382
+
383
+
384
+ _RELOAD_ZSH_CMD = 'source ~/.zshrc'
385
+ _RELOAD_BASH_CMD = 'source ~/.bashrc'
386
+
387
+
388
+ def _add_click_options(options: List[click.Option]):
389
+ """A decorator for adding a list of click option decorators."""
390
+
391
+ def _add_options(func):
392
+ for option in reversed(options):
393
+ func = option(func)
394
+ return func
395
+
396
+ return _add_options
397
+
398
+
399
+ def _parse_override_params(
400
+ gpus: Optional[str] = None,
401
+ cpus: Optional[str] = None,
402
+ memory: Optional[str] = None,
403
+ image_id: Optional[str] = None,
404
+ disk_size: Optional[int] = None,
405
+ ) -> Dict[str, Any]:
406
+ """Parses the override parameters into a dictionary."""
407
+ override_params: Dict[str, Any] = {}
408
+ if gpus is not None:
409
+ if gpus.lower() == 'none':
410
+ override_params['accelerators'] = None
411
+ else:
412
+ override_params['accelerators'] = gpus
413
+ if cpus is not None:
414
+ if cpus.lower() == 'none':
415
+ override_params['cpus'] = None
416
+ else:
417
+ override_params['cpus'] = cpus
418
+ if memory is not None:
419
+ if memory.lower() == 'none':
420
+ override_params['memory'] = None
421
+ else:
422
+ override_params['memory'] = memory
423
+ if image_id is not None:
424
+ if image_id.lower() == 'none':
425
+ override_params['image_id'] = None
426
+ else:
427
+ # Validate Docker image before adding to override params
428
+ validator.validate_and_warn_image(image_id, 'task')
429
+ override_params['image_id'] = image_id
430
+ if disk_size is not None:
431
+ override_params['disk_size'] = disk_size
432
+ return override_params
433
+
434
+
435
+ def _parse_serving_override_params(
436
+ num_nodes: Optional[int] = None,
437
+ min_replicas: Optional[int] = None,
438
+ max_replicas: Optional[int] = None,
439
+ ports: Optional[int] = None,
440
+ probe: Optional[str] = None,
441
+ ) -> Dict[str, Any]:
442
+ """Parses the relevant serving override parameters into a dictionary."""
443
+ override_params: Dict[str, Any] = {}
444
+ if num_nodes is not None:
445
+ override_params['num_nodes'] = num_nodes
446
+ if min_replicas is not None:
447
+ override_params['min_replicas'] = min_replicas
448
+ if max_replicas is not None:
449
+ override_params['max_replicas'] = max_replicas
450
+ if ports is not None:
451
+ override_params['ports'] = ports
452
+ if probe is not None:
453
+ override_params['probe'] = probe
454
+
455
+ return override_params
456
+
457
+
458
+ def _launch_with_confirm(
459
+ task: konduktor.Task,
460
+ *,
461
+ dryrun: bool,
462
+ detach_run: bool,
463
+ no_confirm: bool,
464
+ serving: bool,
465
+ ):
466
+ """Launch a cluster with a Task."""
467
+
468
+ confirm_shown = False
469
+ if not no_confirm:
470
+ # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
471
+ # it exists but is STOPPED.
472
+ if serving:
473
+ prompt = (
474
+ f'Launching a new deployment {colorama.Style.BRIGHT}'
475
+ f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
476
+ 'Proceed?'
477
+ )
478
+ else:
479
+ prompt = (
480
+ f'Launching a new job {colorama.Style.BRIGHT}'
481
+ f'{colorama.Fore.GREEN}{task.name}{colorama.Style.RESET_ALL}. '
482
+ 'Proceed?'
483
+ )
484
+ if prompt is not None:
485
+ confirm_shown = True
486
+ click.confirm(prompt, default=True, abort=True, show_default=True)
487
+
488
+ if not confirm_shown:
489
+ if serving:
490
+ click.secho(f'Creating deployment {task.name}...', fg='yellow')
491
+ else:
492
+ click.secho(f'Running task {task.name}...', fg='yellow')
493
+ return konduktor.launch(
494
+ task,
495
+ dryrun=dryrun,
496
+ detach_run=detach_run,
497
+ )
498
+
499
+
500
+ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
501
+ """Checks if entrypoint is a readable YAML file.
502
+
503
+ Args:
504
+ entrypoint: Path to a YAML file.
505
+ """
506
+ is_yaml = True
507
+ config: Optional[List[Dict[str, Any]]] = None
508
+ result = None
509
+ shell_splits = shlex.split(entrypoint)
510
+ yaml_file_provided = len(shell_splits) == 1 and (
511
+ shell_splits[0].endswith('yaml') or shell_splits[0].endswith('.yml')
512
+ )
513
+ invalid_reason = ''
514
+ try:
515
+ with open(entrypoint, 'r', encoding='utf-8') as f:
516
+ try:
517
+ config = list(yaml.safe_load_all(f))
518
+ if config:
519
+ result = config[0]
520
+ else:
521
+ result = {}
522
+ if isinstance(result, str):
523
+ # 'konduktor exec cluster ./my_script.sh'
524
+ is_yaml = False
525
+ except yaml.YAMLError as e:
526
+ if yaml_file_provided:
527
+ logger.debug(e)
528
+ detailed_error = f'\nYAML Error: {e}\n'
529
+ invalid_reason = (
530
+ 'contains an invalid configuration. '
531
+ 'Please check syntax.\n'
532
+ f'{detailed_error}'
533
+ )
534
+ is_yaml = False
535
+
536
+ except OSError:
537
+ if yaml_file_provided:
538
+ entry_point_path = os.path.expanduser(entrypoint)
539
+ if not os.path.exists(entry_point_path):
540
+ invalid_reason = (
541
+ 'does not exist. Please check if the path' ' is correct.'
542
+ )
543
+ elif not os.path.isfile(entry_point_path):
544
+ invalid_reason = (
545
+ 'is not a file. Please check if the path' ' is correct.'
546
+ )
547
+ else:
548
+ invalid_reason = (
549
+ 'yaml.safe_load() failed. Please check if the' ' path is correct.'
550
+ )
551
+ is_yaml = False
552
+ if not is_yaml:
553
+ if yaml_file_provided:
554
+ click.confirm(
555
+ f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n'
556
+ 'It will be treated as a command to be run remotely. Continue?',
557
+ abort=True,
558
+ )
559
+ return is_yaml, result
560
+
561
+
562
+ def _pop_and_ignore_fields_in_override_params(
563
+ params: Dict[str, Any], field_to_ignore: List[str]
564
+ ) -> None:
565
+ """Pops and ignores fields in override params.
566
+
567
+ Args:
568
+ params: Override params.
569
+ field_to_ignore: Fields to ignore.
570
+
571
+ Returns:
572
+ Override params with fields ignored.
573
+ """
574
+ if field_to_ignore is not None:
575
+ for field in field_to_ignore:
576
+ field_value = params.pop(field, None)
577
+ if field_value is not None:
578
+ click.secho(
579
+ f'Override param {field}={field_value} is ignored.', fg='yellow'
580
+ )
581
+
582
+
583
+ class _NaturalOrderGroup(click.Group):
584
+ """Lists commands in the order defined in this script.
585
+
586
+ Reference: https://github.com/pallets/click/issues/513
587
+ """
588
+
589
+ def list_commands(self, ctx):
590
+ return self.commands.keys()
591
+
592
+ def invoke(self, ctx):
593
+ return super().invoke(ctx)
594
+
595
+
596
+ class _DocumentedCodeCommand(click.Command):
597
+ """Corrects help strings for documented commands such that --help displays
598
+ properly and code blocks are rendered in the official web documentation.
599
+ """
600
+
601
+ def get_help(self, ctx):
602
+ help_str = ctx.command.help
603
+ ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b')
604
+ return super().get_help(ctx)
605
+
606
+
607
+ @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS)
608
+ @click.version_option(konduktor.__version__, '--version', '-v', prog_name='konduktor')
609
+ @click.version_option(
610
+ konduktor.__commit__,
611
+ '--commit',
612
+ '-c',
613
+ prog_name='konduktor',
614
+ message='%(prog)s, commit %(version)s',
615
+ help='Show the commit hash and exit',
616
+ )
617
+ def cli():
618
+ pass
619
+
620
+
621
+ @cli.command()
622
+ @click.option(
623
+ '--all-users',
624
+ '-u',
625
+ default=False,
626
+ is_flag=True,
627
+ required=False,
628
+ help='Show all jobs, including those not owned by the current user.',
629
+ )
630
+ @click.option(
631
+ '--limit',
632
+ '-l',
633
+ default=None,
634
+ type=int,
635
+ help='Maximum number of jobs to display (e.g., --limit 100)',
636
+ )
637
+ @click.option(
638
+ '--after',
639
+ default=None,
640
+ type=str,
641
+ help=(
642
+ 'Show jobs created after this timestamp '
643
+ '(e.g., --after "08/06/25 03:54PM", --after "08/06/25", --after "03:54PM")'
644
+ ),
645
+ )
646
+ @click.option(
647
+ '--before',
648
+ default=None,
649
+ type=str,
650
+ help=(
651
+ 'Show jobs created before this timestamp '
652
+ '(e.g., --before "08/06/25 03:54PM", --before "08/06/25", --before "03:54PM")'
653
+ ),
654
+ )
655
+ # pylint: disable=redefined-builtin
656
+ def status(
657
+ all_users: bool, limit: Optional[int], after: Optional[str], before: Optional[str]
658
+ ):
659
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
660
+ """Shows list of all the jobs with optional filtering and pagination.
661
+
662
+ \b
663
+ Examples:
664
+ konduktor status --limit 10
665
+ konduktor status --before "08/06/25 03:53PM"
666
+ konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
667
+
668
+ \b
669
+ Notes:
670
+ • When using --before or --after timestamps, "08/06/25"
671
+ is equivalent to "08/06/25 00:00".
672
+ • "03:53PM" is equivalent to "03:53:00PM".
673
+ • Timestamps shown in "konduktor status" are truncated
674
+ and are in the local timezone.
675
+ Example: "03:53:55PM" → "03:53PM" — would show up in
676
+ --after "03:53PM" but not in --before "03:53PM".
677
+ """
678
+ context = kubernetes_utils.get_current_kube_config_context_name()
679
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
680
+ user = common_utils.user_and_hostname_hash() if not all_users else 'All'
681
+ click.secho(f'User: {user}', fg='green', bold=True)
682
+ click.secho('Jobs', fg='cyan', bold=True)
683
+ jobset_utils.show_status_table(
684
+ namespace, all_users=all_users, limit=limit, after=after, before=before
685
+ )
686
+
687
+
688
+ @cli.command()
689
+ @click.option(
690
+ '--status',
691
+ is_flag=True,
692
+ default=False,
693
+ help=(
694
+ 'If specified, do not show logs but exit with a status code for the '
695
+ "job's status: 0 for succeeded, or 1 for all other statuses."
696
+ ),
697
+ )
698
+ @click.option(
699
+ '--follow/--no-follow',
700
+ is_flag=True,
701
+ default=True,
702
+ help=(
703
+ 'Follow the logs of a job. '
704
+ 'If --no-follow is specified, print the log so far and exit. '
705
+ '[default: --follow]'
706
+ ),
707
+ )
708
+ @click.option(
709
+ '--num-lines',
710
+ '--num_lines',
711
+ '-n',
712
+ default=-1,
713
+ type=int,
714
+ help=(
715
+ 'The number of lines to display from the end of the log file. '
716
+ 'Default is -1 (no limit).'
717
+ ),
718
+ )
719
+ @click.option(
720
+ '--node-rank',
721
+ '--node_rank',
722
+ '-N',
723
+ default=0,
724
+ type=int,
725
+ help='The node rank to tail logs from.',
726
+ )
727
+ @click.option(
728
+ '--start-offset',
729
+ '--start_offset',
730
+ type=str,
731
+ required=False,
732
+ default='1h',
733
+ help=(
734
+ 'Choose how much time from now to look back in logs. '
735
+ 'Examples: 30s, 5m, 2h, 1d. Default is 1h.'
736
+ 'Note: currently only applies when streaming (default --follow). '
737
+ 'With --no-follow, all available logs are returned.'
738
+ ),
739
+ )
740
+ @click.argument('job_id', type=str, nargs=1)
741
+ # TODO(zhwu): support logs by job name
742
+ def logs(
743
+ status: bool,
744
+ job_id: str,
745
+ follow: bool,
746
+ num_lines: int,
747
+ node_rank: int,
748
+ start_offset: str,
749
+ ):
750
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
751
+ """Retrieve/tail the log of a job."""
752
+ if status:
753
+ raise click.UsageError('`--status` is being deprecated')
754
+
755
+ # Check if the job exists
756
+ if not job_id:
757
+ raise click.UsageError('Please provide a job ID.')
758
+
759
+ context = kubernetes_utils.get_current_kube_config_context_name()
760
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
761
+
762
+ # Verify the job exists before attempting to tail logs
763
+ # TODO(asaiacai): unify the 404 logic under jobset_utils
764
+ try:
765
+ _ = jobset_utils.get_jobset(namespace, job_id)
766
+ except jobset_utils.JobNotFoundError:
767
+ message = (
768
+ f"Job '{job_id}' not found in namespace '{namespace}'. "
769
+ f'This may be due to a typo, `konduktor down`, or garbage collected. '
770
+ f'Check your jobs with '
771
+ f'{colorama.Style.BRIGHT}`konduktor status`'
772
+ f'{colorama.Style.RESET_ALL}.'
773
+ )
774
+
775
+ # Try to find near string matches to help with typos.
776
+ try:
777
+ job_specs = jobset_utils.list_jobset(namespace)
778
+ job_names = [
779
+ item['metadata']['name'] for item in (job_specs or {}).get('items', [])
780
+ ]
781
+ close_matches = difflib.get_close_matches(
782
+ job_id, job_names, n=3, cutoff=0.4
783
+ )
784
+ except Exception:
785
+ close_matches = []
786
+
787
+ if close_matches:
788
+ suggestions = ', '.join(
789
+ f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{name}{colorama.Style.NORMAL}'
790
+ for name in close_matches
791
+ )
792
+ message += f'{colorama.Fore.YELLOW} Did you mean: {suggestions}?'
793
+
794
+ click.secho(message, fg='yellow')
795
+
796
+ log_utils.tail_logs(
797
+ job_id,
798
+ worker_id=node_rank,
799
+ follow=follow,
800
+ num_logs=num_lines,
801
+ start_offset=start_offset,
802
+ )
803
+
804
+
805
+ @cli.command(cls=_DocumentedCodeCommand)
806
+ @click.argument(
807
+ 'entrypoint',
808
+ required=False,
809
+ type=str,
810
+ nargs=-1,
811
+ )
812
+ @click.option(
813
+ '--dryrun',
814
+ default=False,
815
+ is_flag=True,
816
+ help='If True, do not actually run the job.',
817
+ )
818
+ @click.option(
819
+ '--detach-run',
820
+ '-d',
821
+ default=False,
822
+ is_flag=True,
823
+ help=(
824
+ 'If True, as soon as a job is submitted, return from this call '
825
+ 'and do not stream execution logs.'
826
+ ),
827
+ )
828
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
829
+ @click.option(
830
+ '--yes',
831
+ '-y',
832
+ is_flag=True,
833
+ default=False,
834
+ required=False,
835
+ # Disabling quote check here, as there seems to be a bug in pylint,
836
+ # which incorrectly recognizes the help string as a docstring.
837
+ # pylint: disable=bad-docstring-quotes
838
+ help='Skip confirmation prompt.',
839
+ )
840
+ @click.option(
841
+ '--skip-image-check',
842
+ '-s',
843
+ is_flag=True,
844
+ default=False,
845
+ help='Skip Docker image validation checks for faster startup.',
846
+ )
847
+ def launch(
848
+ entrypoint: Tuple[str, ...],
849
+ dryrun: bool,
850
+ detach_run: bool,
851
+ name: Optional[str],
852
+ workdir: Optional[str],
853
+ cloud: Optional[str],
854
+ gpus: Optional[str],
855
+ cpus: Optional[str],
856
+ memory: Optional[str],
857
+ num_nodes: Optional[int],
858
+ max_restarts: Optional[int],
859
+ completions: Optional[int],
860
+ image_id: Optional[str],
861
+ env_file: Optional[Dict[str, str]],
862
+ env: List[Tuple[str, str]],
863
+ disk_size: Optional[int],
864
+ yes: bool,
865
+ skip_image_check: bool,
866
+ ):
867
+ """Launch a task.
868
+
869
+ If ENTRYPOINT points to a valid YAML file, it is read in as the task
870
+ specification. Otherwise, it is interpreted as a bash command.
871
+ """
872
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
873
+ env = _merge_env_vars(env_file, env)
874
+
875
+ if skip_image_check:
876
+ os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
877
+
878
+ task = _make_task_with_overrides(
879
+ entrypoint=entrypoint,
880
+ name=name,
881
+ workdir=workdir,
882
+ cloud=cloud,
883
+ gpus=gpus,
884
+ cpus=cpus,
885
+ memory=memory,
886
+ num_nodes=num_nodes,
887
+ max_restarts=max_restarts,
888
+ completions=completions,
889
+ image_id=image_id,
890
+ env=env,
891
+ disk_size=disk_size,
892
+ # serving stuff
893
+ min_replicas=None,
894
+ max_replicas=None,
895
+ ports=None,
896
+ probe=None,
897
+ )
898
+
899
+ click.secho(
900
+ f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
901
+ )
902
+ table_kwargs = {
903
+ 'hrules': prettytable.FRAME,
904
+ 'vrules': prettytable.NONE,
905
+ 'border': True,
906
+ }
907
+ headers = ['CPUs', 'Mem (GB)', 'GPUs']
908
+ table = log_utils.create_table(headers, **table_kwargs)
909
+ assert task.resources is not None
910
+ table.add_row(
911
+ [task.resources.cpus, task.resources.memory, task.resources.accelerators]
912
+ )
913
+ print(table)
914
+
915
+ if task.serving:
916
+ raise click.UsageError(
917
+ 'Serving information detected. Use '
918
+ '`konduktor serve launch` instead for serving.'
919
+ )
920
+ try:
921
+ _launch_with_confirm(
922
+ task,
923
+ dryrun=dryrun,
924
+ detach_run=detach_run,
925
+ no_confirm=yes,
926
+ serving=bool(task.serving),
927
+ )
928
+ except KeyboardInterrupt:
929
+ click.secho(
930
+ f'Detaching... manage your job {task.name} with the following commands:',
931
+ fg='yellow',
932
+ bold=True,
933
+ )
934
+
935
+ click.secho(
936
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, task.name),
937
+ fg='green',
938
+ bold=True,
939
+ )
940
+
941
+
942
+ def _find_matching_jobs(
943
+ jobs: List[str],
944
+ jobs_response: Dict[str, Any],
945
+ namespace: str,
946
+ all_users: Optional[bool],
947
+ ):
948
+ """
949
+ Find all jobs matching against the user specified pattern.
950
+ In use in `konduktor down` and `konduktor stop`
951
+
952
+ Note(asaiacai): `jobs_response` should be the list of
953
+ all jobsets in this namespace, not necessarily belonging
954
+ to this user.
955
+ """
956
+
957
+ jobs_specs = [job for job in jobs_response['items']]
958
+
959
+ if all_users:
960
+ assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
961
+ assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
962
+ jobs = [
963
+ job['metadata']['name']
964
+ for job in jobs_specs
965
+ if job['metadata']['labels'][backend_constants.USER_LABEL]
966
+ == common_utils.get_cleaned_username()
967
+ ]
968
+ logger.debug(
969
+ f'Jobs found for user {colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
970
+ f'{common_utils.get_cleaned_username()}{colorama.Style.RESET_ALL}: {jobs}'
971
+ )
972
+ elif jobs:
973
+ # Get all available jobs to match against patterns
974
+ if len(jobs_specs) == 0:
975
+ raise click.ClickException(f'No jobs found in namespace {namespace}')
976
+
977
+ all_job_names = {
978
+ job['metadata']['name']: job['metadata']['labels'][
979
+ backend_constants.USER_LABEL
980
+ ]
981
+ for job in jobs_specs
982
+ }
983
+ matched_jobs = []
984
+
985
+ for job_pattern in jobs:
986
+ # Use fnmatch for both wildcard and exact pattern matching
987
+ pattern_matches = fnmatch.filter(all_job_names, job_pattern)
988
+ if not pattern_matches:
989
+ click.secho(
990
+ f'Warning: No jobs found matching pattern "{job_pattern}"',
991
+ fg='yellow',
992
+ err=True,
993
+ )
994
+ for matched_name in pattern_matches:
995
+ if all_job_names[matched_name] != common_utils.get_cleaned_username():
996
+ warning_label = (
997
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}Warning'
998
+ f'{colorama.Style.RESET_ALL}'
999
+ )
1000
+ job_name = (
1001
+ f'{colorama.Style.BRIGHT}{colorama.Fore.WHITE}{matched_name}'
1002
+ f'{colorama.Style.RESET_ALL}'
1003
+ )
1004
+ launched_user = (
1005
+ f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
1006
+ f'{all_job_names[matched_name]}{colorama.Style.RESET_ALL}'
1007
+ )
1008
+ current_user = (
1009
+ f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}'
1010
+ f'{common_utils.get_cleaned_username()}'
1011
+ f'{colorama.Style.RESET_ALL}'
1012
+ )
1013
+ logger.info(
1014
+ f'{warning_label}: job {job_name} was launched by '
1015
+ f'{launched_user}, while the current user is {current_user}',
1016
+ )
1017
+
1018
+ matched_jobs.extend(pattern_matches)
1019
+
1020
+ # Remove duplicates while preserving order
1021
+ seen = set()
1022
+ jobs = []
1023
+ for job in matched_jobs:
1024
+ if job not in seen:
1025
+ seen.add(job)
1026
+ jobs.append(job)
1027
+
1028
+ if not jobs:
1029
+ raise click.ClickException(
1030
+ f'No matching jobs found check status with '
1031
+ f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
1032
+ )
1033
+ else:
1034
+ raise click.ClickException(
1035
+ 'No jobs specified. Use --all to specify '
1036
+ 'all jobs belonging to a user '
1037
+ 'or specify job names/patterns.'
1038
+ )
1039
+ return jobs
1040
+
1041
+
1042
+ @cli.command(cls=_DocumentedCodeCommand)
1043
+ @click.argument(
1044
+ 'jobs',
1045
+ nargs=-1,
1046
+ required=False,
1047
+ )
1048
+ @click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
1049
+ @click.option(
1050
+ '--all-users',
1051
+ '--all_users',
1052
+ default=False,
1053
+ is_flag=True,
1054
+ help='Include other users for teardown',
1055
+ )
1056
+ @click.option(
1057
+ '--yes',
1058
+ '-y',
1059
+ is_flag=True,
1060
+ default=False,
1061
+ required=False,
1062
+ help='Skip confirmation prompt.',
1063
+ )
1064
+ def down(
1065
+ jobs: List[str],
1066
+ all: Optional[bool],
1067
+ all_users: Optional[bool],
1068
+ yes: bool,
1069
+ ):
1070
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1071
+ """Tear down job(s).
1072
+
1073
+ JOB is the name of the job to tear down. If both
1074
+ JOB and ``--all`` are supplied, the latter takes precedence.
1075
+
1076
+ Tearing down a job will delete all associated containers (all billing
1077
+ stops), and any data on the containers disks will be lost. Accelerators
1078
+ (e.g., GPUs) that are part of the job will be deleted too.
1079
+
1080
+ Wildcard patterns are supported using * characters.
1081
+ Examples: "test-*" matches all jobs starting with "test-",
1082
+ "*-gpu" matches all jobs ending with "-gpu".
1083
+
1084
+ Examples:
1085
+
1086
+ .. code-block:: bash
1087
+
1088
+ # Tear down a specific job.
1089
+ konduktor down cluster_name
1090
+ \b
1091
+ # Tear down multiple jobs.
1092
+ konduktor down job1 job2
1093
+ \b
1094
+ # Tear down all jobs matching a pattern.
1095
+ konduktor down "test-*"
1096
+ \b
1097
+ # Tear down all of this users jobs.
1098
+ konduktor down -a
1099
+ konduktor down --all
1100
+
1101
+ # Tear down all jobs across all users
1102
+ konduktor down --all --all-users
1103
+
1104
+ """
1105
+
1106
+ context = kubernetes_utils.get_current_kube_config_context_name()
1107
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1108
+ jobs_response = jobset_utils.list_jobset(namespace)
1109
+ assert jobs_response
1110
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
1111
+
1112
+ if not yes:
1113
+ # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
1114
+ # it exists but is STOPPED.
1115
+ prompt = (
1116
+ f'Tearing down job(s) {colorama.Style.BRIGHT} '
1117
+ f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
1118
+ 'Proceed?'
1119
+ )
1120
+ if prompt is not None:
1121
+ click.confirm(prompt, default=True, abort=True, show_default=True)
1122
+
1123
+ for job in track(filtered_jobs, description='Tearing down job(s)...'):
1124
+ jobset_utils.delete_jobset(namespace, job)
1125
+
1126
+
1127
+ @cli.command(cls=_DocumentedCodeCommand)
1128
+ @click.argument(
1129
+ 'jobs',
1130
+ nargs=-1,
1131
+ required=False,
1132
+ )
1133
+ @click.option('--all', '-a', default=None, is_flag=True, help='Suspend all jobs.')
1134
+ @click.option(
1135
+ '--all-users',
1136
+ '--all_users',
1137
+ default=False,
1138
+ is_flag=True,
1139
+ help='Include other users for suspension',
1140
+ )
1141
+ @click.option(
1142
+ '--yes',
1143
+ '-y',
1144
+ is_flag=True,
1145
+ default=False,
1146
+ required=False,
1147
+ help='Skip confirmation prompt.',
1148
+ )
1149
+ def stop(
1150
+ jobs: List[str],
1151
+ all: Optional[bool],
1152
+ all_users: Optional[bool],
1153
+ yes: bool,
1154
+ ):
1155
+ """Suspend job(s) (manual/user-initiated).
1156
+
1157
+ JOB is the name of the job to suspend. If both
1158
+ JOB and ``--all`` are supplied, the latter takes precedence.
1159
+
1160
+ Suspending a job will pause execution and mark the job as SUSPENDED (by user).
1161
+ The job can be resumed later with `konduktor start`.
1162
+
1163
+ If a job is suspended by the system (e.g., due to queueing),
1164
+ it will show as SUSPENDED (by system).
1165
+
1166
+ Wildcard patterns are supported using * characters.
1167
+ Examples: "my_job-*" matches all jobs starting with "my_job-",
1168
+ "*-gpu" matches all jobs ending with "-gpu".
1169
+
1170
+ Examples:
1171
+
1172
+ .. code-block:: bash
1173
+
1174
+ # Suspend a specific job.
1175
+ konduktor stop my_job
1176
+ \b
1177
+ # Suspend multiple jobs.
1178
+ konduktor stop my_job1 my_job2
1179
+ \b
1180
+ # Suspend all jobs matching a pattern.
1181
+ konduktor stop "my_job-*"
1182
+ \b
1183
+ # Suspend all of this users jobs.
1184
+ konduktor stop -a
1185
+ konduktor stop --all
1186
+
1187
+ # Suspend all jobs across all users
1188
+ konduktor stop --all --all-users
1189
+
1190
+ """
1191
+
1192
+ context = kubernetes_utils.get_current_kube_config_context_name()
1193
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1194
+ jobs_response = jobset_utils.list_jobset(namespace)
1195
+ assert jobs_response
1196
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
1197
+
1198
+ if not yes:
1199
+ # Prompt for confirmation
1200
+ prompt = (
1201
+ f'Suspending job(s) {colorama.Style.BRIGHT} '
1202
+ f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
1203
+ 'Proceed?'
1204
+ )
1205
+ if prompt is not None:
1206
+ click.confirm(prompt, default=True, abort=True, show_default=True)
1207
+
1208
+ for job in track(filtered_jobs, description='Suspending job(s)...'):
1209
+ jobset_utils.stop_jobset(namespace, job)
1210
+
1211
+ click.secho(
1212
+ ux_utils.command_hint_messages(
1213
+ ux_utils.CommandHintType.JOB_STOP, filtered_jobs
1214
+ ),
1215
+ fg='green',
1216
+ bold=True,
1217
+ )
1218
+
1219
+
1220
+ @cli.command(cls=_DocumentedCodeCommand)
1221
+ @click.argument(
1222
+ 'jobs',
1223
+ nargs=-1,
1224
+ required=False,
1225
+ )
1226
+ @click.option(
1227
+ '--all', '-a', default=None, is_flag=True, help='Resume all suspended jobs.'
1228
+ )
1229
+ @click.option(
1230
+ '--all-users',
1231
+ '--all_users',
1232
+ default=False,
1233
+ is_flag=True,
1234
+ help='Include other users for resumption',
1235
+ )
1236
+ @click.option(
1237
+ '--yes',
1238
+ '-y',
1239
+ is_flag=True,
1240
+ default=False,
1241
+ required=False,
1242
+ help='Skip confirmation prompt.',
1243
+ )
1244
+ def start(
1245
+ jobs: List[str],
1246
+ all: Optional[bool],
1247
+ all_users: Optional[bool],
1248
+ yes: bool,
1249
+ ):
1250
+ """Resume suspended job(s) (manual/user-initiated).
1251
+
1252
+ JOB is the name of the job to resume. If both
1253
+ JOB and ``--all`` are supplied, the latter takes precedence.
1254
+
1255
+ Resuming a job will restart execution from where it was suspended.
1256
+ Only suspended jobs can be resumed.
1257
+
1258
+ This command works for both manually suspended jobs (SUSPENDED by user)
1259
+ and system-suspended jobs (SUSPENDED by system).
1260
+
1261
+ Wildcard patterns are supported using * characters.
1262
+ Examples: "my_job-*" matches all jobs starting with "my_job-",
1263
+ "*-gpu" matches all jobs ending with "-gpu".
1264
+
1265
+ Examples:
1266
+
1267
+ .. code-block:: bash
1268
+
1269
+ # Resume a specific job.
1270
+ konduktor start my_job
1271
+ \b
1272
+ # Resume multiple jobs.
1273
+ konduktor start my_job1 my_job2
1274
+ \b
1275
+ # Resume all jobs matching a pattern.
1276
+ konduktor start "my_job-*"
1277
+ \b
1278
+ # Resume all of this users suspended jobs.
1279
+ konduktor start -a
1280
+ konduktor start --all
1281
+
1282
+ # Resume all suspended jobs across all users
1283
+ konduktor start --all --all-users
1284
+
1285
+ """
1286
+
1287
+ context = kubernetes_utils.get_current_kube_config_context_name()
1288
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1289
+ jobs_response = jobset_utils.list_jobset(namespace)
1290
+ assert jobs_response
1291
+ jobs_specs = [job for job in jobs_response['items']]
1292
+
1293
+ if all:
1294
+ # Only get suspended jobs when using --all
1295
+ suspended_jobs = [
1296
+ job['metadata']['name']
1297
+ for job in jobs_specs
1298
+ if job.get('status', {})
1299
+ .get('replicatedJobsStatus', [{}])[0]
1300
+ .get('suspended', False)
1301
+ ]
1302
+ if not suspended_jobs:
1303
+ raise click.ClickException(
1304
+ f'No suspended jobs found in namespace {namespace}'
1305
+ )
1306
+ jobs = suspended_jobs
1307
+ elif jobs:
1308
+ # Get all available jobs to match against patterns
1309
+ if len(jobs_specs) == 0:
1310
+ raise click.ClickException(f'No jobs found in namespace {namespace}')
1311
+
1312
+ all_job_names = [job['metadata']['name'] for job in jobs_specs]
1313
+ matched_jobs = []
1314
+
1315
+ for job_pattern in jobs:
1316
+ # Use fnmatch for both wildcard and exact pattern matching
1317
+ pattern_matches = fnmatch.filter(all_job_names, job_pattern)
1318
+ if not pattern_matches:
1319
+ click.secho(
1320
+ f'Warning: No jobs found matching pattern "{job_pattern}"',
1321
+ fg='yellow',
1322
+ err=True,
1323
+ )
1324
+ matched_jobs.extend(pattern_matches)
1325
+
1326
+ # Remove duplicates while preserving order
1327
+ seen = set()
1328
+ jobs = []
1329
+ for job in matched_jobs:
1330
+ if job not in seen:
1331
+ seen.add(job)
1332
+ jobs.append(job)
1333
+
1334
+ if not jobs:
1335
+ raise click.ClickException(
1336
+ f'No matching jobs found check status with '
1337
+ f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
1338
+ )
1339
+ else:
1340
+ raise click.ClickException(
1341
+ 'No jobs specified. Use --all to resume '
1342
+ 'all suspended jobs or specify job names/patterns.'
1343
+ )
1344
+
1345
+ if not yes:
1346
+ # Prompt for confirmation
1347
+ prompt = (
1348
+ f'Resuming job(s) {colorama.Style.BRIGHT} '
1349
+ f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
1350
+ 'Proceed?'
1351
+ )
1352
+ if prompt is not None:
1353
+ click.confirm(prompt, default=True, abort=True, show_default=True)
1354
+
1355
+ for job in track(jobs, description='Resuming job(s)...'):
1356
+ jobset_utils.start_jobset(namespace, job)
1357
+
1358
+
1359
+ @cli.command(cls=_DocumentedCodeCommand)
1360
+ @click.argument(
1361
+ 'clouds',
1362
+ required=True,
1363
+ type=str,
1364
+ nargs=-1,
1365
+ )
1366
+ def check(clouds: Tuple[str]):
1367
+ """Check which clouds are available to use for storage
1368
+
1369
+ This checks storage credentials for a cloud supported by konduktor. If a
1370
+ cloud is detected to be inaccessible, the reason and correction steps will
1371
+ be shown.
1372
+
1373
+ If CLOUDS are specified, checks credentials for only those clouds.
1374
+
1375
+ The enabled clouds are cached and form the "search space" to be considered
1376
+ for each task.
1377
+
1378
+ Examples:
1379
+
1380
+ .. code-block:: bash
1381
+
1382
+ # Check only specific clouds - gs, s3.
1383
+ konduktor check gs
1384
+ konduktor check s3
1385
+ """
1386
+ clouds_arg = clouds if len(clouds) > 0 else None
1387
+ konduktor_check.check(clouds=clouds_arg)
1388
+
1389
+
1390
+ class KeyValueType(click.ParamType):
1391
+ name = 'key=value'
1392
+
1393
+ def convert(self, value, param, ctx):
1394
+ if '=' not in value:
1395
+ self.fail(f'{value!r} is not a valid key=value pair', param, ctx)
1396
+ key, val = value.split('=', 1)
1397
+ return key, val
1398
+
1399
+
1400
+ _SECRET_CREATE_OPTIONS = [
1401
+ click.option(
1402
+ '--inline',
1403
+ type=KeyValueType(),
1404
+ help='Key=value pair to store as an env secret (only valid with --kind env).',
1405
+ ),
1406
+ click.option(
1407
+ '--from-file',
1408
+ '--from_file',
1409
+ type=click.Path(dir_okay=False),
1410
+ help='Path to a single file to store as a secret.',
1411
+ ),
1412
+ click.option(
1413
+ '--from-directory',
1414
+ '--from_directory',
1415
+ type=click.Path(file_okay=False),
1416
+ help='Path to a directory to store as a multi-file secret.',
1417
+ ),
1418
+ click.option(
1419
+ '--kind',
1420
+ default='default',
1421
+ type=click.Choice(['default', 'env', 'git-ssh']),
1422
+ help='Type of secret being created. More kinds coming soon.',
1423
+ ),
1424
+ ]
1425
+
1426
+
1427
+ @cli.group(cls=_NaturalOrderGroup)
1428
+ def secret():
1429
+ """Manage secrets used in Konduktor.
1430
+
1431
+ USAGE: konduktor secret COMMAND
1432
+
1433
+ \b
1434
+ Use one of the following COMMANDS:
1435
+ create [FLAGS] [NAME]
1436
+ delete [NAME]
1437
+ list [FLAGS]
1438
+
1439
+ \b
1440
+ Examples:
1441
+ konduktor secret create --kind git-ssh --from-file=~/.ssh/id_rsa my-ssh-name
1442
+ konduktor secret create --kind env --inline FOO=bar my-env-name
1443
+ konduktor delete my-ssh-name
1444
+ konduktor secret list
1445
+
1446
+ \b
1447
+ For details on COMMAND ARGS:
1448
+ konduktor secret create -h
1449
+ konduktor secret list -h
1450
+ """
1451
+
1452
+
1453
+ @_add_click_options(_SECRET_CREATE_OPTIONS)
1454
+ @secret.command()
1455
+ @click.argument('name', required=True)
1456
+ def create(kind, from_file, from_directory, inline, name):
1457
+ """Create a new secret."""
1458
+
1459
+ if not kubernetes_utils.is_k8s_resource_name_valid(name):
1460
+ raise click.BadParameter(
1461
+ f'Invalid secret name: {name}. '
1462
+ f'Name must consist of lower case alphanumeric characters or -, '
1463
+ f'and must start and end with alphanumeric characters.',
1464
+ )
1465
+
1466
+ basename = name
1467
+ secret_name = f'{basename}-{common_utils.get_user_hash()}'
1468
+
1469
+ context = kubernetes_utils.get_current_kube_config_context_name()
1470
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1471
+
1472
+ from_file = os.path.expanduser(from_file) if from_file else None
1473
+ from_directory = os.path.expanduser(from_directory) if from_directory else None
1474
+
1475
+ sources = [bool(from_file), bool(from_directory), bool(inline)]
1476
+
1477
+ if sources.count(True) > 1:
1478
+ raise click.UsageError(
1479
+ 'Only one of --from-file, --from-directory, or --inline can be used.\n'
1480
+ 'Examples:\n'
1481
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
1482
+ f'--from-file=~/.ssh/id_rsa my-ssh-name\n{colorama.Style.RESET_ALL}'
1483
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
1484
+ f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
1485
+ )
1486
+
1487
+ if sources.count(True) == 0:
1488
+ raise click.UsageError(
1489
+ 'You must specify one of --from-file, --from-directory, or --inline.\n'
1490
+ 'Examples:\n'
1491
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
1492
+ f'--from-file=~/.ssh/id_rsa my-ssh-name\n{colorama.Style.RESET_ALL}'
1493
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
1494
+ f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
1495
+ )
1496
+
1497
+ if from_file and not os.path.isfile(from_file):
1498
+ raise click.BadParameter(
1499
+ f'--from-file {from_file} does not exist or is not a file'
1500
+ )
1501
+ if from_directory and not os.path.isdir(from_directory):
1502
+ raise click.BadParameter(
1503
+ f'--from-directory {from_directory} does not exist or is not a directory'
1504
+ )
1505
+
1506
+ if kind == 'git-ssh' and not from_file:
1507
+ raise click.UsageError(
1508
+ '--kind git-ssh requires --from-file (not --from-directory or --inline). \n'
1509
+ 'Example:\n'
1510
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind git-ssh '
1511
+ f'--from-file=~/.ssh/id_rsa my-ssh-name{colorama.Style.RESET_ALL}'
1512
+ )
1513
+ if kind == 'env' and not inline:
1514
+ raise click.UsageError(
1515
+ '--kind env requires --inline (not --from-file or --from-directory). \n'
1516
+ 'Example:\n'
1517
+ f' {colorama.Style.BRIGHT}konduktor secret create --kind env '
1518
+ f'--inline FOO=bar my-env-name{colorama.Style.RESET_ALL}'
1519
+ )
1520
+
1521
+ data = {}
1522
+ if from_directory:
1523
+ click.echo(f'Creating secret from directory: {from_directory}')
1524
+ # Use ABSOLUTE directory path so the top-level folder name is preserved
1525
+ base_dir_abs = os.path.abspath(os.path.expanduser(from_directory))
1526
+ if not os.path.isdir(base_dir_abs):
1527
+ raise click.BadParameter(
1528
+ f"--from-directory {from_directory} doesn't exist or is not a directory"
1529
+ )
1530
+ # Ensure there is at least one file inside
1531
+ if not any(p.is_file() for p in pathlib.Path(base_dir_abs).rglob('*')):
1532
+ raise click.BadParameter(f'--from-directory {from_directory} is empty.')
1533
+
1534
+ # Zip + base64 the WHOLE directory (this preserves the inner structure)
1535
+ archive_b64 = base64_utils.zip_base64encode([base_dir_abs])
1536
+
1537
+ # Store as a single key; pod will unzip to the expanded path
1538
+ data = {'payload.zip': archive_b64}
1539
+ elif from_file:
1540
+ click.echo(f'Creating secret from file: {from_file}')
1541
+ key = os.path.basename(from_file)
1542
+ if kind == 'git-ssh':
1543
+ key = 'gitkey'
1544
+ try:
1545
+ with open(from_file, 'rb') as f:
1546
+ data[key] = b64encode(f.read()).decode()
1547
+ except OSError as e:
1548
+ raise click.ClickException(f'Failed to read {kind} file {from_file}: {e}')
1549
+ else:
1550
+ click.echo('Creating secret from inline key=value pair')
1551
+ key, value = inline
1552
+ data = {key: b64encode(value.encode()).decode()}
1553
+
1554
+ secret_metadata = {
1555
+ 'name': secret_name,
1556
+ 'labels': {
1557
+ 'parent': 'konduktor',
1558
+ backend_constants.SECRET_OWNER_LABEL: common_utils.get_user_hash(),
1559
+ backend_constants.SECRET_BASENAME_LABEL: basename,
1560
+ backend_constants.SECRET_KIND_LABEL: kind or None,
1561
+ },
1562
+ }
1563
+
1564
+ # Limit --kind git-ssh secret to 1 max per user
1565
+ # Overwrites if user trying to create more than 1
1566
+ if kind == 'git-ssh':
1567
+ user_hash = common_utils.get_user_hash()
1568
+ label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
1569
+ existing = kubernetes_utils.list_secrets(
1570
+ namespace, context, label_filter=label_selector
1571
+ )
1572
+ for s in existing:
1573
+ labels = s.metadata.labels or {}
1574
+ if labels.get(backend_constants.SECRET_KIND_LABEL) == 'git-ssh':
1575
+ old_name = s.metadata.name
1576
+ click.echo(f'Found existing git-ssh secret: {old_name}, deleting it.')
1577
+ kubernetes_utils.delete_secret(
1578
+ name=old_name, namespace=namespace, context=context
1579
+ )
1580
+ break
1581
+
1582
+ ok, err = kubernetes_utils.set_secret(
1583
+ secret_name=secret_name,
1584
+ namespace=namespace,
1585
+ context=context,
1586
+ data=data,
1587
+ secret_metadata=secret_metadata,
1588
+ )
1589
+ if not ok:
1590
+ raise click.ClickException(f'Failed to create secret: {err}')
1591
+ click.secho(f'Secret {basename} created in namespace {namespace}.', fg='green')
1592
+
1593
+
1594
+ @secret.command()
1595
+ @click.argument('name', required=True)
1596
+ def delete(name):
1597
+ """Delete a secret by name."""
1598
+
1599
+ context = kubernetes_utils.get_current_kube_config_context_name()
1600
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1601
+ user_hash = common_utils.get_user_hash()
1602
+
1603
+ label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
1604
+ secrets = kubernetes_utils.list_secrets(
1605
+ namespace, context, label_filter=label_selector
1606
+ )
1607
+
1608
+ matches = [
1609
+ s
1610
+ for s in secrets
1611
+ if s.metadata.labels
1612
+ and s.metadata.labels.get(backend_constants.SECRET_BASENAME_LABEL) == name
1613
+ ]
1614
+
1615
+ if not matches:
1616
+ raise click.ClickException(
1617
+ f'No secret named "{name}" owned by you found in namespace {namespace}.'
1618
+ )
1619
+ elif len(matches) > 1:
1620
+ raise click.ClickException(f'Multiple secrets with basename "{name}" found.')
1621
+
1622
+ full_name = matches[0].metadata.name
1623
+
1624
+ ok, err = kubernetes_utils.delete_secret(full_name, namespace, context)
1625
+ if not ok:
1626
+ raise click.ClickException(f'Failed to delete secret: {err}')
1627
+ click.secho(f'Secret {name} deleted from namespace {namespace}.', fg='yellow')
1628
+
1629
+
1630
+ @secret.command(name='list')
1631
+ @click.option(
1632
+ '--all-users',
1633
+ '--all_users',
1634
+ '-u',
1635
+ is_flag=True,
1636
+ default=False,
1637
+ help='Show all secrets, including those not owned by the current user.',
1638
+ )
1639
+ def list_secrets(all_users: bool):
1640
+ """List secrets in the namespace.
1641
+ Defaults to only your secrets unless --all-users is set."""
1642
+
1643
+ context = kubernetes_utils.get_current_kube_config_context_name()
1644
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1645
+
1646
+ if not all_users:
1647
+ user_hash = common_utils.get_user_hash()
1648
+ username = common_utils.get_cleaned_username()
1649
+ label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
1650
+ secrets = kubernetes_utils.list_secrets(
1651
+ namespace, context, label_filter=label_selector
1652
+ )
1653
+ else:
1654
+ secrets = kubernetes_utils.list_secrets(namespace, context)
1655
+
1656
+ if not secrets:
1657
+ if all_users:
1658
+ click.secho(f'No secrets found in {namespace}.', fg='yellow')
1659
+ else:
1660
+ click.secho(f'No secrets found for {username} in {namespace}.', fg='yellow')
1661
+ return
1662
+
1663
+ if all_users:
1664
+ click.secho(f'All secrets in {namespace} namespace:\n', bold=True)
1665
+ else:
1666
+ click.secho(f'Secrets in {namespace} namespace owned by you:\n', bold=True)
1667
+
1668
+ for s in secrets:
1669
+ labels = s.metadata.labels or {}
1670
+ basename = labels.get(backend_constants.SECRET_BASENAME_LABEL, s.metadata.name)
1671
+ kind = labels.get(backend_constants.SECRET_KIND_LABEL, '(none)')
1672
+ owner = labels.get(backend_constants.SECRET_OWNER_LABEL, '(none)')
1673
+
1674
+ if all_users:
1675
+ click.echo(f'{basename:30} kind={kind:10} owner={owner}')
1676
+ else:
1677
+ click.echo(f'{basename:30} kind={kind:10}')
1678
+
1679
+
1680
+ @cli.group(cls=_NaturalOrderGroup)
1681
+ def serve():
1682
+ """Manage deployment serving with Konduktor.
1683
+
1684
+ USAGE: konduktor serve COMMAND
1685
+
1686
+ \b
1687
+ Use one of the following COMMANDS:
1688
+ launch
1689
+ down
1690
+ status
1691
+
1692
+ \b
1693
+ Examples:
1694
+ konduktor serve launch my-deployment
1695
+ konduktor serve down my-deployment
1696
+ konduktor serve status
1697
+
1698
+ \b
1699
+ For details on COMMAND ARGS:
1700
+ konduktor serve launch -h
1701
+ konduktor serve down -h
1702
+ konduktor serve status -h
1703
+ """
1704
+ pass
1705
+
1706
+
1707
+ @serve.command(name='launch')
1708
+ @click.argument(
1709
+ 'entrypoint',
1710
+ required=False,
1711
+ type=str,
1712
+ nargs=-1,
1713
+ )
1714
+ @click.option(
1715
+ '--dryrun',
1716
+ default=False,
1717
+ is_flag=True,
1718
+ help='If True, do not actually run the job.',
1719
+ )
1720
+ @click.option(
1721
+ '--detach-run',
1722
+ '-d',
1723
+ default=False,
1724
+ is_flag=True,
1725
+ help=(
1726
+ 'If True, as soon as a job is submitted, return from this call '
1727
+ 'and do not stream execution logs.'
1728
+ ),
1729
+ )
1730
+ @_add_click_options(
1731
+ _TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS + _EXTRA_SERVING_OPTIONS
1732
+ )
1733
+ @click.option(
1734
+ '--yes',
1735
+ '-y',
1736
+ is_flag=True,
1737
+ default=False,
1738
+ required=False,
1739
+ # Disabling quote check here, as there seems to be a bug in pylint,
1740
+ # which incorrectly recognizes the help string as a docstring.
1741
+ # pylint: disable=bad-docstring-quotes
1742
+ help='Skip confirmation prompt.',
1743
+ )
1744
+ @click.option(
1745
+ '--skip-image-check',
1746
+ '-s',
1747
+ is_flag=True,
1748
+ default=False,
1749
+ help='Skip Docker image validation checks for faster startup.',
1750
+ )
1751
+ def serve_launch(
1752
+ entrypoint: Tuple[str, ...],
1753
+ dryrun: bool,
1754
+ detach_run: bool,
1755
+ name: Optional[str],
1756
+ workdir: Optional[str],
1757
+ cloud: Optional[str],
1758
+ gpus: Optional[str],
1759
+ cpus: Optional[str],
1760
+ memory: Optional[str],
1761
+ num_nodes: Optional[int],
1762
+ max_restarts: Optional[int],
1763
+ completions: Optional[int],
1764
+ image_id: Optional[str],
1765
+ env_file: Optional[Dict[str, str]],
1766
+ env: List[Tuple[str, str]],
1767
+ disk_size: Optional[int],
1768
+ min_replicas: Optional[int],
1769
+ max_replicas: Optional[int],
1770
+ ports: Optional[int],
1771
+ probe: Optional[str],
1772
+ yes: bool,
1773
+ skip_image_check: bool = False,
1774
+ ):
1775
+ """Launch a deployment to serve.
1776
+
1777
+ If ENTRYPOINT points to a valid YAML file, it is read in as the task
1778
+ specification. Otherwise, it is interpreted as a bash command.
1779
+ """
1780
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1781
+ env = _merge_env_vars(env_file, env)
1782
+
1783
+ if skip_image_check:
1784
+ os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
1785
+
1786
+ task = _make_task_with_overrides(
1787
+ entrypoint=entrypoint,
1788
+ name=name,
1789
+ workdir=workdir,
1790
+ cloud=cloud,
1791
+ gpus=gpus,
1792
+ cpus=cpus,
1793
+ memory=memory,
1794
+ num_nodes=num_nodes,
1795
+ max_restarts=max_restarts,
1796
+ completions=completions,
1797
+ image_id=image_id,
1798
+ env=env,
1799
+ disk_size=disk_size,
1800
+ # serving stuff
1801
+ min_replicas=min_replicas,
1802
+ max_replicas=max_replicas,
1803
+ ports=ports,
1804
+ probe=probe,
1805
+ )
1806
+
1807
+ click.secho(
1808
+ f'Considered resources ({task.num_nodes} nodes):', fg='green', bold=True
1809
+ )
1810
+ table_kwargs = {
1811
+ 'hrules': prettytable.FRAME,
1812
+ 'vrules': prettytable.NONE,
1813
+ 'border': True,
1814
+ }
1815
+ headers = ['CPUs', 'Mem (GB)', 'GPUs']
1816
+ table = log_utils.create_table(headers, **table_kwargs)
1817
+ assert task.resources is not None
1818
+ table.add_row(
1819
+ [task.resources.cpus, task.resources.memory, task.resources.accelerators]
1820
+ )
1821
+ print(table)
1822
+
1823
+ if not task.serving:
1824
+ raise click.UsageError(
1825
+ 'No serving information detected. '
1826
+ 'Use `konduktor launch` instead for workloads.'
1827
+ )
1828
+
1829
+ job_name = _launch_with_confirm(
1830
+ task,
1831
+ dryrun=dryrun,
1832
+ detach_run=detach_run,
1833
+ no_confirm=yes,
1834
+ serving=bool(task.serving),
1835
+ )
1836
+
1837
+ click.secho(f'Deployment Name: {job_name}', fg='green', bold=True)
1838
+
1839
+
1840
+ @serve.command(name='down')
1841
+ @click.argument('names', nargs=-1, required=False)
1842
+ @click.option(
1843
+ '--all', '-a', default=False, is_flag=True, help='Tear down all deployments.'
1844
+ )
1845
+ @click.option(
1846
+ '--yes',
1847
+ '-y',
1848
+ is_flag=True,
1849
+ default=False,
1850
+ required=False,
1851
+ help='Skip confirmation prompt.',
1852
+ )
1853
+ def serve_down(
1854
+ names: List[str],
1855
+ all: bool,
1856
+ yes: bool,
1857
+ ):
1858
+ """Tear down deployments (Deployment, Service, PodAutoscaler).
1859
+
1860
+ Use --all or -a to tear down all deployments.
1861
+
1862
+ Examples:
1863
+
1864
+ \b
1865
+ konduktor serve down my-deployment
1866
+ konduktor serve down -a
1867
+ """
1868
+ context = kubernetes_utils.get_current_kube_config_context_name()
1869
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1870
+
1871
+ all_models = deployment_utils.list_models(namespace)
1872
+
1873
+ if all:
1874
+ names = all_models
1875
+ if not names:
1876
+ logger.warning(
1877
+ f'No deployments found in namespace '
1878
+ f'{namespace}, but continuing teardown.'
1879
+ )
1880
+ elif names:
1881
+ matched = []
1882
+ for pattern in names:
1883
+ matched.extend(fnmatch.filter(all_models, pattern))
1884
+ names = sorted(set(matched))
1885
+ if not names:
1886
+ raise click.ClickException(
1887
+ f'No matching deployments found. Check with: '
1888
+ f'{colorama.Style.BRIGHT}konduktor serve '
1889
+ f'status{colorama.Style.RESET_ALL}'
1890
+ )
1891
+ else:
1892
+ raise click.ClickException(
1893
+ 'No deployments specified. Use --all to tear down all deployments '
1894
+ 'or pass names/patterns.'
1895
+ )
1896
+
1897
+ if not yes:
1898
+ prompt = (
1899
+ f'Tearing down deployment(s) '
1900
+ f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}{names}'
1901
+ f'{colorama.Style.RESET_ALL}. '
1902
+ f'Proceed?'
1903
+ )
1904
+ click.confirm(prompt, default=True, abort=True, show_default=True)
1905
+
1906
+ for name in track(names, description='Tearing down deployment(s)...'):
1907
+ deployment_utils.delete_serving_specs(name, namespace)
1908
+
1909
+
1910
+ @serve.command(name='status')
1911
+ @click.option(
1912
+ '--all-users',
1913
+ '-u',
1914
+ default=False,
1915
+ is_flag=True,
1916
+ required=False,
1917
+ help='Show all deployments, including those not owned by the ' 'current user.',
1918
+ )
1919
+ @click.option(
1920
+ '--direct',
1921
+ '-d',
1922
+ default=False,
1923
+ is_flag=True,
1924
+ required=False,
1925
+ help='Force display of direct IP endpoints instead of trainy.us endpoints.',
1926
+ )
1927
+ def serve_status(all_users: bool, direct: bool):
1928
+ """Show status of deployments launched via `konduktor serve launch`."""
1929
+ context = kubernetes_utils.get_current_kube_config_context_name()
1930
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1931
+ deployment_utils.show_status_table(
1932
+ namespace, all_users=all_users, force_direct=direct
1933
+ )
1934
+
1935
+
1936
+ def main():
1937
+ try:
1938
+ return cli(standalone_mode=False)
1939
+ except click.exceptions.Abort:
1940
+ click.secho('Detaching...', fg='yellow', bold=True)
1941
+ return None
1942
+
1943
+
1944
+ if __name__ == '__main__':
1945
+ main()