konduktor-nightly 0.1.0.dev20251107104752__py3-none-any.whl → 0.1.0.dev20251215105431__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +2 -2
- konduktor/backends/pod_utils.py +14 -9
- konduktor/cli.py +287 -295
- konduktor/utils/log_utils.py +8 -5
- {konduktor_nightly-0.1.0.dev20251107104752.dist-info → konduktor_nightly-0.1.0.dev20251215105431.dist-info}/METADATA +2 -1
- {konduktor_nightly-0.1.0.dev20251107104752.dist-info → konduktor_nightly-0.1.0.dev20251215105431.dist-info}/RECORD +9 -9
- {konduktor_nightly-0.1.0.dev20251107104752.dist-info → konduktor_nightly-0.1.0.dev20251215105431.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20251107104752.dist-info → konduktor_nightly-0.1.0.dev20251215105431.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20251107104752.dist-info → konduktor_nightly-0.1.0.dev20251215105431.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ from konduktor.task import Task
|
|
|
11
11
|
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
12
|
|
|
13
13
|
# Replaced with the current commit when building the wheels.
|
|
14
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '421390595e3a1b9f263e790323deae61d94da231'
|
|
15
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -45,5 +45,5 @@ def _get_git_commit():
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
__commit__ = _get_git_commit()
|
|
48
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20251215105431'
|
|
49
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
konduktor/backends/pod_utils.py
CHANGED
|
@@ -28,6 +28,8 @@ if typing.TYPE_CHECKING:
|
|
|
28
28
|
logger = logging.get_logger(__name__)
|
|
29
29
|
|
|
30
30
|
_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
|
|
31
|
+
# Use a large default (7 days) to mimic "infinite" runtime.
|
|
32
|
+
_DEFAULT_MAX_RUN_DURATION_SECONDS = 604800
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
@@ -471,18 +473,21 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
|
|
|
471
473
|
jobset_spec: The JobSet spec dictionary to modify
|
|
472
474
|
task: The task object containing resource information
|
|
473
475
|
"""
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
jobset_spec['jobset']['metadata']
|
|
480
|
-
|
|
476
|
+
assert task.resources is not None, 'Task resources are required'
|
|
477
|
+
labels = task.resources.labels or {}
|
|
478
|
+
|
|
479
|
+
# Add max run duration annotation, defaulting to a practically infinite value.
|
|
480
|
+
maxRunDurationSeconds = labels.get('maxRunDurationSeconds')
|
|
481
|
+
metadata = jobset_spec['jobset']['metadata']
|
|
482
|
+
metadata.setdefault('annotations', {})[_RUN_DURATION_ANNOTATION_KEY] = str(
|
|
483
|
+
maxRunDurationSeconds
|
|
484
|
+
if maxRunDurationSeconds is not None
|
|
485
|
+
else _DEFAULT_MAX_RUN_DURATION_SECONDS
|
|
481
486
|
)
|
|
482
487
|
|
|
483
488
|
# Inject resource labels into JobSet metadata.
|
|
484
|
-
if
|
|
485
|
-
jobset_spec['jobset']['metadata']['labels'].update(
|
|
489
|
+
if labels:
|
|
490
|
+
jobset_spec['jobset']['metadata']['labels'].update(labels)
|
|
486
491
|
|
|
487
492
|
|
|
488
493
|
def merge_pod_into_jobset_template(
|
konduktor/cli.py
CHANGED
|
@@ -34,6 +34,7 @@ listed in "konduktor --help". Take care to put logically connected commands clo
|
|
|
34
34
|
each other.
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
+
import difflib
|
|
37
38
|
import fnmatch
|
|
38
39
|
import os
|
|
39
40
|
import pathlib
|
|
@@ -273,22 +274,20 @@ _TASK_OPTIONS = [
|
|
|
273
274
|
'--env-file',
|
|
274
275
|
required=False,
|
|
275
276
|
type=dotenv.dotenv_values,
|
|
276
|
-
help=
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
277
|
+
help=(
|
|
278
|
+
'Path to a dotenv file with environment variables to set on the '
|
|
279
|
+
'remote node. If any values from ``--env-file`` conflict '
|
|
280
|
+
'with values set by ``--env``, the ``--env`` value will '
|
|
281
|
+
'be preferred.'
|
|
282
|
+
),
|
|
282
283
|
),
|
|
283
284
|
click.option(
|
|
284
285
|
'--env',
|
|
285
286
|
required=False,
|
|
286
287
|
type=_parse_env_var,
|
|
287
288
|
multiple=True,
|
|
288
|
-
help="""
|
|
289
|
-
Environment variable to set on the remote node.
|
|
290
|
-
It can be specified multiple times.
|
|
291
|
-
Examples:
|
|
289
|
+
help="""\\
|
|
290
|
+
Environment variable to set on the remote node. It can be specified multiple times:
|
|
292
291
|
|
|
293
292
|
\b
|
|
294
293
|
1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
|
|
@@ -298,7 +297,7 @@ _TASK_OPTIONS = [
|
|
|
298
297
|
is run.
|
|
299
298
|
|
|
300
299
|
3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
|
|
301
|
-
same value of ``$MY_ENV3`` in the local environment.""",
|
|
300
|
+
same value of ``$MY_ENV3`` in the local environment.""", # noqa: E501,
|
|
302
301
|
),
|
|
303
302
|
]
|
|
304
303
|
_TASK_OPTIONS_WITH_NAME = [
|
|
@@ -320,10 +319,10 @@ _EXTRA_RESOURCES_OPTIONS = [
|
|
|
320
319
|
type=str,
|
|
321
320
|
help=(
|
|
322
321
|
'Type and number of GPUs to use. Example values: '
|
|
323
|
-
'"V100:8", "V100" (short for a count of 1)'
|
|
322
|
+
'"V100:8", "V100" (short for a count of 1) '
|
|
324
323
|
'If a new cluster is being launched by this command, this is the '
|
|
325
|
-
'resources to provision. If an existing cluster is being reused, this'
|
|
326
|
-
"
|
|
324
|
+
'resources to provision. If an existing cluster is being reused, this '
|
|
325
|
+
"is seen as the task demand, which must fit the cluster's total "
|
|
327
326
|
'resources and is used for scheduling the task. '
|
|
328
327
|
'Overrides the "accelerators" '
|
|
329
328
|
'config in the YAML if both are supplied. '
|
|
@@ -624,7 +623,7 @@ def cli():
|
|
|
624
623
|
default=False,
|
|
625
624
|
is_flag=True,
|
|
626
625
|
required=False,
|
|
627
|
-
help='Show all
|
|
626
|
+
help='Show all jobs, including those not owned by the current user.',
|
|
628
627
|
)
|
|
629
628
|
@click.option(
|
|
630
629
|
'--limit',
|
|
@@ -660,19 +659,19 @@ def status(
|
|
|
660
659
|
|
|
661
660
|
\b
|
|
662
661
|
Examples:
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
662
|
+
konduktor status --limit 10
|
|
663
|
+
konduktor status --before "08/06/25 03:53PM"
|
|
664
|
+
konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
|
|
666
665
|
|
|
667
666
|
\b
|
|
668
667
|
Notes:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
Example: "03:53:55PM" → "03:53PM"
|
|
675
|
-
|
|
668
|
+
• When using --before or --after timestamps, "08/06/25" is
|
|
669
|
+
equivalent to "08/06/25 00:00".
|
|
670
|
+
• "03:53PM" is equivalent to "03:53:00PM".
|
|
671
|
+
• Timestamps shown in "konduktor status" are truncated and are in
|
|
672
|
+
the local timezone.
|
|
673
|
+
Example: "03:53:55PM" → "03:53PM" would show up in --after "03:53PM"
|
|
674
|
+
but not in --before "03:53PM".
|
|
676
675
|
"""
|
|
677
676
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
678
677
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
@@ -690,8 +689,8 @@ def status(
|
|
|
690
689
|
is_flag=True,
|
|
691
690
|
default=False,
|
|
692
691
|
help=(
|
|
693
|
-
'If specified, do not show logs but exit with a status code
|
|
694
|
-
"job's status: 0 for succeeded, or 1 for all other statuses."
|
|
692
|
+
'[DEPRECATED] If specified, do not show logs but exit with a status code '
|
|
693
|
+
"for the job's status: 0 for succeeded, or 1 for all other statuses."
|
|
695
694
|
),
|
|
696
695
|
)
|
|
697
696
|
@click.option(
|
|
@@ -701,12 +700,13 @@ def status(
|
|
|
701
700
|
help=(
|
|
702
701
|
'Follow the logs of a job. '
|
|
703
702
|
'If --no-follow is specified, print the log so far and exit. '
|
|
704
|
-
'
|
|
703
|
+
'(default: --follow)'
|
|
705
704
|
),
|
|
706
705
|
)
|
|
707
706
|
@click.option(
|
|
708
707
|
'--num-lines',
|
|
709
|
-
'--num_lines'
|
|
708
|
+
'--num_lines',
|
|
709
|
+
'-n',
|
|
710
710
|
default=-1,
|
|
711
711
|
type=int,
|
|
712
712
|
help=(
|
|
@@ -722,6 +722,19 @@ def status(
|
|
|
722
722
|
type=int,
|
|
723
723
|
help='The node rank to tail logs from.',
|
|
724
724
|
)
|
|
725
|
+
@click.option(
|
|
726
|
+
'--start-offset',
|
|
727
|
+
'--start_offset',
|
|
728
|
+
type=str,
|
|
729
|
+
required=False,
|
|
730
|
+
default='1h',
|
|
731
|
+
help=(
|
|
732
|
+
'Choose how much time from now to look back in logs. '
|
|
733
|
+
'Examples: 30s, 5m, 2h, 1d. Default is 1h. '
|
|
734
|
+
'Note: currently only applies when streaming (default --follow). '
|
|
735
|
+
'With --no-follow, all available logs are returned.'
|
|
736
|
+
),
|
|
737
|
+
)
|
|
725
738
|
@click.argument('job_id', type=str, nargs=1)
|
|
726
739
|
# TODO(zhwu): support logs by job name
|
|
727
740
|
def logs(
|
|
@@ -730,11 +743,12 @@ def logs(
|
|
|
730
743
|
follow: bool,
|
|
731
744
|
num_lines: int,
|
|
732
745
|
node_rank: int,
|
|
746
|
+
start_offset: str,
|
|
733
747
|
):
|
|
734
748
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
735
749
|
"""Retrieve/tail the log of a job."""
|
|
736
750
|
if status:
|
|
737
|
-
raise click.UsageError('`--status` is being deprecated
|
|
751
|
+
raise click.UsageError('`--status` is being deprecated')
|
|
738
752
|
|
|
739
753
|
# Check if the job exists
|
|
740
754
|
if not job_id:
|
|
@@ -748,20 +762,41 @@ def logs(
|
|
|
748
762
|
try:
|
|
749
763
|
_ = jobset_utils.get_jobset(namespace, job_id)
|
|
750
764
|
except jobset_utils.JobNotFoundError:
|
|
751
|
-
|
|
765
|
+
message = (
|
|
752
766
|
f"Job '{job_id}' not found in namespace '{namespace}'. "
|
|
753
|
-
f'
|
|
767
|
+
f'This may be due to a typo, `konduktor down`, or garbage collected. '
|
|
754
768
|
f'Check your jobs with '
|
|
755
769
|
f'{colorama.Style.BRIGHT}`konduktor status`'
|
|
756
|
-
f'{colorama.Style.RESET_ALL}.'
|
|
757
|
-
fg='yellow',
|
|
770
|
+
f'{colorama.Style.RESET_ALL}.'
|
|
758
771
|
)
|
|
759
772
|
|
|
773
|
+
# Try to find near string matches to help with typos.
|
|
774
|
+
try:
|
|
775
|
+
job_specs = jobset_utils.list_jobset(namespace)
|
|
776
|
+
job_names = [
|
|
777
|
+
item['metadata']['name'] for item in (job_specs or {}).get('items', [])
|
|
778
|
+
]
|
|
779
|
+
close_matches = difflib.get_close_matches(
|
|
780
|
+
job_id, job_names, n=3, cutoff=0.4
|
|
781
|
+
)
|
|
782
|
+
except Exception:
|
|
783
|
+
close_matches = []
|
|
784
|
+
|
|
785
|
+
if close_matches:
|
|
786
|
+
suggestions = ', '.join(
|
|
787
|
+
f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{name}{colorama.Style.NORMAL}'
|
|
788
|
+
for name in close_matches
|
|
789
|
+
)
|
|
790
|
+
message += f'{colorama.Fore.YELLOW} Did you mean: {suggestions}?'
|
|
791
|
+
|
|
792
|
+
click.secho(message, fg='yellow')
|
|
793
|
+
|
|
760
794
|
log_utils.tail_logs(
|
|
761
795
|
job_id,
|
|
762
796
|
worker_id=node_rank,
|
|
763
797
|
follow=follow,
|
|
764
798
|
num_logs=num_lines,
|
|
799
|
+
start_offset=start_offset,
|
|
765
800
|
)
|
|
766
801
|
|
|
767
802
|
|
|
@@ -829,8 +864,10 @@ def launch(
|
|
|
829
864
|
):
|
|
830
865
|
"""Launch a task.
|
|
831
866
|
|
|
832
|
-
|
|
833
|
-
|
|
867
|
+
\b
|
|
868
|
+
Notes:
|
|
869
|
+
• If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
|
870
|
+
specification. Otherwise, it is interpreted as a bash command.
|
|
834
871
|
"""
|
|
835
872
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
836
873
|
env = _merge_env_vars(env_file, env)
|
|
@@ -902,96 +939,50 @@ def launch(
|
|
|
902
939
|
)
|
|
903
940
|
|
|
904
941
|
|
|
905
|
-
|
|
906
|
-
@click.argument(
|
|
907
|
-
'jobs',
|
|
908
|
-
nargs=-1,
|
|
909
|
-
required=False,
|
|
910
|
-
)
|
|
911
|
-
@click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
|
|
912
|
-
@click.option(
|
|
913
|
-
'--all-users',
|
|
914
|
-
'--all_users',
|
|
915
|
-
default=False,
|
|
916
|
-
is_flag=True,
|
|
917
|
-
help='Include other users for teardown',
|
|
918
|
-
)
|
|
919
|
-
@click.option(
|
|
920
|
-
'--yes',
|
|
921
|
-
'-y',
|
|
922
|
-
is_flag=True,
|
|
923
|
-
default=False,
|
|
924
|
-
required=False,
|
|
925
|
-
help='Skip confirmation prompt.',
|
|
926
|
-
)
|
|
927
|
-
def down(
|
|
942
|
+
def _find_matching_jobs(
|
|
928
943
|
jobs: List[str],
|
|
929
|
-
|
|
944
|
+
jobs_response: Dict[str, Any],
|
|
945
|
+
namespace: str,
|
|
930
946
|
all_users: Optional[bool],
|
|
931
|
-
|
|
947
|
+
all_flag: Optional[bool] = None,
|
|
932
948
|
):
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
JOB is the name of the job to tear down. If both
|
|
937
|
-
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
938
|
-
|
|
939
|
-
Tearing down a job will delete all associated containers (all billing
|
|
940
|
-
stops), and any data on the containers disks will be lost. Accelerators
|
|
941
|
-
(e.g., GPUs) that are part of the job will be deleted too.
|
|
942
|
-
|
|
943
|
-
Wildcard patterns are supported using * characters.
|
|
944
|
-
Examples: "test-*" matches all jobs starting with "test-",
|
|
945
|
-
"*-gpu" matches all jobs ending with "-gpu".
|
|
946
|
-
|
|
947
|
-
Examples:
|
|
948
|
-
|
|
949
|
-
.. code-block:: bash
|
|
950
|
-
|
|
951
|
-
# Tear down a specific job.
|
|
952
|
-
konduktor down cluster_name
|
|
953
|
-
\b
|
|
954
|
-
# Tear down multiple jobs.
|
|
955
|
-
konduktor down job1 job2
|
|
956
|
-
\b
|
|
957
|
-
# Tear down all jobs matching a pattern.
|
|
958
|
-
konduktor down "test-*"
|
|
959
|
-
\b
|
|
960
|
-
# Tear down all of this users jobs.
|
|
961
|
-
konduktor down -a
|
|
962
|
-
konduktor down --all
|
|
963
|
-
|
|
964
|
-
# Tear down all jobs across all users
|
|
965
|
-
konduktor down --all --all-users
|
|
949
|
+
"""
|
|
950
|
+
Find all jobs matching against the user specified pattern.
|
|
951
|
+
In use in `konduktor down` and `konduktor stop`
|
|
966
952
|
|
|
953
|
+
Note(asaiacai): `jobs_response` should be the list of
|
|
954
|
+
all jobsets in this namespace, not necessarily belonging
|
|
955
|
+
to this user.
|
|
967
956
|
"""
|
|
968
957
|
|
|
969
|
-
|
|
970
|
-
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
971
|
-
jobs_response = jobset_utils.list_jobset(namespace)
|
|
972
|
-
assert jobs_response
|
|
973
|
-
jobs_specs = [
|
|
974
|
-
job
|
|
975
|
-
for job in jobs_response['items']
|
|
976
|
-
if (
|
|
977
|
-
job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
|
|
978
|
-
== common_utils.user_and_hostname_hash()
|
|
979
|
-
and not all_users
|
|
980
|
-
)
|
|
981
|
-
]
|
|
958
|
+
jobs_specs = [job for job in jobs_response['items']]
|
|
982
959
|
|
|
983
|
-
if
|
|
960
|
+
if all_flag:
|
|
984
961
|
assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
962
|
+
assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
|
|
963
|
+
if all_users:
|
|
964
|
+
# --all with --all-users = all jobs of all users
|
|
965
|
+
jobs = [job['metadata']['name'] for job in jobs_specs]
|
|
966
|
+
else:
|
|
967
|
+
# --all without --all-users = all jobs of current user
|
|
968
|
+
jobs = [
|
|
969
|
+
job['metadata']['name']
|
|
970
|
+
for job in jobs_specs
|
|
971
|
+
if job['metadata']['labels'][backend_constants.USER_LABEL]
|
|
972
|
+
== common_utils.get_cleaned_username()
|
|
973
|
+
]
|
|
974
|
+
return jobs
|
|
989
975
|
elif jobs:
|
|
990
976
|
# Get all available jobs to match against patterns
|
|
991
977
|
if len(jobs_specs) == 0:
|
|
992
978
|
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
993
979
|
|
|
994
|
-
all_job_names =
|
|
980
|
+
all_job_names = {
|
|
981
|
+
job['metadata']['name']: job['metadata']['labels'][
|
|
982
|
+
backend_constants.USER_LABEL
|
|
983
|
+
]
|
|
984
|
+
for job in jobs_specs
|
|
985
|
+
}
|
|
995
986
|
matched_jobs = []
|
|
996
987
|
|
|
997
988
|
for job_pattern in jobs:
|
|
@@ -1003,6 +994,30 @@ def down(
|
|
|
1003
994
|
fg='yellow',
|
|
1004
995
|
err=True,
|
|
1005
996
|
)
|
|
997
|
+
for matched_name in pattern_matches:
|
|
998
|
+
if all_job_names[matched_name] != common_utils.get_cleaned_username():
|
|
999
|
+
warning_label = (
|
|
1000
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}Warning'
|
|
1001
|
+
f'{colorama.Style.RESET_ALL}'
|
|
1002
|
+
)
|
|
1003
|
+
job_name = (
|
|
1004
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.WHITE}{matched_name}'
|
|
1005
|
+
f'{colorama.Style.RESET_ALL}'
|
|
1006
|
+
)
|
|
1007
|
+
launched_user = (
|
|
1008
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
|
|
1009
|
+
f'{all_job_names[matched_name]}{colorama.Style.RESET_ALL}'
|
|
1010
|
+
)
|
|
1011
|
+
current_user = (
|
|
1012
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}'
|
|
1013
|
+
f'{common_utils.get_cleaned_username()}'
|
|
1014
|
+
f'{colorama.Style.RESET_ALL}'
|
|
1015
|
+
)
|
|
1016
|
+
logger.info(
|
|
1017
|
+
f'{warning_label}: job {job_name} was launched by '
|
|
1018
|
+
f'{launched_user}, while the current user is {current_user}',
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1006
1021
|
matched_jobs.extend(pattern_matches)
|
|
1007
1022
|
|
|
1008
1023
|
# Remove duplicates while preserving order
|
|
@@ -1020,22 +1035,92 @@ def down(
|
|
|
1020
1035
|
)
|
|
1021
1036
|
else:
|
|
1022
1037
|
raise click.ClickException(
|
|
1023
|
-
'No jobs specified. Use --all to
|
|
1024
|
-
'all jobs
|
|
1038
|
+
'No jobs specified. Use --all to specify '
|
|
1039
|
+
'all jobs belonging to a user '
|
|
1040
|
+
'or specify job names/patterns.'
|
|
1025
1041
|
)
|
|
1042
|
+
return jobs
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1046
|
+
@click.argument(
|
|
1047
|
+
'jobs',
|
|
1048
|
+
nargs=-1,
|
|
1049
|
+
required=False,
|
|
1050
|
+
)
|
|
1051
|
+
@click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
|
|
1052
|
+
@click.option(
|
|
1053
|
+
'--all-users',
|
|
1054
|
+
'--all_users',
|
|
1055
|
+
default=False,
|
|
1056
|
+
is_flag=True,
|
|
1057
|
+
help='Include other users for teardown',
|
|
1058
|
+
)
|
|
1059
|
+
@click.option(
|
|
1060
|
+
'--yes',
|
|
1061
|
+
'-y',
|
|
1062
|
+
is_flag=True,
|
|
1063
|
+
default=False,
|
|
1064
|
+
required=False,
|
|
1065
|
+
help='Skip confirmation prompt.',
|
|
1066
|
+
)
|
|
1067
|
+
def down(
|
|
1068
|
+
jobs: List[str],
|
|
1069
|
+
all: Optional[bool],
|
|
1070
|
+
all_users: Optional[bool],
|
|
1071
|
+
yes: bool,
|
|
1072
|
+
):
|
|
1073
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1074
|
+
"""Tear down job(s).
|
|
1075
|
+
|
|
1076
|
+
\b
|
|
1077
|
+
Examples:
|
|
1078
|
+
# Tear down a specific job.
|
|
1079
|
+
konduktor down my_job
|
|
1080
|
+
\b
|
|
1081
|
+
# Tear down multiple jobs.
|
|
1082
|
+
konduktor down my_job1 my_job2
|
|
1083
|
+
\b
|
|
1084
|
+
# Tear down all jobs matching a pattern.
|
|
1085
|
+
konduktor down "my_job-*"
|
|
1086
|
+
\b
|
|
1087
|
+
# Tear down all of this users jobs.
|
|
1088
|
+
konduktor down -a
|
|
1089
|
+
konduktor down --all
|
|
1090
|
+
\b
|
|
1091
|
+
# Tear down all jobs across all users
|
|
1092
|
+
konduktor down --all --all-users
|
|
1093
|
+
|
|
1094
|
+
\b
|
|
1095
|
+
Notes:
|
|
1096
|
+
• If both JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1097
|
+
• Tearing down a job will delete all associated containers (all billing
|
|
1098
|
+
stops), and any data on the containers disks will be lost. Accelerators
|
|
1099
|
+
(e.g., GPUs) that are part of the job will be deleted too.
|
|
1100
|
+
• Wildcard patterns are supported using * characters.
|
|
1101
|
+
Ex: "test-*" matches all jobs starting with "test-",
|
|
1102
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1103
|
+
|
|
1104
|
+
"""
|
|
1105
|
+
|
|
1106
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1107
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1108
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1109
|
+
assert jobs_response
|
|
1110
|
+
filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
|
|
1026
1111
|
|
|
1027
1112
|
if not yes:
|
|
1028
1113
|
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
|
1029
1114
|
# it exists but is STOPPED.
|
|
1030
1115
|
prompt = (
|
|
1031
1116
|
f'Tearing down job(s) {colorama.Style.BRIGHT} '
|
|
1032
|
-
f'{colorama.Fore.GREEN}{
|
|
1117
|
+
f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
|
|
1033
1118
|
'Proceed?'
|
|
1034
1119
|
)
|
|
1035
1120
|
if prompt is not None:
|
|
1036
1121
|
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1037
1122
|
|
|
1038
|
-
for job in track(
|
|
1123
|
+
for job in track(filtered_jobs, description='Tearing down job(s)...'):
|
|
1039
1124
|
jobset_utils.delete_jobset(namespace, job)
|
|
1040
1125
|
|
|
1041
1126
|
|
|
@@ -1069,112 +1154,59 @@ def stop(
|
|
|
1069
1154
|
):
|
|
1070
1155
|
"""Suspend job(s) (manual/user-initiated).
|
|
1071
1156
|
|
|
1072
|
-
|
|
1073
|
-
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1074
|
-
|
|
1075
|
-
Suspending a job will pause execution and mark the job as SUSPENDED (by user).
|
|
1076
|
-
The job can be resumed later with `konduktor start`.
|
|
1077
|
-
|
|
1078
|
-
If a job is suspended by the system (e.g., due to queueing),
|
|
1079
|
-
it will show as SUSPENDED (by system).
|
|
1080
|
-
|
|
1081
|
-
Wildcard patterns are supported using * characters.
|
|
1082
|
-
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1083
|
-
"*-gpu" matches all jobs ending with "-gpu".
|
|
1084
|
-
|
|
1157
|
+
\b
|
|
1085
1158
|
Examples:
|
|
1159
|
+
# Suspend a specific job.
|
|
1160
|
+
konduktor stop my_job
|
|
1161
|
+
\b
|
|
1162
|
+
# Suspend multiple jobs.
|
|
1163
|
+
konduktor stop my_job1 my_job2
|
|
1164
|
+
\b
|
|
1165
|
+
# Suspend all jobs matching a pattern.
|
|
1166
|
+
konduktor stop "my_job-*"
|
|
1167
|
+
\b
|
|
1168
|
+
# Suspend all of this users jobs.
|
|
1169
|
+
konduktor stop -a
|
|
1170
|
+
konduktor stop --all
|
|
1171
|
+
\b
|
|
1172
|
+
# Suspend all jobs across all users
|
|
1173
|
+
konduktor stop --all --all-users
|
|
1086
1174
|
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
\b
|
|
1098
|
-
# Suspend all of this users jobs.
|
|
1099
|
-
konduktor stop -a
|
|
1100
|
-
konduktor stop --all
|
|
1101
|
-
|
|
1102
|
-
# Suspend all jobs across all users
|
|
1103
|
-
konduktor stop --all --all-users
|
|
1104
|
-
|
|
1175
|
+
\b
|
|
1176
|
+
Notes:
|
|
1177
|
+
• If both JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1178
|
+
• Suspending a job will pause execution and mark the job as SUSPENDED (by user).
|
|
1179
|
+
The job can be resumed later with `konduktor start`.
|
|
1180
|
+
• If a job is suspended by the system (e.g., due to queueing), it
|
|
1181
|
+
will show as SUSPENDED (by system).
|
|
1182
|
+
• Wildcard patterns are supported using * characters.
|
|
1183
|
+
Ex: "test-*" matches all jobs starting with "test-",
|
|
1184
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1105
1185
|
"""
|
|
1106
1186
|
|
|
1107
1187
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1108
1188
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1109
1189
|
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1110
1190
|
assert jobs_response
|
|
1111
|
-
|
|
1112
|
-
job
|
|
1113
|
-
for job in jobs_response['items']
|
|
1114
|
-
if (
|
|
1115
|
-
job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
|
|
1116
|
-
== common_utils.user_and_hostname_hash()
|
|
1117
|
-
and not all_users
|
|
1118
|
-
)
|
|
1119
|
-
]
|
|
1120
|
-
|
|
1121
|
-
if all:
|
|
1122
|
-
assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
|
|
1123
|
-
assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
|
|
1124
|
-
jobs = [job['metadata']['name'] for job in jobs_specs]
|
|
1125
|
-
elif jobs:
|
|
1126
|
-
# Get all available jobs to match against patterns
|
|
1127
|
-
if len(jobs_specs) == 0:
|
|
1128
|
-
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
1129
|
-
|
|
1130
|
-
all_job_names = [job['metadata']['name'] for job in jobs_specs]
|
|
1131
|
-
matched_jobs = []
|
|
1132
|
-
|
|
1133
|
-
for job_pattern in jobs:
|
|
1134
|
-
# Use fnmatch for both wildcard and exact pattern matching
|
|
1135
|
-
pattern_matches = fnmatch.filter(all_job_names, job_pattern)
|
|
1136
|
-
if not pattern_matches:
|
|
1137
|
-
click.secho(
|
|
1138
|
-
f'Warning: No jobs found matching pattern "{job_pattern}"',
|
|
1139
|
-
fg='yellow',
|
|
1140
|
-
err=True,
|
|
1141
|
-
)
|
|
1142
|
-
matched_jobs.extend(pattern_matches)
|
|
1143
|
-
|
|
1144
|
-
# Remove duplicates while preserving order
|
|
1145
|
-
seen = set()
|
|
1146
|
-
jobs = []
|
|
1147
|
-
for job in matched_jobs:
|
|
1148
|
-
if job not in seen:
|
|
1149
|
-
seen.add(job)
|
|
1150
|
-
jobs.append(job)
|
|
1151
|
-
|
|
1152
|
-
if not jobs:
|
|
1153
|
-
raise click.ClickException(
|
|
1154
|
-
f'No matching jobs found check status with '
|
|
1155
|
-
f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
|
|
1156
|
-
)
|
|
1157
|
-
else:
|
|
1158
|
-
raise click.ClickException(
|
|
1159
|
-
'No jobs specified. Use --all to suspend '
|
|
1160
|
-
'all jobs or specify job names/patterns.'
|
|
1161
|
-
)
|
|
1191
|
+
filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
|
|
1162
1192
|
|
|
1163
1193
|
if not yes:
|
|
1164
1194
|
# Prompt for confirmation
|
|
1165
1195
|
prompt = (
|
|
1166
1196
|
f'Suspending job(s) {colorama.Style.BRIGHT} '
|
|
1167
|
-
f'{colorama.Fore.GREEN}{
|
|
1197
|
+
f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
|
|
1168
1198
|
'Proceed?'
|
|
1169
1199
|
)
|
|
1170
1200
|
if prompt is not None:
|
|
1171
1201
|
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1172
1202
|
|
|
1173
|
-
for job in track(
|
|
1203
|
+
for job in track(filtered_jobs, description='Suspending job(s)...'):
|
|
1174
1204
|
jobset_utils.stop_jobset(namespace, job)
|
|
1175
1205
|
|
|
1176
1206
|
click.secho(
|
|
1177
|
-
ux_utils.command_hint_messages(
|
|
1207
|
+
ux_utils.command_hint_messages(
|
|
1208
|
+
ux_utils.CommandHintType.JOB_STOP, filtered_jobs
|
|
1209
|
+
),
|
|
1178
1210
|
fg='green',
|
|
1179
1211
|
bold=True,
|
|
1180
1212
|
)
|
|
@@ -1212,54 +1244,41 @@ def start(
|
|
|
1212
1244
|
):
|
|
1213
1245
|
"""Resume suspended job(s) (manual/user-initiated).
|
|
1214
1246
|
|
|
1215
|
-
|
|
1216
|
-
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1217
|
-
|
|
1218
|
-
Resuming a job will restart execution from where it was suspended.
|
|
1219
|
-
Only suspended jobs can be resumed.
|
|
1220
|
-
|
|
1221
|
-
This command works for both manually suspended jobs (SUSPENDED by user)
|
|
1222
|
-
and system-suspended jobs (SUSPENDED by system).
|
|
1223
|
-
|
|
1224
|
-
Wildcard patterns are supported using * characters.
|
|
1225
|
-
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1226
|
-
"*-gpu" matches all jobs ending with "-gpu".
|
|
1227
|
-
|
|
1247
|
+
\b
|
|
1228
1248
|
Examples:
|
|
1249
|
+
# Resume a specific job.
|
|
1250
|
+
konduktor start my_job
|
|
1251
|
+
\b
|
|
1252
|
+
# Resume multiple jobs.
|
|
1253
|
+
konduktor start my_job1 my_job2
|
|
1254
|
+
\b
|
|
1255
|
+
# Resume all jobs matching a pattern.
|
|
1256
|
+
konduktor start "my_job-*"
|
|
1257
|
+
\b
|
|
1258
|
+
# Resume all of this users suspended jobs.
|
|
1259
|
+
konduktor start -a
|
|
1260
|
+
konduktor start --all
|
|
1261
|
+
\b
|
|
1262
|
+
# Resume all suspended jobs across all users
|
|
1263
|
+
konduktor start --all --all-users
|
|
1229
1264
|
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
\b
|
|
1241
|
-
# Resume all of this users suspended jobs.
|
|
1242
|
-
konduktor start -a
|
|
1243
|
-
konduktor start --all
|
|
1244
|
-
|
|
1245
|
-
# Resume all suspended jobs across all users
|
|
1246
|
-
konduktor start --all --all-users
|
|
1247
|
-
|
|
1265
|
+
\b
|
|
1266
|
+
Notes:
|
|
1267
|
+
• If both JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1268
|
+
• Resuming a job will restart execution from where it was suspended.
|
|
1269
|
+
Only suspended jobs can be resumed.
|
|
1270
|
+
• This command works for both manually suspended jobs (SUSPENDED by user)
|
|
1271
|
+
and system-suspended jobs (SUSPENDED by system).
|
|
1272
|
+
• Wildcard patterns are supported using * characters.
|
|
1273
|
+
Ex: "test-*" matches all jobs starting with "test-",
|
|
1274
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1248
1275
|
"""
|
|
1249
1276
|
|
|
1250
1277
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1251
1278
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1252
1279
|
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1253
1280
|
assert jobs_response
|
|
1254
|
-
jobs_specs = [
|
|
1255
|
-
job
|
|
1256
|
-
for job in jobs_response['items']
|
|
1257
|
-
if (
|
|
1258
|
-
job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
|
|
1259
|
-
== common_utils.user_and_hostname_hash()
|
|
1260
|
-
and not all_users
|
|
1261
|
-
)
|
|
1262
|
-
]
|
|
1281
|
+
jobs_specs = [job for job in jobs_response['items']]
|
|
1263
1282
|
|
|
1264
1283
|
if all:
|
|
1265
1284
|
# Only get suspended jobs when using --all
|
|
@@ -1335,24 +1354,22 @@ def start(
|
|
|
1335
1354
|
nargs=-1,
|
|
1336
1355
|
)
|
|
1337
1356
|
def check(clouds: Tuple[str]):
|
|
1338
|
-
"""Check which clouds are available to use for storage
|
|
1339
|
-
|
|
1340
|
-
This checks storage credentials for a cloud supported by konduktor. If a
|
|
1341
|
-
cloud is detected to be inaccessible, the reason and correction steps will
|
|
1342
|
-
be shown.
|
|
1343
|
-
|
|
1344
|
-
If CLOUDS are specified, checks credentials for only those clouds.
|
|
1345
|
-
|
|
1346
|
-
The enabled clouds are cached and form the "search space" to be considered
|
|
1347
|
-
for each task.
|
|
1357
|
+
"""Check which clouds are available to use for storage with Konduktor
|
|
1348
1358
|
|
|
1359
|
+
\b
|
|
1349
1360
|
Examples:
|
|
1361
|
+
# Check only specific clouds - gs, s3.
|
|
1362
|
+
konduktor check gs
|
|
1363
|
+
konduktor check s3
|
|
1350
1364
|
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1365
|
+
\b
|
|
1366
|
+
Notes:
|
|
1367
|
+
• This checks storage credentials for a cloud supported by konduktor.
|
|
1368
|
+
If a cloud is detected to be inaccessible, the reason and correction
|
|
1369
|
+
steps will be shown.
|
|
1370
|
+
• If CLOUDS are specified, checks credentials for only those clouds.
|
|
1371
|
+
• The enabled clouds are cached and form the "search space" to
|
|
1372
|
+
be considered for each task.
|
|
1356
1373
|
"""
|
|
1357
1374
|
clouds_arg = clouds if len(clouds) > 0 else None
|
|
1358
1375
|
konduktor_check.check(clouds=clouds_arg)
|
|
@@ -1401,23 +1418,12 @@ def secret():
|
|
|
1401
1418
|
|
|
1402
1419
|
USAGE: konduktor secret COMMAND
|
|
1403
1420
|
|
|
1404
|
-
\b
|
|
1405
|
-
Use one of the following COMMANDS:
|
|
1406
|
-
create [FLAGS] [NAME]
|
|
1407
|
-
delete [NAME]
|
|
1408
|
-
list [FLAGS]
|
|
1409
|
-
|
|
1410
1421
|
\b
|
|
1411
1422
|
Examples:
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
\b
|
|
1418
|
-
For details on COMMAND ARGS:
|
|
1419
|
-
konduktor secret create -h
|
|
1420
|
-
konduktor secret list -h
|
|
1423
|
+
konduktor secret create --kind git-ssh --from-file ~/.ssh/id_rsa my-ssh-name
|
|
1424
|
+
konduktor secret create --kind env --inline FOO=bar my-env-name
|
|
1425
|
+
konduktor secret delete my-ssh-name
|
|
1426
|
+
konduktor secret list
|
|
1421
1427
|
"""
|
|
1422
1428
|
|
|
1423
1429
|
|
|
@@ -1608,8 +1614,7 @@ def delete(name):
|
|
|
1608
1614
|
help='Show all secrets, including those not owned by the current user.',
|
|
1609
1615
|
)
|
|
1610
1616
|
def list_secrets(all_users: bool):
|
|
1611
|
-
"""List secrets in the namespace.
|
|
1612
|
-
Defaults to only your secrets unless --all-users is set."""
|
|
1617
|
+
"""List secrets in the namespace."""
|
|
1613
1618
|
|
|
1614
1619
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1615
1620
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
@@ -1654,23 +1659,11 @@ def serve():
|
|
|
1654
1659
|
|
|
1655
1660
|
USAGE: konduktor serve COMMAND
|
|
1656
1661
|
|
|
1657
|
-
\b
|
|
1658
|
-
Use one of the following COMMANDS:
|
|
1659
|
-
launch
|
|
1660
|
-
down
|
|
1661
|
-
status
|
|
1662
|
-
|
|
1663
1662
|
\b
|
|
1664
1663
|
Examples:
|
|
1665
1664
|
konduktor serve launch my-deployment
|
|
1666
1665
|
konduktor serve down my-deployment
|
|
1667
1666
|
konduktor serve status
|
|
1668
|
-
|
|
1669
|
-
\b
|
|
1670
|
-
For details on COMMAND ARGS:
|
|
1671
|
-
konduktor serve launch -h
|
|
1672
|
-
konduktor serve down -h
|
|
1673
|
-
konduktor serve status -h
|
|
1674
1667
|
"""
|
|
1675
1668
|
pass
|
|
1676
1669
|
|
|
@@ -1745,8 +1738,10 @@ def serve_launch(
|
|
|
1745
1738
|
):
|
|
1746
1739
|
"""Launch a deployment to serve.
|
|
1747
1740
|
|
|
1748
|
-
|
|
1749
|
-
|
|
1741
|
+
\b
|
|
1742
|
+
Notes:
|
|
1743
|
+
• If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
|
1744
|
+
specification. Otherwise, it is interpreted as a bash command.
|
|
1750
1745
|
"""
|
|
1751
1746
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1752
1747
|
env = _merge_env_vars(env_file, env)
|
|
@@ -1828,13 +1823,10 @@ def serve_down(
|
|
|
1828
1823
|
):
|
|
1829
1824
|
"""Tear down deployments (Deployment, Service, PodAutoscaler).
|
|
1830
1825
|
|
|
1831
|
-
Use --all or -a to tear down all deployments.
|
|
1832
|
-
|
|
1833
|
-
Examples:
|
|
1834
|
-
|
|
1835
1826
|
\b
|
|
1836
|
-
|
|
1837
|
-
|
|
1827
|
+
Examples:
|
|
1828
|
+
konduktor serve down my-deployment
|
|
1829
|
+
konduktor serve down -a
|
|
1838
1830
|
"""
|
|
1839
1831
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1840
1832
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
@@ -1861,7 +1853,7 @@ def serve_down(
|
|
|
1861
1853
|
)
|
|
1862
1854
|
else:
|
|
1863
1855
|
raise click.ClickException(
|
|
1864
|
-
'No deployments specified. Use --all to tear down all
|
|
1856
|
+
'No deployments specified. Use --all to tear down all deployments '
|
|
1865
1857
|
'or pass names/patterns.'
|
|
1866
1858
|
)
|
|
1867
1859
|
|
konduktor/utils/log_utils.py
CHANGED
|
@@ -365,6 +365,7 @@ def tail_vicky_logs(
|
|
|
365
365
|
worker_id: int = 0,
|
|
366
366
|
num_logs: int = -1,
|
|
367
367
|
follow: bool = True,
|
|
368
|
+
start_offset: str = '1h',
|
|
368
369
|
):
|
|
369
370
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
370
371
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
@@ -380,15 +381,16 @@ def tail_vicky_logs(
|
|
|
380
381
|
assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
|
|
381
382
|
query = {'limit': num_logs}
|
|
382
383
|
if follow:
|
|
384
|
+
effective_offset = start_offset or '1h'
|
|
383
385
|
logger.info(
|
|
384
|
-
'Tailing logs from
|
|
386
|
+
f'Tailing logs from {effective_offset} ago. '
|
|
385
387
|
'If logs come up empty, there might be logs just earlier '
|
|
386
|
-
'than
|
|
388
|
+
'than that window, check Grafana or use:\n'
|
|
387
389
|
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
|
388
390
|
f'`konduktor logs --no-follow {job_name}`'
|
|
389
391
|
f'{colorama.Style.RESET_ALL}'
|
|
390
392
|
)
|
|
391
|
-
query['start_offset'] =
|
|
393
|
+
query['start_offset'] = effective_offset
|
|
392
394
|
query['query'] = (
|
|
393
395
|
f'k8s.namespace.name: "{namespace}" AND '
|
|
394
396
|
f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
|
|
@@ -453,12 +455,13 @@ def tail_logs(
|
|
|
453
455
|
worker_id: int = 0,
|
|
454
456
|
num_logs: int = 1000,
|
|
455
457
|
follow: bool = True,
|
|
458
|
+
start_offset: str = '1h',
|
|
456
459
|
):
|
|
457
460
|
logs_backend = config.get_nested(('logs', 'backend'), None)
|
|
458
461
|
if logs_backend == LogBackend.VICTORIA:
|
|
459
|
-
tail_vicky_logs(job_name, worker_id, num_logs, follow)
|
|
462
|
+
tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
|
|
460
463
|
elif logs_backend == LogBackend.LOKI:
|
|
461
464
|
tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
|
|
462
465
|
else:
|
|
463
466
|
logger.info('Defaulting to VictoriaLogs')
|
|
464
|
-
tail_vicky_logs(job_name, worker_id, num_logs, follow)
|
|
467
|
+
tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: konduktor-nightly
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev20251215105431
|
|
4
4
|
Summary: GPU Cluster Health Management
|
|
5
5
|
Author: Andrew Aikawa
|
|
6
6
|
Author-email: asai@berkeley.edu
|
|
@@ -29,6 +29,7 @@ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
|
|
|
29
29
|
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
|
30
30
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
31
31
|
Requires-Dist: rich (>=13.9.4,<14.0.0)
|
|
32
|
+
Requires-Dist: sniffio (>=1.3,<2.0)
|
|
32
33
|
Requires-Dist: websockets (>=15.0.1,<16.0.0)
|
|
33
34
|
Description-Content-Type: text/markdown
|
|
34
35
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
konduktor/__init__.py,sha256=
|
|
1
|
+
konduktor/__init__.py,sha256=A8k1HK8UyBfw1hk53UvZzpp6khOyWkEDT4EAypu6Osc,1574
|
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
|
|
4
4
|
konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
|
|
@@ -11,9 +11,9 @@ konduktor/backends/deployment.py,sha256=d0a3F7dxDbnRKIt4ZO_kQ0_vet0pZvg4bWYzVZ8D
|
|
|
11
11
|
konduktor/backends/deployment_utils.py,sha256=9CmB9CYC_3wxIfIOmTSCN2hbURZ5MpEMTvPwYMUXBRM,49272
|
|
12
12
|
konduktor/backends/jobset.py,sha256=drt8Gc0iYQx18JWXBU6XfhUvC2xCKd8szSJ2JC4O20Q,8640
|
|
13
13
|
konduktor/backends/jobset_utils.py,sha256=g49NY8RFhL_NNd4c1adRLG_Bq3UTFtRURxcAzxnMEYw,26524
|
|
14
|
-
konduktor/backends/pod_utils.py,sha256=
|
|
14
|
+
konduktor/backends/pod_utils.py,sha256=WL6b9_yBqBHjX84hE57uMTYL-rbGo_Ugf2L7-_8NpDc,19422
|
|
15
15
|
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
|
16
|
-
konduktor/cli.py,sha256=
|
|
16
|
+
konduktor/cli.py,sha256=ORnFQub6aSGeZytETn39Dafl1gjH-yihP2r5FnF3EeQ,59591
|
|
17
17
|
konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
|
|
18
18
|
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
|
19
19
|
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -93,15 +93,15 @@ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4x
|
|
|
93
93
|
konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
|
|
94
94
|
konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
|
|
95
95
|
konduktor/utils/kubernetes_utils.py,sha256=XleYxzG64hciZb-CjzBDjX8BOMhFATIIHZlXD2bqN0Q,27186
|
|
96
|
-
konduktor/utils/log_utils.py,sha256=
|
|
96
|
+
konduktor/utils/log_utils.py,sha256=EPDDNu7WxfbP0T5WBHRNA1Pplr7mIeqFIQf9JTOQlss,18340
|
|
97
97
|
konduktor/utils/loki_utils.py,sha256=eOGiD7dZNuwzmyXKiifyqz00EVh2nwcUPFSiPkac9y0,4050
|
|
98
98
|
konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
|
|
99
99
|
konduktor/utils/schemas.py,sha256=cr39nEAgjluhXoUYnvIwCwLBH8rLds37MBsF1uQv1rw,19067
|
|
100
100
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
|
101
101
|
konduktor/utils/ux_utils.py,sha256=LSH4b5lckD157qDF4keThxtkGdxNrAfGKmH1ewhZkm4,8646
|
|
102
102
|
konduktor/utils/validator.py,sha256=UcLvZCk9Cpbbhw8r_ZJtTpMSTfY1NKqcyciKsPzRPZM,17222
|
|
103
|
-
konduktor_nightly-0.1.0.
|
|
104
|
-
konduktor_nightly-0.1.0.
|
|
105
|
-
konduktor_nightly-0.1.0.
|
|
106
|
-
konduktor_nightly-0.1.0.
|
|
107
|
-
konduktor_nightly-0.1.0.
|
|
103
|
+
konduktor_nightly-0.1.0.dev20251215105431.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
|
104
|
+
konduktor_nightly-0.1.0.dev20251215105431.dist-info/METADATA,sha256=WWbQsIiHdKINH1YFtDbNxi8ZW3K_G9P7FyasPy3IWdM,4283
|
|
105
|
+
konduktor_nightly-0.1.0.dev20251215105431.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
106
|
+
konduktor_nightly-0.1.0.dev20251215105431.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
|
107
|
+
konduktor_nightly-0.1.0.dev20251215105431.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|