konduktor-nightly 0.1.0.dev20251107104752__py3-none-any.whl → 0.1.0.dev20251215105431__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '5ceef9b8f579ac23f7a2bd863820aaa2341055e3'
14
+ _KONDUKTOR_COMMIT_SHA = '421390595e3a1b9f263e790323deae61d94da231'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20251107104752'
48
+ __version__ = '1.0.0.dev0.1.0.dev20251215105431'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -28,6 +28,8 @@ if typing.TYPE_CHECKING:
28
28
  logger = logging.get_logger(__name__)
29
29
 
30
30
  _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
31
+ # Use a large default (7 days) to mimic "infinite" runtime.
32
+ _DEFAULT_MAX_RUN_DURATION_SECONDS = 604800
31
33
 
32
34
 
33
35
  def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
@@ -471,18 +473,21 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
471
473
  jobset_spec: The JobSet spec dictionary to modify
472
474
  task: The task object containing resource information
473
475
  """
474
- # Add max run duration annotation
475
- assert task.resources is not None and task.resources.labels is not None
476
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
477
- if not maxRunDurationSeconds:
478
- raise ValueError('maxRunDurationSeconds is required')
479
- jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
480
- str(maxRunDurationSeconds)
476
+ assert task.resources is not None, 'Task resources are required'
477
+ labels = task.resources.labels or {}
478
+
479
+ # Add max run duration annotation, defaulting to a practically infinite value.
480
+ maxRunDurationSeconds = labels.get('maxRunDurationSeconds')
481
+ metadata = jobset_spec['jobset']['metadata']
482
+ metadata.setdefault('annotations', {})[_RUN_DURATION_ANNOTATION_KEY] = str(
483
+ maxRunDurationSeconds
484
+ if maxRunDurationSeconds is not None
485
+ else _DEFAULT_MAX_RUN_DURATION_SECONDS
481
486
  )
482
487
 
483
488
  # Inject resource labels into JobSet metadata.
484
- if task.resources and task.resources.labels:
485
- jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
489
+ if labels:
490
+ jobset_spec['jobset']['metadata']['labels'].update(labels)
486
491
 
487
492
 
488
493
  def merge_pod_into_jobset_template(
konduktor/cli.py CHANGED
@@ -34,6 +34,7 @@ listed in "konduktor --help". Take care to put logically connected commands clo
34
34
  each other.
35
35
  """
36
36
 
37
+ import difflib
37
38
  import fnmatch
38
39
  import os
39
40
  import pathlib
@@ -273,22 +274,20 @@ _TASK_OPTIONS = [
273
274
  '--env-file',
274
275
  required=False,
275
276
  type=dotenv.dotenv_values,
276
- help="""\
277
- Path to a dotenv file with environment variables to set on the remote
278
- node.
279
-
280
- If any values from ``--env-file`` conflict with values set by
281
- ``--env``, the ``--env`` value will be preferred.""",
277
+ help=(
278
+ 'Path to a dotenv file with environment variables to set on the '
279
+ 'remote node. If any values from ``--env-file`` conflict '
280
+ 'with values set by ``--env``, the ``--env`` value will '
281
+ 'be preferred.'
282
+ ),
282
283
  ),
283
284
  click.option(
284
285
  '--env',
285
286
  required=False,
286
287
  type=_parse_env_var,
287
288
  multiple=True,
288
- help="""\
289
- Environment variable to set on the remote node.
290
- It can be specified multiple times.
291
- Examples:
289
+ help="""\\
290
+ Environment variable to set on the remote node. It can be specified multiple times:
292
291
 
293
292
  \b
294
293
  1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
@@ -298,7 +297,7 @@ _TASK_OPTIONS = [
298
297
  is run.
299
298
 
300
299
  3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
301
- same value of ``$MY_ENV3`` in the local environment.""",
300
+ same value of ``$MY_ENV3`` in the local environment.""", # noqa: E501,
302
301
  ),
303
302
  ]
304
303
  _TASK_OPTIONS_WITH_NAME = [
@@ -320,10 +319,10 @@ _EXTRA_RESOURCES_OPTIONS = [
320
319
  type=str,
321
320
  help=(
322
321
  'Type and number of GPUs to use. Example values: '
323
- '"V100:8", "V100" (short for a count of 1)'
322
+ '"V100:8", "V100" (short for a count of 1) '
324
323
  'If a new cluster is being launched by this command, this is the '
325
- 'resources to provision. If an existing cluster is being reused, this'
326
- " is seen as the task demand, which must fit the cluster's total "
324
+ 'resources to provision. If an existing cluster is being reused, this '
325
+ "is seen as the task demand, which must fit the cluster's total "
327
326
  'resources and is used for scheduling the task. '
328
327
  'Overrides the "accelerators" '
329
328
  'config in the YAML if both are supplied. '
@@ -624,7 +623,7 @@ def cli():
624
623
  default=False,
625
624
  is_flag=True,
626
625
  required=False,
627
- help='Show all clusters, including those not owned by the ' 'current user.',
626
+ help='Show all jobs, including those not owned by the current user.',
628
627
  )
629
628
  @click.option(
630
629
  '--limit',
@@ -660,19 +659,19 @@ def status(
660
659
 
661
660
  \b
662
661
  Examples:
663
- konduktor status --limit 10
664
- konduktor status --before "08/06/25 03:53PM"
665
- konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
662
+ konduktor status --limit 10
663
+ konduktor status --before "08/06/25 03:53PM"
664
+ konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
666
665
 
667
666
  \b
668
667
  Notes:
669
- • When using --before or --after timestamps, "08/06/25"
670
- is equivalent to "08/06/25 00:00".
671
- • "03:53PM" is equivalent to "03:53:00PM".
672
- • Timestamps shown in "konduktor status" are truncated
673
- and are in the local timezone.
674
- Example: "03:53:55PM" → "03:53PM" would show up in
675
- --after "03:53PM" but not in --before "03:53PM".
668
+ • When using --before or --after timestamps, "08/06/25" is
669
+ equivalent to "08/06/25 00:00".
670
+ • "03:53PM" is equivalent to "03:53:00PM".
671
+ • Timestamps shown in "konduktor status" are truncated and are in
672
+ the local timezone.
673
+ Example: "03:53:55PM" → "03:53PM" would show up in --after "03:53PM"
674
+ but not in --before "03:53PM".
676
675
  """
677
676
  context = kubernetes_utils.get_current_kube_config_context_name()
678
677
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -690,8 +689,8 @@ def status(
690
689
  is_flag=True,
691
690
  default=False,
692
691
  help=(
693
- 'If specified, do not show logs but exit with a status code for the '
694
- "job's status: 0 for succeeded, or 1 for all other statuses."
692
+ '[DEPRECATED] If specified, do not show logs but exit with a status code '
693
+ "for the job's status: 0 for succeeded, or 1 for all other statuses."
695
694
  ),
696
695
  )
697
696
  @click.option(
@@ -701,12 +700,13 @@ def status(
701
700
  help=(
702
701
  'Follow the logs of a job. '
703
702
  'If --no-follow is specified, print the log so far and exit. '
704
- '[default: --follow]'
703
+ '(default: --follow)'
705
704
  ),
706
705
  )
707
706
  @click.option(
708
707
  '--num-lines',
709
- '--num_lines' '-n',
708
+ '--num_lines',
709
+ '-n',
710
710
  default=-1,
711
711
  type=int,
712
712
  help=(
@@ -722,6 +722,19 @@ def status(
722
722
  type=int,
723
723
  help='The node rank to tail logs from.',
724
724
  )
725
+ @click.option(
726
+ '--start-offset',
727
+ '--start_offset',
728
+ type=str,
729
+ required=False,
730
+ default='1h',
731
+ help=(
732
+ 'Choose how much time from now to look back in logs. '
733
+ 'Examples: 30s, 5m, 2h, 1d. Default is 1h. '
734
+ 'Note: currently only applies when streaming (default --follow). '
735
+ 'With --no-follow, all available logs are returned.'
736
+ ),
737
+ )
725
738
  @click.argument('job_id', type=str, nargs=1)
726
739
  # TODO(zhwu): support logs by job name
727
740
  def logs(
@@ -730,11 +743,12 @@ def logs(
730
743
  follow: bool,
731
744
  num_lines: int,
732
745
  node_rank: int,
746
+ start_offset: str,
733
747
  ):
734
748
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
735
749
  """Retrieve/tail the log of a job."""
736
750
  if status:
737
- raise click.UsageError('`--status` is being deprecated)')
751
+ raise click.UsageError('`--status` is being deprecated')
738
752
 
739
753
  # Check if the job exists
740
754
  if not job_id:
@@ -748,20 +762,41 @@ def logs(
748
762
  try:
749
763
  _ = jobset_utils.get_jobset(namespace, job_id)
750
764
  except jobset_utils.JobNotFoundError:
751
- click.secho(
765
+ message = (
752
766
  f"Job '{job_id}' not found in namespace '{namespace}'. "
753
- f'Job may have been `konduktor down`. '
767
+ f'This may be due to a typo, `konduktor down`, or garbage collected. '
754
768
  f'Check your jobs with '
755
769
  f'{colorama.Style.BRIGHT}`konduktor status`'
756
- f'{colorama.Style.RESET_ALL}.',
757
- fg='yellow',
770
+ f'{colorama.Style.RESET_ALL}.'
758
771
  )
759
772
 
773
+ # Try to find near string matches to help with typos.
774
+ try:
775
+ job_specs = jobset_utils.list_jobset(namespace)
776
+ job_names = [
777
+ item['metadata']['name'] for item in (job_specs or {}).get('items', [])
778
+ ]
779
+ close_matches = difflib.get_close_matches(
780
+ job_id, job_names, n=3, cutoff=0.4
781
+ )
782
+ except Exception:
783
+ close_matches = []
784
+
785
+ if close_matches:
786
+ suggestions = ', '.join(
787
+ f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{name}{colorama.Style.NORMAL}'
788
+ for name in close_matches
789
+ )
790
+ message += f'{colorama.Fore.YELLOW} Did you mean: {suggestions}?'
791
+
792
+ click.secho(message, fg='yellow')
793
+
760
794
  log_utils.tail_logs(
761
795
  job_id,
762
796
  worker_id=node_rank,
763
797
  follow=follow,
764
798
  num_logs=num_lines,
799
+ start_offset=start_offset,
765
800
  )
766
801
 
767
802
 
@@ -829,8 +864,10 @@ def launch(
829
864
  ):
830
865
  """Launch a task.
831
866
 
832
- If ENTRYPOINT points to a valid YAML file, it is read in as the task
833
- specification. Otherwise, it is interpreted as a bash command.
867
+ \b
868
+ Notes:
869
+ • If ENTRYPOINT points to a valid YAML file, it is read in as the task
870
+ specification. Otherwise, it is interpreted as a bash command.
834
871
  """
835
872
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
836
873
  env = _merge_env_vars(env_file, env)
@@ -902,96 +939,50 @@ def launch(
902
939
  )
903
940
 
904
941
 
905
- @cli.command(cls=_DocumentedCodeCommand)
906
- @click.argument(
907
- 'jobs',
908
- nargs=-1,
909
- required=False,
910
- )
911
- @click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
912
- @click.option(
913
- '--all-users',
914
- '--all_users',
915
- default=False,
916
- is_flag=True,
917
- help='Include other users for teardown',
918
- )
919
- @click.option(
920
- '--yes',
921
- '-y',
922
- is_flag=True,
923
- default=False,
924
- required=False,
925
- help='Skip confirmation prompt.',
926
- )
927
- def down(
942
+ def _find_matching_jobs(
928
943
  jobs: List[str],
929
- all: Optional[bool],
944
+ jobs_response: Dict[str, Any],
945
+ namespace: str,
930
946
  all_users: Optional[bool],
931
- yes: bool,
947
+ all_flag: Optional[bool] = None,
932
948
  ):
933
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
934
- """Tear down job(s).
935
-
936
- JOB is the name of the job to tear down. If both
937
- JOB and ``--all`` are supplied, the latter takes precedence.
938
-
939
- Tearing down a job will delete all associated containers (all billing
940
- stops), and any data on the containers disks will be lost. Accelerators
941
- (e.g., GPUs) that are part of the job will be deleted too.
942
-
943
- Wildcard patterns are supported using * characters.
944
- Examples: "test-*" matches all jobs starting with "test-",
945
- "*-gpu" matches all jobs ending with "-gpu".
946
-
947
- Examples:
948
-
949
- .. code-block:: bash
950
-
951
- # Tear down a specific job.
952
- konduktor down cluster_name
953
- \b
954
- # Tear down multiple jobs.
955
- konduktor down job1 job2
956
- \b
957
- # Tear down all jobs matching a pattern.
958
- konduktor down "test-*"
959
- \b
960
- # Tear down all of this users jobs.
961
- konduktor down -a
962
- konduktor down --all
963
-
964
- # Tear down all jobs across all users
965
- konduktor down --all --all-users
949
+ """
950
+ Find all jobs matching against the user specified pattern.
951
+ In use in `konduktor down` and `konduktor stop`
966
952
 
953
+ Note(asaiacai): `jobs_response` should be the list of
954
+ all jobsets in this namespace, not necessarily belonging
955
+ to this user.
967
956
  """
968
957
 
969
- context = kubernetes_utils.get_current_kube_config_context_name()
970
- namespace = kubernetes_utils.get_kube_config_context_namespace(context)
971
- jobs_response = jobset_utils.list_jobset(namespace)
972
- assert jobs_response
973
- jobs_specs = [
974
- job
975
- for job in jobs_response['items']
976
- if (
977
- job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
978
- == common_utils.user_and_hostname_hash()
979
- and not all_users
980
- )
981
- ]
958
+ jobs_specs = [job for job in jobs_response['items']]
982
959
 
983
- if all:
960
+ if all_flag:
984
961
  assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
985
- if len(jobs_specs) == 0:
986
- click.secho(f'No jobs found in namespace {namespace}', fg='yellow')
987
- return
988
- jobs = [job['metadata']['name'] for job in jobs_specs]
962
+ assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
963
+ if all_users:
964
+ # --all with --all-users = all jobs of all users
965
+ jobs = [job['metadata']['name'] for job in jobs_specs]
966
+ else:
967
+ # --all without --all-users = all jobs of current user
968
+ jobs = [
969
+ job['metadata']['name']
970
+ for job in jobs_specs
971
+ if job['metadata']['labels'][backend_constants.USER_LABEL]
972
+ == common_utils.get_cleaned_username()
973
+ ]
974
+ return jobs
989
975
  elif jobs:
990
976
  # Get all available jobs to match against patterns
991
977
  if len(jobs_specs) == 0:
992
978
  raise click.ClickException(f'No jobs found in namespace {namespace}')
993
979
 
994
- all_job_names = [job['metadata']['name'] for job in jobs_specs]
980
+ all_job_names = {
981
+ job['metadata']['name']: job['metadata']['labels'][
982
+ backend_constants.USER_LABEL
983
+ ]
984
+ for job in jobs_specs
985
+ }
995
986
  matched_jobs = []
996
987
 
997
988
  for job_pattern in jobs:
@@ -1003,6 +994,30 @@ def down(
1003
994
  fg='yellow',
1004
995
  err=True,
1005
996
  )
997
+ for matched_name in pattern_matches:
998
+ if all_job_names[matched_name] != common_utils.get_cleaned_username():
999
+ warning_label = (
1000
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}Warning'
1001
+ f'{colorama.Style.RESET_ALL}'
1002
+ )
1003
+ job_name = (
1004
+ f'{colorama.Style.BRIGHT}{colorama.Fore.WHITE}{matched_name}'
1005
+ f'{colorama.Style.RESET_ALL}'
1006
+ )
1007
+ launched_user = (
1008
+ f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
1009
+ f'{all_job_names[matched_name]}{colorama.Style.RESET_ALL}'
1010
+ )
1011
+ current_user = (
1012
+ f'{colorama.Style.BRIGHT}{colorama.Fore.GREEN}'
1013
+ f'{common_utils.get_cleaned_username()}'
1014
+ f'{colorama.Style.RESET_ALL}'
1015
+ )
1016
+ logger.info(
1017
+ f'{warning_label}: job {job_name} was launched by '
1018
+ f'{launched_user}, while the current user is {current_user}',
1019
+ )
1020
+
1006
1021
  matched_jobs.extend(pattern_matches)
1007
1022
 
1008
1023
  # Remove duplicates while preserving order
@@ -1020,22 +1035,92 @@ def down(
1020
1035
  )
1021
1036
  else:
1022
1037
  raise click.ClickException(
1023
- 'No jobs specified. Use --all to tear down '
1024
- 'all jobs or specify job names/patterns.'
1038
+ 'No jobs specified. Use --all to specify '
1039
+ 'all jobs belonging to a user '
1040
+ 'or specify job names/patterns.'
1025
1041
  )
1042
+ return jobs
1043
+
1044
+
1045
+ @cli.command(cls=_DocumentedCodeCommand)
1046
+ @click.argument(
1047
+ 'jobs',
1048
+ nargs=-1,
1049
+ required=False,
1050
+ )
1051
+ @click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
1052
+ @click.option(
1053
+ '--all-users',
1054
+ '--all_users',
1055
+ default=False,
1056
+ is_flag=True,
1057
+ help='Include other users for teardown',
1058
+ )
1059
+ @click.option(
1060
+ '--yes',
1061
+ '-y',
1062
+ is_flag=True,
1063
+ default=False,
1064
+ required=False,
1065
+ help='Skip confirmation prompt.',
1066
+ )
1067
+ def down(
1068
+ jobs: List[str],
1069
+ all: Optional[bool],
1070
+ all_users: Optional[bool],
1071
+ yes: bool,
1072
+ ):
1073
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1074
+ """Tear down job(s).
1075
+
1076
+ \b
1077
+ Examples:
1078
+ # Tear down a specific job.
1079
+ konduktor down my_job
1080
+ \b
1081
+ # Tear down multiple jobs.
1082
+ konduktor down my_job1 my_job2
1083
+ \b
1084
+ # Tear down all jobs matching a pattern.
1085
+ konduktor down "my_job-*"
1086
+ \b
1087
+ # Tear down all of this users jobs.
1088
+ konduktor down -a
1089
+ konduktor down --all
1090
+ \b
1091
+ # Tear down all jobs across all users
1092
+ konduktor down --all --all-users
1093
+
1094
+ \b
1095
+ Notes:
1096
+ • If both JOB and ``--all`` are supplied, the latter takes precedence.
1097
+ • Tearing down a job will delete all associated containers (all billing
1098
+ stops), and any data on the containers disks will be lost. Accelerators
1099
+ (e.g., GPUs) that are part of the job will be deleted too.
1100
+ • Wildcard patterns are supported using * characters.
1101
+ Ex: "test-*" matches all jobs starting with "test-",
1102
+ "*-gpu" matches all jobs ending with "-gpu".
1103
+
1104
+ """
1105
+
1106
+ context = kubernetes_utils.get_current_kube_config_context_name()
1107
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1108
+ jobs_response = jobset_utils.list_jobset(namespace)
1109
+ assert jobs_response
1110
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
1026
1111
 
1027
1112
  if not yes:
1028
1113
  # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
1029
1114
  # it exists but is STOPPED.
1030
1115
  prompt = (
1031
1116
  f'Tearing down job(s) {colorama.Style.BRIGHT} '
1032
- f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
1117
+ f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
1033
1118
  'Proceed?'
1034
1119
  )
1035
1120
  if prompt is not None:
1036
1121
  click.confirm(prompt, default=True, abort=True, show_default=True)
1037
1122
 
1038
- for job in track(jobs, description='Tearing down job(s)...'):
1123
+ for job in track(filtered_jobs, description='Tearing down job(s)...'):
1039
1124
  jobset_utils.delete_jobset(namespace, job)
1040
1125
 
1041
1126
 
@@ -1069,112 +1154,59 @@ def stop(
1069
1154
  ):
1070
1155
  """Suspend job(s) (manual/user-initiated).
1071
1156
 
1072
- JOB is the name of the job to suspend. If both
1073
- JOB and ``--all`` are supplied, the latter takes precedence.
1074
-
1075
- Suspending a job will pause execution and mark the job as SUSPENDED (by user).
1076
- The job can be resumed later with `konduktor start`.
1077
-
1078
- If a job is suspended by the system (e.g., due to queueing),
1079
- it will show as SUSPENDED (by system).
1080
-
1081
- Wildcard patterns are supported using * characters.
1082
- Examples: "my_job-*" matches all jobs starting with "my_job-",
1083
- "*-gpu" matches all jobs ending with "-gpu".
1084
-
1157
+ \b
1085
1158
  Examples:
1159
+ # Suspend a specific job.
1160
+ konduktor stop my_job
1161
+ \b
1162
+ # Suspend multiple jobs.
1163
+ konduktor stop my_job1 my_job2
1164
+ \b
1165
+ # Suspend all jobs matching a pattern.
1166
+ konduktor stop "my_job-*"
1167
+ \b
1168
+ # Suspend all of this users jobs.
1169
+ konduktor stop -a
1170
+ konduktor stop --all
1171
+ \b
1172
+ # Suspend all jobs across all users
1173
+ konduktor stop --all --all-users
1086
1174
 
1087
- .. code-block:: bash
1088
-
1089
- # Suspend a specific job.
1090
- konduktor stop my_job
1091
- \b
1092
- # Suspend multiple jobs.
1093
- konduktor stop my_job1 my_job2
1094
- \b
1095
- # Suspend all jobs matching a pattern.
1096
- konduktor stop "my_job-*"
1097
- \b
1098
- # Suspend all of this users jobs.
1099
- konduktor stop -a
1100
- konduktor stop --all
1101
-
1102
- # Suspend all jobs across all users
1103
- konduktor stop --all --all-users
1104
-
1175
+ \b
1176
+ Notes:
1177
+ If both JOB and ``--all`` are supplied, the latter takes precedence.
1178
+ Suspending a job will pause execution and mark the job as SUSPENDED (by user).
1179
+ The job can be resumed later with `konduktor start`.
1180
+ If a job is suspended by the system (e.g., due to queueing), it
1181
+ will show as SUSPENDED (by system).
1182
+ • Wildcard patterns are supported using * characters.
1183
+ Ex: "test-*" matches all jobs starting with "test-",
1184
+ "*-gpu" matches all jobs ending with "-gpu".
1105
1185
  """
1106
1186
 
1107
1187
  context = kubernetes_utils.get_current_kube_config_context_name()
1108
1188
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1109
1189
  jobs_response = jobset_utils.list_jobset(namespace)
1110
1190
  assert jobs_response
1111
- jobs_specs = [
1112
- job
1113
- for job in jobs_response['items']
1114
- if (
1115
- job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
1116
- == common_utils.user_and_hostname_hash()
1117
- and not all_users
1118
- )
1119
- ]
1120
-
1121
- if all:
1122
- assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
1123
- assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
1124
- jobs = [job['metadata']['name'] for job in jobs_specs]
1125
- elif jobs:
1126
- # Get all available jobs to match against patterns
1127
- if len(jobs_specs) == 0:
1128
- raise click.ClickException(f'No jobs found in namespace {namespace}')
1129
-
1130
- all_job_names = [job['metadata']['name'] for job in jobs_specs]
1131
- matched_jobs = []
1132
-
1133
- for job_pattern in jobs:
1134
- # Use fnmatch for both wildcard and exact pattern matching
1135
- pattern_matches = fnmatch.filter(all_job_names, job_pattern)
1136
- if not pattern_matches:
1137
- click.secho(
1138
- f'Warning: No jobs found matching pattern "{job_pattern}"',
1139
- fg='yellow',
1140
- err=True,
1141
- )
1142
- matched_jobs.extend(pattern_matches)
1143
-
1144
- # Remove duplicates while preserving order
1145
- seen = set()
1146
- jobs = []
1147
- for job in matched_jobs:
1148
- if job not in seen:
1149
- seen.add(job)
1150
- jobs.append(job)
1151
-
1152
- if not jobs:
1153
- raise click.ClickException(
1154
- f'No matching jobs found check status with '
1155
- f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
1156
- )
1157
- else:
1158
- raise click.ClickException(
1159
- 'No jobs specified. Use --all to suspend '
1160
- 'all jobs or specify job names/patterns.'
1161
- )
1191
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
1162
1192
 
1163
1193
  if not yes:
1164
1194
  # Prompt for confirmation
1165
1195
  prompt = (
1166
1196
  f'Suspending job(s) {colorama.Style.BRIGHT} '
1167
- f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
1197
+ f'{colorama.Fore.GREEN}{filtered_jobs}{colorama.Style.RESET_ALL}. '
1168
1198
  'Proceed?'
1169
1199
  )
1170
1200
  if prompt is not None:
1171
1201
  click.confirm(prompt, default=True, abort=True, show_default=True)
1172
1202
 
1173
- for job in track(jobs, description='Suspending job(s)...'):
1203
+ for job in track(filtered_jobs, description='Suspending job(s)...'):
1174
1204
  jobset_utils.stop_jobset(namespace, job)
1175
1205
 
1176
1206
  click.secho(
1177
- ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB_STOP, jobs),
1207
+ ux_utils.command_hint_messages(
1208
+ ux_utils.CommandHintType.JOB_STOP, filtered_jobs
1209
+ ),
1178
1210
  fg='green',
1179
1211
  bold=True,
1180
1212
  )
@@ -1212,54 +1244,41 @@ def start(
1212
1244
  ):
1213
1245
  """Resume suspended job(s) (manual/user-initiated).
1214
1246
 
1215
- JOB is the name of the job to resume. If both
1216
- JOB and ``--all`` are supplied, the latter takes precedence.
1217
-
1218
- Resuming a job will restart execution from where it was suspended.
1219
- Only suspended jobs can be resumed.
1220
-
1221
- This command works for both manually suspended jobs (SUSPENDED by user)
1222
- and system-suspended jobs (SUSPENDED by system).
1223
-
1224
- Wildcard patterns are supported using * characters.
1225
- Examples: "my_job-*" matches all jobs starting with "my_job-",
1226
- "*-gpu" matches all jobs ending with "-gpu".
1227
-
1247
+ \b
1228
1248
  Examples:
1249
+ # Resume a specific job.
1250
+ konduktor start my_job
1251
+ \b
1252
+ # Resume multiple jobs.
1253
+ konduktor start my_job1 my_job2
1254
+ \b
1255
+ # Resume all jobs matching a pattern.
1256
+ konduktor start "my_job-*"
1257
+ \b
1258
+ # Resume all of this users suspended jobs.
1259
+ konduktor start -a
1260
+ konduktor start --all
1261
+ \b
1262
+ # Resume all suspended jobs across all users
1263
+ konduktor start --all --all-users
1229
1264
 
1230
- .. code-block:: bash
1231
-
1232
- # Resume a specific job.
1233
- konduktor start my_job
1234
- \b
1235
- # Resume multiple jobs.
1236
- konduktor start my_job1 my_job2
1237
- \b
1238
- # Resume all jobs matching a pattern.
1239
- konduktor start "my_job-*"
1240
- \b
1241
- # Resume all of this users suspended jobs.
1242
- konduktor start -a
1243
- konduktor start --all
1244
-
1245
- # Resume all suspended jobs across all users
1246
- konduktor start --all --all-users
1247
-
1265
+ \b
1266
+ Notes:
1267
+ If both JOB and ``--all`` are supplied, the latter takes precedence.
1268
+ Resuming a job will restart execution from where it was suspended.
1269
+ Only suspended jobs can be resumed.
1270
+ This command works for both manually suspended jobs (SUSPENDED by user)
1271
+ and system-suspended jobs (SUSPENDED by system).
1272
+ • Wildcard patterns are supported using * characters.
1273
+ Ex: "test-*" matches all jobs starting with "test-",
1274
+ "*-gpu" matches all jobs ending with "-gpu".
1248
1275
  """
1249
1276
 
1250
1277
  context = kubernetes_utils.get_current_kube_config_context_name()
1251
1278
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1252
1279
  jobs_response = jobset_utils.list_jobset(namespace)
1253
1280
  assert jobs_response
1254
- jobs_specs = [
1255
- job
1256
- for job in jobs_response['items']
1257
- if (
1258
- job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
1259
- == common_utils.user_and_hostname_hash()
1260
- and not all_users
1261
- )
1262
- ]
1281
+ jobs_specs = [job for job in jobs_response['items']]
1263
1282
 
1264
1283
  if all:
1265
1284
  # Only get suspended jobs when using --all
@@ -1335,24 +1354,22 @@ def start(
1335
1354
  nargs=-1,
1336
1355
  )
1337
1356
  def check(clouds: Tuple[str]):
1338
- """Check which clouds are available to use for storage
1339
-
1340
- This checks storage credentials for a cloud supported by konduktor. If a
1341
- cloud is detected to be inaccessible, the reason and correction steps will
1342
- be shown.
1343
-
1344
- If CLOUDS are specified, checks credentials for only those clouds.
1345
-
1346
- The enabled clouds are cached and form the "search space" to be considered
1347
- for each task.
1357
+ """Check which clouds are available to use for storage with Konduktor
1348
1358
 
1359
+ \b
1349
1360
  Examples:
1361
+ # Check only specific clouds - gs, s3.
1362
+ konduktor check gs
1363
+ konduktor check s3
1350
1364
 
1351
- .. code-block:: bash
1352
-
1353
- # Check only specific clouds - gs, s3.
1354
- konduktor check gs
1355
- konduktor check s3
1365
+ \b
1366
+ Notes:
1367
+ This checks storage credentials for a cloud supported by konduktor.
1368
+ If a cloud is detected to be inaccessible, the reason and correction
1369
+ steps will be shown.
1370
+ • If CLOUDS are specified, checks credentials for only those clouds.
1371
+ • The enabled clouds are cached and form the "search space" to
1372
+ be considered for each task.
1356
1373
  """
1357
1374
  clouds_arg = clouds if len(clouds) > 0 else None
1358
1375
  konduktor_check.check(clouds=clouds_arg)
@@ -1401,23 +1418,12 @@ def secret():
1401
1418
 
1402
1419
  USAGE: konduktor secret COMMAND
1403
1420
 
1404
- \b
1405
- Use one of the following COMMANDS:
1406
- create [FLAGS] [NAME]
1407
- delete [NAME]
1408
- list [FLAGS]
1409
-
1410
1421
  \b
1411
1422
  Examples:
1412
- konduktor secret create --kind git-ssh --from-file=~/.ssh/id_rsa my-ssh-name
1413
- konduktor secret create --kind env --inline FOO=bar my-env-name
1414
- konduktor delete my-ssh-name
1415
- konduktor secret list
1416
-
1417
- \b
1418
- For details on COMMAND ARGS:
1419
- konduktor secret create -h
1420
- konduktor secret list -h
1423
+ konduktor secret create --kind git-ssh --from-file ~/.ssh/id_rsa my-ssh-name
1424
+ konduktor secret create --kind env --inline FOO=bar my-env-name
1425
+ konduktor secret delete my-ssh-name
1426
+ konduktor secret list
1421
1427
  """
1422
1428
 
1423
1429
 
@@ -1608,8 +1614,7 @@ def delete(name):
1608
1614
  help='Show all secrets, including those not owned by the current user.',
1609
1615
  )
1610
1616
  def list_secrets(all_users: bool):
1611
- """List secrets in the namespace.
1612
- Defaults to only your secrets unless --all-users is set."""
1617
+ """List secrets in the namespace."""
1613
1618
 
1614
1619
  context = kubernetes_utils.get_current_kube_config_context_name()
1615
1620
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -1654,23 +1659,11 @@ def serve():
1654
1659
 
1655
1660
  USAGE: konduktor serve COMMAND
1656
1661
 
1657
- \b
1658
- Use one of the following COMMANDS:
1659
- launch
1660
- down
1661
- status
1662
-
1663
1662
  \b
1664
1663
  Examples:
1665
1664
  konduktor serve launch my-deployment
1666
1665
  konduktor serve down my-deployment
1667
1666
  konduktor serve status
1668
-
1669
- \b
1670
- For details on COMMAND ARGS:
1671
- konduktor serve launch -h
1672
- konduktor serve down -h
1673
- konduktor serve status -h
1674
1667
  """
1675
1668
  pass
1676
1669
 
@@ -1745,8 +1738,10 @@ def serve_launch(
1745
1738
  ):
1746
1739
  """Launch a deployment to serve.
1747
1740
 
1748
- If ENTRYPOINT points to a valid YAML file, it is read in as the task
1749
- specification. Otherwise, it is interpreted as a bash command.
1741
+ \b
1742
+ Notes:
1743
+ • If ENTRYPOINT points to a valid YAML file, it is read in as the task
1744
+ specification. Otherwise, it is interpreted as a bash command.
1750
1745
  """
1751
1746
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1752
1747
  env = _merge_env_vars(env_file, env)
@@ -1828,13 +1823,10 @@ def serve_down(
1828
1823
  ):
1829
1824
  """Tear down deployments (Deployment, Service, PodAutoscaler).
1830
1825
 
1831
- Use --all or -a to tear down all deployments.
1832
-
1833
- Examples:
1834
-
1835
1826
  \b
1836
- konduktor serve down my-deployment
1837
- konduktor serve down -a
1827
+ Examples:
1828
+ konduktor serve down my-deployment
1829
+ konduktor serve down -a
1838
1830
  """
1839
1831
  context = kubernetes_utils.get_current_kube_config_context_name()
1840
1832
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -1861,7 +1853,7 @@ def serve_down(
1861
1853
  )
1862
1854
  else:
1863
1855
  raise click.ClickException(
1864
- 'No deployments specified. Use --all to tear down all deplotments '
1856
+ 'No deployments specified. Use --all to tear down all deployments '
1865
1857
  'or pass names/patterns.'
1866
1858
  )
1867
1859
 
@@ -365,6 +365,7 @@ def tail_vicky_logs(
365
365
  worker_id: int = 0,
366
366
  num_logs: int = -1,
367
367
  follow: bool = True,
368
+ start_offset: str = '1h',
368
369
  ):
369
370
  context = kubernetes_utils.get_current_kube_config_context_name()
370
371
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -380,15 +381,16 @@ def tail_vicky_logs(
380
381
  assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
381
382
  query = {'limit': num_logs}
382
383
  if follow:
384
+ effective_offset = start_offset or '1h'
383
385
  logger.info(
384
- 'Tailing logs from 1 hour ago. '
386
+ f'Tailing logs from {effective_offset} ago. '
385
387
  'If logs come up empty, there might be logs just earlier '
386
- 'than the past hour, check Grafana or use:\n'
388
+ 'than that window, check Grafana or use:\n'
387
389
  f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
388
390
  f'`konduktor logs --no-follow {job_name}`'
389
391
  f'{colorama.Style.RESET_ALL}'
390
392
  )
391
- query['start_offset'] = '1h'
393
+ query['start_offset'] = effective_offset
392
394
  query['query'] = (
393
395
  f'k8s.namespace.name: "{namespace}" AND '
394
396
  f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
@@ -453,12 +455,13 @@ def tail_logs(
453
455
  worker_id: int = 0,
454
456
  num_logs: int = 1000,
455
457
  follow: bool = True,
458
+ start_offset: str = '1h',
456
459
  ):
457
460
  logs_backend = config.get_nested(('logs', 'backend'), None)
458
461
  if logs_backend == LogBackend.VICTORIA:
459
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
462
+ tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
460
463
  elif logs_backend == LogBackend.LOKI:
461
464
  tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
462
465
  else:
463
466
  logger.info('Defaulting to VictoriaLogs')
464
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
467
+ tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20251107104752
3
+ Version: 0.1.0.dev20251215105431
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -29,6 +29,7 @@ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
29
29
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
30
30
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
31
31
  Requires-Dist: rich (>=13.9.4,<14.0.0)
32
+ Requires-Dist: sniffio (>=1.3,<2.0)
32
33
  Requires-Dist: websockets (>=15.0.1,<16.0.0)
33
34
  Description-Content-Type: text/markdown
34
35
 
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=mHmTi0owXeaxTt6NwGboUKlwfKWw6xwzbdcUjq9-1DM,1574
1
+ konduktor/__init__.py,sha256=A8k1HK8UyBfw1hk53UvZzpp6khOyWkEDT4EAypu6Osc,1574
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -11,9 +11,9 @@ konduktor/backends/deployment.py,sha256=d0a3F7dxDbnRKIt4ZO_kQ0_vet0pZvg4bWYzVZ8D
11
11
  konduktor/backends/deployment_utils.py,sha256=9CmB9CYC_3wxIfIOmTSCN2hbURZ5MpEMTvPwYMUXBRM,49272
12
12
  konduktor/backends/jobset.py,sha256=drt8Gc0iYQx18JWXBU6XfhUvC2xCKd8szSJ2JC4O20Q,8640
13
13
  konduktor/backends/jobset_utils.py,sha256=g49NY8RFhL_NNd4c1adRLG_Bq3UTFtRURxcAzxnMEYw,26524
14
- konduktor/backends/pod_utils.py,sha256=kOi3cLbTI3abZFCNQswWrkrOiBBm3gW_9N4INjxeS-w,19276
14
+ konduktor/backends/pod_utils.py,sha256=WL6b9_yBqBHjX84hE57uMTYL-rbGo_Ugf2L7-_8NpDc,19422
15
15
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
16
- konduktor/cli.py,sha256=B3Pp3RCwkGj8r9YgH-TgC85XU4zcc3eema1kpcDTQ3I,58452
16
+ konduktor/cli.py,sha256=ORnFQub6aSGeZytETn39Dafl1gjH-yihP2r5FnF3EeQ,59591
17
17
  konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
18
18
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
19
19
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -93,15 +93,15 @@ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4x
93
93
  konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
94
94
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
95
95
  konduktor/utils/kubernetes_utils.py,sha256=XleYxzG64hciZb-CjzBDjX8BOMhFATIIHZlXD2bqN0Q,27186
96
- konduktor/utils/log_utils.py,sha256=VUyTtN819BJnSwm33-73-h8aaD51Y5Gawt6ek2kU1tk,18181
96
+ konduktor/utils/log_utils.py,sha256=EPDDNu7WxfbP0T5WBHRNA1Pplr7mIeqFIQf9JTOQlss,18340
97
97
  konduktor/utils/loki_utils.py,sha256=eOGiD7dZNuwzmyXKiifyqz00EVh2nwcUPFSiPkac9y0,4050
98
98
  konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
99
99
  konduktor/utils/schemas.py,sha256=cr39nEAgjluhXoUYnvIwCwLBH8rLds37MBsF1uQv1rw,19067
100
100
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
101
101
  konduktor/utils/ux_utils.py,sha256=LSH4b5lckD157qDF4keThxtkGdxNrAfGKmH1ewhZkm4,8646
102
102
  konduktor/utils/validator.py,sha256=UcLvZCk9Cpbbhw8r_ZJtTpMSTfY1NKqcyciKsPzRPZM,17222
103
- konduktor_nightly-0.1.0.dev20251107104752.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
104
- konduktor_nightly-0.1.0.dev20251107104752.dist-info/METADATA,sha256=EEA9KjVBKhzBk4hO1-mWEacCmBul0d5GqMbB_VUKWbQ,4247
105
- konduktor_nightly-0.1.0.dev20251107104752.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
106
- konduktor_nightly-0.1.0.dev20251107104752.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
107
- konduktor_nightly-0.1.0.dev20251107104752.dist-info/RECORD,,
103
+ konduktor_nightly-0.1.0.dev20251215105431.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
104
+ konduktor_nightly-0.1.0.dev20251215105431.dist-info/METADATA,sha256=WWbQsIiHdKINH1YFtDbNxi8ZW3K_G9P7FyasPy3IWdM,4283
105
+ konduktor_nightly-0.1.0.dev20251215105431.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
106
+ konduktor_nightly-0.1.0.dev20251215105431.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
107
+ konduktor_nightly-0.1.0.dev20251215105431.dist-info/RECORD,,