cmd-queue 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cmd-queue might be problematic. Click here for more details.

cmd_queue/slurm_queue.py CHANGED
@@ -264,10 +264,6 @@ class SlurmJob(base_queue.Job):
264
264
  return ' \\\n '.join(args)
265
265
 
266
266
  def _build_sbatch_args(self, jobname_to_varname=None):
267
- # job_name = 'todo'
268
- # output_fpath = '$HOME/.cache/slurm/logs/job-%j-%x.out'
269
- # command = "python -c 'import sys; sys.exit(1)'"
270
- # -c 2 -p priority --gres=gpu:1
271
267
  sbatch_args = ['sbatch']
272
268
  if self.name:
273
269
  sbatch_args.append(f'--job-name="{self.name}"')
@@ -306,7 +302,8 @@ class SlurmJob(base_queue.Job):
306
302
 
307
303
  for key, value in self._sbatch_kvargs.items():
308
304
  key = key.replace('_', '-')
309
- sbatch_args.append(f'--{key}="{value}"')
305
+ if value is not None:
306
+ sbatch_args.append(f'--{key}="{value}"')
310
307
 
311
308
  for key, flag in self._sbatch_flags.items():
312
309
  if flag:
@@ -371,6 +368,19 @@ class SlurmQueue(base_queue.Queue):
371
368
  CommandLine:
372
369
  xdoctest -m cmd_queue.slurm_queue SlurmQueue
373
370
 
371
+ Example:
372
+ >>> from cmd_queue.slurm_queue import * # NOQA
373
+ >>> self = SlurmQueue()
374
+ >>> job0 = self.submit('echo "hi from $SLURM_JOBID"')
375
+ >>> job1 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job0])
376
+ >>> job2 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job1])
377
+ >>> job3 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job1, job2])
378
+ >>> self.write()
379
+ >>> self.print_commands()
380
+ >>> # xdoctest: +REQUIRES(--run)
381
+ >>> if not self.is_available():
382
+ >>> self.run()
383
+
374
384
  Example:
375
385
  >>> from cmd_queue.slurm_queue import * # NOQA
376
386
  >>> self = SlurmQueue()
@@ -413,6 +423,11 @@ class SlurmQueue(base_queue.Queue):
413
423
  self.unused_kwargs = kwargs
414
424
  self.queue_id = name + '-' + stamp + '-' + ub.hash_data(uuid.uuid4())[0:8]
415
425
  self.dpath = ub.Path.appdir('cmd_queue/slurm') / self.queue_id
426
+ if 0:
427
+ # hack for submission on different systems, probably dont want to
428
+ # do this.
429
+ self.dpath = self.dpath.shrinkuser(home='$HOME')
430
+
416
431
  self.log_dpath = self.dpath / 'logs'
417
432
  self.fpath = self.dpath / (self.queue_id + '.sh')
418
433
  self.shell = shell
@@ -420,10 +435,43 @@ class SlurmQueue(base_queue.Queue):
420
435
  self.all_depends = None
421
436
  self._sbatch_kvargs = ub.udict(kwargs) & SLURM_SBATCH_KVARGS
422
437
  self._sbatch_flags = ub.udict(kwargs) & SLURM_SBATCH_FLAGS
438
+ self._include_monitor_metadata = True
439
+ self.jobid_fpath = None
423
440
 
424
441
  def __nice__(self):
425
442
  return self.queue_id
426
443
 
444
+ @classmethod
445
+ def _slurm_checks(cls):
446
+ status = {}
447
+ info = {}
448
+ info['squeue_fpath'] = ub.find_exe('squeue')
449
+ status['has_squeue'] = bool(info['squeue_fpath'])
450
+ status['slurmd_running'] = False
451
+ import psutil
452
+ for p in psutil.process_iter():
453
+ if p.name() == 'slurmd':
454
+ status['slurmd_running'] = True
455
+ info['slurmd_info'] = {
456
+ 'pid': p.pid,
457
+ 'name': p.name(),
458
+ 'status': p.status(),
459
+ 'create_time': p.create_time(),
460
+ }
461
+ break
462
+ status['squeue_working'] = (ub.cmd('squeue')['ret'] == 0)
463
+
464
+ sinfo = ub.cmd('sinfo --json')
465
+ status['sinfo_working'] = False
466
+ if sinfo['ret'] == 0:
467
+ status['sinfo_working'] = True
468
+ import json
469
+ sinfo_out = json.loads(sinfo['out'])
470
+ has_working_nodes = not all(
471
+ node['state'] == 'down'
472
+ for node in sinfo_out['nodes'])
473
+ status['has_working_nodes'] = has_working_nodes
474
+
427
475
  @classmethod
428
476
  def is_available(cls):
429
477
  """
@@ -436,15 +484,37 @@ class SlurmQueue(base_queue.Queue):
436
484
  squeue_working = (ub.cmd('squeue')['ret'] == 0)
437
485
  if squeue_working:
438
486
  # Check if nodes are available or down
439
- sinfo = ub.cmd('sinfo --json')
440
- if sinfo['ret'] == 0:
487
+ # note: the --json command is not available in
488
+ # slurm-wlm 19.05.5, but it is in slurm-wlm 21.08.5
489
+ sinfo_version_str = ub.cmd('sinfo --version').stdout.strip().split(' ')[1]
490
+ sinfo_major_version = int(sinfo_version_str.split('.')[0])
491
+ if sinfo_major_version < 21:
492
+ # Dont check in this case
493
+ return True
494
+ else:
441
495
  import json
442
- sinfo_out = json.loads(sinfo['out'])
443
- has_working_nodes = not all(
444
- node['state'] == 'down'
445
- for node in sinfo_out['nodes'])
446
- if has_working_nodes:
447
- return True
496
+ # sinfo --json changed between v22 and v23
497
+ # https://github.com/SchedMD/slurm/blob/slurm-23.02/RELEASE_NOTES#L230
498
+ if sinfo_major_version == 22:
499
+ sinfo = ub.cmd('sinfo --json')
500
+ else:
501
+ sinfo = ub.cmd('scontrol show nodes --json')
502
+ if sinfo['ret'] == 0:
503
+ sinfo_out = json.loads(sinfo['out'])
504
+ nodes = sinfo_out['nodes']
505
+ # FIXME: this might be an incorrect check on v22
506
+ # the v23 version seems different, but I don't have
507
+ # v22 setup anymore. Might not be worth supporting.
508
+ node_states = [node['state'] for node in nodes]
509
+ if sinfo_major_version == 22:
510
+ has_working_nodes = not all(
511
+ 'down' in str(state).lower() for state in node_states)
512
+ else:
513
+ has_working_nodes = not all(
514
+ 'DOWN' in state for state in node_states)
515
+ if has_working_nodes:
516
+ return True
517
+
448
518
  return False
449
519
 
450
520
  def submit(self, command, **kwargs):
@@ -486,6 +556,12 @@ class SlurmQueue(base_queue.Queue):
486
556
  self.header_commands.append(command)
487
557
 
488
558
  def order_jobs(self):
559
+ """
560
+ Get a topological sorting of the jobs in this DAG.
561
+
562
+ Returns:
563
+ List[SlurmJob]: ordered jobs
564
+ """
489
565
  import networkx as nx
490
566
  graph = self._dependency_graph()
491
567
  if 0:
@@ -497,6 +573,15 @@ class SlurmQueue(base_queue.Queue):
497
573
  return new_order
498
574
 
499
575
  def finalize_text(self, exclude_tags=None, **kwargs):
576
+ """
577
+ Serialize the state of the queue into a bash script.
578
+
579
+ Returns:
580
+ str
581
+ """
582
+ # generating the slurm bash script is straightforward because slurm
583
+ # will take of the hard stuff (like scheduling) for us. we just need
584
+ # to effectively encode the DAG as a list of sbatch commands.
500
585
  exclude_tags = util_tags.Tags.coerce(exclude_tags)
501
586
  new_order = self.order_jobs()
502
587
  commands = []
@@ -517,6 +602,20 @@ class SlurmQueue(base_queue.Queue):
517
602
  jobname_to_varname[job.name] = varname
518
603
  commands.append(command)
519
604
  self.jobname_to_varname = jobname_to_varname
605
+
606
+ self._include_monitor_metadata = True
607
+ if self._include_monitor_metadata:
608
+ # Build a command to dump the job-ids for this queue to disk to
609
+ # allow us to track them in the monitor.
610
+ from cmd_queue.util import util_bash
611
+ json_fmt_parts = [
612
+ (job_varname, '%s', '$' + job_varname)
613
+ for job_varname in self.jobname_to_varname.values()
614
+ ]
615
+ self.jobid_fpath = self.fpath.augment(ext='.jobids.json')
616
+ command = util_bash.bash_json_dump(json_fmt_parts, self.jobid_fpath)
617
+ commands.append(command)
618
+
520
619
  text = '\n'.join(commands)
521
620
  return text
522
621
 
@@ -532,6 +631,25 @@ class SlurmQueue(base_queue.Queue):
532
631
  def monitor(self, refresh_rate=0.4):
533
632
  """
534
633
  Monitor progress until the jobs are done
634
+
635
+ CommandLine:
636
+ xdoctest -m cmd_queue.slurm_queue SlurmQueue.monitor --dev --run
637
+
638
+ Example:
639
+ >>> # xdoctest: +REQUIRES(--dev)
640
+ >>> from cmd_queue.slurm_queue import * # NOQA
641
+ >>> dpath = ub.Path.appdir('slurm_queue/tests/test-slurm-failed-monitor')
642
+ >>> queue = SlurmQueue()
643
+ >>> job0 = queue.submit(f'echo "here we go"', name='job0')
644
+ >>> job1 = queue.submit(f'echo "this job will pass, allowing dependencies to run" && true', depends=[job0])
645
+ >>> job2 = queue.submit(f'echo "this job will run and pass" && sleep 10 && true', depends=[job1])
646
+ >>> job3 = queue.submit(f'echo "this job will run and fail" && false', depends=[job1])
647
+ >>> job4 = queue.submit(f'echo "this job will fail, preventing dependencies from running" && false', depends=[job0])
648
+ >>> job5 = queue.submit(f'echo "this job will never run" && true', depends=[job4])
649
+ >>> job6 = queue.submit(f'echo "this job will also never run" && false', depends=[job4])
650
+ >>> queue.print_commands()
651
+ >>> # xdoctest: +REQUIRES(--run)
652
+ >>> queue.run()
535
653
  """
536
654
 
537
655
  import time
@@ -543,46 +661,154 @@ class SlurmQueue(base_queue.Queue):
543
661
 
544
662
  num_at_start = None
545
663
 
664
+ job_status_table = None
665
+ if self.jobid_fpath is not None:
666
+ class UnableToMonitor(Exception):
667
+ ...
668
+ try:
669
+ import json
670
+ if not self.jobid_fpath.exists():
671
+ raise UnableToMonitor
672
+ jobid_lut = json.loads(self.jobid_fpath.read_text())
673
+ job_status_table = [
674
+ {
675
+ 'job_varname': job_varname,
676
+ 'job_id': job_id,
677
+ 'status': 'unknown',
678
+ 'needs_update': True,
679
+ }
680
+ for job_varname, job_id in jobid_lut.items()
681
+ ]
682
+ except UnableToMonitor:
683
+ print('ERROR: Unable to monitors jobids')
684
+
685
+ def update_jobid_status():
686
+ import rich
687
+ for row in job_status_table:
688
+ if row['needs_update']:
689
+ job_id = row['job_id']
690
+ out = ub.cmd(f'scontrol show job "{job_id}"')
691
+ info = parse_scontrol_output(out.stdout)
692
+ row['JobState'] = info['JobState']
693
+ row['ExitCode'] = info.get('ExitCode', None)
694
+ # https://slurm.schedmd.com/job_state_codes.html
695
+ if info['JobState'].startswith('FAILED'):
696
+ row['status'] = 'failed'
697
+ rich.print(f'[red] Failed job: {info["JobName"]}')
698
+ if info["StdErr"] == info["StdOut"]:
699
+ rich.print(f'[red] * Logs: {info["StdErr"]}')
700
+ else:
701
+ rich.print(f'[red] StdErr: {info["StdErr"]}')
702
+ rich.print(f'[red] StdOut: {info["StdOut"]}')
703
+ row['needs_update'] = False
704
+ elif info['JobState'].startswith('CANCELLED'):
705
+ rich.print(f'[yellow] Skip job: {info["JobName"]}')
706
+ row['status'] = 'skipped'
707
+ row['needs_update'] = False
708
+ elif info['JobState'].startswith('COMPLETED'):
709
+ rich.print(f'[green] Completed job: {info["JobName"]}')
710
+ row['status'] = 'passed'
711
+ row['needs_update'] = False
712
+ elif info['JobState'].startswith('RUNNING'):
713
+ row['status'] = 'running'
714
+ elif info['JobState'].startswith('PENDING'):
715
+ row['status'] = 'pending'
716
+ else:
717
+ row['status'] = 'unknown'
718
+ # print(f'job_status_table = {ub.urepr(job_status_table, nl=1)}')
719
+
546
720
  def update_status_table():
547
721
  nonlocal num_at_start
548
- # https://rich.readthedocs.io/en/stable/live.html
549
- info = ub.cmd('squeue --format="%i %P %j %u %t %M %D %R"')
550
- stream = io.StringIO(info['out'])
551
- df = pd.read_csv(stream, sep=' ')
552
- jobid_history.update(df['JOBID'])
553
-
554
- num_running = (df['ST'] == 'R').sum()
555
- num_in_queue = len(df)
556
- total_monitored = len(jobid_history)
557
-
558
- HACK_KILL_BROKEN_JOBS = 1
559
- if HACK_KILL_BROKEN_JOBS:
560
- # For whatever reason using kill-on-invalid-dep
561
- # kills jobs too fast and not when they are in a dependency state not a
562
- # a never satisfied state. Killing these jobs here seems to fix
563
- # it.
564
- broken_jobs = df[df['NODELIST(REASON)'] == '(DependencyNeverSatisfied)']
565
- if len(broken_jobs):
566
- for name in broken_jobs['NAME']:
567
- ub.cmd(f'scancel --name="{name}"')
722
+
723
+ # TODO: move this block into into the version where job status
724
+ # table is not available, and reimplement it for the per-job style
725
+ # of query. The reason we have it out here now is because we need
726
+ # to implement the HACK_KILL_BROKEN_JOBS in the alternate case.
727
+ if True:
728
+ # https://rich.readthedocs.io/en/stable/live.html
729
+ info = ub.cmd('squeue --format="%i %P %j %u %t %M %D %R"')
730
+ stream = io.StringIO(info['out'])
731
+ df = pd.read_csv(stream, sep=' ')
732
+
733
+ # Only include job names that this queue created
734
+ job_names = [job.name for job in self.jobs]
735
+ df = df[df['NAME'].isin(job_names)]
736
+ jobid_history.update(df['JOBID'])
737
+
738
+ num_running = (df['ST'] == 'R').sum()
739
+ num_in_queue = len(df)
740
+ total_monitored = len(jobid_history)
741
+
742
+ HACK_KILL_BROKEN_JOBS = 1
743
+ if HACK_KILL_BROKEN_JOBS:
744
+ # For whatever reason using kill-on-invalid-dep
745
+ # kills jobs too fast and not when they are in a dependency state not a
746
+ # a never satisfied state. Killing these jobs here seems to fix
747
+ # it.
748
+ broken_jobs = df[df['NODELIST(REASON)'] == '(DependencyNeverSatisfied)']
749
+ if len(broken_jobs):
750
+ for name in broken_jobs['NAME']:
751
+ ub.cmd(f'scancel --name="{name}"')
568
752
 
569
753
  if num_at_start is None:
570
754
  num_at_start = len(df)
571
755
 
572
- table = Table(*['num_running', 'num_in_queue', 'total_monitored', 'num_at_start'],
756
+ if job_status_table is not None:
757
+ update_jobid_status()
758
+ state = ub.dict_hist([row['status'] for row in job_status_table])
759
+ state.setdefault('passed', 0)
760
+ state.setdefault('failed', 0)
761
+ state.setdefault('skipped', 0)
762
+ state.setdefault('pending', 0)
763
+ state.setdefault('unknown', 0)
764
+ state.setdefault('running', 0)
765
+ state['total'] = len(job_status_table)
766
+
767
+ state['other'] = state['total'] - (
768
+ state['passed'] + state['failed'] + state['skipped'] +
769
+ state['running'] + state['pending']
770
+ )
771
+ pass_color = ''
772
+ fail_color = ''
773
+ skip_color = ''
774
+ finished = (state['pending'] + state['unknown'] + state['running'] == 0)
775
+ if (state['failed'] > 0):
776
+ fail_color = '[red]'
777
+ if (state['skipped'] > 0):
778
+ skip_color = '[yellow]'
779
+ if finished:
780
+ pass_color = '[green]'
781
+
782
+ header = ['passed', 'failed', 'skipped', 'running', 'pending', 'other', 'total']
783
+ row_values = [
784
+ f"{pass_color}{state['passed']}",
785
+ f"{fail_color}{state['failed']}",
786
+ f"{skip_color}{state['skipped']}",
787
+ f"{state['running']}",
788
+ f"{state['pending']}",
789
+ f"{state['other']}",
790
+ f"{state['total']}",
791
+ ]
792
+ else:
793
+ # TODO: determine if slurm has accounting on, and if we can
794
+ # figure out how many jobs errored / passed
795
+ header = ['num_running', 'num_in_queue', 'total_monitored', 'num_at_start']
796
+ row_values = [
797
+ f'{num_running}',
798
+ f'{num_in_queue}',
799
+ f'{total_monitored}',
800
+ f'{num_at_start}',
801
+ ]
802
+ # row_values.append(str(state.get('FAIL', 0)))
803
+ # row_values.append(str(state.get('SKIPPED', 0)))
804
+ # row_values.append(str(state.get('PENDING', 0)))
805
+ finished = (num_in_queue == 0)
806
+
807
+ table = Table(*header,
573
808
  title='slurm-monitor')
574
809
 
575
- # TODO: determine if slurm has accounting on, and if we can
576
- # figure out how many jobs errored / passed
577
-
578
- table.add_row(
579
- f'{num_running}',
580
- f'{num_in_queue}',
581
- f'{total_monitored}',
582
- f'{num_at_start}',
583
- )
810
+ table.add_row(*row_values)
584
811
 
585
- finished = (num_in_queue == 0)
586
812
  return table, finished
587
813
 
588
814
  try:
@@ -622,6 +848,8 @@ class SlurmQueue(base_queue.Queue):
622
848
  style (str):
623
849
  can be 'colors', 'rich', or 'plain'
624
850
 
851
+ **kwargs: extra backend-specific args passed to finalize_text
852
+
625
853
  CommandLine:
626
854
  xdoctest -m cmd_queue.slurm_queue SlurmQueue.print_commands
627
855
 
@@ -640,6 +868,82 @@ class SlurmQueue(base_queue.Queue):
640
868
  rprint = print_commands
641
869
 
642
870
 
871
+ def parse_scontrol_output(output: str) -> dict:
872
+ """
873
+ Parses the output of `scontrol show job` into a dictionary.
874
+
875
+ Example:
876
+ from cmd_queue.slurm_queue import * # NOQA
877
+ # Example usage
878
+ output = ub.codeblock(
879
+ '''
880
+ JobId=307 JobName=J0002-SQ-2025 with a space 0218T165929-9a50513a
881
+ UserId=joncrall(1000) GroupId=joncrall(1000) MCS_label=N/A
882
+ Priority=1 Nice=0 Account=(null) QOS=(null)
883
+ JobState=COMPLETED Reason=None Dependency=(null)
884
+ Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
885
+ RunTime=00:00:10 TimeLimit=365-00:00:00 TimeMin=N/A
886
+ SubmitTime=2025-02-18T16:59:30 EligibleTime=2025-02-18T16:59:33
887
+ AccrueTime=Unknown
888
+ StartTime=2025-02-18T16:59:33 EndTime=2025-02-18T16:59:43 Deadline=N/A
889
+ SuspendTime=None SecsPreSuspend=0 LastSchedEval=2025-02-18T16:59:33 Scheduler=Backfill
890
+ Partition=priority AllocNode:Sid=localhost:215414
891
+ ReqNodeList=(null) ExcNodeList=(null)
892
+ NodeList=toothbrush
893
+ BatchHost=toothbrush
894
+ NumNodes=1 NumCPUs=2 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
895
+ ReqTRES=cpu=1,mem=120445M,node=1,billing=1
896
+ AllocTRES=cpu=2,node=1,billing=2
897
+ Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
898
+ MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
899
+ Features=(null) DelayBoot=00:00:00
900
+ OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
901
+ Command=(null)
902
+ WorkDir=/home/joncrall/code/cmd_queue
903
+ StdErr="cmd_queue/slurm/SQ-2025021 with a space 8T165929-9a50513a/logs/J0002-SQ-20250218T165929-9a50513a.sh"
904
+ StdIn=/dev/null
905
+ StdOut="slurm/SQ-20 with and = 250218T165929-9a50513a/logs/J0002-SQ-20250218T165929-9a50513a.sh"
906
+ Power=
907
+ ''')
908
+ parse_scontrol_output(output)
909
+ """
910
+ import re
911
+ # These keys should be the last key on a line. They are allowed to contain
912
+ # space and equal characters.
913
+ special_keys = [
914
+ 'JobName', 'WorkDir', 'StdErr', 'StdIn', 'StdOut', 'Command',
915
+ 'NodeList', 'BatchHost', 'Partition'
916
+ ]
917
+ patterns = '(' + '|'.join(f' {re.escape(k)}=' for k in special_keys) + ')'
918
+ pat = re.compile(patterns)
919
+
920
+ # Initialize dictionary to store parsed key-value pairs
921
+ parsed_data = {}
922
+
923
+ # Split the input into lines
924
+ for line in output.splitlines():
925
+ # First, check for special keys (those with spaces before the equal sign)
926
+ match = pat.search(line)
927
+ if match:
928
+ # Special case: Key is a special key with a space
929
+ startpos = match.start()
930
+ leading_part = line[:startpos]
931
+ special_part = line[startpos + 1:]
932
+ key, value = special_part.split('=', 1)
933
+ parsed_data[key] = value.strip()
934
+ line = leading_part
935
+
936
+ # Now, handle the general case: split by spaces and then by "="
937
+ line = line.strip()
938
+ if line:
939
+ parts = line.split(' ')
940
+ for part in parts:
941
+ key, value = part.split('=', 1)
942
+ parsed_data[key] = value
943
+
944
+ return parsed_data
945
+
946
+
643
947
  SLURM_NOTES = r"""
644
948
  This shows a few things you can do with slurm
645
949
 
@@ -729,4 +1033,9 @@ sbatch \
729
1033
  squeue
730
1034
 
731
1035
 
1036
+
1037
+ References:
1038
+ https://stackoverflow.com/questions/74164136/slurm-accessing-stdout-stderr-location-of-a-completed-job
1039
+
1040
+
732
1041
  """
cmd_queue/slurmify.py ADDED
@@ -0,0 +1,116 @@
1
+ r"""
2
+ Helper script to wrap a command with sbatch, but using a more srun like syntax.
3
+
4
+ .. code:: bash
5
+
6
+ python -m cmd_queue.slurmify \
7
+ --jobname="my_job" \
8
+ --depends=None \
9
+ --gpus=1 \
10
+ --mem=16GB \
11
+ --cpus_per_task=5 \
12
+ --ntasks=1 \
13
+ --ntasks-per-node=1 \
14
+ --partition=community \
15
+ -- \
16
+ python -c 'import sys; print("hello world"); sys.exit(0)'
17
+ """
18
+ #!/usr/bin/env python3
19
+ import scriptconfig as scfg
20
+ import ubelt as ub
21
+
22
+
23
+ class SlurmifyCLI(scfg.DataConfig):
24
+ __command__ = 'slurmify'
25
+
26
+ jobname = scfg.Value(None, help='for submit, this is the name of the new job')
27
+ depends = scfg.Value(None, help='comma separated jobnames to depend on')
28
+
29
+ command = scfg.Value(None, type=str, position=1, nargs='*', help=ub.paragraph(
30
+ '''
31
+ Specifies the bash command to queue.
32
+ Care must be taken when specifying this argument. If specifying as a
33
+ key/value pair argument, it is important to quote and escape the bash
34
+ command properly. A more convinient way to specify this command is as
35
+ a positional argument. End all of the options to this CLI with `--` and
36
+ then specify your full command.
37
+ '''))
38
+
39
+ gpus = scfg.Value(None, help='a comma separated list of the gpu numbers to spread across. tmux backend only.')
40
+ workers = scfg.Value(1, help='number of concurrent queues for the tmux backend.')
41
+
42
+ mem = scfg.Value(None, help='')
43
+ partition = scfg.Value(1, help='slurm partition')
44
+
45
+ ntasks = scfg.Value(None, help='')
46
+ ntasks_per_node = scfg.Value(None, help='')
47
+ cpus_per_task = scfg.Value(None, help='')
48
+
49
+ @classmethod
50
+ def main(cls, cmdline=1, **kwargs):
51
+ """
52
+ Example:
53
+ >>> # xdoctest: +SKIP
54
+ >>> from cmd_queue.slurmify import * # NOQA
55
+ >>> cmdline = 0
56
+ >>> kwargs = dict()
57
+ >>> cls = SlurmifyCLI
58
+ >>> cls.main(cmdline=cmdline, **kwargs)
59
+ """
60
+ import rich
61
+ from rich.markup import escape
62
+ config = cls.cli(cmdline=cmdline, data=kwargs, strict=True)
63
+ rich.print('config = ' + escape(ub.urepr(config, nl=1)))
64
+
65
+ # import json
66
+ # Run a new CLI queue
67
+ row = {'type': 'command', 'command': config['command']}
68
+ if config.jobname:
69
+ row['name'] = config.jobname
70
+ if config.depends:
71
+ row['depends'] = config.depends
72
+
73
+ import cmd_queue
74
+ queue = cmd_queue.Queue.create(
75
+ size=max(1, config['workers']),
76
+ backend='slurm',
77
+ name='slurmified',
78
+ gpus=config['gpus'],
79
+ mem=config['mem'],
80
+ partition=config['partition'],
81
+ ntasks=config['ntasks'],
82
+ ntasks_per_node=config['ntasks_per_node'],
83
+ )
84
+ try:
85
+ bash_command = row['command']
86
+ if isinstance(bash_command, list):
87
+ if len(bash_command) == 1:
88
+ # hack
89
+ import shlex
90
+ if shlex.quote(bash_command[0]) == bash_command[0]:
91
+ bash_command = bash_command[0]
92
+ else:
93
+ bash_command = shlex.quote(bash_command[0])
94
+ else:
95
+ import shlex
96
+ bash_command = ' '.join([shlex.quote(str(p)) for p in bash_command])
97
+ submitkw = ub.udict(row) & {'name', 'depends'}
98
+ queue.submit(bash_command, log=False, **submitkw)
99
+ except Exception:
100
+ print('row = {}'.format(ub.urepr(row, nl=1)))
101
+ raise
102
+ queue.print_commands()
103
+
104
+ # config.cli_queue_fpath.write_text(json.dumps(row))
105
+ # 'sbatch --job-name="test_job1" --output="$HOME/.cache/slurm/logs/job-%j-%x.out" --wrap=""
106
+
107
+ __cli__ = SlurmifyCLI
108
+
109
+ if __name__ == '__main__':
110
+ """
111
+
112
+ CommandLine:
113
+ python ~/code/cmd_queue/cmd_queue/slurmify.py
114
+ python -m cmd_queue.slurmify
115
+ """
116
+ __cli__.main()
cmd_queue/tmux_queue.py CHANGED
@@ -724,7 +724,7 @@ class TMUXMultiQueue(base_queue.Queue):
724
724
 
725
725
  CommandLine:
726
726
  xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:0
727
- xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:1 --interact
727
+ INTERACTIVE_TEST=1 xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:1
728
728
 
729
729
  Example:
730
730
  >>> # xdoctest: +REQUIRES(--interact)
@@ -855,7 +855,6 @@ class TMUXMultiQueue(base_queue.Queue):
855
855
  finished &= (state['status'] == 'done')
856
856
  if state['status'] == 'done':
857
857
  pass_color = '[green]'
858
-
859
858
  if (state['failed'] > 0):
860
859
  fail_color = '[red]'
861
860
  if (state['skipped'] > 0):
@@ -1074,5 +1073,19 @@ if 0:
1074
1073
  tmux send -t my_session_id1 "tmux select-pane -t 3" Enter
1075
1074
  tmux send -t my_session_id1 "echo pane3" Enter
1076
1075
 
1076
+ # https://stackoverflow.com/questions/54954177/how-to-write-a-tmux-script-so-that-it-automatically-split-windows-and-opens-a-se
1077
+ # https://tmuxcheatsheet.com/
1078
+ # https://gist.github.com/Starefossen/5955406
1079
+
1080
+ # List the bindings
1081
+ tmux list-keys
1082
+
1083
+ # Can arange the splits in a session via a preset layout
1084
+ # Preset layouts are:
1085
+ # even-horizontal, even-vertical, main-horizontal, main-vertical, or tiled.
1086
+ tmux select-layout -t "${SESSION_NAME}" even-vertical
1087
+
1088
+ # switch to an existing session
1089
+ tmux switch -t "${SESSION_NAME}"
1077
1090
 
1078
1091
  """