cmd-queue 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cmd-queue might be problematic. Click here for more details.
- cmd_queue/__init__.py +1 -1
- cmd_queue/serial_queue.py +55 -59
- cmd_queue/slurm_queue.py +353 -44
- cmd_queue/slurmify.py +116 -0
- cmd_queue/tmux_queue.py +15 -2
- cmd_queue/util/util_bash.py +52 -0
- cmd_queue/util/util_tmux.py +76 -0
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/METADATA +213 -175
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/RECORD +13 -11
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/WHEEL +1 -1
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/LICENSE +0 -0
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/entry_points.txt +0 -0
- {cmd_queue-0.2.0.dist-info → cmd_queue-0.2.2.dist-info}/top_level.txt +0 -0
cmd_queue/slurm_queue.py
CHANGED
|
@@ -264,10 +264,6 @@ class SlurmJob(base_queue.Job):
|
|
|
264
264
|
return ' \\\n '.join(args)
|
|
265
265
|
|
|
266
266
|
def _build_sbatch_args(self, jobname_to_varname=None):
|
|
267
|
-
# job_name = 'todo'
|
|
268
|
-
# output_fpath = '$HOME/.cache/slurm/logs/job-%j-%x.out'
|
|
269
|
-
# command = "python -c 'import sys; sys.exit(1)'"
|
|
270
|
-
# -c 2 -p priority --gres=gpu:1
|
|
271
267
|
sbatch_args = ['sbatch']
|
|
272
268
|
if self.name:
|
|
273
269
|
sbatch_args.append(f'--job-name="{self.name}"')
|
|
@@ -306,7 +302,8 @@ class SlurmJob(base_queue.Job):
|
|
|
306
302
|
|
|
307
303
|
for key, value in self._sbatch_kvargs.items():
|
|
308
304
|
key = key.replace('_', '-')
|
|
309
|
-
|
|
305
|
+
if value is not None:
|
|
306
|
+
sbatch_args.append(f'--{key}="{value}"')
|
|
310
307
|
|
|
311
308
|
for key, flag in self._sbatch_flags.items():
|
|
312
309
|
if flag:
|
|
@@ -371,6 +368,19 @@ class SlurmQueue(base_queue.Queue):
|
|
|
371
368
|
CommandLine:
|
|
372
369
|
xdoctest -m cmd_queue.slurm_queue SlurmQueue
|
|
373
370
|
|
|
371
|
+
Example:
|
|
372
|
+
>>> from cmd_queue.slurm_queue import * # NOQA
|
|
373
|
+
>>> self = SlurmQueue()
|
|
374
|
+
>>> job0 = self.submit('echo "hi from $SLURM_JOBID"')
|
|
375
|
+
>>> job1 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job0])
|
|
376
|
+
>>> job2 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job1])
|
|
377
|
+
>>> job3 = self.submit('echo "hi from $SLURM_JOBID"', depends=[job1, job2])
|
|
378
|
+
>>> self.write()
|
|
379
|
+
>>> self.print_commands()
|
|
380
|
+
>>> # xdoctest: +REQUIRES(--run)
|
|
381
|
+
>>> if not self.is_available():
|
|
382
|
+
>>> self.run()
|
|
383
|
+
|
|
374
384
|
Example:
|
|
375
385
|
>>> from cmd_queue.slurm_queue import * # NOQA
|
|
376
386
|
>>> self = SlurmQueue()
|
|
@@ -413,6 +423,11 @@ class SlurmQueue(base_queue.Queue):
|
|
|
413
423
|
self.unused_kwargs = kwargs
|
|
414
424
|
self.queue_id = name + '-' + stamp + '-' + ub.hash_data(uuid.uuid4())[0:8]
|
|
415
425
|
self.dpath = ub.Path.appdir('cmd_queue/slurm') / self.queue_id
|
|
426
|
+
if 0:
|
|
427
|
+
# hack for submission on different systems, probably dont want to
|
|
428
|
+
# do this.
|
|
429
|
+
self.dpath = self.dpath.shrinkuser(home='$HOME')
|
|
430
|
+
|
|
416
431
|
self.log_dpath = self.dpath / 'logs'
|
|
417
432
|
self.fpath = self.dpath / (self.queue_id + '.sh')
|
|
418
433
|
self.shell = shell
|
|
@@ -420,10 +435,43 @@ class SlurmQueue(base_queue.Queue):
|
|
|
420
435
|
self.all_depends = None
|
|
421
436
|
self._sbatch_kvargs = ub.udict(kwargs) & SLURM_SBATCH_KVARGS
|
|
422
437
|
self._sbatch_flags = ub.udict(kwargs) & SLURM_SBATCH_FLAGS
|
|
438
|
+
self._include_monitor_metadata = True
|
|
439
|
+
self.jobid_fpath = None
|
|
423
440
|
|
|
424
441
|
def __nice__(self):
|
|
425
442
|
return self.queue_id
|
|
426
443
|
|
|
444
|
+
@classmethod
|
|
445
|
+
def _slurm_checks(cls):
|
|
446
|
+
status = {}
|
|
447
|
+
info = {}
|
|
448
|
+
info['squeue_fpath'] = ub.find_exe('squeue')
|
|
449
|
+
status['has_squeue'] = bool(info['squeue_fpath'])
|
|
450
|
+
status['slurmd_running'] = False
|
|
451
|
+
import psutil
|
|
452
|
+
for p in psutil.process_iter():
|
|
453
|
+
if p.name() == 'slurmd':
|
|
454
|
+
status['slurmd_running'] = True
|
|
455
|
+
info['slurmd_info'] = {
|
|
456
|
+
'pid': p.pid,
|
|
457
|
+
'name': p.name(),
|
|
458
|
+
'status': p.status(),
|
|
459
|
+
'create_time': p.create_time(),
|
|
460
|
+
}
|
|
461
|
+
break
|
|
462
|
+
status['squeue_working'] = (ub.cmd('squeue')['ret'] == 0)
|
|
463
|
+
|
|
464
|
+
sinfo = ub.cmd('sinfo --json')
|
|
465
|
+
status['sinfo_working'] = False
|
|
466
|
+
if sinfo['ret'] == 0:
|
|
467
|
+
status['sinfo_working'] = True
|
|
468
|
+
import json
|
|
469
|
+
sinfo_out = json.loads(sinfo['out'])
|
|
470
|
+
has_working_nodes = not all(
|
|
471
|
+
node['state'] == 'down'
|
|
472
|
+
for node in sinfo_out['nodes'])
|
|
473
|
+
status['has_working_nodes'] = has_working_nodes
|
|
474
|
+
|
|
427
475
|
@classmethod
|
|
428
476
|
def is_available(cls):
|
|
429
477
|
"""
|
|
@@ -436,15 +484,37 @@ class SlurmQueue(base_queue.Queue):
|
|
|
436
484
|
squeue_working = (ub.cmd('squeue')['ret'] == 0)
|
|
437
485
|
if squeue_working:
|
|
438
486
|
# Check if nodes are available or down
|
|
439
|
-
|
|
440
|
-
|
|
487
|
+
# note: the --json command is not available in
|
|
488
|
+
# slurm-wlm 19.05.5, but it is in slurm-wlm 21.08.5
|
|
489
|
+
sinfo_version_str = ub.cmd('sinfo --version').stdout.strip().split(' ')[1]
|
|
490
|
+
sinfo_major_version = int(sinfo_version_str.split('.')[0])
|
|
491
|
+
if sinfo_major_version < 21:
|
|
492
|
+
# Dont check in this case
|
|
493
|
+
return True
|
|
494
|
+
else:
|
|
441
495
|
import json
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
496
|
+
# sinfo --json changed between v22 and v23
|
|
497
|
+
# https://github.com/SchedMD/slurm/blob/slurm-23.02/RELEASE_NOTES#L230
|
|
498
|
+
if sinfo_major_version == 22:
|
|
499
|
+
sinfo = ub.cmd('sinfo --json')
|
|
500
|
+
else:
|
|
501
|
+
sinfo = ub.cmd('scontrol show nodes --json')
|
|
502
|
+
if sinfo['ret'] == 0:
|
|
503
|
+
sinfo_out = json.loads(sinfo['out'])
|
|
504
|
+
nodes = sinfo_out['nodes']
|
|
505
|
+
# FIXME: this might be an incorrect check on v22
|
|
506
|
+
# the v23 version seems different, but I don't have
|
|
507
|
+
# v22 setup anymore. Might not be worth supporting.
|
|
508
|
+
node_states = [node['state'] for node in nodes]
|
|
509
|
+
if sinfo_major_version == 22:
|
|
510
|
+
has_working_nodes = not all(
|
|
511
|
+
'down' in str(state).lower() for state in node_states)
|
|
512
|
+
else:
|
|
513
|
+
has_working_nodes = not all(
|
|
514
|
+
'DOWN' in state for state in node_states)
|
|
515
|
+
if has_working_nodes:
|
|
516
|
+
return True
|
|
517
|
+
|
|
448
518
|
return False
|
|
449
519
|
|
|
450
520
|
def submit(self, command, **kwargs):
|
|
@@ -486,6 +556,12 @@ class SlurmQueue(base_queue.Queue):
|
|
|
486
556
|
self.header_commands.append(command)
|
|
487
557
|
|
|
488
558
|
def order_jobs(self):
|
|
559
|
+
"""
|
|
560
|
+
Get a topological sorting of the jobs in this DAG.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
List[SlurmJob]: ordered jobs
|
|
564
|
+
"""
|
|
489
565
|
import networkx as nx
|
|
490
566
|
graph = self._dependency_graph()
|
|
491
567
|
if 0:
|
|
@@ -497,6 +573,15 @@ class SlurmQueue(base_queue.Queue):
|
|
|
497
573
|
return new_order
|
|
498
574
|
|
|
499
575
|
def finalize_text(self, exclude_tags=None, **kwargs):
|
|
576
|
+
"""
|
|
577
|
+
Serialize the state of the queue into a bash script.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
str
|
|
581
|
+
"""
|
|
582
|
+
# generating the slurm bash script is straightforward because slurm
|
|
583
|
+
# will take of the hard stuff (like scheduling) for us. we just need
|
|
584
|
+
# to effectively encode the DAG as a list of sbatch commands.
|
|
500
585
|
exclude_tags = util_tags.Tags.coerce(exclude_tags)
|
|
501
586
|
new_order = self.order_jobs()
|
|
502
587
|
commands = []
|
|
@@ -517,6 +602,20 @@ class SlurmQueue(base_queue.Queue):
|
|
|
517
602
|
jobname_to_varname[job.name] = varname
|
|
518
603
|
commands.append(command)
|
|
519
604
|
self.jobname_to_varname = jobname_to_varname
|
|
605
|
+
|
|
606
|
+
self._include_monitor_metadata = True
|
|
607
|
+
if self._include_monitor_metadata:
|
|
608
|
+
# Build a command to dump the job-ids for this queue to disk to
|
|
609
|
+
# allow us to track them in the monitor.
|
|
610
|
+
from cmd_queue.util import util_bash
|
|
611
|
+
json_fmt_parts = [
|
|
612
|
+
(job_varname, '%s', '$' + job_varname)
|
|
613
|
+
for job_varname in self.jobname_to_varname.values()
|
|
614
|
+
]
|
|
615
|
+
self.jobid_fpath = self.fpath.augment(ext='.jobids.json')
|
|
616
|
+
command = util_bash.bash_json_dump(json_fmt_parts, self.jobid_fpath)
|
|
617
|
+
commands.append(command)
|
|
618
|
+
|
|
520
619
|
text = '\n'.join(commands)
|
|
521
620
|
return text
|
|
522
621
|
|
|
@@ -532,6 +631,25 @@ class SlurmQueue(base_queue.Queue):
|
|
|
532
631
|
def monitor(self, refresh_rate=0.4):
|
|
533
632
|
"""
|
|
534
633
|
Monitor progress until the jobs are done
|
|
634
|
+
|
|
635
|
+
CommandLine:
|
|
636
|
+
xdoctest -m cmd_queue.slurm_queue SlurmQueue.monitor --dev --run
|
|
637
|
+
|
|
638
|
+
Example:
|
|
639
|
+
>>> # xdoctest: +REQUIRES(--dev)
|
|
640
|
+
>>> from cmd_queue.slurm_queue import * # NOQA
|
|
641
|
+
>>> dpath = ub.Path.appdir('slurm_queue/tests/test-slurm-failed-monitor')
|
|
642
|
+
>>> queue = SlurmQueue()
|
|
643
|
+
>>> job0 = queue.submit(f'echo "here we go"', name='job0')
|
|
644
|
+
>>> job1 = queue.submit(f'echo "this job will pass, allowing dependencies to run" && true', depends=[job0])
|
|
645
|
+
>>> job2 = queue.submit(f'echo "this job will run and pass" && sleep 10 && true', depends=[job1])
|
|
646
|
+
>>> job3 = queue.submit(f'echo "this job will run and fail" && false', depends=[job1])
|
|
647
|
+
>>> job4 = queue.submit(f'echo "this job will fail, preventing dependencies from running" && false', depends=[job0])
|
|
648
|
+
>>> job5 = queue.submit(f'echo "this job will never run" && true', depends=[job4])
|
|
649
|
+
>>> job6 = queue.submit(f'echo "this job will also never run" && false', depends=[job4])
|
|
650
|
+
>>> queue.print_commands()
|
|
651
|
+
>>> # xdoctest: +REQUIRES(--run)
|
|
652
|
+
>>> queue.run()
|
|
535
653
|
"""
|
|
536
654
|
|
|
537
655
|
import time
|
|
@@ -543,46 +661,154 @@ class SlurmQueue(base_queue.Queue):
|
|
|
543
661
|
|
|
544
662
|
num_at_start = None
|
|
545
663
|
|
|
664
|
+
job_status_table = None
|
|
665
|
+
if self.jobid_fpath is not None:
|
|
666
|
+
class UnableToMonitor(Exception):
|
|
667
|
+
...
|
|
668
|
+
try:
|
|
669
|
+
import json
|
|
670
|
+
if not self.jobid_fpath.exists():
|
|
671
|
+
raise UnableToMonitor
|
|
672
|
+
jobid_lut = json.loads(self.jobid_fpath.read_text())
|
|
673
|
+
job_status_table = [
|
|
674
|
+
{
|
|
675
|
+
'job_varname': job_varname,
|
|
676
|
+
'job_id': job_id,
|
|
677
|
+
'status': 'unknown',
|
|
678
|
+
'needs_update': True,
|
|
679
|
+
}
|
|
680
|
+
for job_varname, job_id in jobid_lut.items()
|
|
681
|
+
]
|
|
682
|
+
except UnableToMonitor:
|
|
683
|
+
print('ERROR: Unable to monitors jobids')
|
|
684
|
+
|
|
685
|
+
def update_jobid_status():
|
|
686
|
+
import rich
|
|
687
|
+
for row in job_status_table:
|
|
688
|
+
if row['needs_update']:
|
|
689
|
+
job_id = row['job_id']
|
|
690
|
+
out = ub.cmd(f'scontrol show job "{job_id}"')
|
|
691
|
+
info = parse_scontrol_output(out.stdout)
|
|
692
|
+
row['JobState'] = info['JobState']
|
|
693
|
+
row['ExitCode'] = info.get('ExitCode', None)
|
|
694
|
+
# https://slurm.schedmd.com/job_state_codes.html
|
|
695
|
+
if info['JobState'].startswith('FAILED'):
|
|
696
|
+
row['status'] = 'failed'
|
|
697
|
+
rich.print(f'[red] Failed job: {info["JobName"]}')
|
|
698
|
+
if info["StdErr"] == info["StdOut"]:
|
|
699
|
+
rich.print(f'[red] * Logs: {info["StdErr"]}')
|
|
700
|
+
else:
|
|
701
|
+
rich.print(f'[red] StdErr: {info["StdErr"]}')
|
|
702
|
+
rich.print(f'[red] StdOut: {info["StdOut"]}')
|
|
703
|
+
row['needs_update'] = False
|
|
704
|
+
elif info['JobState'].startswith('CANCELLED'):
|
|
705
|
+
rich.print(f'[yellow] Skip job: {info["JobName"]}')
|
|
706
|
+
row['status'] = 'skipped'
|
|
707
|
+
row['needs_update'] = False
|
|
708
|
+
elif info['JobState'].startswith('COMPLETED'):
|
|
709
|
+
rich.print(f'[green] Completed job: {info["JobName"]}')
|
|
710
|
+
row['status'] = 'passed'
|
|
711
|
+
row['needs_update'] = False
|
|
712
|
+
elif info['JobState'].startswith('RUNNING'):
|
|
713
|
+
row['status'] = 'running'
|
|
714
|
+
elif info['JobState'].startswith('PENDING'):
|
|
715
|
+
row['status'] = 'pending'
|
|
716
|
+
else:
|
|
717
|
+
row['status'] = 'unknown'
|
|
718
|
+
# print(f'job_status_table = {ub.urepr(job_status_table, nl=1)}')
|
|
719
|
+
|
|
546
720
|
def update_status_table():
|
|
547
721
|
nonlocal num_at_start
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
722
|
+
|
|
723
|
+
# TODO: move this block into into the version where job status
|
|
724
|
+
# table is not available, and reimplement it for the per-job style
|
|
725
|
+
# of query. The reason we have it out here now is because we need
|
|
726
|
+
# to implement the HACK_KILL_BROKEN_JOBS in the alternate case.
|
|
727
|
+
if True:
|
|
728
|
+
# https://rich.readthedocs.io/en/stable/live.html
|
|
729
|
+
info = ub.cmd('squeue --format="%i %P %j %u %t %M %D %R"')
|
|
730
|
+
stream = io.StringIO(info['out'])
|
|
731
|
+
df = pd.read_csv(stream, sep=' ')
|
|
732
|
+
|
|
733
|
+
# Only include job names that this queue created
|
|
734
|
+
job_names = [job.name for job in self.jobs]
|
|
735
|
+
df = df[df['NAME'].isin(job_names)]
|
|
736
|
+
jobid_history.update(df['JOBID'])
|
|
737
|
+
|
|
738
|
+
num_running = (df['ST'] == 'R').sum()
|
|
739
|
+
num_in_queue = len(df)
|
|
740
|
+
total_monitored = len(jobid_history)
|
|
741
|
+
|
|
742
|
+
HACK_KILL_BROKEN_JOBS = 1
|
|
743
|
+
if HACK_KILL_BROKEN_JOBS:
|
|
744
|
+
# For whatever reason using kill-on-invalid-dep
|
|
745
|
+
# kills jobs too fast and not when they are in a dependency state not a
|
|
746
|
+
# a never satisfied state. Killing these jobs here seems to fix
|
|
747
|
+
# it.
|
|
748
|
+
broken_jobs = df[df['NODELIST(REASON)'] == '(DependencyNeverSatisfied)']
|
|
749
|
+
if len(broken_jobs):
|
|
750
|
+
for name in broken_jobs['NAME']:
|
|
751
|
+
ub.cmd(f'scancel --name="{name}"')
|
|
568
752
|
|
|
569
753
|
if num_at_start is None:
|
|
570
754
|
num_at_start = len(df)
|
|
571
755
|
|
|
572
|
-
|
|
756
|
+
if job_status_table is not None:
|
|
757
|
+
update_jobid_status()
|
|
758
|
+
state = ub.dict_hist([row['status'] for row in job_status_table])
|
|
759
|
+
state.setdefault('passed', 0)
|
|
760
|
+
state.setdefault('failed', 0)
|
|
761
|
+
state.setdefault('skipped', 0)
|
|
762
|
+
state.setdefault('pending', 0)
|
|
763
|
+
state.setdefault('unknown', 0)
|
|
764
|
+
state.setdefault('running', 0)
|
|
765
|
+
state['total'] = len(job_status_table)
|
|
766
|
+
|
|
767
|
+
state['other'] = state['total'] - (
|
|
768
|
+
state['passed'] + state['failed'] + state['skipped'] +
|
|
769
|
+
state['running'] + state['pending']
|
|
770
|
+
)
|
|
771
|
+
pass_color = ''
|
|
772
|
+
fail_color = ''
|
|
773
|
+
skip_color = ''
|
|
774
|
+
finished = (state['pending'] + state['unknown'] + state['running'] == 0)
|
|
775
|
+
if (state['failed'] > 0):
|
|
776
|
+
fail_color = '[red]'
|
|
777
|
+
if (state['skipped'] > 0):
|
|
778
|
+
skip_color = '[yellow]'
|
|
779
|
+
if finished:
|
|
780
|
+
pass_color = '[green]'
|
|
781
|
+
|
|
782
|
+
header = ['passed', 'failed', 'skipped', 'running', 'pending', 'other', 'total']
|
|
783
|
+
row_values = [
|
|
784
|
+
f"{pass_color}{state['passed']}",
|
|
785
|
+
f"{fail_color}{state['failed']}",
|
|
786
|
+
f"{skip_color}{state['skipped']}",
|
|
787
|
+
f"{state['running']}",
|
|
788
|
+
f"{state['pending']}",
|
|
789
|
+
f"{state['other']}",
|
|
790
|
+
f"{state['total']}",
|
|
791
|
+
]
|
|
792
|
+
else:
|
|
793
|
+
# TODO: determine if slurm has accounting on, and if we can
|
|
794
|
+
# figure out how many jobs errored / passed
|
|
795
|
+
header = ['num_running', 'num_in_queue', 'total_monitored', 'num_at_start']
|
|
796
|
+
row_values = [
|
|
797
|
+
f'{num_running}',
|
|
798
|
+
f'{num_in_queue}',
|
|
799
|
+
f'{total_monitored}',
|
|
800
|
+
f'{num_at_start}',
|
|
801
|
+
]
|
|
802
|
+
# row_values.append(str(state.get('FAIL', 0)))
|
|
803
|
+
# row_values.append(str(state.get('SKIPPED', 0)))
|
|
804
|
+
# row_values.append(str(state.get('PENDING', 0)))
|
|
805
|
+
finished = (num_in_queue == 0)
|
|
806
|
+
|
|
807
|
+
table = Table(*header,
|
|
573
808
|
title='slurm-monitor')
|
|
574
809
|
|
|
575
|
-
|
|
576
|
-
# figure out how many jobs errored / passed
|
|
577
|
-
|
|
578
|
-
table.add_row(
|
|
579
|
-
f'{num_running}',
|
|
580
|
-
f'{num_in_queue}',
|
|
581
|
-
f'{total_monitored}',
|
|
582
|
-
f'{num_at_start}',
|
|
583
|
-
)
|
|
810
|
+
table.add_row(*row_values)
|
|
584
811
|
|
|
585
|
-
finished = (num_in_queue == 0)
|
|
586
812
|
return table, finished
|
|
587
813
|
|
|
588
814
|
try:
|
|
@@ -622,6 +848,8 @@ class SlurmQueue(base_queue.Queue):
|
|
|
622
848
|
style (str):
|
|
623
849
|
can be 'colors', 'rich', or 'plain'
|
|
624
850
|
|
|
851
|
+
**kwargs: extra backend-specific args passed to finalize_text
|
|
852
|
+
|
|
625
853
|
CommandLine:
|
|
626
854
|
xdoctest -m cmd_queue.slurm_queue SlurmQueue.print_commands
|
|
627
855
|
|
|
@@ -640,6 +868,82 @@ class SlurmQueue(base_queue.Queue):
|
|
|
640
868
|
rprint = print_commands
|
|
641
869
|
|
|
642
870
|
|
|
871
|
+
def parse_scontrol_output(output: str) -> dict:
|
|
872
|
+
"""
|
|
873
|
+
Parses the output of `scontrol show job` into a dictionary.
|
|
874
|
+
|
|
875
|
+
Example:
|
|
876
|
+
from cmd_queue.slurm_queue import * # NOQA
|
|
877
|
+
# Example usage
|
|
878
|
+
output = ub.codeblock(
|
|
879
|
+
'''
|
|
880
|
+
JobId=307 JobName=J0002-SQ-2025 with a space 0218T165929-9a50513a
|
|
881
|
+
UserId=joncrall(1000) GroupId=joncrall(1000) MCS_label=N/A
|
|
882
|
+
Priority=1 Nice=0 Account=(null) QOS=(null)
|
|
883
|
+
JobState=COMPLETED Reason=None Dependency=(null)
|
|
884
|
+
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
|
|
885
|
+
RunTime=00:00:10 TimeLimit=365-00:00:00 TimeMin=N/A
|
|
886
|
+
SubmitTime=2025-02-18T16:59:30 EligibleTime=2025-02-18T16:59:33
|
|
887
|
+
AccrueTime=Unknown
|
|
888
|
+
StartTime=2025-02-18T16:59:33 EndTime=2025-02-18T16:59:43 Deadline=N/A
|
|
889
|
+
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2025-02-18T16:59:33 Scheduler=Backfill
|
|
890
|
+
Partition=priority AllocNode:Sid=localhost:215414
|
|
891
|
+
ReqNodeList=(null) ExcNodeList=(null)
|
|
892
|
+
NodeList=toothbrush
|
|
893
|
+
BatchHost=toothbrush
|
|
894
|
+
NumNodes=1 NumCPUs=2 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
|
|
895
|
+
ReqTRES=cpu=1,mem=120445M,node=1,billing=1
|
|
896
|
+
AllocTRES=cpu=2,node=1,billing=2
|
|
897
|
+
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
|
|
898
|
+
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
|
|
899
|
+
Features=(null) DelayBoot=00:00:00
|
|
900
|
+
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
|
|
901
|
+
Command=(null)
|
|
902
|
+
WorkDir=/home/joncrall/code/cmd_queue
|
|
903
|
+
StdErr="cmd_queue/slurm/SQ-2025021 with a space 8T165929-9a50513a/logs/J0002-SQ-20250218T165929-9a50513a.sh"
|
|
904
|
+
StdIn=/dev/null
|
|
905
|
+
StdOut="slurm/SQ-20 with and = 250218T165929-9a50513a/logs/J0002-SQ-20250218T165929-9a50513a.sh"
|
|
906
|
+
Power=
|
|
907
|
+
''')
|
|
908
|
+
parse_scontrol_output(output)
|
|
909
|
+
"""
|
|
910
|
+
import re
|
|
911
|
+
# These keys should be the last key on a line. They are allowed to contain
|
|
912
|
+
# space and equal characters.
|
|
913
|
+
special_keys = [
|
|
914
|
+
'JobName', 'WorkDir', 'StdErr', 'StdIn', 'StdOut', 'Command',
|
|
915
|
+
'NodeList', 'BatchHost', 'Partition'
|
|
916
|
+
]
|
|
917
|
+
patterns = '(' + '|'.join(f' {re.escape(k)}=' for k in special_keys) + ')'
|
|
918
|
+
pat = re.compile(patterns)
|
|
919
|
+
|
|
920
|
+
# Initialize dictionary to store parsed key-value pairs
|
|
921
|
+
parsed_data = {}
|
|
922
|
+
|
|
923
|
+
# Split the input into lines
|
|
924
|
+
for line in output.splitlines():
|
|
925
|
+
# First, check for special keys (those with spaces before the equal sign)
|
|
926
|
+
match = pat.search(line)
|
|
927
|
+
if match:
|
|
928
|
+
# Special case: Key is a special key with a space
|
|
929
|
+
startpos = match.start()
|
|
930
|
+
leading_part = line[:startpos]
|
|
931
|
+
special_part = line[startpos + 1:]
|
|
932
|
+
key, value = special_part.split('=', 1)
|
|
933
|
+
parsed_data[key] = value.strip()
|
|
934
|
+
line = leading_part
|
|
935
|
+
|
|
936
|
+
# Now, handle the general case: split by spaces and then by "="
|
|
937
|
+
line = line.strip()
|
|
938
|
+
if line:
|
|
939
|
+
parts = line.split(' ')
|
|
940
|
+
for part in parts:
|
|
941
|
+
key, value = part.split('=', 1)
|
|
942
|
+
parsed_data[key] = value
|
|
943
|
+
|
|
944
|
+
return parsed_data
|
|
945
|
+
|
|
946
|
+
|
|
643
947
|
SLURM_NOTES = r"""
|
|
644
948
|
This shows a few things you can do with slurm
|
|
645
949
|
|
|
@@ -729,4 +1033,9 @@ sbatch \
|
|
|
729
1033
|
squeue
|
|
730
1034
|
|
|
731
1035
|
|
|
1036
|
+
|
|
1037
|
+
References:
|
|
1038
|
+
https://stackoverflow.com/questions/74164136/slurm-accessing-stdout-stderr-location-of-a-completed-job
|
|
1039
|
+
|
|
1040
|
+
|
|
732
1041
|
"""
|
cmd_queue/slurmify.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
Helper script to wrap a command with sbatch, but using a more srun like syntax.
|
|
3
|
+
|
|
4
|
+
.. code:: bash
|
|
5
|
+
|
|
6
|
+
python -m cmd_queue.slurmify \
|
|
7
|
+
--jobname="my_job" \
|
|
8
|
+
--depends=None \
|
|
9
|
+
--gpus=1 \
|
|
10
|
+
--mem=16GB \
|
|
11
|
+
--cpus_per_task=5 \
|
|
12
|
+
--ntasks=1 \
|
|
13
|
+
--ntasks-per-node=1 \
|
|
14
|
+
--partition=community \
|
|
15
|
+
-- \
|
|
16
|
+
python -c 'import sys; print("hello world"); sys.exit(0)'
|
|
17
|
+
"""
|
|
18
|
+
#!/usr/bin/env python3
|
|
19
|
+
import scriptconfig as scfg
|
|
20
|
+
import ubelt as ub
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SlurmifyCLI(scfg.DataConfig):
|
|
24
|
+
__command__ = 'slurmify'
|
|
25
|
+
|
|
26
|
+
jobname = scfg.Value(None, help='for submit, this is the name of the new job')
|
|
27
|
+
depends = scfg.Value(None, help='comma separated jobnames to depend on')
|
|
28
|
+
|
|
29
|
+
command = scfg.Value(None, type=str, position=1, nargs='*', help=ub.paragraph(
|
|
30
|
+
'''
|
|
31
|
+
Specifies the bash command to queue.
|
|
32
|
+
Care must be taken when specifying this argument. If specifying as a
|
|
33
|
+
key/value pair argument, it is important to quote and escape the bash
|
|
34
|
+
command properly. A more convinient way to specify this command is as
|
|
35
|
+
a positional argument. End all of the options to this CLI with `--` and
|
|
36
|
+
then specify your full command.
|
|
37
|
+
'''))
|
|
38
|
+
|
|
39
|
+
gpus = scfg.Value(None, help='a comma separated list of the gpu numbers to spread across. tmux backend only.')
|
|
40
|
+
workers = scfg.Value(1, help='number of concurrent queues for the tmux backend.')
|
|
41
|
+
|
|
42
|
+
mem = scfg.Value(None, help='')
|
|
43
|
+
partition = scfg.Value(1, help='slurm partition')
|
|
44
|
+
|
|
45
|
+
ntasks = scfg.Value(None, help='')
|
|
46
|
+
ntasks_per_node = scfg.Value(None, help='')
|
|
47
|
+
cpus_per_task = scfg.Value(None, help='')
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def main(cls, cmdline=1, **kwargs):
|
|
51
|
+
"""
|
|
52
|
+
Example:
|
|
53
|
+
>>> # xdoctest: +SKIP
|
|
54
|
+
>>> from cmd_queue.slurmify import * # NOQA
|
|
55
|
+
>>> cmdline = 0
|
|
56
|
+
>>> kwargs = dict()
|
|
57
|
+
>>> cls = SlurmifyCLI
|
|
58
|
+
>>> cls.main(cmdline=cmdline, **kwargs)
|
|
59
|
+
"""
|
|
60
|
+
import rich
|
|
61
|
+
from rich.markup import escape
|
|
62
|
+
config = cls.cli(cmdline=cmdline, data=kwargs, strict=True)
|
|
63
|
+
rich.print('config = ' + escape(ub.urepr(config, nl=1)))
|
|
64
|
+
|
|
65
|
+
# import json
|
|
66
|
+
# Run a new CLI queue
|
|
67
|
+
row = {'type': 'command', 'command': config['command']}
|
|
68
|
+
if config.jobname:
|
|
69
|
+
row['name'] = config.jobname
|
|
70
|
+
if config.depends:
|
|
71
|
+
row['depends'] = config.depends
|
|
72
|
+
|
|
73
|
+
import cmd_queue
|
|
74
|
+
queue = cmd_queue.Queue.create(
|
|
75
|
+
size=max(1, config['workers']),
|
|
76
|
+
backend='slurm',
|
|
77
|
+
name='slurmified',
|
|
78
|
+
gpus=config['gpus'],
|
|
79
|
+
mem=config['mem'],
|
|
80
|
+
partition=config['partition'],
|
|
81
|
+
ntasks=config['ntasks'],
|
|
82
|
+
ntasks_per_node=config['ntasks_per_node'],
|
|
83
|
+
)
|
|
84
|
+
try:
|
|
85
|
+
bash_command = row['command']
|
|
86
|
+
if isinstance(bash_command, list):
|
|
87
|
+
if len(bash_command) == 1:
|
|
88
|
+
# hack
|
|
89
|
+
import shlex
|
|
90
|
+
if shlex.quote(bash_command[0]) == bash_command[0]:
|
|
91
|
+
bash_command = bash_command[0]
|
|
92
|
+
else:
|
|
93
|
+
bash_command = shlex.quote(bash_command[0])
|
|
94
|
+
else:
|
|
95
|
+
import shlex
|
|
96
|
+
bash_command = ' '.join([shlex.quote(str(p)) for p in bash_command])
|
|
97
|
+
submitkw = ub.udict(row) & {'name', 'depends'}
|
|
98
|
+
queue.submit(bash_command, log=False, **submitkw)
|
|
99
|
+
except Exception:
|
|
100
|
+
print('row = {}'.format(ub.urepr(row, nl=1)))
|
|
101
|
+
raise
|
|
102
|
+
queue.print_commands()
|
|
103
|
+
|
|
104
|
+
# config.cli_queue_fpath.write_text(json.dumps(row))
|
|
105
|
+
# 'sbatch --job-name="test_job1" --output="$HOME/.cache/slurm/logs/job-%j-%x.out" --wrap=""
|
|
106
|
+
|
|
107
|
+
__cli__ = SlurmifyCLI
|
|
108
|
+
|
|
109
|
+
if __name__ == '__main__':
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
CommandLine:
|
|
113
|
+
python ~/code/cmd_queue/cmd_queue/slurmify.py
|
|
114
|
+
python -m cmd_queue.slurmify
|
|
115
|
+
"""
|
|
116
|
+
__cli__.main()
|
cmd_queue/tmux_queue.py
CHANGED
|
@@ -724,7 +724,7 @@ class TMUXMultiQueue(base_queue.Queue):
|
|
|
724
724
|
|
|
725
725
|
CommandLine:
|
|
726
726
|
xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:0
|
|
727
|
-
xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:1
|
|
727
|
+
INTERACTIVE_TEST=1 xdoctest -m cmd_queue.tmux_queue TMUXMultiQueue.monitor:1
|
|
728
728
|
|
|
729
729
|
Example:
|
|
730
730
|
>>> # xdoctest: +REQUIRES(--interact)
|
|
@@ -855,7 +855,6 @@ class TMUXMultiQueue(base_queue.Queue):
|
|
|
855
855
|
finished &= (state['status'] == 'done')
|
|
856
856
|
if state['status'] == 'done':
|
|
857
857
|
pass_color = '[green]'
|
|
858
|
-
|
|
859
858
|
if (state['failed'] > 0):
|
|
860
859
|
fail_color = '[red]'
|
|
861
860
|
if (state['skipped'] > 0):
|
|
@@ -1074,5 +1073,19 @@ if 0:
|
|
|
1074
1073
|
tmux send -t my_session_id1 "tmux select-pane -t 3" Enter
|
|
1075
1074
|
tmux send -t my_session_id1 "echo pane3" Enter
|
|
1076
1075
|
|
|
1076
|
+
# https://stackoverflow.com/questions/54954177/how-to-write-a-tmux-script-so-that-it-automatically-split-windows-and-opens-a-se
|
|
1077
|
+
# https://tmuxcheatsheet.com/
|
|
1078
|
+
# https://gist.github.com/Starefossen/5955406
|
|
1079
|
+
|
|
1080
|
+
# List the bindings
|
|
1081
|
+
tmux list-keys
|
|
1082
|
+
|
|
1083
|
+
# Can arange the splits in a session via a preset layout
|
|
1084
|
+
# Preset layouts are:
|
|
1085
|
+
# even-horizontal, even-vertical, main-horizontal, main-vertical, or tiled.
|
|
1086
|
+
tmux select-layout -t "${SESSION_NAME}" even-vertical
|
|
1087
|
+
|
|
1088
|
+
# switch to an existing session
|
|
1089
|
+
tmux switch -t "${SESSION_NAME}"
|
|
1077
1090
|
|
|
1078
1091
|
"""
|