rbbt-util 5.28.12 → 5.29.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Queue a job in Marenostrum
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -r--running Running jobs only
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
24
+ EOF
25
+
26
+ if options[:help]
27
+ if defined? rbbt_usage
28
+ rbbt_usage
29
+ else
30
+ puts SOPT.doc
31
+ end
32
+ exit 0
33
+ end
34
+
35
+ Log.severity = 4
36
+ done, error, running, queued, aborted, jobid, search, tail = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail
37
+
38
+ workdir = File.expand_path('~/rbbt-slurm')
39
+ Path.setup(workdir)
40
+
41
+ running_jobs = begin
42
+ squeue_txt = CMD.cmd('squeue').read
43
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
44
+ rescue
45
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
46
+ squeue_txt = nil
47
+ $norunningjobs = true
48
+ []
49
+ end
50
+
51
+ if squeue_txt
52
+ job_nodes = {}
53
+ squeue_txt.split("\n").each do |line|
54
+ parts = line.strip.split(/\s+/)
55
+ job_nodes[parts.first] = parts.last.split(",")
56
+ end
57
+ else
58
+ job_nodes = nil
59
+ end
60
+
61
+ count = 0
62
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
63
+ dir = File.dirname(fcmd)
64
+
65
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
66
+ cmd = m[1]
67
+ else
68
+ cmd = nil
69
+ end
70
+
71
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
72
+ exe = m[1]
73
+ else
74
+ exe = nil
75
+ end
76
+
77
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
78
+ container_home = m[1]
79
+ else
80
+ container_home = nil
81
+ end
82
+
83
+
84
+ if File.exists?(fid = File.join(dir, 'job.id'))
85
+ id = Open.read(fid).chomp
86
+ else
87
+ id = nil
88
+ end
89
+
90
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
91
+ exit_status = Open.read(fstatus).to_i
92
+ else
93
+ exit_status = nil
94
+ end
95
+
96
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
97
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
98
+ elsif job_nodes[id]
99
+ nodes = job_nodes[id]
100
+ else
101
+ nodes = []
102
+ end
103
+
104
+ if File.exists?(File.join(dir, 'std.out'))
105
+ outt = File.mtime File.join(dir, 'std.out')
106
+ errt = File.mtime File.join(dir, 'std.err')
107
+ time_diff = Time.now - [outt, errt].max
108
+ end
109
+
110
+ fdep = File.join(dir, 'dependencies.list')
111
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
112
+
113
+ fcadep = File.join(dir, 'canfail_dependencies.list')
114
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
115
+
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status == 0
119
+ select = true if error && exit_status && exit_status != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = true if queued && deps && (running_jobs & deps).any?
122
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
123
+ select = true if jobid && jobid.split(",").include?(id)
124
+ select = true if search && cmd.match(/#{search}/)
125
+ next unless select
126
+ end
127
+
128
+
129
+ puts Log.color :blue, dir
130
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
131
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
132
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
133
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
134
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
135
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
136
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
137
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
138
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
139
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
140
+
141
+ if options[:sbatch_parameters]
142
+ puts Log.color(:magenta, "SBATCH parameters: ")
143
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
144
+ end
145
+
146
+ if tail && File.exists?(File.join(dir, 'std.err'))
147
+ if exit_status && exit_status != 0
148
+ puts Log.color(:magenta, "First error or exception found: ")
149
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
150
+ elsif exit_status
151
+ puts Log.color(:magenta, "Completed jobs: ")
152
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
153
+ else
154
+ puts Log.color(:magenta, "Log tail: ")
155
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
156
+ end
157
+ end
158
+
159
+ count += 1
160
+
161
+ end
162
+
163
+ puts
164
+ puts "Found #{count} jobs"
165
+
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'rbbt/hpc/orchestrate'
8
+ require 'time'
9
+
10
+ $slurm_options = SOPT.get <<EOF
11
+ -dr--dry_run Print only the template
12
+ -cj--clean_job Clean job
13
+ --drbbt* Use development version of rbbt
14
+ -sing--singularity Use Singularity
15
+ -ug--user_group* Use alternative user group for group project directory
16
+ -c--contain* Contain in directory (using Singularity)
17
+ -s--sync* Contain in directory and sync jobs
18
+ -e--exclusive Make exclusive use of the node
19
+ -hm--highmem Make use of highmem cores
20
+ -wc--wipe_container* Wipe the jobs from the contain directory
21
+ -CS--contain_and_sync Contain and sync to default locations
22
+ -ci--copy_image When using a container directory, copy image there
23
+ -t--tail Tail the logs
24
+ -q--queue* Queue
25
+ -t--task_cpus* Tasks
26
+ -W--workflows* Additional workflows
27
+ -tm--time* Time
28
+ -OR--orchestration_rules* Orchestration rules
29
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
+ EOF
31
+
32
+ class Step
33
+ def run(*args)
34
+ if done?
35
+ self.load
36
+ else
37
+ begin
38
+ Log.debug "Issuing SLURM job for #{self.path}"
39
+ HPC::SLURM.orchestrate_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
40
+ rescue HPC::SBATCH
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
47
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -9,8 +9,9 @@ require 'time'
9
9
  $slurm_options = SOPT.get <<EOF
10
10
  -dr--dry_run Print only the template
11
11
  -cj--clean_job Clean job
12
- --drbbt Use development version of rbbt
12
+ --drbbt* Use development version of rbbt
13
13
  -sing--singularity Use Singularity
14
+ -ug--user_group* Use alternative user group for group project directory
14
15
  -c--contain* Contain in directory (using Singularity)
15
16
  -s--sync* Contain in directory and sync jobs
16
17
  -e--exclusive Make exclusive use of the node
@@ -21,8 +22,9 @@ $slurm_options = SOPT.get <<EOF
21
22
  -t--tail Tail the logs
22
23
  -q--queue* Queue
23
24
  -t--task_cpus* Tasks
25
+ -W--workflows* Additional workflows
24
26
  -tm--time* Time
25
- -S--server* SLURM login node
27
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
26
28
  EOF
27
29
 
28
30
  class Step
@@ -30,9 +32,14 @@ class Step
30
32
  if done?
31
33
  self.load
32
34
  else
33
- Marenostrum::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
35
+ begin
36
+ Log.debug "Issuing SLURM job for #{self.path}"
37
+ HPC::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
38
+ rescue HPC::SBATCH
39
+ end
34
40
  end
35
41
  end
36
42
  end
37
43
 
44
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
38
45
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
20
20
  puts
21
21
  if workflow.nil?
22
22
  puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
23
- exit -1
23
+ exit! -1
24
24
  end
25
25
 
26
26
  if task.nil?
@@ -206,7 +206,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
206
206
  EOF
207
207
 
208
208
  workflow = ARGV.shift
209
- usage and exit -1 if workflow.nil?
209
+ usage and exit! -1 if workflow.nil?
210
210
 
211
211
  task = ARGV.shift
212
212
 
@@ -232,7 +232,8 @@ else
232
232
  remote_workflows = {}
233
233
  end
234
234
 
235
- Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
235
+ #Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
236
+ Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
236
237
 
237
238
  workflow = Workflow.require_workflow workflow
238
239
 
@@ -486,7 +487,7 @@ rescue ParameterException
486
487
  puts
487
488
  report_options saved_job_options
488
489
  puts
489
- exit -1
490
+ exit! -1
490
491
  end
491
492
 
492
493
  if options.delete(:list_job_files)
@@ -538,7 +539,7 @@ when Step
538
539
  io.abort if io.respond_to? :abort
539
540
  io.join if io.respond_to? :join
540
541
  ensure
541
- exit -1
542
+ exit! -1
542
543
  end
543
544
  rescue Exception
544
545
  Log.exception $!
@@ -547,9 +548,11 @@ when Step
547
548
  io.abort if io.respond_to? :abort
548
549
  io.join if io.respond_to? :join
549
550
  ensure
550
- exit -1
551
+ exit! -1
551
552
  end
552
553
  end
554
+ elsif detach
555
+ exit! 0
553
556
  else
554
557
  res.join
555
558
  out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
@@ -386,9 +386,9 @@ class TestWorkflow < Test::Unit::TestCase
386
386
  job.run
387
387
  Misc.with_env "RBBT_UPDATE", 'true' do
388
388
  assert job.checks.select{|d| d.task_name.to_s == "t1" }.any?
389
- job = TestWF.job(:t3)
390
- job.step(:t1).clean
391
- assert job.checks.select{|d| d.task_name.to_s == "t1" }.empty?
389
+ #job = TestWF.job(:t3)
390
+ #job.step(:t1).clean
391
+ #assert job.checks.select{|d| d.task_name.to_s == "t1" }.empty?
392
392
  job = TestWF.job(:t3).recursive_clean
393
393
  job.run
394
394
  assert job.checks.select{|d| d.task_name.to_s == "t1" }.any?
@@ -427,7 +427,7 @@ class TestWorkflow < Test::Unit::TestCase
427
427
  TmpFile.with_file do |dir|
428
428
  Path.setup(dir)
429
429
  Step.save_job_inputs(job, dir)
430
- assert_equal Dir.glob(dir + "/*"), [dir.file.find + '.read']
430
+ assert_equal Dir.glob(dir + "/*"), [dir.file.find + '.yaml']
431
431
  inputs = Workflow.load_inputs(dir, [:file], :file => :file)
432
432
  assert_equal inputs, {:file => 'code'}
433
433
  end
@@ -455,9 +455,9 @@ class TestWorkflow < Test::Unit::TestCase
455
455
  job.run
456
456
  Misc.with_env "RBBT_UPDATE", 'true' do
457
457
  assert job.checks.select{|d| d.task_name.to_s == "t1" }.any?
458
- job = TestWF.job(:t3)
459
- job.step(:t1).clean
460
- assert job.checks.select{|d| d.task_name.to_s == "t1" }.empty?
458
+ #job = TestWF.job(:t3)
459
+ #job.step(:t1).clean
460
+ #assert job.checks.select{|d| d.task_name.to_s == "t1" }.empty?
461
461
  job = TestWF.job(:t3).recursive_clean
462
462
  job.run
463
463
  assert job.checks.select{|d| d.task_name.to_s == "t1" }.any?
@@ -566,7 +566,7 @@ row2,CC
566
566
  tsv3.keys.each{|k| tsv3[k] = nil if tsv3[k] == ""}
567
567
  end
568
568
 
569
- assert_equal tsv1.attach(tsv2, :complete => true).attach(tsv3, :complete => true)["row1"], [nil, "B", nil]
569
+ assert_equal [nil, "B", nil], tsv1.attach(tsv2, :complete => true).attach(tsv3, :complete => true)["row1"]
570
570
  end
571
571
 
572
572
  def test_attach_index_both_non_key
@@ -597,10 +597,10 @@ A Id3
597
597
 
598
598
  tsv1 = tsv2 = nil
599
599
 
600
- tsv1 = Rbbt.tmp.test.test1.data.tsv :double, :sep => /\s+/
601
- tsv2 = Rbbt.tmp.test.test2.data.tsv :double, :sep => /\s+/
600
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
601
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
602
602
 
603
- tsv2.identifiers = Rbbt.tmp.test.test2.identifiers.produce.find #.to_s
603
+ tsv2.identifiers = Rbbt.tmp.test.test2.identifiers.produce(true).produce.find #.to_s
604
604
 
605
605
  tsv1.attach tsv2, :fields => ["ValueE"] #, :persist_input => true
606
606
  Log.tsv tsv1
@@ -627,12 +627,92 @@ E B
627
627
 
628
628
  tsv1 = tsv2 = nil
629
629
 
630
- tsv1 = Rbbt.tmp.test.test1.data.tsv :double, :sep => /\s+/
631
- tsv2 = Rbbt.tmp.test.test2.data.tsv :double, :sep => /\s+/
630
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
631
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
632
632
 
633
633
  tsv1.attach tsv2, :fields => ["ValueE"] #, :persist_input => true
634
634
  Log.tsv tsv1
635
635
 
636
636
  end
637
+
638
+ def test_attach_complete
639
+ content1 =<<-EOF
640
+ #: :sep=/\\s+/
641
+ #Id ValueA
642
+ row1 a|aa|aaa
643
+ row2 A
644
+ EOF
645
+
646
+ content2 =<<-EOF
647
+ #: :sep=/\\s+/
648
+ #Id ValueB
649
+ row1 b
650
+ row3 C
651
+ EOF
652
+ Rbbt.claim Rbbt.tmp.test.test1.data, :string, content1
653
+ Rbbt.claim Rbbt.tmp.test.test2.data, :string, content2
654
+
655
+ tsv1 = tsv2 = nil
656
+
657
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
658
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
659
+
660
+ tsv1.attach tsv2, :complete => true
661
+ assert_equal [[], ["C"]], tsv1["row3"]
662
+
663
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
664
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
665
+
666
+ ppp tsv1.attach tsv2, :complete => ["AA"]
667
+ tsv1.attach tsv2, :complete => ["AA"]
668
+ assert_equal [["AA"], ["C"]], tsv1["row3"]
669
+ end
670
+
671
+ def test_attach_complete_identifiers
672
+ content1 =<<-EOF
673
+ #: :sep=/\\s+/
674
+ #Id ValueA
675
+ row1 a|aa|aaa
676
+ row2 A
677
+ EOF
678
+
679
+ content2 =<<-EOF
680
+ #: :sep=/\\s+/
681
+ #Id2 ValueB
682
+ ROW_1 b
683
+ ROW_2 C
684
+ EOF
685
+
686
+ identifiers =<<-EOF
687
+ #: :sep=/\\s+/
688
+ #Id Id2
689
+ row1 ROW_1
690
+ row2 ROW_2
691
+ row3 ROW_3
692
+ EOF
693
+ Rbbt.claim Rbbt.tmp.test.test1.data, :string, content1
694
+ Rbbt.claim Rbbt.tmp.test.test2.data, :string, content2
695
+ Rbbt.claim Rbbt.tmp.test.identifiers.data, :string, identifiers
696
+
697
+ tsv1 = tsv2 = nil
698
+
699
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
700
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
701
+ ids = Rbbt.tmp.test.identifiers.data.produce(true).tsv :double, :sep => /\s+/
702
+
703
+ tsv1.identifiers = ids
704
+
705
+ tsv1.attach tsv2
706
+ assert_equal [["A"], ["C"]], tsv1["row2"]
707
+
708
+ tsv1 = Rbbt.tmp.test.test1.data.produce(true).tsv :double, :sep => /\s+/
709
+ tsv2 = Rbbt.tmp.test.test2.data.produce(true).tsv :double, :sep => /\s+/
710
+ ids = Rbbt.tmp.test.identifiers.data.produce(true).tsv :double, :sep => /\s+/
711
+
712
+ tsv1.identifiers = ids
713
+
714
+ tsv1.attach tsv2, :complete => true
715
+ assert_equal [["A"], ["C"]], tsv1["row2"]
716
+ end
637
717
  end
638
718