bio-pipengine 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +20 -0
- data/README.md +631 -0
- data/VERSION +1 -0
- data/bin/pipengine +101 -0
- data/lib/bio/pipengine/job.rb +271 -0
- data/lib/bio/pipengine/sample.rb +13 -0
- data/lib/bio/pipengine/step.rb +39 -0
- data/lib/bio/pipengine.rb +234 -0
- data/lib/bio-pipengine.rb +13 -0
- metadata +167 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.6.0
|
data/bin/pipengine
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:<< File.expand_path(File.join(File.dirname(File.dirname __FILE__),"lib"))
|
4
|
+
require 'bio-pipengine'
|
5
|
+
|
6
|
+
banner_text = "\nLauncher for Complex Biological Pipelines . Copyright(C) 2013 Francesco Strozzi\n\n"
|
7
|
+
version_text = File.read File.expand_path(File.join(File.dirname(File.dirname __FILE__),"VERSION"))
|
8
|
+
SUB_COMMANDS = %w(run jobs)
|
9
|
+
|
10
|
+
|
11
|
+
Bio::Pipengine.check_config
|
12
|
+
|
13
|
+
options = {}
|
14
|
+
cmd = ARGV.first # get the subcommand
|
15
|
+
opts = case cmd
|
16
|
+
when "run"
|
17
|
+
options[:run] = true
|
18
|
+
ARGV.shift
|
19
|
+
Trollop::options do
|
20
|
+
opt :pipeline, "YAML file with pipeline and sample details", :short => "p", :type => :string, :default => "pipeline.yml"
|
21
|
+
opt :samples_file, "YAML file with samples name and directory paths", :short => "f", :type => :string, :default => "samples.yml"
|
22
|
+
opt :samples, "List of sample names to run the pipeline", :type => :strings, :short => "l"
|
23
|
+
opt :steps, "List of steps to be executed", :type => :strings, :short => "s"
|
24
|
+
opt :dry,"Dry run. Just create the job script without submitting it to the batch system", :short => "d"
|
25
|
+
opt :spooler,"Destination spooler PBS, plain shell script", :short =>"x", :type => :string, :default => "pbs"
|
26
|
+
opt :tmp, "Temporary output folder", :type => :string, :short => "t"
|
27
|
+
opt :create_samples, "Create samples.yml file from a Sample directory (only for CASAVA projects)", :short => "c", :type => :strings
|
28
|
+
opt :multi, "List of samples to be processed by a given step (the order matters)", :short => "m", :type => :strings
|
29
|
+
opt :group, "Specify the group of samples to run the pipeline steps on (do not specify --multi)", :short => "g", :type => :string
|
30
|
+
opt :name, "Analysis name", :short => "n", :type => :string
|
31
|
+
opt :output_dir, "Output directory (override standard output directory names)", :short => "o", :type => :string
|
32
|
+
opt :pbs_opts, "PBS options", :type => :strings, :short => "b"
|
33
|
+
opt :pbs_queue, "PBS queue", :type => :string, :short => "q"
|
34
|
+
opt :inspect_pipeline, "Show steps", :short => "i", :type => :string
|
35
|
+
opt :mail_exit, "Send an Email when the job terminates", :type => :string
|
36
|
+
opt :mail_start, "Send an Email when the job starts", :type => :string
|
37
|
+
opt :log, "Log script activities, by default stdin. Options are fluentd", :type => :string, :default => "stdin"
|
38
|
+
opt :log_adapter, "(stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag", :type => :string
|
39
|
+
end
|
40
|
+
when "jobs"
|
41
|
+
ARGV.shift
|
42
|
+
options[:jobs] = true
|
43
|
+
Trollop::options do
|
44
|
+
opt :job_id, "Search submitted jobs by Job ID", :type => :strings, :short => "i"
|
45
|
+
opt :job_name, "Search submitted jobs by Job Name", :type => :strings, :short => "n"
|
46
|
+
opt :delete, "Delete submitted jobs ('all' to erase everything or type one or more job IDs)", :short => "d", :type => :strings
|
47
|
+
end
|
48
|
+
when "-h"
|
49
|
+
puts banner_text
|
50
|
+
puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n\tjobs\tShow statistics and interact with running jobs\n"
|
51
|
+
exit
|
52
|
+
else
|
53
|
+
global_opts = Trollop::options do
|
54
|
+
banner banner_text
|
55
|
+
version "PipEngine v#{version_text}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
options = options.merge opts
|
62
|
+
Trollop::die :multi, "Specifing both --group and --multi is not allowed" if options[:multi] and options[:group]
|
63
|
+
|
64
|
+
if options[:create_samples]
|
65
|
+
Bio::Pipengine.create_samples options[:create_samples]
|
66
|
+
elsif options[:jobs]
|
67
|
+
if options[:job_id]
|
68
|
+
Bio::Pipengine.show_stats(options[:job_id])
|
69
|
+
elsif options[:job_name]
|
70
|
+
warn "Not yet implemented"
|
71
|
+
exit
|
72
|
+
elsif options[:delete]
|
73
|
+
if options[:delete].empty?
|
74
|
+
warn "Provide one or more Job IDs or write 'all' to delete all your running jobs".red
|
75
|
+
exit
|
76
|
+
end
|
77
|
+
puts "Warning: this will delete the following running jobs: ".light_blue + "#{options[:delete].join(",")}".green
|
78
|
+
print "Are you sure? (y|n):"
|
79
|
+
answer = gets.chomp
|
80
|
+
if answer == "y"
|
81
|
+
Bio::Pipengine.delete_jobs(options[:delete])
|
82
|
+
else
|
83
|
+
puts "Aborting..."
|
84
|
+
exit
|
85
|
+
end
|
86
|
+
else
|
87
|
+
Bio::Pipengine.show_stats(["all"])
|
88
|
+
end
|
89
|
+
elsif options[:pipeline] && options[:samples_file]
|
90
|
+
if options[:inspect_pipeline]
|
91
|
+
Bio::Pipengine.inspect_steps(options[:inspect_pipeline])
|
92
|
+
exit
|
93
|
+
else
|
94
|
+
abort("File not found: #{options[:pipeline]}".red) unless File.exists? options[:pipeline]
|
95
|
+
abort("File not found: #{options[:samples_file]}".red) unless File.exists? options[:samples_file]
|
96
|
+
abort("Please provide a valid step name with the --step parameter".red) unless options[:steps]
|
97
|
+
Bio::Pipengine.run(options)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
@@ -0,0 +1,271 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
class Job
|
5
|
+
|
6
|
+
# a Job object holds information on a job to be submitted
|
7
|
+
# samples_groups and samples_obj are used to store information in case of steps that require to combine info
|
8
|
+
# from multiple samples
|
9
|
+
attr_accessor :name, :cpus, :nodes, :mem, :resources, :command_line, :local,
|
10
|
+
:multi_samples, :samples_obj, :custom_output, :custom_name,
|
11
|
+
:log, :log_adapter
|
12
|
+
def initialize(name)
|
13
|
+
@name = generate_uuid + "-" + name
|
14
|
+
@shortname = name
|
15
|
+
@command_line = []
|
16
|
+
@resources = {}
|
17
|
+
@cpus = 1
|
18
|
+
@nodes = "1"
|
19
|
+
@log = "stdin"
|
20
|
+
@log_adapter = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_resources(resources)
|
24
|
+
self.resources.merge! resources
|
25
|
+
end
|
26
|
+
|
27
|
+
def output
|
28
|
+
self.resources["output"]
|
29
|
+
end
|
30
|
+
|
31
|
+
# add all the command lines for a given step
|
32
|
+
def add_step(step,sample)
|
33
|
+
|
34
|
+
# setting job working directory
|
35
|
+
working_dir = ""
|
36
|
+
if self.local
|
37
|
+
working_dir = self.local+"/"+self.name
|
38
|
+
else
|
39
|
+
working_dir = self.output
|
40
|
+
|
41
|
+
if step.is_multi?
|
42
|
+
folder = (self.custom_output) ? self.custom_output : @shortname
|
43
|
+
working_dir += "/#{folder}"
|
44
|
+
else
|
45
|
+
folder =
|
46
|
+
if self.custom_output
|
47
|
+
self.custom_output
|
48
|
+
elsif self.custom_name
|
49
|
+
self.custom_name
|
50
|
+
else
|
51
|
+
step.name
|
52
|
+
end
|
53
|
+
working_dir += "/#{sample.name}/#{folder}"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
# set job cpus number to the higher step cpus (this in case of multiple steps)
|
59
|
+
self.cpus = step.cpus if step.cpus > self.cpus
|
60
|
+
|
61
|
+
# set number of nodes for job
|
62
|
+
self.nodes = (step.nodes) ? step.nodes : @nodes
|
63
|
+
|
64
|
+
# set the memory used
|
65
|
+
self.mem = step.mem
|
66
|
+
|
67
|
+
# adding job working directory
|
68
|
+
unless step.name.start_with? "_"
|
69
|
+
self.command_line << "if [ ! -f #{working_dir}/checkpoint ]"
|
70
|
+
self.command_line << "then"
|
71
|
+
self.command_line << logger(step, "start")
|
72
|
+
self.command_line << "\nmkdir -p #{working_dir}"
|
73
|
+
self.command_line << "cd #{working_dir}"
|
74
|
+
end
|
75
|
+
|
76
|
+
# generate command lines for this step
|
77
|
+
if step.run.kind_of? Array
|
78
|
+
step.run.each_with_index do |cmd, i|
|
79
|
+
command = generate_cmd_line(cmd,sample,step)
|
80
|
+
# TODO verify that logger works in this case
|
81
|
+
# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name}:#{i}\" ; exit 1; }"
|
82
|
+
self.command_line << "#{command} || { #{logger(step, "FAILED #{i}" )}; exit 1; }"
|
83
|
+
end
|
84
|
+
else
|
85
|
+
command = generate_cmd_line(step.run,sample,step)
|
86
|
+
# TODO verify that logger works in this case
|
87
|
+
# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name} \" ; exit 1; }"
|
88
|
+
self.command_line << "#{command} || { #{logger(step, "FAILED" )}; exit 1; }"
|
89
|
+
end
|
90
|
+
self.command_line << logger(step, "finished")
|
91
|
+
self.command_line << "touch #{working_dir}/checkpoint"
|
92
|
+
self.command_line << "else"
|
93
|
+
self.command_line << logger(step, "already executed, skipping this step")
|
94
|
+
self.command_line << "fi"
|
95
|
+
|
96
|
+
# check if a temporary (i.e. different from 'output') directory is set
|
97
|
+
if self.local
|
98
|
+
final_output = ""
|
99
|
+
|
100
|
+
if step.is_multi?
|
101
|
+
folder = (self.custom_output) ? self.custom_output : @shortname
|
102
|
+
final_output = self.output+"/#{folder}"
|
103
|
+
else
|
104
|
+
folder = (self.custom_output) ? self.custom_output : step.name
|
105
|
+
final_output = self.output+"/#{sample.name}/#{folder}"
|
106
|
+
end
|
107
|
+
|
108
|
+
self.command_line << "mkdir -p #{final_output}"
|
109
|
+
self.command_line << "cp -r #{working_dir}/* #{final_output}"
|
110
|
+
self.command_line << "rm -fr #{working_dir}"
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
# convert the job object into a TORQUE::Qsub object
|
116
|
+
def to_pbs(options)
|
117
|
+
TORQUE::Qsub.new(options) do |torque_job|
|
118
|
+
torque_job.name = self.name
|
119
|
+
torque_job.working_directory = self.output # where pbs scripts and stdout / stderr files will be saved
|
120
|
+
if options[:pbs_opts]
|
121
|
+
torque_job.l = options[:pbs_opts]
|
122
|
+
else
|
123
|
+
l_string = []
|
124
|
+
l_string << "nodes=#{self.nodes}:ppn=#{self.cpus}"
|
125
|
+
l_string << "mem=#{self.mem}" if self.mem
|
126
|
+
torque_job.l = l_string
|
127
|
+
if options[:mail_exit]
|
128
|
+
torque_job.m = "e"
|
129
|
+
torque_job.M = options[:mail_exit]
|
130
|
+
end
|
131
|
+
if options[:mail_start]
|
132
|
+
torque_job.m = "b"
|
133
|
+
torque_job.M = options[:mail_start]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
torque_job.q = options[:pbs_queue] if options[:pbs_queue]
|
137
|
+
torque_job.script = self.command_line.join("\n")+"\n"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def to_script(options)
|
142
|
+
File.open(self.name+'.sh','w') do |file|
|
143
|
+
file.puts "#!/usr/bin/env bash -l"
|
144
|
+
file.puts self.command_line.join("\n")
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
# create a unique ID for each job
|
151
|
+
def generate_uuid
|
152
|
+
SecureRandom.hex(5)
|
153
|
+
end
|
154
|
+
|
155
|
+
# this method call other methods to perform the right substitutions into the command lines
|
156
|
+
def generate_cmd_line(cmd,sample,step)
|
157
|
+
if step.is_multi? # if is a multi samples step call a different method
|
158
|
+
set_multi_cmd(step,self.multi_samples)
|
159
|
+
cmd = sub_multi(cmd,step)
|
160
|
+
else
|
161
|
+
cmd = sub_placeholders(cmd,sample,step) # normal step, perform usual substitutions
|
162
|
+
end
|
163
|
+
return cmd
|
164
|
+
end
|
165
|
+
|
166
|
+
# perform substitutions on all the placeholders
|
167
|
+
def sub_placeholders(cmd,sample,step=nil)
|
168
|
+
tmp_cmd = cmd.gsub(/<sample>/,sample.name)
|
169
|
+
if tmp_cmd =~/<sample_path>/
|
170
|
+
sample_path_glob = (tmp_cmd.scan(/<sample_path>(\S+)/).map {|e| e.first})
|
171
|
+
if sample_path_glob.empty?
|
172
|
+
tmp_cmd.gsub!(/<sample_path>/,sample.path.join("\s"))
|
173
|
+
else
|
174
|
+
sample_path_glob.each do |append|
|
175
|
+
tmp_cmd.gsub!(/<sample_path>#{Regexp.quote(append)}/,(sample.path.map {|s| s+append}).join("\s"))
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
# for resourcers and cpus
|
180
|
+
tmp_cmd = sub_resources_and_cpu(tmp_cmd,step)
|
181
|
+
|
182
|
+
# for placeholders like <mapping/sample>
|
183
|
+
tmp_cmd.scan(/<(\S+)\/sample>/).map {|e| e.first}.each do |input_folder|
|
184
|
+
warn "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
|
185
|
+
tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/sample>/,self.output+"/"+sample.name+"/"+input_folder+"/"+sample.name)
|
186
|
+
end
|
187
|
+
|
188
|
+
# for placeholders like <mapping/>
|
189
|
+
tmp_cmd.scan(/<(\S+)\/>/).map {|e| e.first}.each do |input_folder|
|
190
|
+
warn "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
|
191
|
+
tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/>/,self.output+"/"+sample.name+"/"+input_folder+"/")
|
192
|
+
end
|
193
|
+
return tmp_cmd
|
194
|
+
end
|
195
|
+
|
196
|
+
def sub_resources_and_cpu(cmd,step)
|
197
|
+
# for all resources tags like <gtf> <index> <genome> <bwa> etc.
|
198
|
+
self.resources.each_key do |r|
|
199
|
+
cmd.gsub!(/<#{r}>/,self.resources[r])
|
200
|
+
end
|
201
|
+
# set number of cpus for this command line
|
202
|
+
cmd.gsub!(/<cpu>/,step.cpus.to_s) unless step.nil?
|
203
|
+
return cmd
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
# creates actual multi-samples command lines to be substituted where <multi> placeholders are found
|
208
|
+
def set_multi_cmd(step,multi_samples)
|
209
|
+
if step.multi_def.kind_of? Array # in case of many multi-samples command lines
|
210
|
+
step.multi_cmd = []
|
211
|
+
step.multi_def.each do |m_def|
|
212
|
+
step.multi_cmd << generate_multi_cmd(m_def,multi_samples)
|
213
|
+
end
|
214
|
+
else
|
215
|
+
step.multi_cmd = generate_multi_cmd(step.multi_def,multi_samples)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# take the multi_cmd and perform the subsitutions into the step command lines
|
220
|
+
def sub_multi(cmd,step)
|
221
|
+
cmd = sub_resources_and_cpu(cmd,step)
|
222
|
+
if step.multi_cmd.kind_of? Array
|
223
|
+
step.multi_cmd.each_with_index do |m,index|
|
224
|
+
cmd.gsub!(/<multi#{index+1}>/,m)
|
225
|
+
end
|
226
|
+
else
|
227
|
+
cmd.gsub!(/<multi>/,step.multi_cmd)
|
228
|
+
end
|
229
|
+
return cmd
|
230
|
+
end
|
231
|
+
|
232
|
+
# this sub method handle different multi-samples definitions (like comma separated list, space separated etc.)
|
233
|
+
def generate_multi_cmd(multi_def,multi_samples)
|
234
|
+
multi_cmd = []
|
235
|
+
multi_samples.each do |sample_name|
|
236
|
+
if sample_name.include? ","
|
237
|
+
multi_cmd << split_and_sub(",",multi_def,sample_name)
|
238
|
+
elsif sample_name.include? ";"
|
239
|
+
multi_cmd << split_and_sub(";",multi_def,sample_name)
|
240
|
+
else
|
241
|
+
multi_cmd << sub_placeholders(multi_def,self.samples_obj[sample_name])
|
242
|
+
end
|
243
|
+
end
|
244
|
+
return multi_cmd.join("\s")
|
245
|
+
end
|
246
|
+
|
247
|
+
# take a non-space separated list of samples and perform the substitution with the group defitions
|
248
|
+
def split_and_sub(sep,multi_def,multi)
|
249
|
+
cmd_line = []
|
250
|
+
multi.split(sep).each do |sample_name|
|
251
|
+
cmd_line << sub_placeholders(multi_def,self.samples_obj[sample_name])
|
252
|
+
end
|
253
|
+
cmd_line.join(sep)
|
254
|
+
end
|
255
|
+
|
256
|
+
# log a step according to the selected adapter
|
257
|
+
def logger(step, message)
|
258
|
+
case self.log
|
259
|
+
when "stdin"
|
260
|
+
"echo \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd` `date`.\""
|
261
|
+
when "syslog"
|
262
|
+
"logger -t PIPENGINE \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd`\""
|
263
|
+
when "fluentd"
|
264
|
+
"curl -X POST -d 'json={\"source\":\"PIPENGINE\", \"step\":\"#{step.name}\", \"message\":\"#{message}\", \"job_id\":\"#{name}\", \"user\":\"\'\"`whoami`\"\'\", \"host\":\"\'\"`hostname`\"\'\", \"pwd\":\"\'\"`pwd`\"\'\"}' #{self.log_adapter}"
|
265
|
+
end
|
266
|
+
end #logger
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
class Sample
|
4
|
+
# Sample holds all the information on a sample and its original input path (or multiple paths)
|
5
|
+
attr_accessor :path, :name
|
6
|
+
def initialize(name,path_string)
|
7
|
+
@path = path_string.split(",")
|
8
|
+
@name = name
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
# Step holds information for a pipeline step
|
5
|
+
# groups_def is used to store information on groups definition (i.e. generic cmd lines with placeholders)
|
6
|
+
# groups_cmd is used to store the actual command lines for all the samples to be combined in a "groups" step
|
7
|
+
# this are generated by combining groups_def information with sample groups information and will be placed
|
8
|
+
# where <groups> placeholder is found into the step command lines.
|
9
|
+
class Step
|
10
|
+
attr_accessor :name, :run, :cpus, :mem, :nodes, :multi_def, :multi_cmd, :pre
|
11
|
+
def initialize(name,step_instructions)
|
12
|
+
@name = name
|
13
|
+
parse_yaml(step_instructions)
|
14
|
+
end
|
15
|
+
|
16
|
+
def is_multi?
|
17
|
+
return (self.multi_def.nil?) ? false : true
|
18
|
+
end
|
19
|
+
|
20
|
+
def has_prerequisite?
|
21
|
+
return (self.pre.nil?) ? false : true
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def parse_yaml(step_instructions)
|
27
|
+
self.cpus = step_instructions["cpu"].to_i
|
28
|
+
self.nodes = step_instructions["nodes"]
|
29
|
+
self.mem = step_instructions["mem"]
|
30
|
+
self.run = step_instructions["run"]
|
31
|
+
self.multi_def = step_instructions["multi"]
|
32
|
+
self.pre = step_instructions["pre"]
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,234 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
def self.run(options)
|
5
|
+
|
6
|
+
# reading the yaml files
|
7
|
+
pipeline = YAML.load_file options[:pipeline]
|
8
|
+
samples_file = YAML.load_file options[:samples_file]
|
9
|
+
samples_file["samples"].each do |k,v|
|
10
|
+
if v.kind_of? Hash
|
11
|
+
samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }]
|
12
|
+
else
|
13
|
+
samples_file["samples"][k] = v.to_s
|
14
|
+
end
|
15
|
+
end
|
16
|
+
# make sure everything in Samples and Resources is converted to string
|
17
|
+
#samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }]
|
18
|
+
samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}]
|
19
|
+
|
20
|
+
# pre-running checks
|
21
|
+
check_steps(options[:steps],pipeline)
|
22
|
+
check_samples(options[:samples],samples_file) if options[:samples]
|
23
|
+
|
24
|
+
# list of samples the jobs will work on
|
25
|
+
samples_list = nil
|
26
|
+
# check if a group is specified
|
27
|
+
if options[:group]
|
28
|
+
samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]]
|
29
|
+
options[:multi] = samples_list.keys
|
30
|
+
samples_file["resources"]["output"] << "/#{options[:group]}"
|
31
|
+
else # if not, proceed normalizing the sample list to remove groups and get a list of all samples
|
32
|
+
full_list_samples = {}
|
33
|
+
samples_file["samples"].each_key do |k|
|
34
|
+
if samples_file["samples"][k].kind_of? Hash
|
35
|
+
full_list_samples.merge! samples_file["samples"][k]
|
36
|
+
else
|
37
|
+
full_list_samples[k] = samples_file["samples"][k]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples
|
41
|
+
end
|
42
|
+
|
43
|
+
########### START ###########
|
44
|
+
|
45
|
+
# create output directory (jobs scripts will be saved there)
|
46
|
+
FileUtils.mkdir_p samples_file["resources"]["output"] unless options[:dry] #&& options[:spooler]!="pbs"
|
47
|
+
|
48
|
+
# check if the requested steps are multi-samples
|
49
|
+
run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options)
|
50
|
+
|
51
|
+
unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample
|
52
|
+
samples_list.each_key do |sample_name|
|
53
|
+
sample = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name])
|
54
|
+
create_job(samples_file,pipeline,samples_list,options,sample)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# handle steps that run on multiple samples (i.e. sample groups job)
|
60
|
+
def self.check_and_run_multi(samples_file,pipeline,samples_list,options)
|
61
|
+
step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?}
|
62
|
+
|
63
|
+
if step_multi.include? false
|
64
|
+
if step_multi.uniq.size > 1
|
65
|
+
puts "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red
|
66
|
+
exit
|
67
|
+
else
|
68
|
+
return false
|
69
|
+
end
|
70
|
+
else
|
71
|
+
samples_obj = {}
|
72
|
+
samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name])}
|
73
|
+
create_job(samples_file,pipeline,samples_list,options,samples_obj)
|
74
|
+
return true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.create_job(samples_file,pipeline,samples_list,options,sample)
|
79
|
+
# getting the sample name (only if this is not a multi samples job)
|
80
|
+
sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-"
|
81
|
+
# setting the job name
|
82
|
+
job_name = nil
|
83
|
+
if options[:name]
|
84
|
+
job_name = options[:name]
|
85
|
+
elsif options[:steps].size > 1
|
86
|
+
job_name = "#{sample_name}#{options[:steps].join("-")}"
|
87
|
+
else
|
88
|
+
job_name = "#{sample_name}#{options[:steps].first}"
|
89
|
+
end
|
90
|
+
# creating the Job object
|
91
|
+
job = Bio::Pipengine::Job.new(job_name)
|
92
|
+
job.local = options[:tmp]
|
93
|
+
job.custom_output = options[:output_dir]
|
94
|
+
job.custom_name = (options[:name]) ? options[:name] : nil
|
95
|
+
job.add_resources pipeline["resources"]
|
96
|
+
job.add_resources samples_file["resources"]
|
97
|
+
#setting the logging system
|
98
|
+
job.log = options[:log]
|
99
|
+
job.log_adapter = options[:log_adapter]
|
100
|
+
# setting sample groups either by cli option (if present) or by taking all available samples
|
101
|
+
job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys
|
102
|
+
job.samples_obj = sample if sample.kind_of? Hash
|
103
|
+
# cycling through steps and add command lines to the job
|
104
|
+
options[:steps].each do |step_name|
|
105
|
+
# TODO WARNING this can add multiple times the same step if the are multi dependencies
|
106
|
+
self.add_job(job, pipeline, step_name, sample)
|
107
|
+
end
|
108
|
+
|
109
|
+
if options[:dry] #&& options[:spooler] == "script"
|
110
|
+
job.to_script(options)
|
111
|
+
else
|
112
|
+
script = job.to_pbs(options) # converting the Job into a TORQUE::Qsub PBS compatible object
|
113
|
+
job_id = script.submit(options)
|
114
|
+
puts "#{job_id}".green unless options[:dry]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# check if sample exists
|
119
|
+
def self.check_samples(passed_samples,samples)
|
120
|
+
passed_samples.each do |sample|
|
121
|
+
samples_names = []
|
122
|
+
samples["samples"].each_key do |k|
|
123
|
+
if samples["samples"][k].kind_of? Hash
|
124
|
+
samples["samples"][k].each_key {|s| samples_names << s}
|
125
|
+
else
|
126
|
+
samples_names << k
|
127
|
+
end
|
128
|
+
end
|
129
|
+
unless samples_names.include? sample
|
130
|
+
puts "Sample \"#{sample}\" does not exist in sample file!".red
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# check if step exists
|
137
|
+
def self.check_steps(passed_steps,pipeline)
|
138
|
+
passed_steps.each do |step|
|
139
|
+
unless pipeline["steps"].keys.include? step
|
140
|
+
puts "Step \"#{step}\" does not exist in pipeline file!".red
|
141
|
+
exit
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# load the pipeline file and show a list of available steps
|
147
|
+
def self.inspect_steps(pipeline_file)
|
148
|
+
pipeline = YAML.load_file pipeline_file
|
149
|
+
print "\nPipeline: ".blue
|
150
|
+
print "#{pipeline["pipeline"]}\n\n".green
|
151
|
+
puts "List of available steps:".light_blue
|
152
|
+
pipeline["steps"].each_key do |s|
|
153
|
+
print "\s\s#{s}:\s\s".blue
|
154
|
+
print "#{pipeline["steps"][s]["desc"]}\n".green
|
155
|
+
end
|
156
|
+
puts "\n"
|
157
|
+
end
|
158
|
+
|
159
|
+
# create the samples.yml file (CASAVA ONLY!)
|
160
|
+
def self.create_samples(dir)
|
161
|
+
File.open("samples.yml","w") do |file|
|
162
|
+
file.write "resources:\n\soutput: #{FileUtils.pwd}\n\nsamples:\n"
|
163
|
+
samples = Hash.new {|hash,key| hash[key] = []}
|
164
|
+
dir.each do |path|
|
165
|
+
projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/}
|
166
|
+
unless projects.empty?
|
167
|
+
projects.each do |project_folder|
|
168
|
+
Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s}
|
169
|
+
end
|
170
|
+
else
|
171
|
+
Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s}
|
172
|
+
end
|
173
|
+
end
|
174
|
+
samples.each_key do |sample|
|
175
|
+
file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# show running jobs information
|
181
|
+
def self.show_stats(job_ids)
|
182
|
+
stats = TORQUE::Qstat.new
|
183
|
+
if job_ids.first == "all"
|
184
|
+
stats.display
|
185
|
+
else
|
186
|
+
stats.display(:job_ids => job_ids)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# delete running jobs from the scheduler
|
191
|
+
def self.delete_jobs(job_ids)
|
192
|
+
include TORQUE
|
193
|
+
if job_ids == ["all"]
|
194
|
+
Qdel.rm_all
|
195
|
+
else
|
196
|
+
job_ids.each {|job_id| Qdel.rm job_id}
|
197
|
+
end
|
198
|
+
end #delete_jobs
|
199
|
+
|
200
|
+
# check if required configuration exists
|
201
|
+
def self.check_config
|
202
|
+
unless File.exists?("#{Dir.home}/.torque_rm.yaml")
|
203
|
+
ARGV.clear
|
204
|
+
current_user = Etc.getlogin
|
205
|
+
puts "\nIt seems you are running PipEngine for the first time. Please fill in the following information:"
|
206
|
+
print "\nHostname or IP address of authorized server from where jobs will be submitted: ".light_blue
|
207
|
+
server = gets.chomp
|
208
|
+
print "\n"
|
209
|
+
print "Specify the username you will be using to connect and submit jobs [#{current_user}]: ".light_blue
|
210
|
+
username = gets.chomp
|
211
|
+
username = (username == "") ? current_user : username
|
212
|
+
puts "Attempting connection to the server...".green
|
213
|
+
path = `ssh #{username}@#{server} -t "which qsub"`.split("/qsub").first
|
214
|
+
unless path=~/\/\S+\/\S+/
|
215
|
+
warn "Connection problems detected! Please check that you are able to connect to '#{server}' as '#{username}' via ssh.".red
|
216
|
+
else
|
217
|
+
file = File.open("#{Dir.home}/.torque_rm.yaml","w")
|
218
|
+
file.write({:hostname => server, :path => path, :user => username}.to_yaml)
|
219
|
+
file.close
|
220
|
+
puts "First time configuration completed!".green
|
221
|
+
puts "It is strongly recommended to setup a password-less SSH connection to use PipEngine.".green
|
222
|
+
exit
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end #check_config
|
226
|
+
|
227
|
+
def self.add_job(job, pipeline, step_name, sample)
|
228
|
+
step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions
|
229
|
+
self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite?
|
230
|
+
job.add_step(step,sample) # adding step command lines to the job
|
231
|
+
end #add_job
|
232
|
+
|
233
|
+
end
|
234
|
+
end
|