bio-pipengine 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +20 -0
- data/README.md +631 -0
- data/VERSION +1 -0
- data/bin/pipengine +101 -0
- data/lib/bio/pipengine/job.rb +271 -0
- data/lib/bio/pipengine/sample.rb +13 -0
- data/lib/bio/pipengine/step.rb +39 -0
- data/lib/bio/pipengine.rb +234 -0
- data/lib/bio-pipengine.rb +13 -0
- metadata +167 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.6.0
|
data/bin/pipengine
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:<< File.expand_path(File.join(File.dirname(File.dirname __FILE__),"lib"))
|
4
|
+
require 'bio-pipengine'
|
5
|
+
|
6
|
+
banner_text = "\nLauncher for Complex Biological Pipelines . Copyright(C) 2013 Francesco Strozzi\n\n"
|
7
|
+
version_text = File.read File.expand_path(File.join(File.dirname(File.dirname __FILE__),"VERSION"))
|
8
|
+
SUB_COMMANDS = %w(run jobs)
|
9
|
+
|
10
|
+
|
11
|
+
Bio::Pipengine.check_config
|
12
|
+
|
13
|
+
options = {}
|
14
|
+
cmd = ARGV.first # get the subcommand
|
15
|
+
opts = case cmd
|
16
|
+
when "run"
|
17
|
+
options[:run] = true
|
18
|
+
ARGV.shift
|
19
|
+
Trollop::options do
|
20
|
+
opt :pipeline, "YAML file with pipeline and sample details", :short => "p", :type => :string, :default => "pipeline.yml"
|
21
|
+
opt :samples_file, "YAML file with samples name and directory paths", :short => "f", :type => :string, :default => "samples.yml"
|
22
|
+
opt :samples, "List of sample names to run the pipeline", :type => :strings, :short => "l"
|
23
|
+
opt :steps, "List of steps to be executed", :type => :strings, :short => "s"
|
24
|
+
opt :dry,"Dry run. Just create the job script without submitting it to the batch system", :short => "d"
|
25
|
+
opt :spooler,"Destination spooler PBS, plain shell script", :short =>"x", :type => :string, :default => "pbs"
|
26
|
+
opt :tmp, "Temporary output folder", :type => :string, :short => "t"
|
27
|
+
opt :create_samples, "Create samples.yml file from a Sample directory (only for CASAVA projects)", :short => "c", :type => :strings
|
28
|
+
opt :multi, "List of samples to be processed by a given step (the order matters)", :short => "m", :type => :strings
|
29
|
+
opt :group, "Specify the group of samples to run the pipeline steps on (do not specify --multi)", :short => "g", :type => :string
|
30
|
+
opt :name, "Analysis name", :short => "n", :type => :string
|
31
|
+
opt :output_dir, "Output directory (override standard output directory names)", :short => "o", :type => :string
|
32
|
+
opt :pbs_opts, "PBS options", :type => :strings, :short => "b"
|
33
|
+
opt :pbs_queue, "PBS queue", :type => :string, :short => "q"
|
34
|
+
opt :inspect_pipeline, "Show steps", :short => "i", :type => :string
|
35
|
+
opt :mail_exit, "Send an Email when the job terminates", :type => :string
|
36
|
+
opt :mail_start, "Send an Email when the job starts", :type => :string
|
37
|
+
opt :log, "Log script activities, by default stdin. Options are fluentd", :type => :string, :default => "stdin"
|
38
|
+
opt :log_adapter, "(stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag", :type => :string
|
39
|
+
end
|
40
|
+
when "jobs"
|
41
|
+
ARGV.shift
|
42
|
+
options[:jobs] = true
|
43
|
+
Trollop::options do
|
44
|
+
opt :job_id, "Search submitted jobs by Job ID", :type => :strings, :short => "i"
|
45
|
+
opt :job_name, "Search submitted jobs by Job Name", :type => :strings, :short => "n"
|
46
|
+
opt :delete, "Delete submitted jobs ('all' to erase everything or type one or more job IDs)", :short => "d", :type => :strings
|
47
|
+
end
|
48
|
+
when "-h"
|
49
|
+
puts banner_text
|
50
|
+
puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n\tjobs\tShow statistics and interact with running jobs\n"
|
51
|
+
exit
|
52
|
+
else
|
53
|
+
global_opts = Trollop::options do
|
54
|
+
banner banner_text
|
55
|
+
version "PipEngine v#{version_text}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
options = options.merge opts
|
62
|
+
Trollop::die :multi, "Specifing both --group and --multi is not allowed" if options[:multi] and options[:group]
|
63
|
+
|
64
|
+
if options[:create_samples]
|
65
|
+
Bio::Pipengine.create_samples options[:create_samples]
|
66
|
+
elsif options[:jobs]
|
67
|
+
if options[:job_id]
|
68
|
+
Bio::Pipengine.show_stats(options[:job_id])
|
69
|
+
elsif options[:job_name]
|
70
|
+
warn "Not yet implemented"
|
71
|
+
exit
|
72
|
+
elsif options[:delete]
|
73
|
+
if options[:delete].empty?
|
74
|
+
warn "Provide one or more Job IDs or write 'all' to delete all your running jobs".red
|
75
|
+
exit
|
76
|
+
end
|
77
|
+
puts "Warning: this will delete the following running jobs: ".light_blue + "#{options[:delete].join(",")}".green
|
78
|
+
print "Are you sure? (y|n):"
|
79
|
+
answer = gets.chomp
|
80
|
+
if answer == "y"
|
81
|
+
Bio::Pipengine.delete_jobs(options[:delete])
|
82
|
+
else
|
83
|
+
puts "Aborting..."
|
84
|
+
exit
|
85
|
+
end
|
86
|
+
else
|
87
|
+
Bio::Pipengine.show_stats(["all"])
|
88
|
+
end
|
89
|
+
elsif options[:pipeline] && options[:samples_file]
|
90
|
+
if options[:inspect_pipeline]
|
91
|
+
Bio::Pipengine.inspect_steps(options[:inspect_pipeline])
|
92
|
+
exit
|
93
|
+
else
|
94
|
+
abort("File not found: #{options[:pipeline]}".red) unless File.exists? options[:pipeline]
|
95
|
+
abort("File not found: #{options[:samples_file]}".red) unless File.exists? options[:samples_file]
|
96
|
+
abort("Please provide a valid step name with the --step parameter".red) unless options[:steps]
|
97
|
+
Bio::Pipengine.run(options)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
@@ -0,0 +1,271 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
class Job
|
5
|
+
|
6
|
+
# a Job object holds information on a job to be submitted
|
7
|
+
# samples_groups and samples_obj are used to store information in case of steps that require to combine info
|
8
|
+
# from multiple samples
|
9
|
+
attr_accessor :name, :cpus, :nodes, :mem, :resources, :command_line, :local,
|
10
|
+
:multi_samples, :samples_obj, :custom_output, :custom_name,
|
11
|
+
:log, :log_adapter
|
12
|
+
def initialize(name)
|
13
|
+
@name = generate_uuid + "-" + name
|
14
|
+
@shortname = name
|
15
|
+
@command_line = []
|
16
|
+
@resources = {}
|
17
|
+
@cpus = 1
|
18
|
+
@nodes = "1"
|
19
|
+
@log = "stdin"
|
20
|
+
@log_adapter = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_resources(resources)
|
24
|
+
self.resources.merge! resources
|
25
|
+
end
|
26
|
+
|
27
|
+
def output
|
28
|
+
self.resources["output"]
|
29
|
+
end
|
30
|
+
|
31
|
+
# add all the command lines for a given step
|
32
|
+
def add_step(step,sample)
|
33
|
+
|
34
|
+
# setting job working directory
|
35
|
+
working_dir = ""
|
36
|
+
if self.local
|
37
|
+
working_dir = self.local+"/"+self.name
|
38
|
+
else
|
39
|
+
working_dir = self.output
|
40
|
+
|
41
|
+
if step.is_multi?
|
42
|
+
folder = (self.custom_output) ? self.custom_output : @shortname
|
43
|
+
working_dir += "/#{folder}"
|
44
|
+
else
|
45
|
+
folder =
|
46
|
+
if self.custom_output
|
47
|
+
self.custom_output
|
48
|
+
elsif self.custom_name
|
49
|
+
self.custom_name
|
50
|
+
else
|
51
|
+
step.name
|
52
|
+
end
|
53
|
+
working_dir += "/#{sample.name}/#{folder}"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
# set job cpus number to the higher step cpus (this in case of multiple steps)
|
59
|
+
self.cpus = step.cpus if step.cpus > self.cpus
|
60
|
+
|
61
|
+
# set number of nodes for job
|
62
|
+
self.nodes = (step.nodes) ? step.nodes : @nodes
|
63
|
+
|
64
|
+
# set the memory used
|
65
|
+
self.mem = step.mem
|
66
|
+
|
67
|
+
# adding job working directory
|
68
|
+
unless step.name.start_with? "_"
|
69
|
+
self.command_line << "if [ ! -f #{working_dir}/checkpoint ]"
|
70
|
+
self.command_line << "then"
|
71
|
+
self.command_line << logger(step, "start")
|
72
|
+
self.command_line << "\nmkdir -p #{working_dir}"
|
73
|
+
self.command_line << "cd #{working_dir}"
|
74
|
+
end
|
75
|
+
|
76
|
+
# generate command lines for this step
|
77
|
+
if step.run.kind_of? Array
|
78
|
+
step.run.each_with_index do |cmd, i|
|
79
|
+
command = generate_cmd_line(cmd,sample,step)
|
80
|
+
# TODO verify that logger works in this case
|
81
|
+
# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name}:#{i}\" ; exit 1; }"
|
82
|
+
self.command_line << "#{command} || { #{logger(step, "FAILED #{i}" )}; exit 1; }"
|
83
|
+
end
|
84
|
+
else
|
85
|
+
command = generate_cmd_line(step.run,sample,step)
|
86
|
+
# TODO verify that logger works in this case
|
87
|
+
# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name} \" ; exit 1; }"
|
88
|
+
self.command_line << "#{command} || { #{logger(step, "FAILED" )}; exit 1; }"
|
89
|
+
end
|
90
|
+
self.command_line << logger(step, "finished")
|
91
|
+
self.command_line << "touch #{working_dir}/checkpoint"
|
92
|
+
self.command_line << "else"
|
93
|
+
self.command_line << logger(step, "already executed, skipping this step")
|
94
|
+
self.command_line << "fi"
|
95
|
+
|
96
|
+
# check if a temporary (i.e. different from 'output') directory is set
|
97
|
+
if self.local
|
98
|
+
final_output = ""
|
99
|
+
|
100
|
+
if step.is_multi?
|
101
|
+
folder = (self.custom_output) ? self.custom_output : @shortname
|
102
|
+
final_output = self.output+"/#{folder}"
|
103
|
+
else
|
104
|
+
folder = (self.custom_output) ? self.custom_output : step.name
|
105
|
+
final_output = self.output+"/#{sample.name}/#{folder}"
|
106
|
+
end
|
107
|
+
|
108
|
+
self.command_line << "mkdir -p #{final_output}"
|
109
|
+
self.command_line << "cp -r #{working_dir}/* #{final_output}"
|
110
|
+
self.command_line << "rm -fr #{working_dir}"
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
# convert the job object into a TORQUE::Qsub object
|
116
|
+
def to_pbs(options)
|
117
|
+
TORQUE::Qsub.new(options) do |torque_job|
|
118
|
+
torque_job.name = self.name
|
119
|
+
torque_job.working_directory = self.output # where pbs scripts and stdout / stderr files will be saved
|
120
|
+
if options[:pbs_opts]
|
121
|
+
torque_job.l = options[:pbs_opts]
|
122
|
+
else
|
123
|
+
l_string = []
|
124
|
+
l_string << "nodes=#{self.nodes}:ppn=#{self.cpus}"
|
125
|
+
l_string << "mem=#{self.mem}" if self.mem
|
126
|
+
torque_job.l = l_string
|
127
|
+
if options[:mail_exit]
|
128
|
+
torque_job.m = "e"
|
129
|
+
torque_job.M = options[:mail_exit]
|
130
|
+
end
|
131
|
+
if options[:mail_start]
|
132
|
+
torque_job.m = "b"
|
133
|
+
torque_job.M = options[:mail_start]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
torque_job.q = options[:pbs_queue] if options[:pbs_queue]
|
137
|
+
torque_job.script = self.command_line.join("\n")+"\n"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def to_script(options)
|
142
|
+
File.open(self.name+'.sh','w') do |file|
|
143
|
+
file.puts "#!/usr/bin/env bash -l"
|
144
|
+
file.puts self.command_line.join("\n")
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
# create a unique ID for each job
|
151
|
+
def generate_uuid
|
152
|
+
SecureRandom.hex(5)
|
153
|
+
end
|
154
|
+
|
155
|
+
# this method call other methods to perform the right substitutions into the command lines
|
156
|
+
def generate_cmd_line(cmd,sample,step)
|
157
|
+
if step.is_multi? # if is a multi samples step call a different method
|
158
|
+
set_multi_cmd(step,self.multi_samples)
|
159
|
+
cmd = sub_multi(cmd,step)
|
160
|
+
else
|
161
|
+
cmd = sub_placeholders(cmd,sample,step) # normal step, perform usual substitutions
|
162
|
+
end
|
163
|
+
return cmd
|
164
|
+
end
|
165
|
+
|
166
|
+
# perform substitutions on all the placeholders
|
167
|
+
def sub_placeholders(cmd,sample,step=nil)
|
168
|
+
tmp_cmd = cmd.gsub(/<sample>/,sample.name)
|
169
|
+
if tmp_cmd =~/<sample_path>/
|
170
|
+
sample_path_glob = (tmp_cmd.scan(/<sample_path>(\S+)/).map {|e| e.first})
|
171
|
+
if sample_path_glob.empty?
|
172
|
+
tmp_cmd.gsub!(/<sample_path>/,sample.path.join("\s"))
|
173
|
+
else
|
174
|
+
sample_path_glob.each do |append|
|
175
|
+
tmp_cmd.gsub!(/<sample_path>#{Regexp.quote(append)}/,(sample.path.map {|s| s+append}).join("\s"))
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
# for resourcers and cpus
|
180
|
+
tmp_cmd = sub_resources_and_cpu(tmp_cmd,step)
|
181
|
+
|
182
|
+
# for placeholders like <mapping/sample>
|
183
|
+
tmp_cmd.scan(/<(\S+)\/sample>/).map {|e| e.first}.each do |input_folder|
|
184
|
+
warn "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
|
185
|
+
tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/sample>/,self.output+"/"+sample.name+"/"+input_folder+"/"+sample.name)
|
186
|
+
end
|
187
|
+
|
188
|
+
# for placeholders like <mapping/>
|
189
|
+
tmp_cmd.scan(/<(\S+)\/>/).map {|e| e.first}.each do |input_folder|
|
190
|
+
warn "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
|
191
|
+
tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/>/,self.output+"/"+sample.name+"/"+input_folder+"/")
|
192
|
+
end
|
193
|
+
return tmp_cmd
|
194
|
+
end
|
195
|
+
|
196
|
+
def sub_resources_and_cpu(cmd,step)
|
197
|
+
# for all resources tags like <gtf> <index> <genome> <bwa> etc.
|
198
|
+
self.resources.each_key do |r|
|
199
|
+
cmd.gsub!(/<#{r}>/,self.resources[r])
|
200
|
+
end
|
201
|
+
# set number of cpus for this command line
|
202
|
+
cmd.gsub!(/<cpu>/,step.cpus.to_s) unless step.nil?
|
203
|
+
return cmd
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
# creates actual multi-samples command lines to be substituted where <multi> placeholders are found
|
208
|
+
def set_multi_cmd(step,multi_samples)
|
209
|
+
if step.multi_def.kind_of? Array # in case of many multi-samples command lines
|
210
|
+
step.multi_cmd = []
|
211
|
+
step.multi_def.each do |m_def|
|
212
|
+
step.multi_cmd << generate_multi_cmd(m_def,multi_samples)
|
213
|
+
end
|
214
|
+
else
|
215
|
+
step.multi_cmd = generate_multi_cmd(step.multi_def,multi_samples)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# take the multi_cmd and perform the subsitutions into the step command lines
|
220
|
+
def sub_multi(cmd,step)
|
221
|
+
cmd = sub_resources_and_cpu(cmd,step)
|
222
|
+
if step.multi_cmd.kind_of? Array
|
223
|
+
step.multi_cmd.each_with_index do |m,index|
|
224
|
+
cmd.gsub!(/<multi#{index+1}>/,m)
|
225
|
+
end
|
226
|
+
else
|
227
|
+
cmd.gsub!(/<multi>/,step.multi_cmd)
|
228
|
+
end
|
229
|
+
return cmd
|
230
|
+
end
|
231
|
+
|
232
|
+
# this sub method handle different multi-samples definitions (like comma separated list, space separated etc.)
|
233
|
+
def generate_multi_cmd(multi_def,multi_samples)
|
234
|
+
multi_cmd = []
|
235
|
+
multi_samples.each do |sample_name|
|
236
|
+
if sample_name.include? ","
|
237
|
+
multi_cmd << split_and_sub(",",multi_def,sample_name)
|
238
|
+
elsif sample_name.include? ";"
|
239
|
+
multi_cmd << split_and_sub(";",multi_def,sample_name)
|
240
|
+
else
|
241
|
+
multi_cmd << sub_placeholders(multi_def,self.samples_obj[sample_name])
|
242
|
+
end
|
243
|
+
end
|
244
|
+
return multi_cmd.join("\s")
|
245
|
+
end
|
246
|
+
|
247
|
+
# take a non-space separated list of samples and perform the substitution with the group defitions
|
248
|
+
def split_and_sub(sep,multi_def,multi)
|
249
|
+
cmd_line = []
|
250
|
+
multi.split(sep).each do |sample_name|
|
251
|
+
cmd_line << sub_placeholders(multi_def,self.samples_obj[sample_name])
|
252
|
+
end
|
253
|
+
cmd_line.join(sep)
|
254
|
+
end
|
255
|
+
|
256
|
+
# log a step according to the selected adapter
|
257
|
+
def logger(step, message)
|
258
|
+
case self.log
|
259
|
+
when "stdin"
|
260
|
+
"echo \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd` `date`.\""
|
261
|
+
when "syslog"
|
262
|
+
"logger -t PIPENGINE \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd`\""
|
263
|
+
when "fluentd"
|
264
|
+
"curl -X POST -d 'json={\"source\":\"PIPENGINE\", \"step\":\"#{step.name}\", \"message\":\"#{message}\", \"job_id\":\"#{name}\", \"user\":\"\'\"`whoami`\"\'\", \"host\":\"\'\"`hostname`\"\'\", \"pwd\":\"\'\"`pwd`\"\'\"}' #{self.log_adapter}"
|
265
|
+
end
|
266
|
+
end #logger
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
class Sample
|
4
|
+
# Sample holds all the information on a sample and its original input path (or multiple paths)
|
5
|
+
attr_accessor :path, :name
|
6
|
+
def initialize(name,path_string)
|
7
|
+
@path = path_string.split(",")
|
8
|
+
@name = name
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
# Step holds information for a pipeline step
|
5
|
+
# groups_def is used to store information on groups definition (i.e. generic cmd lines with placeholders)
|
6
|
+
# groups_cmd is used to store the actual command lines for all the samples to be combined in a "groups" step
|
7
|
+
# this are generated by combining groups_def information with sample groups information and will be placed
|
8
|
+
# where <groups> placeholder is found into the step command lines.
|
9
|
+
class Step
|
10
|
+
attr_accessor :name, :run, :cpus, :mem, :nodes, :multi_def, :multi_cmd, :pre
|
11
|
+
def initialize(name,step_instructions)
|
12
|
+
@name = name
|
13
|
+
parse_yaml(step_instructions)
|
14
|
+
end
|
15
|
+
|
16
|
+
def is_multi?
|
17
|
+
return (self.multi_def.nil?) ? false : true
|
18
|
+
end
|
19
|
+
|
20
|
+
def has_prerequisite?
|
21
|
+
return (self.pre.nil?) ? false : true
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def parse_yaml(step_instructions)
|
27
|
+
self.cpus = step_instructions["cpu"].to_i
|
28
|
+
self.nodes = step_instructions["nodes"]
|
29
|
+
self.mem = step_instructions["mem"]
|
30
|
+
self.run = step_instructions["run"]
|
31
|
+
self.multi_def = step_instructions["multi"]
|
32
|
+
self.pre = step_instructions["pre"]
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,234 @@
|
|
1
|
+
module Bio
|
2
|
+
module Pipengine
|
3
|
+
|
4
|
+
def self.run(options)
|
5
|
+
|
6
|
+
# reading the yaml files
|
7
|
+
pipeline = YAML.load_file options[:pipeline]
|
8
|
+
samples_file = YAML.load_file options[:samples_file]
|
9
|
+
samples_file["samples"].each do |k,v|
|
10
|
+
if v.kind_of? Hash
|
11
|
+
samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }]
|
12
|
+
else
|
13
|
+
samples_file["samples"][k] = v.to_s
|
14
|
+
end
|
15
|
+
end
|
16
|
+
# make sure everything in Samples and Resources is converted to string
|
17
|
+
#samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }]
|
18
|
+
samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}]
|
19
|
+
|
20
|
+
# pre-running checks
|
21
|
+
check_steps(options[:steps],pipeline)
|
22
|
+
check_samples(options[:samples],samples_file) if options[:samples]
|
23
|
+
|
24
|
+
# list of samples the jobs will work on
|
25
|
+
samples_list = nil
|
26
|
+
# check if a group is specified
|
27
|
+
if options[:group]
|
28
|
+
samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]]
|
29
|
+
options[:multi] = samples_list.keys
|
30
|
+
samples_file["resources"]["output"] << "/#{options[:group]}"
|
31
|
+
else # if not, proceed normalizing the sample list to remove groups and get a list of all samples
|
32
|
+
full_list_samples = {}
|
33
|
+
samples_file["samples"].each_key do |k|
|
34
|
+
if samples_file["samples"][k].kind_of? Hash
|
35
|
+
full_list_samples.merge! samples_file["samples"][k]
|
36
|
+
else
|
37
|
+
full_list_samples[k] = samples_file["samples"][k]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples
|
41
|
+
end
|
42
|
+
|
43
|
+
########### START ###########
|
44
|
+
|
45
|
+
# create output directory (jobs scripts will be saved there)
|
46
|
+
FileUtils.mkdir_p samples_file["resources"]["output"] unless options[:dry] #&& options[:spooler]!="pbs"
|
47
|
+
|
48
|
+
# check if the requested steps are multi-samples
|
49
|
+
run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options)
|
50
|
+
|
51
|
+
unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample
|
52
|
+
samples_list.each_key do |sample_name|
|
53
|
+
sample = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name])
|
54
|
+
create_job(samples_file,pipeline,samples_list,options,sample)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# handle steps that run on multiple samples (i.e. sample groups job)
|
60
|
+
def self.check_and_run_multi(samples_file,pipeline,samples_list,options)
|
61
|
+
step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?}
|
62
|
+
|
63
|
+
if step_multi.include? false
|
64
|
+
if step_multi.uniq.size > 1
|
65
|
+
puts "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red
|
66
|
+
exit
|
67
|
+
else
|
68
|
+
return false
|
69
|
+
end
|
70
|
+
else
|
71
|
+
samples_obj = {}
|
72
|
+
samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name])}
|
73
|
+
create_job(samples_file,pipeline,samples_list,options,samples_obj)
|
74
|
+
return true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.create_job(samples_file,pipeline,samples_list,options,sample)
|
79
|
+
# getting the sample name (only if this is not a multi samples job)
|
80
|
+
sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-"
|
81
|
+
# setting the job name
|
82
|
+
job_name = nil
|
83
|
+
if options[:name]
|
84
|
+
job_name = options[:name]
|
85
|
+
elsif options[:steps].size > 1
|
86
|
+
job_name = "#{sample_name}#{options[:steps].join("-")}"
|
87
|
+
else
|
88
|
+
job_name = "#{sample_name}#{options[:steps].first}"
|
89
|
+
end
|
90
|
+
# creating the Job object
|
91
|
+
job = Bio::Pipengine::Job.new(job_name)
|
92
|
+
job.local = options[:tmp]
|
93
|
+
job.custom_output = options[:output_dir]
|
94
|
+
job.custom_name = (options[:name]) ? options[:name] : nil
|
95
|
+
job.add_resources pipeline["resources"]
|
96
|
+
job.add_resources samples_file["resources"]
|
97
|
+
#setting the logging system
|
98
|
+
job.log = options[:log]
|
99
|
+
job.log_adapter = options[:log_adapter]
|
100
|
+
# setting sample groups either by cli option (if present) or by taking all available samples
|
101
|
+
job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys
|
102
|
+
job.samples_obj = sample if sample.kind_of? Hash
|
103
|
+
# cycling through steps and add command lines to the job
|
104
|
+
options[:steps].each do |step_name|
|
105
|
+
# TODO WARNING this can add multiple times the same step if the are multi dependencies
|
106
|
+
self.add_job(job, pipeline, step_name, sample)
|
107
|
+
end
|
108
|
+
|
109
|
+
if options[:dry] #&& options[:spooler] == "script"
|
110
|
+
job.to_script(options)
|
111
|
+
else
|
112
|
+
script = job.to_pbs(options) # converting the Job into a TORQUE::Qsub PBS compatible object
|
113
|
+
job_id = script.submit(options)
|
114
|
+
puts "#{job_id}".green unless options[:dry]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# check if sample exists
|
119
|
+
def self.check_samples(passed_samples,samples)
|
120
|
+
passed_samples.each do |sample|
|
121
|
+
samples_names = []
|
122
|
+
samples["samples"].each_key do |k|
|
123
|
+
if samples["samples"][k].kind_of? Hash
|
124
|
+
samples["samples"][k].each_key {|s| samples_names << s}
|
125
|
+
else
|
126
|
+
samples_names << k
|
127
|
+
end
|
128
|
+
end
|
129
|
+
unless samples_names.include? sample
|
130
|
+
puts "Sample \"#{sample}\" does not exist in sample file!".red
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# check if step exists
|
137
|
+
def self.check_steps(passed_steps,pipeline)
|
138
|
+
passed_steps.each do |step|
|
139
|
+
unless pipeline["steps"].keys.include? step
|
140
|
+
puts "Step \"#{step}\" does not exist in pipeline file!".red
|
141
|
+
exit
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# load the pipeline file and show a list of available steps
|
147
|
+
def self.inspect_steps(pipeline_file)
|
148
|
+
pipeline = YAML.load_file pipeline_file
|
149
|
+
print "\nPipeline: ".blue
|
150
|
+
print "#{pipeline["pipeline"]}\n\n".green
|
151
|
+
puts "List of available steps:".light_blue
|
152
|
+
pipeline["steps"].each_key do |s|
|
153
|
+
print "\s\s#{s}:\s\s".blue
|
154
|
+
print "#{pipeline["steps"][s]["desc"]}\n".green
|
155
|
+
end
|
156
|
+
puts "\n"
|
157
|
+
end
|
158
|
+
|
159
|
+
# create the samples.yml file (CASAVA ONLY!)
|
160
|
+
def self.create_samples(dir)
|
161
|
+
File.open("samples.yml","w") do |file|
|
162
|
+
file.write "resources:\n\soutput: #{FileUtils.pwd}\n\nsamples:\n"
|
163
|
+
samples = Hash.new {|hash,key| hash[key] = []}
|
164
|
+
dir.each do |path|
|
165
|
+
projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/}
|
166
|
+
unless projects.empty?
|
167
|
+
projects.each do |project_folder|
|
168
|
+
Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s}
|
169
|
+
end
|
170
|
+
else
|
171
|
+
Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s}
|
172
|
+
end
|
173
|
+
end
|
174
|
+
samples.each_key do |sample|
|
175
|
+
file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# show running jobs information
|
181
|
+
def self.show_stats(job_ids)
|
182
|
+
stats = TORQUE::Qstat.new
|
183
|
+
if job_ids.first == "all"
|
184
|
+
stats.display
|
185
|
+
else
|
186
|
+
stats.display(:job_ids => job_ids)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# delete running jobs from the scheduler
|
191
|
+
def self.delete_jobs(job_ids)
|
192
|
+
include TORQUE
|
193
|
+
if job_ids == ["all"]
|
194
|
+
Qdel.rm_all
|
195
|
+
else
|
196
|
+
job_ids.each {|job_id| Qdel.rm job_id}
|
197
|
+
end
|
198
|
+
end #delete_jobs
|
199
|
+
|
200
|
+
# check if required configuration exists
|
201
|
+
def self.check_config
|
202
|
+
unless File.exists?("#{Dir.home}/.torque_rm.yaml")
|
203
|
+
ARGV.clear
|
204
|
+
current_user = Etc.getlogin
|
205
|
+
puts "\nIt seems you are running PipEngine for the first time. Please fill in the following information:"
|
206
|
+
print "\nHostname or IP address of authorized server from where jobs will be submitted: ".light_blue
|
207
|
+
server = gets.chomp
|
208
|
+
print "\n"
|
209
|
+
print "Specify the username you will be using to connect and submit jobs [#{current_user}]: ".light_blue
|
210
|
+
username = gets.chomp
|
211
|
+
username = (username == "") ? current_user : username
|
212
|
+
puts "Attempting connection to the server...".green
|
213
|
+
path = `ssh #{username}@#{server} -t "which qsub"`.split("/qsub").first
|
214
|
+
unless path=~/\/\S+\/\S+/
|
215
|
+
warn "Connection problems detected! Please check that you are able to connect to '#{server}' as '#{username}' via ssh.".red
|
216
|
+
else
|
217
|
+
file = File.open("#{Dir.home}/.torque_rm.yaml","w")
|
218
|
+
file.write({:hostname => server, :path => path, :user => username}.to_yaml)
|
219
|
+
file.close
|
220
|
+
puts "First time configuration completed!".green
|
221
|
+
puts "It is strongly recommended to setup a password-less SSH connection to use PipEngine.".green
|
222
|
+
exit
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end #check_config
|
226
|
+
|
227
|
+
def self.add_job(job, pipeline, step_name, sample)
|
228
|
+
step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions
|
229
|
+
self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite?
|
230
|
+
job.add_step(step,sample) # adding step command lines to the job
|
231
|
+
end #add_job
|
232
|
+
|
233
|
+
end
|
234
|
+
end
|