mutations_caller_pipeline_aws 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mutations_caller_pipeline_aws'
3
+ require 'optparse'
4
+ require 'rubygems'
5
+ require 'yaml'
6
+
7
+ usage =<<EOF
8
+ _________________________________________________________________________________________________
9
+
10
+ #{$0}
11
+ -b sorted_bam_file
12
+ -c config.yml -v raw_vcf_file
13
+ [-a account || -p project]
14
+
15
+ _________________________________________________________________________________________________
16
+
17
+ #{$0} ...
18
+ ... is a tool to find mutations between the reference gene and a given test
19
+ population. To run this tool you must have bwa, samtools and GATK installed.
20
+ Also you should have the indices for bwa and GATK prepared.
21
+ NOTE: Only paired end reads are supported!
22
+
23
+ +++ C L U S T E R V E R S I O N +++
24
+
25
+ config.yml should look like this:
26
+ # config.yml
27
+ index_prefix: "path/to/prefix"
28
+ annotation_file: "path/to/annotation_file"
29
+ bwa: "path/to/bwa"
30
+ samtools: "path/to/samtools"
31
+ gatk: "path/to/GenomeAnalysisTK.jar"
32
+ _________________________________________________________________________________________________
33
+
34
+ EOF
35
+
36
+ options = { :bam_file_sorted => nil,
37
+ :index_prefix => nil,
38
+ :index_fa => nil,
39
+ :index_vcf => nil,
40
+ :annotation_file => nil,
41
+ :samtools => nil,
42
+ :gatk => nil,
43
+ :bwa => nil,
44
+ :vcf => nil,
45
+ :account => "",
46
+ :project => "",
47
+ :debug = 1,
48
+ }
49
+
50
+ optparse = OptionParser.new do |opts|
51
+ opts.banner = usage
52
+
53
+ opts.on("-b", "--bam_file_sorted DIR", :REQUIRED, String, "Bam file sorted") do |i|
54
+ options[:bam_file_sorted] = i
55
+ end
56
+
57
+ opts.on("-c", "--config DIR",:REQUIRED, String, "Set config file") do |path|
58
+ options.merge!(Hash[YAML::load(open(path)).map { |k, v| [k.to_sym, v] }])
59
+ end
60
+
61
+ opts.on("-v","--vcf [PATH]",:REQUIRED, String, "Output of pipeline") do |i|
62
+ options[:vcf] = i
63
+ end
64
+
65
+ opts.on("-a","--account [NAME]", "Option for qsub: -A [NAME]. Default: [none] " ) do |i|
66
+ options[:account] = " -A #{i}" if i
67
+ end
68
+
69
+ opts.on("-p","--project [NAME]", "Option for qsub: -P [NAME]. Default: [none] " ) do |i|
70
+ options[:project] = " -P #{i}" if i
71
+ end
72
+
73
+ opts.on("-d","--debug ", "Option to debug" ) do |i|
74
+ options[:debug] = 5 if i
75
+ end
76
+
77
+ opts.on_tail("-h", "--help", "Show this message") do
78
+ puts opts
79
+ exit
80
+ end
81
+ end
82
+
83
+ begin
84
+ optparse.parse!
85
+ mandatory = [:bam_file_sorted, :index_prefix, :annotation_file, :bwa, :samtools, :gatk, :vcf, :index_vcf, :index_fa]
86
+ missing = mandatory.select{ |param| options[param].nil? }
87
+ if !missing.empty?
88
+ puts "\nMissing options given or missing in config_file: \n\t#{missing.join(",\n\t")}"
89
+ puts optparse
90
+ exit
91
+ end
92
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
93
+ puts $!.to_s
94
+ puts optparse
95
+ exit
96
+ end
97
+
98
+ # pipeline starts here
99
+
100
+ # tmp files for output
101
+ random = (rand*1000000).floor.to_s
102
+ bam_file = options[:bam_file_sorted]
103
+ job_prefix = "#{random}"
104
+ log_file = "#{random}.log"
105
+ target_intervals = "#{random}_target.intervals"
106
+ realigned_bam = "#{random}_realigned.bam"
107
+ recal_file = "#{random}_recal.csv"
108
+ recal_bam = "#{random}_recal.bam"
109
+
110
+ options[:account] = options[:project] if options[:account].empty?
111
+
112
+
113
+ # Indexing
114
+ SamtoolsIndexing.call(bam_file,
115
+ job_prefix,
116
+ account,
117
+ options[:debug])
118
+
119
+ # Realigne
120
+ GatkCaller.prepare_realigne(log_file,
121
+ options[:gatk],
122
+ bam_file,
123
+ options[:index_fa],
124
+ target_intervals,
125
+ job_prefix,
126
+ options[:account],
127
+ options[:debug])
128
+
129
+ GatkCaller.realigne(log_file,
130
+ options[:gatk],
131
+ bam_file,
132
+ options[:index_fa],
133
+ target_intervals,
134
+ realigned_bam,
135
+ job_prefix,
136
+ options[:account],
137
+ options[:debug])
138
+
139
+ # Recalibration
140
+ GatkCaller.recalibrate_bam( log_file,
141
+ options[:gatk],
142
+ options[:index_fa],
143
+ options[:index_vcf],
144
+ realigned_bam,
145
+ recal_file,
146
+ job_prefix,
147
+ options[:account],
148
+ options[:debug] )
149
+
150
+ GatkCaller.table_calibration(log_file,
151
+ options[:gatk],
152
+ options[:index_fa],
153
+ realigned_bam,
154
+ recal_bam,
155
+ recal_file,
156
+ job_prefix,
157
+ options[:account],
158
+ options[:debug])
159
+
160
+ # GATK: finding mutations
161
+ GatkCaller.call(log_file,
162
+ options[:gatk],
163
+ options[:index_fa],
164
+ recal_bam,
165
+ options[:vcf],
166
+ job_prefix,
167
+ options[:account],
168
+ options[:debug])
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mutations_caller_pipeline_aws'
3
+ require 'optparse'
4
+ require 'rubygems'
5
+ require 'yaml'
6
+
7
+ usage =<<EOF
8
+ _________________________________________________________________________________________________
9
+
10
+ #{$0}
11
+ -m mutant_r1.fq -n mutant_r2.fq
12
+ [-w wildtype_r1.fq -x wildtype_r2.fq]
13
+ -c config.yml -v raw_vcf_file
14
+ [-a account || -p project]
15
+ _________________________________________________________________________________________________
16
+
17
+ #{$0} ...
18
+ ... is a tool to find mutations between the reference gene and a given test
19
+ population. To run this tool you must have bwa, samtools and GATK installed.
20
+ Also you should have the indices for bwa and GATK prepared.
21
+ NOTE: Only paired end reads are supported!
22
+
23
+ +++ C L U S T E R V E R S I O N +++
24
+
25
+ config.yml should look like this:
26
+ # config.yml
27
+ index_prefix: "path/to/prefix"
28
+ annotation_file: "path/to/annotation_file"
29
+ bwa: "path/to/bwa"
30
+ samtools: "path/to/samtools"
31
+ gatk: "path/to/GenomeAnalysisTK.jar"
32
+ _________________________________________________________________________________________________
33
+
34
+ EOF
35
+
36
+ options = { :mutant_r1 => nil,
37
+ :mutant_r2 => nil,
38
+ :wildtype_r1 => nil,
39
+ :wildtype_r2 => nil,
40
+ :index_prefix => nil,
41
+ :index_fa => nil,
42
+ :index_vcf => nil,
43
+ :annotation_file => nil,
44
+ :samtools => nil,
45
+ :gatk => nil,
46
+ :bwa => nil,
47
+ :vcf => nil,
48
+ :account => "",
49
+ :project => "",
50
+ :debug => 1,
51
+ }
52
+
53
+ optparse = OptionParser.new do |opts|
54
+ opts.banner = usage
55
+
56
+ opts.on("-m", "--fwd_read_mutant DIR", :REQUIRED, String, "Path to fwd read of mutant") do |i|
57
+ options[:mutant_r1] = i
58
+ end
59
+
60
+ opts.on("-n", "--rev_read_mutant DIR", :REQUIRED, String, "Path to rev read of mutant") do |i|
61
+ options[:mutant_r2] = i
62
+ end
63
+
64
+ opts.on("-w", "--fwd_read_wildtype DIR", String, "Path to fwd read of wildtype, not mandatory") do |i|
65
+ options[:wildtype_r1] = i if i
66
+ end
67
+
68
+ opts.on("-x", "--rev_read_wildtype DIR", String, "Path to rev read of wildtype, not mandatory") do |i|
69
+ options[:wildtype_r2] = i if i
70
+ end
71
+
72
+ opts.on("-c", "--config DIR", String, "Set config file") do |path|
73
+ options.merge!(Hash[YAML::load(open(path)).map { |k, v| [k.to_sym, v] }])
74
+ end
75
+
76
+ opts.on("-v","--vcf [PATH]", "Output of pipeline") do |i|
77
+ options[:vcf] = i
78
+ end
79
+
80
+ opts.on("-a","--account [NAME]", "Option for qsub: -A [NAME]. Default: [none] " ) do |i|
81
+ options[:account] = " -A #{i}" if i
82
+ end
83
+
84
+ opts.on("-p","--project [NAME]", "Option for qsub: -P [NAME]. Default: [none] " ) do |i|
85
+ options[:project] = " -P #{i}" if i
86
+ end
87
+
88
+ opts.on("-d","--debug ", "Option to debug" ) do |i|
89
+ options[:debug] = 5 if i
90
+ end
91
+
92
+ opts.on_tail("-h", "--help", "Show this message") do
93
+ puts opts
94
+ exit
95
+ end
96
+ end
97
+
98
+ begin
99
+ optparse.parse!
100
+ mandatory = [:mutant_r1, :mutant_r2, :index_prefix, :annotation_file, :bwa, :samtools, :gatk, :vcf, :index_vcf, :index_fa]
101
+ missing = mandatory.select{ |param| options[param].nil? }
102
+ if !missing.empty?
103
+ puts "\nMissing options given or missing in config_file: \n\t#{missing.join(",\n\t")}"
104
+ puts optparse
105
+ exit
106
+ end
107
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
108
+ puts $!.to_s
109
+ puts optparse
110
+ exit
111
+ end
112
+
113
+ # pipeline starts here
114
+
115
+ # tmp files for output
116
+ random = (rand*1000000).floor.to_s
117
+ bam_file = "mutant_#{random}"
118
+ job_prefix = "#{random}"
119
+ log_file = "#{random}.log"
120
+ target_intervals = "#{random}_target.intervals"
121
+ realigned_bam = "#{random}_realigned.bam"
122
+ recal_file = "#{random}_recal.csv"
123
+ recal_bam = "#{random}_recal.bam"
124
+
125
+ options[:account] = options[:project] if options[:account].empty?
126
+
127
+ # BWA : First step mapping reads to reference
128
+ BwaCaller.call_paired_end(options[:mutant_r1],
129
+ options[:mutant_r2],
130
+ bam_file,
131
+ options[:index_prefix],
132
+ log_file,
133
+ options[:bwa],
134
+ options[:samtools],
135
+ job_prefix,
136
+ options[:account],
137
+ options[:debug])
138
+
139
+
140
+ # Indexing
141
+ bam_file = bam_file + ".bam"
142
+ SamtoolsIndexing.call(bam_file,
143
+ job_prefix,
144
+ options[:account],
145
+ options[:debug])
146
+
147
+ # Realigne
148
+ GatkCaller.prepare_realigne(log_file,
149
+ options[:gatk],
150
+ bam_file,
151
+ options[:index_fa],
152
+ target_intervals,
153
+ job_prefix,
154
+ options[:account],
155
+ options[:debug])
156
+
157
+ GatkCaller.realigne(log_file,
158
+ options[:gatk],
159
+ bam_file,
160
+ options[:index_fa],
161
+ target_intervals,
162
+ realigned_bam,
163
+ job_prefix,
164
+ options[:account],
165
+ options[:debug])
166
+
167
+ # Recalibration
168
+ GatkCaller.recalibrate_bam( log_file,
169
+ options[:gatk],
170
+ options[:index_fa],
171
+ options[:index_vcf],
172
+ realigned_bam,
173
+ recal_file,
174
+ job_prefix,
175
+ options[:account],
176
+ options[:debug] )
177
+
178
+ GatkCaller.table_calibration(log_file,
179
+ options[:gatk],
180
+ options[:index_fa],
181
+ realigned_bam,
182
+ recal_bam,
183
+ recal_file,
184
+ job_prefix,
185
+ options[:account],
186
+ options[:debug])
187
+
188
+ # GATK: finding mutations
189
+ GatkCaller.call(log_file,
190
+ options[:gatk],
191
+ options[:index_fa],
192
+ recal_bam,
193
+ options[:vcf],
194
+ job_prefix,
195
+ options[:account],
196
+ options[:debug])
@@ -0,0 +1,18 @@
1
+ class BwaCaller
2
+ def self.call_single_end(r1,out_file,index, log_file, bwa, samtools)
3
+ cmd = "#{bwa} samse -r '@RG\tID:foo\tSM:bar\tPL:Illumina' #{index} \
4
+ <(#{bwa} aln #{index} #{r1} 2>>#{log_file}) \
5
+ #{r1} 2>>#{log_file} | #{samtools} view -Su - 2>>#{log_file} | #{samtools} sort - #{out_file} 2>>#{log_file}"
6
+ puts cmd
7
+ system('bash','-c',cmd )
8
+ end
9
+
10
+ def self.call_paired_end(r1, r2, out_file, index, log_file, bwa, samtools, job_prefix,account, debug)
11
+ cmd = "qsub -cwd -b y -N #{job_prefix}_bwa -l h_vmem=9G -pe make 4 #{account}\
12
+ #{bwa} sampe -r '@RG\tID:foo\tSM:bar\tPL:Illumina' #{index} \
13
+ <(#{bwa} aln #{index} #{r1} 2>>#{log_file} || exit 1) <(#{bwa} aln #{index} #{r2} 2>>#{log_file} ) \
14
+ #{r1} #{r2} 2>>#{log_file} | #{samtools} view -Su - 2>>#{log_file} | #{samtools} sort - #{out_file} 2>>#{log_file}"
15
+ puts cmd
16
+ system('bash','-c', cmd) if debug == 1
17
+ end
18
+ end
@@ -0,0 +1,70 @@
1
+ class GatkCaller
2
+ # INDEX is normal genom.fa
3
+ # Genotyper
4
+ def self.call(log_dir, gatk, index_fa, read_bam, read_vcf, job_prefix, account, debug)
5
+ cmd = "echo 'starting GATK for mutant at ' `date` >> #{log_dir}
6
+ qsub -cwd -b y -N #{job_prefix}_genotyper -l h_vmem=3G -hold_jid #{job_prefix}_recalibration #{account}\
7
+ java -Xmx4g -jar #{gatk} -l INFO -R #{index_fa} -T UnifiedGenotyper \
8
+ -I #{read_bam} \
9
+ -o #{read_vcf} \
10
+ --genotype_likelihoods_model BOTH \
11
+ >> #{log_dir} 2>&1 || exit 1"
12
+ puts cmd
13
+ system(cmd) if debug == 1
14
+ end
15
+
16
+ # Making recalibration table
17
+ def self.recalibrate_bam(log_dir ,gatk, index_fa, index_vcf, read_bam, recal_file, job_prefix, account, debug )
18
+ cmd = "echo 'starting recalibration table ' `date` >> #{log_dir}
19
+ qsub -cwd -b y -N #{job_prefix}_recalibration_table -l h_vmem=3G #{account} \
20
+ java -Xmx4g -jar #{gatk} -knownSites #{index_vcf} -I #{read_bam} \
21
+ -R #{index_fa} -T CountCovariates \
22
+ -cov ReadGroupCovariate -cov QualityScoreCovariate -cov DinucCovariate \
23
+ -cov CycleCovariate \
24
+ -recalFile #{recal_file} >> #{log_dir} 2>&1 || exit 1 "
25
+ puts cmd
26
+ system(cmd) if debug == 1
27
+ end
28
+
29
+ # Using recalibration table
30
+ def self.table_calibration(log_dir, gatk, index_fa, read_bam, recal_bam, recal_file, job_prefix, account, debug)
31
+ cmd = "echo 'recalibrating bam_file at ' `date` >> #{log_dir}
32
+ qsub -cwd -b y -N #{job_prefix}_recalibration -l h_vmem=3G -hold_jid #{job_prefix}_recalibration_table #{account} \
33
+ java -Xmx4g -jar #{gatk} \
34
+ -R #{index_fa} \
35
+ -I #{read_bam} \
36
+ -T TableRecalibration \
37
+ -o #{recal_bam} \
38
+ -recalFile #{recal_file} >> #{log_dir} 2>&1 || exit 1"
39
+ puts cmd
40
+ system(cmd) if debug == 1
41
+ end
42
+
43
+ # Preparation realignement
44
+ def self.prepare_realigne(log_dir, gatk, read_bam, index_fa, target_intervals, job_prefix, account, debug)
45
+ cmd = "echo 'preparing realignement at ' `date` >> #{log_dir}
46
+ qsub -cwd -b y -N #{job_prefix}_prep_realignment -l h_vmem=3G -hold_jid #{job_prefix}_indexing #{account}\
47
+ java -Xmx2g -jar #{gatk} \
48
+ -I #{read_bam} \
49
+ -R #{index_fa} \
50
+ -T RealignerTargetCreator \
51
+ -o #{target_intervals}"
52
+ puts cmd
53
+ system(cmd) if debug == 1
54
+ end
55
+
56
+ # Realignment
57
+ def self.realigne(log_dir, gatk, read_bam, index_fa, target_intervals, realigned_bam, job_prefix, account, debug)
58
+ cmd = "echo 'preparing realignement at ' `date` >> #{log_dir}
59
+ qsub -cwd -b y -N #{job_prefix}_realignment -l h_vmem=3G -hold_jid #{job_prefix}_prep_realignment #{account} \
60
+ java -Xmx4g -jar #{gatk} \
61
+ -I #{read_bam} \
62
+ -R #{index_fa} \
63
+ -T IndelRealigner \
64
+ -targetIntervals #{target_intervals} \
65
+ -o #{realigned_bam} >> #{log_dir} 2>&1 || exit 1"
66
+ puts cmd
67
+ system(cmd) if debug == 1
68
+ end
69
+
70
+ end
@@ -0,0 +1,21 @@
1
+ class LocationFile
2
+ def self.create(vcf_file, loction_file_output)
3
+ locations = File.open(vcf_file)
4
+ line = locations.readline()
5
+
6
+ locus = []
7
+ while line.include?('#')
8
+ location = line.scan(/##contig=<ID=+\w+/)
9
+ if !location.empty?()
10
+ location = location[0].split('=')
11
+ locus << location[-1]
12
+ end
13
+ line = locations.readline()
14
+ end
15
+
16
+ locations.close()
17
+ locus_file = File.new(location_file_output,'w')
18
+ locus_file.write(locus.join("\n"))
19
+ locus_file.close()
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+ class SamtoolsIndexing
2
+ def self.call(bam_file, job_prefix, account, debug)
3
+ cmd = "qsub -cwd -b y -N #{job_prefix}_indexing -l h_vmem=3G -hold_jid #{job_prefix}_bwa #{account} \
4
+ samtools index #{bam_file}"
5
+ puts cmd
6
+ system(cmd) if debug == 1
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ require 'mutations_caller_pipeline_aws/bwa_caller'
2
+ require 'mutations_caller_pipeline_aws/gatk_caller'
3
+ require 'mutations_caller_pipeline_aws/samtools_indexing'
4
+
5
+ class MutationsCallerPipelineAws
6
+ def self.hi
7
+ "Hello World!"
8
+ end
9
+ end
10
+
11
+
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mutations_caller_pipeline_aws
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kaharina Hayer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Using BWA to align and GATK to call the bases
15
+ email:
16
+ - katharinaehayer@gmail.com
17
+ executables:
18
+ - mutations_caller_pipeline_aws
19
+ - gatk_pipe_only_aws
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - bin/gatk_pipe_only_aws
24
+ - bin/mutations_caller_pipeline_aws
25
+ - lib/mutations_caller_pipeline_aws.rb
26
+ - lib/mutations_caller_pipeline_aws/.DS_Store
27
+ - lib/mutations_caller_pipeline_aws/bwa_caller.rb
28
+ - lib/mutations_caller_pipeline_aws/gatk_caller.rb
29
+ - lib/mutations_caller_pipeline_aws/location_file.rb
30
+ - lib/mutations_caller_pipeline_aws/samtools_indexing.rb
31
+ homepage: https://github.com/khayer/mutations_caller_pipeline_aws
32
+ licenses: []
33
+ post_install_message:
34
+ rdoc_options: []
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project: mutations_caller_pipeline_aws
51
+ rubygems_version: 1.8.10
52
+ signing_key:
53
+ specification_version: 3
54
+ summary: Call Mutations for files.fq
55
+ test_files: []
56
+ has_rdoc: