mutations_caller_pipeline_aws 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mutations_caller_pipeline_aws'
3
+ require 'optparse'
4
+ require 'rubygems'
5
+ require 'yaml'
6
+
7
+ usage =<<EOF
8
+ _________________________________________________________________________________________________
9
+
10
+ #{$0}
11
+ -b sorted_bam_file
12
+ -c config.yml -v raw_vcf_file
13
+ [-a account || -p project]
14
+
15
+ _________________________________________________________________________________________________
16
+
17
+ #{$0} ...
18
+ ... is a tool to find mutations between the reference gene and a given test
19
+ population. To run this tool you must have bwa, samtools and GATK installed.
20
+ Also you should have the indices for bwa and GATK prepared.
21
+ NOTE: Only paired end reads are supported!
22
+
23
+ +++ C L U S T E R V E R S I O N +++
24
+
25
+ config.yml should look like this:
26
+ # config.yml
27
+ index_prefix: "path/to/prefix"
28
+ annotation_file: "path/to/annotation_file"
29
+ bwa: "path/to/bwa"
30
+ samtools: "path/to/samtools"
31
+ gatk: "path/to/GenomeAnalysisTK.jar"
32
+ _________________________________________________________________________________________________
33
+
34
+ EOF
35
+
36
+ options = { :bam_file_sorted => nil,
37
+ :index_prefix => nil,
38
+ :index_fa => nil,
39
+ :index_vcf => nil,
40
+ :annotation_file => nil,
41
+ :samtools => nil,
42
+ :gatk => nil,
43
+ :bwa => nil,
44
+ :vcf => nil,
45
+ :account => "",
46
+ :project => "",
47
+ :debug = 1,
48
+ }
49
+
50
+ optparse = OptionParser.new do |opts|
51
+ opts.banner = usage
52
+
53
+ opts.on("-b", "--bam_file_sorted DIR", :REQUIRED, String, "Bam file sorted") do |i|
54
+ options[:bam_file_sorted] = i
55
+ end
56
+
57
+ opts.on("-c", "--config DIR",:REQUIRED, String, "Set config file") do |path|
58
+ options.merge!(Hash[YAML::load(open(path)).map { |k, v| [k.to_sym, v] }])
59
+ end
60
+
61
+ opts.on("-v","--vcf [PATH]",:REQUIRED, String, "Output of pipeline") do |i|
62
+ options[:vcf] = i
63
+ end
64
+
65
+ opts.on("-a","--account [NAME]", "Option for qsub: -A [NAME]. Default: [none] " ) do |i|
66
+ options[:account] = " -A #{i}" if i
67
+ end
68
+
69
+ opts.on("-p","--project [NAME]", "Option for qsub: -P [NAME]. Default: [none] " ) do |i|
70
+ options[:project] = " -P #{i}" if i
71
+ end
72
+
73
+ opts.on("-d","--debug ", "Option to debug" ) do |i|
74
+ options[:debug] = 5 if i
75
+ end
76
+
77
+ opts.on_tail("-h", "--help", "Show this message") do
78
+ puts opts
79
+ exit
80
+ end
81
+ end
82
+
83
+ begin
84
+ optparse.parse!
85
+ mandatory = [:bam_file_sorted, :index_prefix, :annotation_file, :bwa, :samtools, :gatk, :vcf, :index_vcf, :index_fa]
86
+ missing = mandatory.select{ |param| options[param].nil? }
87
+ if !missing.empty?
88
+ puts "\nMissing options given or missing in config_file: \n\t#{missing.join(",\n\t")}"
89
+ puts optparse
90
+ exit
91
+ end
92
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
93
+ puts $!.to_s
94
+ puts optparse
95
+ exit
96
+ end
97
+
98
+ # pipeline starts here
99
+
100
+ # tmp files for output
101
+ random = (rand*1000000).floor.to_s
102
+ bam_file = options[:bam_file_sorted]
103
+ job_prefix = "#{random}"
104
+ log_file = "#{random}.log"
105
+ target_intervals = "#{random}_target.intervals"
106
+ realigned_bam = "#{random}_realigned.bam"
107
+ recal_file = "#{random}_recal.csv"
108
+ recal_bam = "#{random}_recal.bam"
109
+
110
+ options[:account] = options[:project] if options[:account].empty?
111
+
112
+
113
+ # Indexing
114
+ SamtoolsIndexing.call(bam_file,
115
+ job_prefix,
116
+ account,
117
+ options[:debug])
118
+
119
+ # Realigne
120
+ GatkCaller.prepare_realigne(log_file,
121
+ options[:gatk],
122
+ bam_file,
123
+ options[:index_fa],
124
+ target_intervals,
125
+ job_prefix,
126
+ options[:account],
127
+ options[:debug])
128
+
129
+ GatkCaller.realigne(log_file,
130
+ options[:gatk],
131
+ bam_file,
132
+ options[:index_fa],
133
+ target_intervals,
134
+ realigned_bam,
135
+ job_prefix,
136
+ options[:account],
137
+ options[:debug])
138
+
139
+ # Recalibration
140
+ GatkCaller.recalibrate_bam( log_file,
141
+ options[:gatk],
142
+ options[:index_fa],
143
+ options[:index_vcf],
144
+ realigned_bam,
145
+ recal_file,
146
+ job_prefix,
147
+ options[:account],
148
+ options[:debug] )
149
+
150
+ GatkCaller.table_calibration(log_file,
151
+ options[:gatk],
152
+ options[:index_fa],
153
+ realigned_bam,
154
+ recal_bam,
155
+ recal_file,
156
+ job_prefix,
157
+ options[:account],
158
+ options[:debug])
159
+
160
+ # GATK: finding mutations
161
+ GatkCaller.call(log_file,
162
+ options[:gatk],
163
+ options[:index_fa],
164
+ recal_bam,
165
+ options[:vcf],
166
+ job_prefix,
167
+ options[:account],
168
+ options[:debug])
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mutations_caller_pipeline_aws'
3
+ require 'optparse'
4
+ require 'rubygems'
5
+ require 'yaml'
6
+
7
+ usage =<<EOF
8
+ _________________________________________________________________________________________________
9
+
10
+ #{$0}
11
+ -m mutant_r1.fq -n mutant_r2.fq
12
+ [-w wildtype_r1.fq -x wildtype_r2.fq]
13
+ -c config.yml -v raw_vcf_file
14
+ [-a account || -p project]
15
+ _________________________________________________________________________________________________
16
+
17
+ #{$0} ...
18
+ ... is a tool to find mutations between the reference gene and a given test
19
+ population. To run this tool you must have bwa, samtools and GATK installed.
20
+ Also you should have the indices for bwa and GATK prepared.
21
+ NOTE: Only paired end reads are supported!
22
+
23
+ +++ C L U S T E R V E R S I O N +++
24
+
25
+ config.yml should look like this:
26
+ # config.yml
27
+ index_prefix: "path/to/prefix"
28
+ annotation_file: "path/to/annotation_file"
29
+ bwa: "path/to/bwa"
30
+ samtools: "path/to/samtools"
31
+ gatk: "path/to/GenomeAnalysisTK.jar"
32
+ _________________________________________________________________________________________________
33
+
34
+ EOF
35
+
36
+ options = { :mutant_r1 => nil,
37
+ :mutant_r2 => nil,
38
+ :wildtype_r1 => nil,
39
+ :wildtype_r2 => nil,
40
+ :index_prefix => nil,
41
+ :index_fa => nil,
42
+ :index_vcf => nil,
43
+ :annotation_file => nil,
44
+ :samtools => nil,
45
+ :gatk => nil,
46
+ :bwa => nil,
47
+ :vcf => nil,
48
+ :account => "",
49
+ :project => "",
50
+ :debug => 1,
51
+ }
52
+
53
+ optparse = OptionParser.new do |opts|
54
+ opts.banner = usage
55
+
56
+ opts.on("-m", "--fwd_read_mutant DIR", :REQUIRED, String, "Path to fwd read of mutant") do |i|
57
+ options[:mutant_r1] = i
58
+ end
59
+
60
+ opts.on("-n", "--rev_read_mutant DIR", :REQUIRED, String, "Path to rev read of mutant") do |i|
61
+ options[:mutant_r2] = i
62
+ end
63
+
64
+ opts.on("-w", "--fwd_read_wildtype DIR", String, "Path to fwd read of wildtype, not mandatory") do |i|
65
+ options[:wildtype_r1] = i if i
66
+ end
67
+
68
+ opts.on("-x", "--rev_read_wildtype DIR", String, "Path to rev read of wildtype, not mandatory") do |i|
69
+ options[:wildtype_r2] = i if i
70
+ end
71
+
72
+ opts.on("-c", "--config DIR", String, "Set config file") do |path|
73
+ options.merge!(Hash[YAML::load(open(path)).map { |k, v| [k.to_sym, v] }])
74
+ end
75
+
76
+ opts.on("-v","--vcf [PATH]", "Output of pipeline") do |i|
77
+ options[:vcf] = i
78
+ end
79
+
80
+ opts.on("-a","--account [NAME]", "Option for qsub: -A [NAME]. Default: [none] " ) do |i|
81
+ options[:account] = " -A #{i}" if i
82
+ end
83
+
84
+ opts.on("-p","--project [NAME]", "Option for qsub: -P [NAME]. Default: [none] " ) do |i|
85
+ options[:project] = " -P #{i}" if i
86
+ end
87
+
88
+ opts.on("-d","--debug ", "Option to debug" ) do |i|
89
+ options[:debug] = 5 if i
90
+ end
91
+
92
+ opts.on_tail("-h", "--help", "Show this message") do
93
+ puts opts
94
+ exit
95
+ end
96
+ end
97
+
98
+ begin
99
+ optparse.parse!
100
+ mandatory = [:mutant_r1, :mutant_r2, :index_prefix, :annotation_file, :bwa, :samtools, :gatk, :vcf, :index_vcf, :index_fa]
101
+ missing = mandatory.select{ |param| options[param].nil? }
102
+ if !missing.empty?
103
+ puts "\nMissing options given or missing in config_file: \n\t#{missing.join(",\n\t")}"
104
+ puts optparse
105
+ exit
106
+ end
107
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
108
+ puts $!.to_s
109
+ puts optparse
110
+ exit
111
+ end
112
+
113
+ # pipeline starts here
114
+
115
+ # tmp files for output
116
+ random = (rand*1000000).floor.to_s
117
+ bam_file = "mutant_#{random}"
118
+ job_prefix = "#{random}"
119
+ log_file = "#{random}.log"
120
+ target_intervals = "#{random}_target.intervals"
121
+ realigned_bam = "#{random}_realigned.bam"
122
+ recal_file = "#{random}_recal.csv"
123
+ recal_bam = "#{random}_recal.bam"
124
+
125
+ options[:account] = options[:project] if options[:account].empty?
126
+
127
+ # BWA : First step mapping reads to reference
128
+ BwaCaller.call_paired_end(options[:mutant_r1],
129
+ options[:mutant_r2],
130
+ bam_file,
131
+ options[:index_prefix],
132
+ log_file,
133
+ options[:bwa],
134
+ options[:samtools],
135
+ job_prefix,
136
+ options[:account],
137
+ options[:debug])
138
+
139
+
140
+ # Indexing
141
+ bam_file = bam_file + ".bam"
142
+ SamtoolsIndexing.call(bam_file,
143
+ job_prefix,
144
+ options[:account],
145
+ options[:debug])
146
+
147
+ # Realigne
148
+ GatkCaller.prepare_realigne(log_file,
149
+ options[:gatk],
150
+ bam_file,
151
+ options[:index_fa],
152
+ target_intervals,
153
+ job_prefix,
154
+ options[:account],
155
+ options[:debug])
156
+
157
+ GatkCaller.realigne(log_file,
158
+ options[:gatk],
159
+ bam_file,
160
+ options[:index_fa],
161
+ target_intervals,
162
+ realigned_bam,
163
+ job_prefix,
164
+ options[:account],
165
+ options[:debug])
166
+
167
+ # Recalibration
168
+ GatkCaller.recalibrate_bam( log_file,
169
+ options[:gatk],
170
+ options[:index_fa],
171
+ options[:index_vcf],
172
+ realigned_bam,
173
+ recal_file,
174
+ job_prefix,
175
+ options[:account],
176
+ options[:debug] )
177
+
178
+ GatkCaller.table_calibration(log_file,
179
+ options[:gatk],
180
+ options[:index_fa],
181
+ realigned_bam,
182
+ recal_bam,
183
+ recal_file,
184
+ job_prefix,
185
+ options[:account],
186
+ options[:debug])
187
+
188
+ # GATK: finding mutations
189
+ GatkCaller.call(log_file,
190
+ options[:gatk],
191
+ options[:index_fa],
192
+ recal_bam,
193
+ options[:vcf],
194
+ job_prefix,
195
+ options[:account],
196
+ options[:debug])
@@ -0,0 +1,18 @@
1
+ class BwaCaller
2
+ def self.call_single_end(r1,out_file,index, log_file, bwa, samtools)
3
+ cmd = "#{bwa} samse -r '@RG\tID:foo\tSM:bar\tPL:Illumina' #{index} \
4
+ <(#{bwa} aln #{index} #{r1} 2>>#{log_file}) \
5
+ #{r1} 2>>#{log_file} | #{samtools} view -Su - 2>>#{log_file} | #{samtools} sort - #{out_file} 2>>#{log_file}"
6
+ puts cmd
7
+ system('bash','-c',cmd )
8
+ end
9
+
10
+ def self.call_paired_end(r1, r2, out_file, index, log_file, bwa, samtools, job_prefix,account, debug)
11
+ cmd = "qsub -cwd -b y -N #{job_prefix}_bwa -l h_vmem=9G -pe make 4 #{account}\
12
+ #{bwa} sampe -r '@RG\tID:foo\tSM:bar\tPL:Illumina' #{index} \
13
+ <(#{bwa} aln #{index} #{r1} 2>>#{log_file} || exit 1) <(#{bwa} aln #{index} #{r2} 2>>#{log_file} ) \
14
+ #{r1} #{r2} 2>>#{log_file} | #{samtools} view -Su - 2>>#{log_file} | #{samtools} sort - #{out_file} 2>>#{log_file}"
15
+ puts cmd
16
+ system('bash','-c', cmd) if debug == 1
17
+ end
18
+ end
@@ -0,0 +1,70 @@
1
+ class GatkCaller
2
+ # INDEX is normal genom.fa
3
+ # Genotyper
4
+ def self.call(log_dir, gatk, index_fa, read_bam, read_vcf, job_prefix, account, debug)
5
+ cmd = "echo 'starting GATK for mutant at ' `date` >> #{log_dir}
6
+ qsub -cwd -b y -N #{job_prefix}_genotyper -l h_vmem=3G -hold_jid #{job_prefix}_recalibration #{account}\
7
+ java -Xmx4g -jar #{gatk} -l INFO -R #{index_fa} -T UnifiedGenotyper \
8
+ -I #{read_bam} \
9
+ -o #{read_vcf} \
10
+ --genotype_likelihoods_model BOTH \
11
+ >> #{log_dir} 2>&1 || exit 1"
12
+ puts cmd
13
+ system(cmd) if debug == 1
14
+ end
15
+
16
+ # Making recalibration table
17
+ def self.recalibrate_bam(log_dir ,gatk, index_fa, index_vcf, read_bam, recal_file, job_prefix, account, debug )
18
+ cmd = "echo 'starting recalibration table ' `date` >> #{log_dir}
19
+ qsub -cwd -b y -N #{job_prefix}_recalibration_table -l h_vmem=3G #{account} \
20
+ java -Xmx4g -jar #{gatk} -knownSites #{index_vcf} -I #{read_bam} \
21
+ -R #{index_fa} -T CountCovariates \
22
+ -cov ReadGroupCovariate -cov QualityScoreCovariate -cov DinucCovariate \
23
+ -cov CycleCovariate \
24
+ -recalFile #{recal_file} >> #{log_dir} 2>&1 || exit 1 "
25
+ puts cmd
26
+ system(cmd) if debug == 1
27
+ end
28
+
29
+ # Using recalibration table
30
+ def self.table_calibration(log_dir, gatk, index_fa, read_bam, recal_bam, recal_file, job_prefix, account, debug)
31
+ cmd = "echo 'recalibrating bam_file at ' `date` >> #{log_dir}
32
+ qsub -cwd -b y -N #{job_prefix}_recalibration -l h_vmem=3G -hold_jid #{job_prefix}_recalibration_table #{account} \
33
+ java -Xmx4g -jar #{gatk} \
34
+ -R #{index_fa} \
35
+ -I #{read_bam} \
36
+ -T TableRecalibration \
37
+ -o #{recal_bam} \
38
+ -recalFile #{recal_file} >> #{log_dir} 2>&1 || exit 1"
39
+ puts cmd
40
+ system(cmd) if debug == 1
41
+ end
42
+
43
+ # Preparation realignement
44
+ def self.prepare_realigne(log_dir, gatk, read_bam, index_fa, target_intervals, job_prefix, account, debug)
45
+ cmd = "echo 'preparing realignement at ' `date` >> #{log_dir}
46
+ qsub -cwd -b y -N #{job_prefix}_prep_realignment -l h_vmem=3G -hold_jid #{job_prefix}_indexing #{account}\
47
+ java -Xmx2g -jar #{gatk} \
48
+ -I #{read_bam} \
49
+ -R #{index_fa} \
50
+ -T RealignerTargetCreator \
51
+ -o #{target_intervals}"
52
+ puts cmd
53
+ system(cmd) if debug == 1
54
+ end
55
+
56
+ # Realignment
57
+ def self.realigne(log_dir, gatk, read_bam, index_fa, target_intervals, realigned_bam, job_prefix, account, debug)
58
+ cmd = "echo 'preparing realignement at ' `date` >> #{log_dir}
59
+ qsub -cwd -b y -N #{job_prefix}_realignment -l h_vmem=3G -hold_jid #{job_prefix}_prep_realignment #{account} \
60
+ java -Xmx4g -jar #{gatk} \
61
+ -I #{read_bam} \
62
+ -R #{index_fa} \
63
+ -T IndelRealigner \
64
+ -targetIntervals #{target_intervals} \
65
+ -o #{realigned_bam} >> #{log_dir} 2>&1 || exit 1"
66
+ puts cmd
67
+ system(cmd) if debug == 1
68
+ end
69
+
70
+ end
@@ -0,0 +1,21 @@
1
+ class LocationFile
2
+ def self.create(vcf_file, loction_file_output)
3
+ locations = File.open(vcf_file)
4
+ line = locations.readline()
5
+
6
+ locus = []
7
+ while line.include?('#')
8
+ location = line.scan(/##contig=<ID=+\w+/)
9
+ if !location.empty?()
10
+ location = location[0].split('=')
11
+ locus << location[-1]
12
+ end
13
+ line = locations.readline()
14
+ end
15
+
16
+ locations.close()
17
+ locus_file = File.new(location_file_output,'w')
18
+ locus_file.write(locus.join("\n"))
19
+ locus_file.close()
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+ class SamtoolsIndexing
2
+ def self.call(bam_file, job_prefix, account, debug)
3
+ cmd = "qsub -cwd -b y -N #{job_prefix}_indexing -l h_vmem=3G -hold_jid #{job_prefix}_bwa #{account} \
4
+ samtools index #{bam_file}"
5
+ puts cmd
6
+ system(cmd) if debug == 1
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ require 'mutations_caller_pipeline_aws/bwa_caller'
2
+ require 'mutations_caller_pipeline_aws/gatk_caller'
3
+ require 'mutations_caller_pipeline_aws/samtools_indexing'
4
+
5
+ class MutationsCallerPipelineAws
6
+ def self.hi
7
+ "Hello World!"
8
+ end
9
+ end
10
+
11
+
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mutations_caller_pipeline_aws
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kaharina Hayer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Using BWA to align and GATK to call the bases
15
+ email:
16
+ - katharinaehayer@gmail.com
17
+ executables:
18
+ - mutations_caller_pipeline_aws
19
+ - gatk_pipe_only_aws
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - bin/gatk_pipe_only_aws
24
+ - bin/mutations_caller_pipeline_aws
25
+ - lib/mutations_caller_pipeline_aws.rb
26
+ - lib/mutations_caller_pipeline_aws/.DS_Store
27
+ - lib/mutations_caller_pipeline_aws/bwa_caller.rb
28
+ - lib/mutations_caller_pipeline_aws/gatk_caller.rb
29
+ - lib/mutations_caller_pipeline_aws/location_file.rb
30
+ - lib/mutations_caller_pipeline_aws/samtools_indexing.rb
31
+ homepage: https://github.com/khayer/mutations_caller_pipeline_aws
32
+ licenses: []
33
+ post_install_message:
34
+ rdoc_options: []
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project: mutations_caller_pipeline_aws
51
+ rubygems_version: 1.8.10
52
+ signing_key:
53
+ specification_version: 3
54
+ summary: Call Mutations for files.fq
55
+ test_files: []
56
+ has_rdoc: