bio-gngm 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bio-gngm.gemspec +7 -3
- data/doc/Bio.html +4 -2
- data/doc/Bio/DB.html +4 -2
- data/doc/Bio/DB/FastaLengthDB.html +238 -0
- data/doc/Bio/DB/Pileup.html +10 -9
- data/doc/Bio/DB/Vcf.html +52 -102
- data/doc/Bio/Util.html +4 -2
- data/doc/Bio/Util/Gngm.html +318 -72
- data/doc/LICENSE_txt.html +12 -9
- data/doc/created.rid +3 -3
- data/doc/index.html +3 -1
- data/doc/js/search_index.js +1 -1
- data/doc/lib/bio-gngm_rb.html +2 -0
- data/doc/lib/bio/util/bio-gngm_rb.html +2 -0
- data/doc/table_of_contents.html +20 -7
- data/examples/Chr1.html +1483 -0
- data/examples/gngm_qtl_mapping_HTML_maker_set_options.rb +243 -0
- data/examples/pileup.txt +22 -0
- data/examples/snps.vcf +22 -0
- data/lib/bio-gngm.rb +0 -1
- data/lib/bio/util/bio-gngm.rb +100 -38
- data/test/test_bio-gngm.rb +1 -1
- metadata +30 -26
- data/lib/bio/util/mutation_effects.rb +0 -39
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Author : Naveed Ishaque (inspired by Dan Maclean) edited again by Dan to include ChD value setting and deletion of SNPs file
|
4
|
+
# naveed.ishaque@tsl.ac.uk; naveed.ishaque@hotmail.co.uk
|
5
|
+
# Date: 20th June 2012 and 1st November 2012
|
6
|
+
|
7
|
+
# This scripts produces a HTML with embedded images showing the SNP density and chastity plots for a given BAM file
|
8
|
+
# It will automatically iterate over all contigs, form begining to end
|
9
|
+
# NOTE - run using ruby executable: /home/programs/gngm/ruby/bin/ruby
|
10
|
+
# NOTE - this will only run on the cluster via a bsub command
|
11
|
+
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bio'
|
15
|
+
require 'bio-gngm'
|
16
|
+
require 'base64'
|
17
|
+
require 'getoptions'
|
18
|
+
|
19
|
+
usage = "\n#{$PROGRAM_NAME} reads in a fasta and bam files and produces a html file indicating QTL locations as peaks\n\n\n\t #{$PROGRAM_NAME}\n\n\n -f [reference fasta file]\n -b [bam file]\n -e expected ChD (allele freq) default 1\n -c control ChD (allele freq) default 0.5\n -s List of known SNPS [tab delimited file]\n\n"
|
20
|
+
|
21
|
+
# PARSE INPUTS
|
22
|
+
|
23
|
+
opt = GetOptions.new(%w(h help f=@s b=@s e=@s c=@s s=@s))
|
24
|
+
|
25
|
+
puts "#{usage}" if opt[:h]
|
26
|
+
exit if opt[:h]
|
27
|
+
puts "#{usage}" if opt[:help]
|
28
|
+
exit if opt[:help]
|
29
|
+
|
30
|
+
puts "ERROR - no fasta file provided (-f)\n#{usage}" unless opt[:f]
|
31
|
+
exit unless opt[:f]
|
32
|
+
puts "ERROR - fasta file '#{opt[:f][0]}' does not exist\n#{$usage}" unless FileTest.exist?("#{opt[:f][0]}")
|
33
|
+
exit unless FileTest.exist?("#{opt[:f][0]}")
|
34
|
+
warn "\nUsing FASTA file #{opt[:f][0]}"
|
35
|
+
|
36
|
+
puts "ERROR - no bam file provided (-b)\n#{usage}" unless opt[:b]
|
37
|
+
exit unless opt[:b]
|
38
|
+
puts "ERROR - BAM file '#{opt[:b][0]}' does not exist\n#{usage}" unless FileTest.exist?("#{opt[:b][0]}")
|
39
|
+
exit unless FileTest.exist?("#{opt[:b][0]}")
|
40
|
+
warn "Using BAM file #{opt[:b][0]}"
|
41
|
+
|
42
|
+
|
43
|
+
expected_chd = 1.0
|
44
|
+
expected_chd =opt[:e][0].to_f if expected_chd and opt[:e]
|
45
|
+
|
46
|
+
control_chd = 0.5
|
47
|
+
control_chd = opt[:c][0].to_f if control_chd and opt[:c]
|
48
|
+
|
49
|
+
known_snps = Hash.new { |h,k| h[k] = Array.new }
|
50
|
+
if opt[:s].first
|
51
|
+
File.open(opt[:s].first).each do |line|
|
52
|
+
chr,pos = line.split("\t")
|
53
|
+
known_snps[chr] << pos.to_i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
$stderr.puts "using expected ChD: #{expected_chd} and control ChD: #{control_chd}"
|
59
|
+
|
60
|
+
hist_bins = [100000, 250000, 500000]
|
61
|
+
ks = [5, 7, 9, 11]
|
62
|
+
kadjusts = [0.5, 0.25, 0.1, 0.05, 0.01]
|
63
|
+
warn "Histogram bin sizes: #{hist_bins}\nThread clusters (K): #{ks}\nKernal adjusts: #{kadjusts}\n"
|
64
|
+
|
65
|
+
# LOAD FASTA and find contigs
|
66
|
+
sequences = Bio::DB::FastaLengthDB.new(:file => "#{opt[:f][0]}")
|
67
|
+
|
68
|
+
|
69
|
+
# For Each contig in the fasta file analyse...
|
70
|
+
|
71
|
+
sequences.each do |id,length|
|
72
|
+
warn "\nProcessing #{id}:1 - #{length}..."
|
73
|
+
|
74
|
+
warn "Skipping #{id} as too short ..." if length < (4 * hist_bins.max)
|
75
|
+
next if length < (4 * hist_bins.max)
|
76
|
+
|
77
|
+
g = Bio::Util::Gngm.new(:file => "#{opt[:b][0]}",
|
78
|
+
:format => :bam,
|
79
|
+
:fasta => "#{opt[:f][0]}",
|
80
|
+
:start => 1,
|
81
|
+
:stop => 10000,
|
82
|
+
:chromosome => id,
|
83
|
+
:samtools => {
|
84
|
+
:q => 20,
|
85
|
+
:Q => 20
|
86
|
+
},
|
87
|
+
:ignore_file => "#{opt[:s][0]}",
|
88
|
+
:write_pileup => "pileup.txt",
|
89
|
+
:write_vcf => "snps.vcf"
|
90
|
+
)
|
91
|
+
|
92
|
+
# predict SNPs
|
93
|
+
|
94
|
+
warn " Prediciting SNPs for #{id}:1-#{length}..."
|
95
|
+
g.snp_positions
|
96
|
+
|
97
|
+
#delete SNPs from known snp_list
|
98
|
+
#a = g.snp_positions.dup
|
99
|
+
#known_snps[seq.entry_id].each {|snp_pos| a.delete_if{|x| x.first == snp_pos} }
|
100
|
+
#$stderr.puts "deleted #{g.snp_positions.length - a.length} snps appearing in #{opt[:s]}"
|
101
|
+
#g.snp_positions = a
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
# produce SNP density histograms
|
106
|
+
|
107
|
+
warn " Iterating over different histogram bin sizes..."
|
108
|
+
hist_bins.each do |bin_width|
|
109
|
+
warn " Makings PNG for bin size #{bin_width}..."
|
110
|
+
file_name = "#{id}_SNP_histogram_bin#{bin_width}.png"
|
111
|
+
g.frequency_histogram("#{file_name}",bin_width, :title => "#{id}: SNP density histogram (bin width - #{bin_width})", :width => 1066, :height => 300)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Write to embedded HTML
|
115
|
+
|
116
|
+
htmlout = File.open("#{id}.html", 'w')
|
117
|
+
htmlout.puts "<html>\n"
|
118
|
+
htmlout.puts " <head>\n"
|
119
|
+
htmlout.puts " <title>GNGM #{id} - QTL mapping</title>\n"
|
120
|
+
htmlout.puts " <style type=\"text/css\">\n"
|
121
|
+
htmlout.puts " table,\n"
|
122
|
+
htmlout.puts " td,\n"
|
123
|
+
htmlout.puts " tbody,\n"
|
124
|
+
htmlout.puts " thead,\n"
|
125
|
+
htmlout.puts " thead th,\n"
|
126
|
+
htmlout.puts " tr.even,\n"
|
127
|
+
htmlout.puts " tr.odd {\n"
|
128
|
+
htmlout.puts " border: 0;\n"
|
129
|
+
htmlout.puts " }\n"
|
130
|
+
htmlout.puts " </style>\n"
|
131
|
+
htmlout.puts " </head>\n"
|
132
|
+
htmlout.puts " <body>\n\t\t"
|
133
|
+
htmlout.puts " <table>\n"
|
134
|
+
htmlout.puts " <tr>\n"
|
135
|
+
hist_bins.each do |bin_width|
|
136
|
+
htmlout.puts " <td>\n"
|
137
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
138
|
+
htmlout.puts [open("#{id}_SNP_histogram_bin#{bin_width}.png").read].pack("m")
|
139
|
+
File.delete("#{id}_SNP_histogram_bin#{bin_width}.png")
|
140
|
+
htmlout.puts "\" width=\"533\" height=\"150\"/>\n"
|
141
|
+
htmlout.puts " </td>\n"
|
142
|
+
end
|
143
|
+
htmlout.puts " </tr>\n"
|
144
|
+
htmlout.puts " </table>\n"
|
145
|
+
|
146
|
+
# Perform chastity calculations
|
147
|
+
|
148
|
+
warn " Collecting threads..."
|
149
|
+
g.collect_threads
|
150
|
+
warn " Iterating over k and kernel adjusts..."
|
151
|
+
ks.each do | k |
|
152
|
+
begin
|
153
|
+
warn " Makings PNG for k = #{k} ..."
|
154
|
+
warn " Calculating threads ..."
|
155
|
+
g.calculate_clusters(:k => k, :adjust => 0.5, :control_chd => control_chd, :expected_chd => expected_chd)
|
156
|
+
warn " Drawing threads ..."
|
157
|
+
filename = "#{id}_k#{k}_threads.png"
|
158
|
+
g.draw_threads(filename, :title => "#{id}: Chastity bands - all phases (k=#{k})", :width => 700, :height => 300)
|
159
|
+
warn " Clustering bands ..."
|
160
|
+
filename = "#{id}_k#{k}_clustered_bands.png"
|
161
|
+
g.draw_bands(filename, :title => "#{id}: Homozygous and heterozygous chastity belts (k=#{k})", :width => 800, :height => 300)
|
162
|
+
kadjusts.each do |kernel_adjust|
|
163
|
+
begin
|
164
|
+
warn " Calculating threads (with kernal adjust #{kernel_adjust}) ..."
|
165
|
+
g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => control_chd, :expected_chd => expected_chd)
|
166
|
+
warn " Calculating signal ..."
|
167
|
+
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png"
|
168
|
+
g.draw_signal(filename, :title => "#{id}: Homo/Het signal ratio (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
|
169
|
+
warn " Estimating peaks ..."
|
170
|
+
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png"
|
171
|
+
g.draw_peaks(filename, :title => "#{id}: Signal peaks (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
|
172
|
+
rescue => e
|
173
|
+
$stderr.puts "skipping #{k} #{kernel_adjust} => #{e}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
rescue => e
|
177
|
+
$stderr.puts "Skipping #{k} => #{e}"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
g.close
|
182
|
+
|
183
|
+
# Write to embedded HTML
|
184
|
+
|
185
|
+
htmlout.puts " <table>\n"
|
186
|
+
|
187
|
+
# all bands
|
188
|
+
htmlout.puts " <tr>\n"
|
189
|
+
ks.each do | k |
|
190
|
+
htmlout.puts " <td>\n"
|
191
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
192
|
+
htmlout.puts [open("#{id}_k#{k}_threads.png").read].pack("m")
|
193
|
+
File.delete("#{id}_k#{k}_threads.png")
|
194
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
195
|
+
htmlout.puts " </td>\n"
|
196
|
+
end
|
197
|
+
htmlout.puts " </tr>\n"
|
198
|
+
|
199
|
+
# homo/het bands
|
200
|
+
htmlout.puts " <tr>\n"
|
201
|
+
ks.each do | k |
|
202
|
+
htmlout.puts " <td>\n"
|
203
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
204
|
+
htmlout.puts [open("#{id}_k#{k}_clustered_bands.png").read].pack("m")
|
205
|
+
File.delete("#{id}_k#{k}_clustered_bands.png")
|
206
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
207
|
+
htmlout.puts " </td>\n"
|
208
|
+
end
|
209
|
+
htmlout.puts " </tr>\n"
|
210
|
+
|
211
|
+
# k/adjusts
|
212
|
+
kadjusts.each do |kernel_adjust|
|
213
|
+
htmlout.puts " <tr>\n"
|
214
|
+
ks.each do | k |
|
215
|
+
htmlout.puts " <td>\n"
|
216
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
217
|
+
htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png").read].pack("m")
|
218
|
+
File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png")
|
219
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
220
|
+
htmlout.puts " </td>\n"
|
221
|
+
end
|
222
|
+
htmlout.puts " </tr>\n"
|
223
|
+
htmlout.puts " <tr>\n"
|
224
|
+
ks.each do | k |
|
225
|
+
htmlout.puts " <td>\n"
|
226
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
227
|
+
htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png").read].pack("m")
|
228
|
+
File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png")
|
229
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
230
|
+
htmlout.puts " </td>\n"
|
231
|
+
end
|
232
|
+
htmlout.puts " </tr>\n"
|
233
|
+
end
|
234
|
+
|
235
|
+
htmlout.puts " </table>\n"
|
236
|
+
htmlout.puts "\n </body>\n</html>\n"
|
237
|
+
|
238
|
+
htmlout.close
|
239
|
+
|
240
|
+
end
|
241
|
+
|
242
|
+
|
243
|
+
|
data/examples/pileup.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Chr1 23 C 4 AA.. !!II
|
2
|
+
Chr1 30 C 5 .-1T.-1T... !!HII
|
3
|
+
Chr1 33 T 5 .-1G.-1G... !!IHI
|
4
|
+
Chr1 37 T 6 AA.... !!IIII
|
5
|
+
Chr1 40 T 7 CC..... !!IHIID
|
6
|
+
Chr1 44 T 7 AA..... !!IIIFB
|
7
|
+
Chr1 711 T 16 ,.,,,..,,,.,,CC. >GIIDH;IIIHID48I
|
8
|
+
Chr1 1584 G 24 ,,.,..a,,,....,,.C,,.,.^]. IIIHIC.IIDGGII=DI'IFHIIE
|
9
|
+
Chr1 2544 T 34 ,,,,,c,,,...,,..,,.,,,..,,.,.,.,C. EIGEG&GIIIHIGG@IGGIDIHIIHEIHIIG=/D
|
10
|
+
Chr1 3975 G 22 .,....+1T......,...,-1t.,-1t.., HIHIIGGHDIEGFIFDCIDGBE
|
11
|
+
Chr1 4290 C 9 ,.,.,,-2at,-2at.-2AT. IGDIC@<II
|
12
|
+
Chr1 4292 T 11 ,.,.,***.C^]C F4DI+!!!I!!
|
13
|
+
Chr1 5699 C 25 tT.,,..,.....,..,......,. !);IH<AHD>;H;I09IDIGIIIGI
|
14
|
+
Chr1 5927 T 20 .,......,g,.,..g,... HI?IHIHII%IBDI=%HBHF
|
15
|
+
Chr1 5932 A 20 ,......,g,.,..t,.... >-HHIIIH&II7I=2HHHGH
|
16
|
+
Chr1 6324 T 23 ,$,,+1a,+1a.+1A.+1A.+1A.+1A,+1a,+1a.+1A,+1a,+1a,+1a,+1a.+1A,+1a,+1a,+1a..+1A.+1A.+1A BII1IIGIIFFIHDFGDIHF?4I
|
17
|
+
Chr1 7361 T 28 ,,,,,.$,,,,,..C,..,,A.,,,...^], HIFBIA?IIDHFF(GBIHH,.HBHI.GE
|
18
|
+
Chr1 7562 A 31 ..,.c.,,,,.,.t,,.,.......,.,... EFEE+>EEDGC@I)>AI<I?IHIGI8H=IID
|
19
|
+
Chr1 8003 C 21 .$,,..,.......-1A....,^]a^],^]a DIGHIGIAIGGEHIIIF7!0!
|
20
|
+
Chr1 8017 G 16 .A....A...,,,,,. A-<>76**1@BIGIII
|
21
|
+
Chr1 9066 G 28 .,,,.,.,,..,..,,.,,,c..,,C.^], DGHIHIGD2IGII;HHGI?I%IIIH(ID
|
22
|
+
Chr1 9971 C 19 ,....,,,,-2at.,.-2AT,-2at,,,... HI=HIGAGIGIIDI<IGIH
|
data/examples/snps.vcf
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Chr1 23 . C A,C 0 0 DP=4 GT:GQ:DP 0/1:0:4
|
2
|
+
Chr1 30 . C . 0 0 DP=5 GT:GQ:DP 0/0:0:5
|
3
|
+
Chr1 33 . T . 0 0 DP=5 GT:GQ:DP 0/0:0:5
|
4
|
+
Chr1 37 . T . 0 0 DP=6 GT:GQ:DP 0/0:0:6
|
5
|
+
Chr1 40 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
|
6
|
+
Chr1 44 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
|
7
|
+
Chr1 711 . T . 0 0 DP=16 GT:GQ:DP 0/0:0:16
|
8
|
+
Chr1 1584 . G . 0 0 DP=24 GT:GQ:DP 0/0:0:24
|
9
|
+
Chr1 2544 . T . 0 0 DP=34 GT:GQ:DP 0/0:0:34
|
10
|
+
Chr1 3975 . G . 0 0 DP=22 GT:GQ:DP 0/0:0:22
|
11
|
+
Chr1 4290 . C . 0 0 DP=9 GT:GQ:DP 0/0:0:9
|
12
|
+
Chr1 4292 . T . 0 0 DP=11 GT:GQ:DP 0/0:0:11
|
13
|
+
Chr1 5699 . C . 0 0 DP=25 GT:GQ:DP 0/0:0:25
|
14
|
+
Chr1 5927 . T . 0 0 DP=20 GT:GQ:DP 0/0:0:20
|
15
|
+
Chr1 5932 . A . 0 0 DP=20 GT:GQ:DP 0/0:0:20
|
16
|
+
Chr1 6324 . T . 0 0 DP=23 GT:GQ:DP 0/0:0:23
|
17
|
+
Chr1 7361 . T . 0 0 DP=28 GT:GQ:DP 0/0:0:28
|
18
|
+
Chr1 7562 . A . 0 0 DP=31 GT:GQ:DP 0/0:0:31
|
19
|
+
Chr1 8003 . C . 0 0 DP=21 GT:GQ:DP 0/0:0:21
|
20
|
+
Chr1 8017 . G . 0 0 DP=16 GT:GQ:DP 0/0:0:16
|
21
|
+
Chr1 9066 . G . 0 0 DP=28 GT:GQ:DP 0/0:0:28
|
22
|
+
Chr1 9971 . C . 0 0 DP=19 GT:GQ:DP 0/0:0:19
|
data/lib/bio-gngm.rb
CHANGED
data/lib/bio/util/bio-gngm.rb
CHANGED
@@ -227,14 +227,36 @@ link:images/signal.png
|
|
227
227
|
|
228
228
|
g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
|
229
229
|
:format => :bam,
|
230
|
-
:fasta => "reference.fasta",
|
231
|
-
:
|
230
|
+
:fasta => "reference.fasta",
|
231
|
+
:start => 100,
|
232
|
+
:stop => 200,
|
233
|
+
:write_pileup => "my_pileup_file.pileup",
|
234
|
+
:write_vcf => "my_vcf_file.vcf",
|
235
|
+
:ignore_file => "my_known_snps.txt"
|
236
|
+
:samtools => {
|
232
237
|
:q => 20,
|
233
238
|
:Q => 50
|
234
239
|
},
|
235
240
|
:min_non_ref_freq => 0.5,
|
236
|
-
:min_non_ref => 3
|
241
|
+
:min_non_ref => 3,
|
242
|
+
:start => 1,
|
243
|
+
:stop => 100000,
|
244
|
+
:chromosome => "Chr1",
|
245
|
+
:variant_call => {
|
246
|
+
:indels => false,
|
247
|
+
:min_depth => 6,
|
248
|
+
:max_depth => 250,
|
249
|
+
:mapping_quality => 20.0,
|
250
|
+
:min_non_ref_count => 2,
|
251
|
+
:ignore_reference_n => true,
|
252
|
+
:min_snp_quality => 20,
|
253
|
+
:min_consensus_quality => 20,
|
254
|
+
:substitutions => ["C:T","G:A"]
|
255
|
+
}
|
256
|
+
|
257
|
+
|
237
258
|
)
|
259
|
+
|
238
260
|
g.snp_positions
|
239
261
|
g.collect_threads(:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 )
|
240
262
|
[0.25, 0.5, 1.0].each do |kernel_adjust| # loop through different kernel values
|
@@ -311,10 +333,7 @@ The following R packages are required
|
|
311
333
|
Thanks very much indeed to Ryan Austin, who invented NGM in the first place and was very forthcoming with R code, around which this implementation is based.
|
312
334
|
|
313
335
|
== Using bio-gngm
|
314
|
-
|
315
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
316
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
317
|
-
require 'bio-samtools'
|
336
|
+
|
318
337
|
require 'bio-gngm'
|
319
338
|
|
320
339
|
== API
|
@@ -335,30 +354,37 @@ class Gngm
|
|
335
354
|
# :samtools => {:q => 20, :Q => 50},
|
336
355
|
# :fasta => "reference.fa"
|
337
356
|
# :start => 100,
|
338
|
-
# :stop => 200
|
357
|
+
# :stop => 200,
|
358
|
+
# :write_pileup => "my_pileup_file.pileup",
|
359
|
+
# :write_vcf => "my_vcf_file.vcf",
|
360
|
+
# :ignore_file => "my_known_snps.txt"
|
361
|
+
#
|
339
362
|
# )
|
340
363
|
#
|
341
364
|
#Required parameters and defaults:
|
342
|
-
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
|
343
|
-
#- <tt>:format => :bam</tt> -either :bam, :
|
365
|
+
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present. A pileup file, or tab-delimited text file can be used.
|
366
|
+
#- <tt>:format => :bam</tt> -either :bam, :pileup, :txt (pileup expected to be 10 col format from samtools -vcf)
|
344
367
|
#- <tt>:chromosome => "nil"</tt> -sequence id to look at
|
345
368
|
#- <tt>:start => nil</tt> -start position on that sequence
|
346
369
|
#- <tt>:stop => nil</tt> -stop position on that sequence
|
347
370
|
#- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
|
348
|
-
#- <tt>:
|
371
|
+
#- <tt>:write_pileup => false</tt> -the path to a file. SNPs will be written in pileup to this file (indels not output)
|
372
|
+
#- <tt>:write_vcf => false</tt> -the path to a file. SNPs will be written in VCF to this file (indels not output)
|
373
|
+
#- <tt>:ignore_file => false</tt> -file of SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". All SNPs in this file will be ignored
|
374
|
+
#- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details.
|
349
375
|
#Optional parameters and defaults:
|
350
|
-
|
376
|
+
#
|
351
377
|
#Most of these are parameters for specific methods and can be over-ridden when particular methods are called
|
352
|
-
#- <tt>:variant_call => {:indels => false
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
#
|
378
|
+
#- <tt>:variant_call => {:indels => false,</tt>
|
379
|
+
#- <tt> :min_depth => 2, </tt>
|
380
|
+
#- <tt> :max_depth => 10000000, </tt>
|
381
|
+
#- <tt> :min_snp_quality => 20, </tt>
|
382
|
+
#- <tt> :mapping_quality => 10.0, </tt>
|
383
|
+
#- <tt> :min_non_ref_count => 2, </tt>
|
384
|
+
#- <tt> :ignore_reference_n => true, </tt>
|
385
|
+
#- <tt> :min_consensus_quality => 20, </tt>
|
386
|
+
#- <tt> :min_snp_quality => 20 }</tt>.
|
387
|
+
# - <tt> For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
|
362
388
|
#- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
|
363
389
|
#- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
|
364
390
|
#- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
|
@@ -385,9 +411,12 @@ class Gngm
|
|
385
411
|
:fasta => nil,
|
386
412
|
:samtools => {:q => 20, :Q => 50},
|
387
413
|
:indels => false,
|
388
|
-
|
414
|
+
:write_pileup => false,
|
415
|
+
:write_vcf => false,
|
416
|
+
:ignore_file => false,
|
389
417
|
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
390
|
-
:variant_call => { :
|
418
|
+
:variant_call => { :indels => false,
|
419
|
+
:min_depth => 2,
|
391
420
|
:max_depth => 10000000,
|
392
421
|
:mapping_quality => 10.0,
|
393
422
|
:min_non_ref_count => 2,
|
@@ -412,6 +441,31 @@ class Gngm
|
|
412
441
|
}
|
413
442
|
@opts.merge!(options)
|
414
443
|
@opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
|
444
|
+
@pileup_outfile, @vcf_outfile = nil,nil
|
445
|
+
if @opts[:variant_call][:indels] and (@opts[:write_pileup] or @opts[:write_vcf])
|
446
|
+
$stderr.puts "Cannot yet output VCF/Pileup when generating INDELs. Turning output off."
|
447
|
+
@opts[:write_pileup] = false
|
448
|
+
@opts[:write_vcf] = false
|
449
|
+
end
|
450
|
+
if @opts[:write_pileup]
|
451
|
+
@pileup_outfile = File.open(@opts[:write_pileup], "w")
|
452
|
+
end
|
453
|
+
if @opts[:write_vcf]
|
454
|
+
@vcf_outfile = File.open(@opts[:write_vcf], "w")
|
455
|
+
end
|
456
|
+
|
457
|
+
@known_snps = Hash.new
|
458
|
+
if @opts[:ignore_file]
|
459
|
+
File.open(@opts[:ignore_file], "r").each do |line|
|
460
|
+
col = line.chomp.split(/\t/)
|
461
|
+
if @known_snps[col[0]]
|
462
|
+
@known_snps[col[0]][col[1].to_i] = 1
|
463
|
+
else
|
464
|
+
@known_snps[col[0]] = Hash.new
|
465
|
+
@known_snps[col[0]][col[1].to_i] = 1
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
415
469
|
open_file
|
416
470
|
end
|
417
471
|
|
@@ -455,10 +509,8 @@ class Gngm
|
|
455
509
|
#- <tt>:max_depth => 10000000</tt> -maximum quality passing depth of coverage at a position for a SNP call
|
456
510
|
#- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
|
457
511
|
#- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
|
458
|
-
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
459
|
-
|
460
|
-
#- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
|
461
|
-
#When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
|
512
|
+
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
513
|
+
#When INDEL calling only one of <tt>:indels</tt> should be used. If +false+, SNPs are called.
|
462
514
|
#
|
463
515
|
#calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
|
464
516
|
def snp_positions(optsa={})
|
@@ -495,7 +547,10 @@ class Gngm
|
|
495
547
|
|
496
548
|
if not @opts[:samtools][:g]
|
497
549
|
@file.mpileup(@opts[:samtools]) do |pileup|
|
498
|
-
|
550
|
+
if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts) and not @known_snps[pileup.ref_name][pileup.pos]
|
551
|
+
arr << [pileup.pos, pileup.discordant_chastity]
|
552
|
+
write(pileup)
|
553
|
+
end
|
499
554
|
end
|
500
555
|
else
|
501
556
|
@file.mpileup_plus(@opts[:samtools]) do |vcf|
|
@@ -503,9 +558,9 @@ class Gngm
|
|
503
558
|
next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
|
504
559
|
##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
|
505
560
|
if opts[:indels]
|
506
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
561
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
|
507
562
|
else
|
508
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
563
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
|
509
564
|
end
|
510
565
|
end
|
511
566
|
end
|
@@ -515,11 +570,6 @@ class Gngm
|
|
515
570
|
arr
|
516
571
|
end
|
517
572
|
|
518
|
-
private
|
519
|
-
def get_snp_positions_from_map(options={})
|
520
|
-
arr = []
|
521
|
-
opts = @opts[:variant_call].merge(options)
|
522
|
-
end
|
523
573
|
|
524
574
|
#this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
|
525
575
|
#text file is of format chr\tpos\tref\talt\tfreq\n
|
@@ -530,7 +580,7 @@ class Gngm
|
|
530
580
|
chr,pos,ref,alt,freq = line.chomp.split("\t")
|
531
581
|
pos = pos.to_i
|
532
582
|
freq = freq.to_f
|
533
|
-
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
|
583
|
+
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts) and not @known_snps[chr][pos]
|
534
584
|
arr << [pos, freq]
|
535
585
|
end
|
536
586
|
@snp_positions = arr
|
@@ -546,13 +596,25 @@ class Gngm
|
|
546
596
|
next
|
547
597
|
end
|
548
598
|
#old fashioned 10 col pileup format has extra fields we can use if needed
|
549
|
-
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
|
599
|
+
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil? and not @known_snps[pileup.ref_name][pileup.pos]
|
600
|
+
write(pileup)
|
550
601
|
arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
551
602
|
end
|
552
603
|
end
|
553
604
|
@snp_positions = arr
|
554
605
|
end
|
555
606
|
|
607
|
+
private
|
608
|
+
#writes out pileup/vcf files of SNPs that were used
|
609
|
+
def write(obj)
|
610
|
+
if @opts[:write_pileup]
|
611
|
+
@pileup_outfile.puts(obj.to_s)
|
612
|
+
end
|
613
|
+
if @opts[:write_vcf]
|
614
|
+
@vcf_outfile.puts(obj.to_vcf)
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
556
618
|
private
|
557
619
|
#Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
|
558
620
|
def get_insert_size_frequency_from_bam(opts={})
|