bio-gngm 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bio-gngm.gemspec +7 -3
- data/doc/Bio.html +4 -2
- data/doc/Bio/DB.html +4 -2
- data/doc/Bio/DB/FastaLengthDB.html +238 -0
- data/doc/Bio/DB/Pileup.html +10 -9
- data/doc/Bio/DB/Vcf.html +52 -102
- data/doc/Bio/Util.html +4 -2
- data/doc/Bio/Util/Gngm.html +318 -72
- data/doc/LICENSE_txt.html +12 -9
- data/doc/created.rid +3 -3
- data/doc/index.html +3 -1
- data/doc/js/search_index.js +1 -1
- data/doc/lib/bio-gngm_rb.html +2 -0
- data/doc/lib/bio/util/bio-gngm_rb.html +2 -0
- data/doc/table_of_contents.html +20 -7
- data/examples/Chr1.html +1483 -0
- data/examples/gngm_qtl_mapping_HTML_maker_set_options.rb +243 -0
- data/examples/pileup.txt +22 -0
- data/examples/snps.vcf +22 -0
- data/lib/bio-gngm.rb +0 -1
- data/lib/bio/util/bio-gngm.rb +100 -38
- data/test/test_bio-gngm.rb +1 -1
- metadata +30 -26
- data/lib/bio/util/mutation_effects.rb +0 -39
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Author : Naveed Ishaque (inspired by Dan Maclean) edited again by Dan to include ChD value setting and deletion of SNPs file
|
4
|
+
# naveed.ishaque@tsl.ac.uk; naveed.ishaque@hotmail.co.uk
|
5
|
+
# Date: 20th June 2012 and 1st November 2012
|
6
|
+
|
7
|
+
# This scripts produces a HTML with embedded images showing the SNP density and chastity plots for a given BAM file
|
8
|
+
# It will automatically iterate over all contigs, form begining to end
|
9
|
+
# NOTE - run using ruby executable: /home/programs/gngm/ruby/bin/ruby
|
10
|
+
# NOTE - this will only run on the cluster via a bsub command
|
11
|
+
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bio'
|
15
|
+
require 'bio-gngm'
|
16
|
+
require 'base64'
|
17
|
+
require 'getoptions'
|
18
|
+
|
19
|
+
usage = "\n#{$PROGRAM_NAME} reads in a fasta and bam files and produces a html file indicating QTL locations as peaks\n\n\n\t #{$PROGRAM_NAME}\n\n\n -f [reference fasta file]\n -b [bam file]\n -e expected ChD (allele freq) default 1\n -c control ChD (allele freq) default 0.5\n -s List of known SNPS [tab delimited file]\n\n"
|
20
|
+
|
21
|
+
# PARSE INPUTS
|
22
|
+
|
23
|
+
opt = GetOptions.new(%w(h help f=@s b=@s e=@s c=@s s=@s))
|
24
|
+
|
25
|
+
puts "#{usage}" if opt[:h]
|
26
|
+
exit if opt[:h]
|
27
|
+
puts "#{usage}" if opt[:help]
|
28
|
+
exit if opt[:help]
|
29
|
+
|
30
|
+
puts "ERROR - no fasta file provided (-f)\n#{usage}" unless opt[:f]
|
31
|
+
exit unless opt[:f]
|
32
|
+
puts "ERROR - fasta file '#{opt[:f][0]}' does not exist\n#{$usage}" unless FileTest.exist?("#{opt[:f][0]}")
|
33
|
+
exit unless FileTest.exist?("#{opt[:f][0]}")
|
34
|
+
warn "\nUsing FASTA file #{opt[:f][0]}"
|
35
|
+
|
36
|
+
puts "ERROR - no bam file provided (-b)\n#{usage}" unless opt[:b]
|
37
|
+
exit unless opt[:b]
|
38
|
+
puts "ERROR - BAM file '#{opt[:b][0]}' does not exist\n#{usage}" unless FileTest.exist?("#{opt[:b][0]}")
|
39
|
+
exit unless FileTest.exist?("#{opt[:b][0]}")
|
40
|
+
warn "Using BAM file #{opt[:b][0]}"
|
41
|
+
|
42
|
+
|
43
|
+
expected_chd = 1.0
|
44
|
+
expected_chd =opt[:e][0].to_f if expected_chd and opt[:e]
|
45
|
+
|
46
|
+
control_chd = 0.5
|
47
|
+
control_chd = opt[:c][0].to_f if control_chd and opt[:c]
|
48
|
+
|
49
|
+
known_snps = Hash.new { |h,k| h[k] = Array.new }
|
50
|
+
if opt[:s].first
|
51
|
+
File.open(opt[:s].first).each do |line|
|
52
|
+
chr,pos = line.split("\t")
|
53
|
+
known_snps[chr] << pos.to_i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
$stderr.puts "using expected ChD: #{expected_chd} and control ChD: #{control_chd}"
|
59
|
+
|
60
|
+
hist_bins = [100000, 250000, 500000]
|
61
|
+
ks = [5, 7, 9, 11]
|
62
|
+
kadjusts = [0.5, 0.25, 0.1, 0.05, 0.01]
|
63
|
+
warn "Histogram bin sizes: #{hist_bins}\nThread clusters (K): #{ks}\nKernal adjusts: #{kadjusts}\n"
|
64
|
+
|
65
|
+
# LOAD FASTA and find contigs
|
66
|
+
sequences = Bio::DB::FastaLengthDB.new(:file => "#{opt[:f][0]}")
|
67
|
+
|
68
|
+
|
69
|
+
# For Each contig in the fasta file analyse...
|
70
|
+
|
71
|
+
sequences.each do |id,length|
|
72
|
+
warn "\nProcessing #{id}:1 - #{length}..."
|
73
|
+
|
74
|
+
warn "Skipping #{id} as too short ..." if length < (4 * hist_bins.max)
|
75
|
+
next if length < (4 * hist_bins.max)
|
76
|
+
|
77
|
+
g = Bio::Util::Gngm.new(:file => "#{opt[:b][0]}",
|
78
|
+
:format => :bam,
|
79
|
+
:fasta => "#{opt[:f][0]}",
|
80
|
+
:start => 1,
|
81
|
+
:stop => 10000,
|
82
|
+
:chromosome => id,
|
83
|
+
:samtools => {
|
84
|
+
:q => 20,
|
85
|
+
:Q => 20
|
86
|
+
},
|
87
|
+
:ignore_file => "#{opt[:s][0]}",
|
88
|
+
:write_pileup => "pileup.txt",
|
89
|
+
:write_vcf => "snps.vcf"
|
90
|
+
)
|
91
|
+
|
92
|
+
# predict SNPs
|
93
|
+
|
94
|
+
warn " Prediciting SNPs for #{id}:1-#{length}..."
|
95
|
+
g.snp_positions
|
96
|
+
|
97
|
+
#delete SNPs from known snp_list
|
98
|
+
#a = g.snp_positions.dup
|
99
|
+
#known_snps[seq.entry_id].each {|snp_pos| a.delete_if{|x| x.first == snp_pos} }
|
100
|
+
#$stderr.puts "deleted #{g.snp_positions.length - a.length} snps appearing in #{opt[:s]}"
|
101
|
+
#g.snp_positions = a
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
# produce SNP density histograms
|
106
|
+
|
107
|
+
warn " Iterating over different histogram bin sizes..."
|
108
|
+
hist_bins.each do |bin_width|
|
109
|
+
warn " Makings PNG for bin size #{bin_width}..."
|
110
|
+
file_name = "#{id}_SNP_histogram_bin#{bin_width}.png"
|
111
|
+
g.frequency_histogram("#{file_name}",bin_width, :title => "#{id}: SNP density histogram (bin width - #{bin_width})", :width => 1066, :height => 300)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Write to embedded HTML
|
115
|
+
|
116
|
+
htmlout = File.open("#{id}.html", 'w')
|
117
|
+
htmlout.puts "<html>\n"
|
118
|
+
htmlout.puts " <head>\n"
|
119
|
+
htmlout.puts " <title>GNGM #{id} - QTL mapping</title>\n"
|
120
|
+
htmlout.puts " <style type=\"text/css\">\n"
|
121
|
+
htmlout.puts " table,\n"
|
122
|
+
htmlout.puts " td,\n"
|
123
|
+
htmlout.puts " tbody,\n"
|
124
|
+
htmlout.puts " thead,\n"
|
125
|
+
htmlout.puts " thead th,\n"
|
126
|
+
htmlout.puts " tr.even,\n"
|
127
|
+
htmlout.puts " tr.odd {\n"
|
128
|
+
htmlout.puts " border: 0;\n"
|
129
|
+
htmlout.puts " }\n"
|
130
|
+
htmlout.puts " </style>\n"
|
131
|
+
htmlout.puts " </head>\n"
|
132
|
+
htmlout.puts " <body>\n\t\t"
|
133
|
+
htmlout.puts " <table>\n"
|
134
|
+
htmlout.puts " <tr>\n"
|
135
|
+
hist_bins.each do |bin_width|
|
136
|
+
htmlout.puts " <td>\n"
|
137
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
138
|
+
htmlout.puts [open("#{id}_SNP_histogram_bin#{bin_width}.png").read].pack("m")
|
139
|
+
File.delete("#{id}_SNP_histogram_bin#{bin_width}.png")
|
140
|
+
htmlout.puts "\" width=\"533\" height=\"150\"/>\n"
|
141
|
+
htmlout.puts " </td>\n"
|
142
|
+
end
|
143
|
+
htmlout.puts " </tr>\n"
|
144
|
+
htmlout.puts " </table>\n"
|
145
|
+
|
146
|
+
# Perform chastity calculations
|
147
|
+
|
148
|
+
warn " Collecting threads..."
|
149
|
+
g.collect_threads
|
150
|
+
warn " Iterating over k and kernel adjusts..."
|
151
|
+
ks.each do | k |
|
152
|
+
begin
|
153
|
+
warn " Makings PNG for k = #{k} ..."
|
154
|
+
warn " Calculating threads ..."
|
155
|
+
g.calculate_clusters(:k => k, :adjust => 0.5, :control_chd => control_chd, :expected_chd => expected_chd)
|
156
|
+
warn " Drawing threads ..."
|
157
|
+
filename = "#{id}_k#{k}_threads.png"
|
158
|
+
g.draw_threads(filename, :title => "#{id}: Chastity bands - all phases (k=#{k})", :width => 700, :height => 300)
|
159
|
+
warn " Clustering bands ..."
|
160
|
+
filename = "#{id}_k#{k}_clustered_bands.png"
|
161
|
+
g.draw_bands(filename, :title => "#{id}: Homozygous and heterozygous chastity belts (k=#{k})", :width => 800, :height => 300)
|
162
|
+
kadjusts.each do |kernel_adjust|
|
163
|
+
begin
|
164
|
+
warn " Calculating threads (with kernal adjust #{kernel_adjust}) ..."
|
165
|
+
g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => control_chd, :expected_chd => expected_chd)
|
166
|
+
warn " Calculating signal ..."
|
167
|
+
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png"
|
168
|
+
g.draw_signal(filename, :title => "#{id}: Homo/Het signal ratio (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
|
169
|
+
warn " Estimating peaks ..."
|
170
|
+
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png"
|
171
|
+
g.draw_peaks(filename, :title => "#{id}: Signal peaks (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
|
172
|
+
rescue => e
|
173
|
+
$stderr.puts "skipping #{k} #{kernel_adjust} => #{e}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
rescue => e
|
177
|
+
$stderr.puts "Skipping #{k} => #{e}"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
g.close
|
182
|
+
|
183
|
+
# Write to embedded HTML
|
184
|
+
|
185
|
+
htmlout.puts " <table>\n"
|
186
|
+
|
187
|
+
# all bands
|
188
|
+
htmlout.puts " <tr>\n"
|
189
|
+
ks.each do | k |
|
190
|
+
htmlout.puts " <td>\n"
|
191
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
192
|
+
htmlout.puts [open("#{id}_k#{k}_threads.png").read].pack("m")
|
193
|
+
File.delete("#{id}_k#{k}_threads.png")
|
194
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
195
|
+
htmlout.puts " </td>\n"
|
196
|
+
end
|
197
|
+
htmlout.puts " </tr>\n"
|
198
|
+
|
199
|
+
# homo/het bands
|
200
|
+
htmlout.puts " <tr>\n"
|
201
|
+
ks.each do | k |
|
202
|
+
htmlout.puts " <td>\n"
|
203
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
204
|
+
htmlout.puts [open("#{id}_k#{k}_clustered_bands.png").read].pack("m")
|
205
|
+
File.delete("#{id}_k#{k}_clustered_bands.png")
|
206
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
207
|
+
htmlout.puts " </td>\n"
|
208
|
+
end
|
209
|
+
htmlout.puts " </tr>\n"
|
210
|
+
|
211
|
+
# k/adjusts
|
212
|
+
kadjusts.each do |kernel_adjust|
|
213
|
+
htmlout.puts " <tr>\n"
|
214
|
+
ks.each do | k |
|
215
|
+
htmlout.puts " <td>\n"
|
216
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
217
|
+
htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png").read].pack("m")
|
218
|
+
File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png")
|
219
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
220
|
+
htmlout.puts " </td>\n"
|
221
|
+
end
|
222
|
+
htmlout.puts " </tr>\n"
|
223
|
+
htmlout.puts " <tr>\n"
|
224
|
+
ks.each do | k |
|
225
|
+
htmlout.puts " <td>\n"
|
226
|
+
htmlout.puts "<img src=\"data:image/gif;base64,"
|
227
|
+
htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png").read].pack("m")
|
228
|
+
File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png")
|
229
|
+
htmlout.puts "\" width=\"400\" height=\"150\"/>"
|
230
|
+
htmlout.puts " </td>\n"
|
231
|
+
end
|
232
|
+
htmlout.puts " </tr>\n"
|
233
|
+
end
|
234
|
+
|
235
|
+
htmlout.puts " </table>\n"
|
236
|
+
htmlout.puts "\n </body>\n</html>\n"
|
237
|
+
|
238
|
+
htmlout.close
|
239
|
+
|
240
|
+
end
|
241
|
+
|
242
|
+
|
243
|
+
|
data/examples/pileup.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Chr1 23 C 4 AA.. !!II
|
2
|
+
Chr1 30 C 5 .-1T.-1T... !!HII
|
3
|
+
Chr1 33 T 5 .-1G.-1G... !!IHI
|
4
|
+
Chr1 37 T 6 AA.... !!IIII
|
5
|
+
Chr1 40 T 7 CC..... !!IHIID
|
6
|
+
Chr1 44 T 7 AA..... !!IIIFB
|
7
|
+
Chr1 711 T 16 ,.,,,..,,,.,,CC. >GIIDH;IIIHID48I
|
8
|
+
Chr1 1584 G 24 ,,.,..a,,,....,,.C,,.,.^]. IIIHIC.IIDGGII=DI'IFHIIE
|
9
|
+
Chr1 2544 T 34 ,,,,,c,,,...,,..,,.,,,..,,.,.,.,C. EIGEG&GIIIHIGG@IGGIDIHIIHEIHIIG=/D
|
10
|
+
Chr1 3975 G 22 .,....+1T......,...,-1t.,-1t.., HIHIIGGHDIEGFIFDCIDGBE
|
11
|
+
Chr1 4290 C 9 ,.,.,,-2at,-2at.-2AT. IGDIC@<II
|
12
|
+
Chr1 4292 T 11 ,.,.,***.C^]C F4DI+!!!I!!
|
13
|
+
Chr1 5699 C 25 tT.,,..,.....,..,......,. !);IH<AHD>;H;I09IDIGIIIGI
|
14
|
+
Chr1 5927 T 20 .,......,g,.,..g,... HI?IHIHII%IBDI=%HBHF
|
15
|
+
Chr1 5932 A 20 ,......,g,.,..t,.... >-HHIIIH&II7I=2HHHGH
|
16
|
+
Chr1 6324 T 23 ,$,,+1a,+1a.+1A.+1A.+1A.+1A,+1a,+1a.+1A,+1a,+1a,+1a,+1a.+1A,+1a,+1a,+1a..+1A.+1A.+1A BII1IIGIIFFIHDFGDIHF?4I
|
17
|
+
Chr1 7361 T 28 ,,,,,.$,,,,,..C,..,,A.,,,...^], HIFBIA?IIDHFF(GBIHH,.HBHI.GE
|
18
|
+
Chr1 7562 A 31 ..,.c.,,,,.,.t,,.,.......,.,... EFEE+>EEDGC@I)>AI<I?IHIGI8H=IID
|
19
|
+
Chr1 8003 C 21 .$,,..,.......-1A....,^]a^],^]a DIGHIGIAIGGEHIIIF7!0!
|
20
|
+
Chr1 8017 G 16 .A....A...,,,,,. A-<>76**1@BIGIII
|
21
|
+
Chr1 9066 G 28 .,,,.,.,,..,..,,.,,,c..,,C.^], DGHIHIGD2IGII;HHGI?I%IIIH(ID
|
22
|
+
Chr1 9971 C 19 ,....,,,,-2at.,.-2AT,-2at,,,... HI=HIGAGIGIIDI<IGIH
|
data/examples/snps.vcf
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Chr1 23 . C A,C 0 0 DP=4 GT:GQ:DP 0/1:0:4
|
2
|
+
Chr1 30 . C . 0 0 DP=5 GT:GQ:DP 0/0:0:5
|
3
|
+
Chr1 33 . T . 0 0 DP=5 GT:GQ:DP 0/0:0:5
|
4
|
+
Chr1 37 . T . 0 0 DP=6 GT:GQ:DP 0/0:0:6
|
5
|
+
Chr1 40 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
|
6
|
+
Chr1 44 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
|
7
|
+
Chr1 711 . T . 0 0 DP=16 GT:GQ:DP 0/0:0:16
|
8
|
+
Chr1 1584 . G . 0 0 DP=24 GT:GQ:DP 0/0:0:24
|
9
|
+
Chr1 2544 . T . 0 0 DP=34 GT:GQ:DP 0/0:0:34
|
10
|
+
Chr1 3975 . G . 0 0 DP=22 GT:GQ:DP 0/0:0:22
|
11
|
+
Chr1 4290 . C . 0 0 DP=9 GT:GQ:DP 0/0:0:9
|
12
|
+
Chr1 4292 . T . 0 0 DP=11 GT:GQ:DP 0/0:0:11
|
13
|
+
Chr1 5699 . C . 0 0 DP=25 GT:GQ:DP 0/0:0:25
|
14
|
+
Chr1 5927 . T . 0 0 DP=20 GT:GQ:DP 0/0:0:20
|
15
|
+
Chr1 5932 . A . 0 0 DP=20 GT:GQ:DP 0/0:0:20
|
16
|
+
Chr1 6324 . T . 0 0 DP=23 GT:GQ:DP 0/0:0:23
|
17
|
+
Chr1 7361 . T . 0 0 DP=28 GT:GQ:DP 0/0:0:28
|
18
|
+
Chr1 7562 . A . 0 0 DP=31 GT:GQ:DP 0/0:0:31
|
19
|
+
Chr1 8003 . C . 0 0 DP=21 GT:GQ:DP 0/0:0:21
|
20
|
+
Chr1 8017 . G . 0 0 DP=16 GT:GQ:DP 0/0:0:16
|
21
|
+
Chr1 9066 . G . 0 0 DP=28 GT:GQ:DP 0/0:0:28
|
22
|
+
Chr1 9971 . C . 0 0 DP=19 GT:GQ:DP 0/0:0:19
|
data/lib/bio-gngm.rb
CHANGED
data/lib/bio/util/bio-gngm.rb
CHANGED
@@ -227,14 +227,36 @@ link:images/signal.png
|
|
227
227
|
|
228
228
|
g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
|
229
229
|
:format => :bam,
|
230
|
-
:fasta => "reference.fasta",
|
231
|
-
:
|
230
|
+
:fasta => "reference.fasta",
|
231
|
+
:start => 100,
|
232
|
+
:stop => 200,
|
233
|
+
:write_pileup => "my_pileup_file.pileup",
|
234
|
+
:write_vcf => "my_vcf_file.vcf",
|
235
|
+
:ignore_file => "my_known_snps.txt"
|
236
|
+
:samtools => {
|
232
237
|
:q => 20,
|
233
238
|
:Q => 50
|
234
239
|
},
|
235
240
|
:min_non_ref_freq => 0.5,
|
236
|
-
:min_non_ref => 3
|
241
|
+
:min_non_ref => 3,
|
242
|
+
:start => 1,
|
243
|
+
:stop => 100000,
|
244
|
+
:chromosome => "Chr1",
|
245
|
+
:variant_call => {
|
246
|
+
:indels => false,
|
247
|
+
:min_depth => 6,
|
248
|
+
:max_depth => 250,
|
249
|
+
:mapping_quality => 20.0,
|
250
|
+
:min_non_ref_count => 2,
|
251
|
+
:ignore_reference_n => true,
|
252
|
+
:min_snp_quality => 20,
|
253
|
+
:min_consensus_quality => 20,
|
254
|
+
:substitutions => ["C:T","G:A"]
|
255
|
+
}
|
256
|
+
|
257
|
+
|
237
258
|
)
|
259
|
+
|
238
260
|
g.snp_positions
|
239
261
|
g.collect_threads(:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 )
|
240
262
|
[0.25, 0.5, 1.0].each do |kernel_adjust| # loop through different kernel values
|
@@ -311,10 +333,7 @@ The following R packages are required
|
|
311
333
|
Thanks very much indeed to Ryan Austin, who invented NGM in the first place and was very forthcoming with R code, around which this implementation is based.
|
312
334
|
|
313
335
|
== Using bio-gngm
|
314
|
-
|
315
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
316
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
317
|
-
require 'bio-samtools'
|
336
|
+
|
318
337
|
require 'bio-gngm'
|
319
338
|
|
320
339
|
== API
|
@@ -335,30 +354,37 @@ class Gngm
|
|
335
354
|
# :samtools => {:q => 20, :Q => 50},
|
336
355
|
# :fasta => "reference.fa"
|
337
356
|
# :start => 100,
|
338
|
-
# :stop => 200
|
357
|
+
# :stop => 200,
|
358
|
+
# :write_pileup => "my_pileup_file.pileup",
|
359
|
+
# :write_vcf => "my_vcf_file.vcf",
|
360
|
+
# :ignore_file => "my_known_snps.txt"
|
361
|
+
#
|
339
362
|
# )
|
340
363
|
#
|
341
364
|
#Required parameters and defaults:
|
342
|
-
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
|
343
|
-
#- <tt>:format => :bam</tt> -either :bam, :
|
365
|
+
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present. A pileup file, or tab-delimited text file can be used.
|
366
|
+
#- <tt>:format => :bam</tt> -either :bam, :pileup, :txt (pileup expected to be 10 col format from samtools -vcf)
|
344
367
|
#- <tt>:chromosome => "nil"</tt> -sequence id to look at
|
345
368
|
#- <tt>:start => nil</tt> -start position on that sequence
|
346
369
|
#- <tt>:stop => nil</tt> -stop position on that sequence
|
347
370
|
#- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
|
348
|
-
#- <tt>:
|
371
|
+
#- <tt>:write_pileup => false</tt> -the path to a file. SNPs will be written in pileup to this file (indels not output)
|
372
|
+
#- <tt>:write_vcf => false</tt> -the path to a file. SNPs will be written in VCF to this file (indels not output)
|
373
|
+
#- <tt>:ignore_file => false</tt> -file of SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". All SNPs in this file will be ignored
|
374
|
+
#- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details.
|
349
375
|
#Optional parameters and defaults:
|
350
|
-
|
376
|
+
#
|
351
377
|
#Most of these are parameters for specific methods and can be over-ridden when particular methods are called
|
352
|
-
#- <tt>:variant_call => {:indels => false
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
#
|
378
|
+
#- <tt>:variant_call => {:indels => false,</tt>
|
379
|
+
#- <tt> :min_depth => 2, </tt>
|
380
|
+
#- <tt> :max_depth => 10000000, </tt>
|
381
|
+
#- <tt> :min_snp_quality => 20, </tt>
|
382
|
+
#- <tt> :mapping_quality => 10.0, </tt>
|
383
|
+
#- <tt> :min_non_ref_count => 2, </tt>
|
384
|
+
#- <tt> :ignore_reference_n => true, </tt>
|
385
|
+
#- <tt> :min_consensus_quality => 20, </tt>
|
386
|
+
#- <tt> :min_snp_quality => 20 }</tt>.
|
387
|
+
# - <tt> For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
|
362
388
|
#- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
|
363
389
|
#- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
|
364
390
|
#- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
|
@@ -385,9 +411,12 @@ class Gngm
|
|
385
411
|
:fasta => nil,
|
386
412
|
:samtools => {:q => 20, :Q => 50},
|
387
413
|
:indels => false,
|
388
|
-
|
414
|
+
:write_pileup => false,
|
415
|
+
:write_vcf => false,
|
416
|
+
:ignore_file => false,
|
389
417
|
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
390
|
-
:variant_call => { :
|
418
|
+
:variant_call => { :indels => false,
|
419
|
+
:min_depth => 2,
|
391
420
|
:max_depth => 10000000,
|
392
421
|
:mapping_quality => 10.0,
|
393
422
|
:min_non_ref_count => 2,
|
@@ -412,6 +441,31 @@ class Gngm
|
|
412
441
|
}
|
413
442
|
@opts.merge!(options)
|
414
443
|
@opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
|
444
|
+
@pileup_outfile, @vcf_outfile = nil,nil
|
445
|
+
if @opts[:variant_call][:indels] and (@opts[:write_pileup] or @opts[:write_vcf])
|
446
|
+
$stderr.puts "Cannot yet output VCF/Pileup when generating INDELs. Turning output off."
|
447
|
+
@opts[:write_pileup] = false
|
448
|
+
@opts[:write_vcf] = false
|
449
|
+
end
|
450
|
+
if @opts[:write_pileup]
|
451
|
+
@pileup_outfile = File.open(@opts[:write_pileup], "w")
|
452
|
+
end
|
453
|
+
if @opts[:write_vcf]
|
454
|
+
@vcf_outfile = File.open(@opts[:write_vcf], "w")
|
455
|
+
end
|
456
|
+
|
457
|
+
@known_snps = Hash.new
|
458
|
+
if @opts[:ignore_file]
|
459
|
+
File.open(@opts[:ignore_file], "r").each do |line|
|
460
|
+
col = line.chomp.split(/\t/)
|
461
|
+
if @known_snps[col[0]]
|
462
|
+
@known_snps[col[0]][col[1].to_i] = 1
|
463
|
+
else
|
464
|
+
@known_snps[col[0]] = Hash.new
|
465
|
+
@known_snps[col[0]][col[1].to_i] = 1
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
415
469
|
open_file
|
416
470
|
end
|
417
471
|
|
@@ -455,10 +509,8 @@ class Gngm
|
|
455
509
|
#- <tt>:max_depth => 10000000</tt> -maximum quality passing depth of coverage at a position for a SNP call
|
456
510
|
#- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
|
457
511
|
#- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
|
458
|
-
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
459
|
-
|
460
|
-
#- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
|
461
|
-
#When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
|
512
|
+
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
513
|
+
#When INDEL calling only one of <tt>:indels</tt> should be used. If +false+, SNPs are called.
|
462
514
|
#
|
463
515
|
#calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
|
464
516
|
def snp_positions(optsa={})
|
@@ -495,7 +547,10 @@ class Gngm
|
|
495
547
|
|
496
548
|
if not @opts[:samtools][:g]
|
497
549
|
@file.mpileup(@opts[:samtools]) do |pileup|
|
498
|
-
|
550
|
+
if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts) and not @known_snps[pileup.ref_name][pileup.pos]
|
551
|
+
arr << [pileup.pos, pileup.discordant_chastity]
|
552
|
+
write(pileup)
|
553
|
+
end
|
499
554
|
end
|
500
555
|
else
|
501
556
|
@file.mpileup_plus(@opts[:samtools]) do |vcf|
|
@@ -503,9 +558,9 @@ class Gngm
|
|
503
558
|
next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
|
504
559
|
##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
|
505
560
|
if opts[:indels]
|
506
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
561
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
|
507
562
|
else
|
508
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
563
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
|
509
564
|
end
|
510
565
|
end
|
511
566
|
end
|
@@ -515,11 +570,6 @@ class Gngm
|
|
515
570
|
arr
|
516
571
|
end
|
517
572
|
|
518
|
-
private
|
519
|
-
def get_snp_positions_from_map(options={})
|
520
|
-
arr = []
|
521
|
-
opts = @opts[:variant_call].merge(options)
|
522
|
-
end
|
523
573
|
|
524
574
|
#this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
|
525
575
|
#text file is of format chr\tpos\tref\talt\tfreq\n
|
@@ -530,7 +580,7 @@ class Gngm
|
|
530
580
|
chr,pos,ref,alt,freq = line.chomp.split("\t")
|
531
581
|
pos = pos.to_i
|
532
582
|
freq = freq.to_f
|
533
|
-
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
|
583
|
+
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts) and not @known_snps[chr][pos]
|
534
584
|
arr << [pos, freq]
|
535
585
|
end
|
536
586
|
@snp_positions = arr
|
@@ -546,13 +596,25 @@ class Gngm
|
|
546
596
|
next
|
547
597
|
end
|
548
598
|
#old fashioned 10 col pileup format has extra fields we can use if needed
|
549
|
-
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
|
599
|
+
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil? and not @known_snps[pileup.ref_name][pileup.pos]
|
600
|
+
write(pileup)
|
550
601
|
arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
551
602
|
end
|
552
603
|
end
|
553
604
|
@snp_positions = arr
|
554
605
|
end
|
555
606
|
|
607
|
+
private
|
608
|
+
#writes out pileup/vcf files of SNPs that were used
|
609
|
+
def write(obj)
|
610
|
+
if @opts[:write_pileup]
|
611
|
+
@pileup_outfile.puts(obj.to_s)
|
612
|
+
end
|
613
|
+
if @opts[:write_vcf]
|
614
|
+
@vcf_outfile.puts(obj.to_vcf)
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
556
618
|
private
|
557
619
|
#Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
|
558
620
|
def get_insert_size_frequency_from_bam(opts={})
|