bio-gngm 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Author : Naveed Ishaque (inspired by Dan Maclean) edited again by Dan to include ChD value setting and deletion of SNPs file
4
+ # naveed.ishaque@tsl.ac.uk; naveed.ishaque@hotmail.co.uk
5
+ # Date: 20th June 2012 and 1st November 2012
6
+
7
+ # This scripts produces a HTML with embedded images showing the SNP density and chastity plots for a given BAM file
8
+ # It will automatically iterate over all contigs, form begining to end
9
+ # NOTE - run using ruby executable: /home/programs/gngm/ruby/bin/ruby
10
+ # NOTE - this will only run on the cluster via a bsub command
11
+
12
+
13
+ require 'rubygems'
14
+ require 'bio'
15
+ require 'bio-gngm'
16
+ require 'base64'
17
+ require 'getoptions'
18
+
19
+ usage = "\n#{$PROGRAM_NAME} reads in a fasta and bam files and produces a html file indicating QTL locations as peaks\n\n\n\t #{$PROGRAM_NAME}\n\n\n -f [reference fasta file]\n -b [bam file]\n -e expected ChD (allele freq) default 1\n -c control ChD (allele freq) default 0.5\n -s List of known SNPS [tab delimited file]\n\n"
20
+
21
+ # PARSE INPUTS
22
+
23
+ opt = GetOptions.new(%w(h help f=@s b=@s e=@s c=@s s=@s))
24
+
25
+ puts "#{usage}" if opt[:h]
26
+ exit if opt[:h]
27
+ puts "#{usage}" if opt[:help]
28
+ exit if opt[:help]
29
+
30
+ puts "ERROR - no fasta file provided (-f)\n#{usage}" unless opt[:f]
31
+ exit unless opt[:f]
32
+ puts "ERROR - fasta file '#{opt[:f][0]}' does not exist\n#{$usage}" unless FileTest.exist?("#{opt[:f][0]}")
33
+ exit unless FileTest.exist?("#{opt[:f][0]}")
34
+ warn "\nUsing FASTA file #{opt[:f][0]}"
35
+
36
+ puts "ERROR - no bam file provided (-b)\n#{usage}" unless opt[:b]
37
+ exit unless opt[:b]
38
+ puts "ERROR - BAM file '#{opt[:b][0]}' does not exist\n#{usage}" unless FileTest.exist?("#{opt[:b][0]}")
39
+ exit unless FileTest.exist?("#{opt[:b][0]}")
40
+ warn "Using BAM file #{opt[:b][0]}"
41
+
42
+
43
+ expected_chd = 1.0
44
+ expected_chd =opt[:e][0].to_f if expected_chd and opt[:e]
45
+
46
+ control_chd = 0.5
47
+ control_chd = opt[:c][0].to_f if control_chd and opt[:c]
48
+
49
+ known_snps = Hash.new { |h,k| h[k] = Array.new }
50
+ if opt[:s].first
51
+ File.open(opt[:s].first).each do |line|
52
+ chr,pos = line.split("\t")
53
+ known_snps[chr] << pos.to_i
54
+ end
55
+ end
56
+
57
+
58
+ $stderr.puts "using expected ChD: #{expected_chd} and control ChD: #{control_chd}"
59
+
60
+ hist_bins = [100000, 250000, 500000]
61
+ ks = [5, 7, 9, 11]
62
+ kadjusts = [0.5, 0.25, 0.1, 0.05, 0.01]
63
+ warn "Histogram bin sizes: #{hist_bins}\nThread clusters (K): #{ks}\nKernal adjusts: #{kadjusts}\n"
64
+
65
+ # LOAD FASTA and find contigs
66
+ sequences = Bio::DB::FastaLengthDB.new(:file => "#{opt[:f][0]}")
67
+
68
+
69
+ # For Each contig in the fasta file analyse...
70
+
71
+ sequences.each do |id,length|
72
+ warn "\nProcessing #{id}:1 - #{length}..."
73
+
74
+ warn "Skipping #{id} as too short ..." if length < (4 * hist_bins.max)
75
+ next if length < (4 * hist_bins.max)
76
+
77
+ g = Bio::Util::Gngm.new(:file => "#{opt[:b][0]}",
78
+ :format => :bam,
79
+ :fasta => "#{opt[:f][0]}",
80
+ :start => 1,
81
+ :stop => 10000,
82
+ :chromosome => id,
83
+ :samtools => {
84
+ :q => 20,
85
+ :Q => 20
86
+ },
87
+ :ignore_file => "#{opt[:s][0]}",
88
+ :write_pileup => "pileup.txt",
89
+ :write_vcf => "snps.vcf"
90
+ )
91
+
92
+ # predict SNPs
93
+
94
+ warn " Prediciting SNPs for #{id}:1-#{length}..."
95
+ g.snp_positions
96
+
97
+ #delete SNPs from known snp_list
98
+ #a = g.snp_positions.dup
99
+ #known_snps[seq.entry_id].each {|snp_pos| a.delete_if{|x| x.first == snp_pos} }
100
+ #$stderr.puts "deleted #{g.snp_positions.length - a.length} snps appearing in #{opt[:s]}"
101
+ #g.snp_positions = a
102
+
103
+
104
+
105
+ # produce SNP density histograms
106
+
107
+ warn " Iterating over different histogram bin sizes..."
108
+ hist_bins.each do |bin_width|
109
+ warn " Makings PNG for bin size #{bin_width}..."
110
+ file_name = "#{id}_SNP_histogram_bin#{bin_width}.png"
111
+ g.frequency_histogram("#{file_name}",bin_width, :title => "#{id}: SNP density histogram (bin width - #{bin_width})", :width => 1066, :height => 300)
112
+ end
113
+
114
+ # Write to embedded HTML
115
+
116
+ htmlout = File.open("#{id}.html", 'w')
117
+ htmlout.puts "<html>\n"
118
+ htmlout.puts " <head>\n"
119
+ htmlout.puts " <title>GNGM #{id} - QTL mapping</title>\n"
120
+ htmlout.puts " <style type=\"text/css\">\n"
121
+ htmlout.puts " table,\n"
122
+ htmlout.puts " td,\n"
123
+ htmlout.puts " tbody,\n"
124
+ htmlout.puts " thead,\n"
125
+ htmlout.puts " thead th,\n"
126
+ htmlout.puts " tr.even,\n"
127
+ htmlout.puts " tr.odd {\n"
128
+ htmlout.puts " border: 0;\n"
129
+ htmlout.puts " }\n"
130
+ htmlout.puts " </style>\n"
131
+ htmlout.puts " </head>\n"
132
+ htmlout.puts " <body>\n\t\t"
133
+ htmlout.puts " <table>\n"
134
+ htmlout.puts " <tr>\n"
135
+ hist_bins.each do |bin_width|
136
+ htmlout.puts " <td>\n"
137
+ htmlout.puts "<img src=\"data:image/gif;base64,"
138
+ htmlout.puts [open("#{id}_SNP_histogram_bin#{bin_width}.png").read].pack("m")
139
+ File.delete("#{id}_SNP_histogram_bin#{bin_width}.png")
140
+ htmlout.puts "\" width=\"533\" height=\"150\"/>\n"
141
+ htmlout.puts " </td>\n"
142
+ end
143
+ htmlout.puts " </tr>\n"
144
+ htmlout.puts " </table>\n"
145
+
146
+ # Perform chastity calculations
147
+
148
+ warn " Collecting threads..."
149
+ g.collect_threads
150
+ warn " Iterating over k and kernel adjusts..."
151
+ ks.each do | k |
152
+ begin
153
+ warn " Makings PNG for k = #{k} ..."
154
+ warn " Calculating threads ..."
155
+ g.calculate_clusters(:k => k, :adjust => 0.5, :control_chd => control_chd, :expected_chd => expected_chd)
156
+ warn " Drawing threads ..."
157
+ filename = "#{id}_k#{k}_threads.png"
158
+ g.draw_threads(filename, :title => "#{id}: Chastity bands - all phases (k=#{k})", :width => 700, :height => 300)
159
+ warn " Clustering bands ..."
160
+ filename = "#{id}_k#{k}_clustered_bands.png"
161
+ g.draw_bands(filename, :title => "#{id}: Homozygous and heterozygous chastity belts (k=#{k})", :width => 800, :height => 300)
162
+ kadjusts.each do |kernel_adjust|
163
+ begin
164
+ warn " Calculating threads (with kernal adjust #{kernel_adjust}) ..."
165
+ g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => control_chd, :expected_chd => expected_chd)
166
+ warn " Calculating signal ..."
167
+ filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png"
168
+ g.draw_signal(filename, :title => "#{id}: Homo/Het signal ratio (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
169
+ warn " Estimating peaks ..."
170
+ filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png"
171
+ g.draw_peaks(filename, :title => "#{id}: Signal peaks (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
172
+ rescue => e
173
+ $stderr.puts "skipping #{k} #{kernel_adjust} => #{e}"
174
+ end
175
+ end
176
+ rescue => e
177
+ $stderr.puts "Skipping #{k} => #{e}"
178
+ end
179
+ end
180
+
181
+ g.close
182
+
183
+ # Write to embedded HTML
184
+
185
+ htmlout.puts " <table>\n"
186
+
187
+ # all bands
188
+ htmlout.puts " <tr>\n"
189
+ ks.each do | k |
190
+ htmlout.puts " <td>\n"
191
+ htmlout.puts "<img src=\"data:image/gif;base64,"
192
+ htmlout.puts [open("#{id}_k#{k}_threads.png").read].pack("m")
193
+ File.delete("#{id}_k#{k}_threads.png")
194
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
195
+ htmlout.puts " </td>\n"
196
+ end
197
+ htmlout.puts " </tr>\n"
198
+
199
+ # homo/het bands
200
+ htmlout.puts " <tr>\n"
201
+ ks.each do | k |
202
+ htmlout.puts " <td>\n"
203
+ htmlout.puts "<img src=\"data:image/gif;base64,"
204
+ htmlout.puts [open("#{id}_k#{k}_clustered_bands.png").read].pack("m")
205
+ File.delete("#{id}_k#{k}_clustered_bands.png")
206
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
207
+ htmlout.puts " </td>\n"
208
+ end
209
+ htmlout.puts " </tr>\n"
210
+
211
+ # k/adjusts
212
+ kadjusts.each do |kernel_adjust|
213
+ htmlout.puts " <tr>\n"
214
+ ks.each do | k |
215
+ htmlout.puts " <td>\n"
216
+ htmlout.puts "<img src=\"data:image/gif;base64,"
217
+ htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png").read].pack("m")
218
+ File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png")
219
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
220
+ htmlout.puts " </td>\n"
221
+ end
222
+ htmlout.puts " </tr>\n"
223
+ htmlout.puts " <tr>\n"
224
+ ks.each do | k |
225
+ htmlout.puts " <td>\n"
226
+ htmlout.puts "<img src=\"data:image/gif;base64,"
227
+ htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png").read].pack("m")
228
+ File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png")
229
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
230
+ htmlout.puts " </td>\n"
231
+ end
232
+ htmlout.puts " </tr>\n"
233
+ end
234
+
235
+ htmlout.puts " </table>\n"
236
+ htmlout.puts "\n </body>\n</html>\n"
237
+
238
+ htmlout.close
239
+
240
+ end
241
+
242
+
243
+
@@ -0,0 +1,22 @@
1
+ Chr1 23 C 4 AA.. !!II
2
+ Chr1 30 C 5 .-1T.-1T... !!HII
3
+ Chr1 33 T 5 .-1G.-1G... !!IHI
4
+ Chr1 37 T 6 AA.... !!IIII
5
+ Chr1 40 T 7 CC..... !!IHIID
6
+ Chr1 44 T 7 AA..... !!IIIFB
7
+ Chr1 711 T 16 ,.,,,..,,,.,,CC. >GIIDH;IIIHID48I
8
+ Chr1 1584 G 24 ,,.,..a,,,....,,.C,,.,.^]. IIIHIC.IIDGGII=DI'IFHIIE
9
+ Chr1 2544 T 34 ,,,,,c,,,...,,..,,.,,,..,,.,.,.,C. EIGEG&GIIIHIGG@IGGIDIHIIHEIHIIG=/D
10
+ Chr1 3975 G 22 .,....+1T......,...,-1t.,-1t.., HIHIIGGHDIEGFIFDCIDGBE
11
+ Chr1 4290 C 9 ,.,.,,-2at,-2at.-2AT. IGDIC@<II
12
+ Chr1 4292 T 11 ,.,.,***.C^]C F4DI+!!!I!!
13
+ Chr1 5699 C 25 tT.,,..,.....,..,......,. !);IH<AHD>;H;I09IDIGIIIGI
14
+ Chr1 5927 T 20 .,......,g,.,..g,... HI?IHIHII%IBDI=%HBHF
15
+ Chr1 5932 A 20 ,......,g,.,..t,.... >-HHIIIH&II7I=2HHHGH
16
+ Chr1 6324 T 23 ,$,,+1a,+1a.+1A.+1A.+1A.+1A,+1a,+1a.+1A,+1a,+1a,+1a,+1a.+1A,+1a,+1a,+1a..+1A.+1A.+1A BII1IIGIIFFIHDFGDIHF?4I
17
+ Chr1 7361 T 28 ,,,,,.$,,,,,..C,..,,A.,,,...^], HIFBIA?IIDHFF(GBIHH,.HBHI.GE
18
+ Chr1 7562 A 31 ..,.c.,,,,.,.t,,.,.......,.,... EFEE+>EEDGC@I)>AI<I?IHIGI8H=IID
19
+ Chr1 8003 C 21 .$,,..,.......-1A....,^]a^],^]a DIGHIGIAIGGEHIIIF7!0!
20
+ Chr1 8017 G 16 .A....A...,,,,,. A-<>76**1@BIGIII
21
+ Chr1 9066 G 28 .,,,.,.,,..,..,,.,,,c..,,C.^], DGHIHIGD2IGII;HHGI?I%IIIH(ID
22
+ Chr1 9971 C 19 ,....,,,,-2at.,.-2AT,-2at,,,... HI=HIGAGIGIIDI<IGIH
@@ -0,0 +1,22 @@
1
+ Chr1 23 . C A,C 0 0 DP=4 GT:GQ:DP 0/1:0:4
2
+ Chr1 30 . C . 0 0 DP=5 GT:GQ:DP 0/0:0:5
3
+ Chr1 33 . T . 0 0 DP=5 GT:GQ:DP 0/0:0:5
4
+ Chr1 37 . T . 0 0 DP=6 GT:GQ:DP 0/0:0:6
5
+ Chr1 40 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
6
+ Chr1 44 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
7
+ Chr1 711 . T . 0 0 DP=16 GT:GQ:DP 0/0:0:16
8
+ Chr1 1584 . G . 0 0 DP=24 GT:GQ:DP 0/0:0:24
9
+ Chr1 2544 . T . 0 0 DP=34 GT:GQ:DP 0/0:0:34
10
+ Chr1 3975 . G . 0 0 DP=22 GT:GQ:DP 0/0:0:22
11
+ Chr1 4290 . C . 0 0 DP=9 GT:GQ:DP 0/0:0:9
12
+ Chr1 4292 . T . 0 0 DP=11 GT:GQ:DP 0/0:0:11
13
+ Chr1 5699 . C . 0 0 DP=25 GT:GQ:DP 0/0:0:25
14
+ Chr1 5927 . T . 0 0 DP=20 GT:GQ:DP 0/0:0:20
15
+ Chr1 5932 . A . 0 0 DP=20 GT:GQ:DP 0/0:0:20
16
+ Chr1 6324 . T . 0 0 DP=23 GT:GQ:DP 0/0:0:23
17
+ Chr1 7361 . T . 0 0 DP=28 GT:GQ:DP 0/0:0:28
18
+ Chr1 7562 . A . 0 0 DP=31 GT:GQ:DP 0/0:0:31
19
+ Chr1 8003 . C . 0 0 DP=21 GT:GQ:DP 0/0:0:21
20
+ Chr1 8017 . G . 0 0 DP=16 GT:GQ:DP 0/0:0:16
21
+ Chr1 9066 . G . 0 0 DP=28 GT:GQ:DP 0/0:0:28
22
+ Chr1 9971 . C . 0 0 DP=19 GT:GQ:DP 0/0:0:19
@@ -10,4 +10,3 @@
10
10
  # was ever to get merged into the main bioruby tree.
11
11
 
12
12
  require 'bio/util/bio-gngm'
13
- require 'bio/util/mutation_effects'
@@ -227,14 +227,36 @@ link:images/signal.png
227
227
 
228
228
  g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
229
229
  :format => :bam,
230
- :fasta => "reference.fasta",
231
- :samtools => {:r => "chr1:1-100000",
230
+ :fasta => "reference.fasta",
231
+ :start => 100,
232
+ :stop => 200,
233
+ :write_pileup => "my_pileup_file.pileup",
234
+ :write_vcf => "my_vcf_file.vcf",
235
+ :ignore_file => "my_known_snps.txt"
236
+ :samtools => {
232
237
  :q => 20,
233
238
  :Q => 50
234
239
  },
235
240
  :min_non_ref_freq => 0.5,
236
- :min_non_ref => 3
241
+ :min_non_ref => 3,
242
+ :start => 1,
243
+ :stop => 100000,
244
+ :chromosome => "Chr1",
245
+ :variant_call => {
246
+ :indels => false,
247
+ :min_depth => 6,
248
+ :max_depth => 250,
249
+ :mapping_quality => 20.0,
250
+ :min_non_ref_count => 2,
251
+ :ignore_reference_n => true,
252
+ :min_snp_quality => 20,
253
+ :min_consensus_quality => 20,
254
+ :substitutions => ["C:T","G:A"]
255
+ }
256
+
257
+
237
258
  )
259
+
238
260
  g.snp_positions
239
261
  g.collect_threads(:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 )
240
262
  [0.25, 0.5, 1.0].each do |kernel_adjust| # loop through different kernel values
@@ -311,10 +333,7 @@ The following R packages are required
311
333
  Thanks very much indeed to Ryan Austin, who invented NGM in the first place and was very forthcoming with R code, around which this implementation is based.
312
334
 
313
335
  == Using bio-gngm
314
- The package is not yet released, a gem will be prepared soon. Until then scripts run fine when saved in the package scripts from within the package directory with the below pre-amble at the top of the script. Run scripts from the root of the package directory.
315
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
316
- $LOAD_PATH.unshift(File.dirname(__FILE__))
317
- require 'bio-samtools'
336
+
318
337
  require 'bio-gngm'
319
338
 
320
339
  == API
@@ -335,30 +354,37 @@ class Gngm
335
354
  # :samtools => {:q => 20, :Q => 50},
336
355
  # :fasta => "reference.fa"
337
356
  # :start => 100,
338
- # :stop => 200
357
+ # :stop => 200,
358
+ # :write_pileup => "my_pileup_file.pileup",
359
+ # :write_vcf => "my_vcf_file.vcf",
360
+ # :ignore_file => "my_known_snps.txt"
361
+ #
339
362
  # )
340
363
  #
341
364
  #Required parameters and defaults:
342
- #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
343
- #- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
365
+ #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present. A pileup file, or tab-delimited text file can be used.
366
+ #- <tt>:format => :bam</tt> -either :bam, :pileup, :txt (pileup expected to be 10 col format from samtools -vcf)
344
367
  #- <tt>:chromosome => "nil"</tt> -sequence id to look at
345
368
  #- <tt>:start => nil</tt> -start position on that sequence
346
369
  #- <tt>:stop => nil</tt> -stop position on that sequence
347
370
  #- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
348
- #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
371
+ #- <tt>:write_pileup => false</tt> -the path to a file. SNPs will be written in pileup to this file (indels not output)
372
+ #- <tt>:write_vcf => false</tt> -the path to a file. SNPs will be written in VCF to this file (indels not output)
373
+ #- <tt>:ignore_file => false</tt> -file of SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". All SNPs in this file will be ignored
374
+ #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details.
349
375
  #Optional parameters and defaults:
350
-
376
+ #
351
377
  #Most of these are parameters for specific methods and can be over-ridden when particular methods are called
352
- #- <tt>:variant_call => {:indels => false,
353
- # :min_depth => 2,
354
- # :max_depth => 10000000,
355
- # :min_snp_quality => 20,
356
- # :mapping_quality => 10.0,
357
- # :min_non_ref_count => 2,
358
- # :ignore_reference_n => true,
359
- # :min_consensus_quality => 20,
360
- # :min_snp_quality => 20 }</tt>.
361
- # For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
378
+ #- <tt>:variant_call => {:indels => false,</tt>
379
+ #- <tt> :min_depth => 2, </tt>
380
+ #- <tt> :max_depth => 10000000, </tt>
381
+ #- <tt> :min_snp_quality => 20, </tt>
382
+ #- <tt> :mapping_quality => 10.0, </tt>
383
+ #- <tt> :min_non_ref_count => 2, </tt>
384
+ #- <tt> :ignore_reference_n => true, </tt>
385
+ #- <tt> :min_consensus_quality => 20, </tt>
386
+ #- <tt> :min_snp_quality => 20 }</tt>.
387
+ # - <tt> For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
362
388
  #- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
363
389
  #- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
364
390
  #- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
@@ -385,9 +411,12 @@ class Gngm
385
411
  :fasta => nil,
386
412
  :samtools => {:q => 20, :Q => 50},
387
413
  :indels => false,
388
- ##:indels = call indels too, causes return of vcf
414
+ :write_pileup => false,
415
+ :write_vcf => false,
416
+ :ignore_file => false,
389
417
  :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
390
- :variant_call => { :min_depth => 2,
418
+ :variant_call => { :indels => false,
419
+ :min_depth => 2,
391
420
  :max_depth => 10000000,
392
421
  :mapping_quality => 10.0,
393
422
  :min_non_ref_count => 2,
@@ -412,6 +441,31 @@ class Gngm
412
441
  }
413
442
  @opts.merge!(options)
414
443
  @opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
444
+ @pileup_outfile, @vcf_outfile = nil,nil
445
+ if @opts[:variant_call][:indels] and (@opts[:write_pileup] or @opts[:write_vcf])
446
+ $stderr.puts "Cannot yet output VCF/Pileup when generating INDELs. Turning output off."
447
+ @opts[:write_pileup] = false
448
+ @opts[:write_vcf] = false
449
+ end
450
+ if @opts[:write_pileup]
451
+ @pileup_outfile = File.open(@opts[:write_pileup], "w")
452
+ end
453
+ if @opts[:write_vcf]
454
+ @vcf_outfile = File.open(@opts[:write_vcf], "w")
455
+ end
456
+
457
+ @known_snps = Hash.new
458
+ if @opts[:ignore_file]
459
+ File.open(@opts[:ignore_file], "r").each do |line|
460
+ col = line.chomp.split(/\t/)
461
+ if @known_snps[col[0]]
462
+ @known_snps[col[0]][col[1].to_i] = 1
463
+ else
464
+ @known_snps[col[0]] = Hash.new
465
+ @known_snps[col[0]][col[1].to_i] = 1
466
+ end
467
+ end
468
+ end
415
469
  open_file
416
470
  end
417
471
 
@@ -455,10 +509,8 @@ class Gngm
455
509
  #- <tt>:max_depth => 10000000</tt> -maximum quality passing depth of coverage at a position for a SNP call
456
510
  #- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
457
511
  #- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
458
- #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
459
- #- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
460
- #- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
461
- #When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
512
+ #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
513
+ #When INDEL calling only one of <tt>:indels</tt> should be used. If +false+, SNPs are called.
462
514
  #
463
515
  #calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
464
516
  def snp_positions(optsa={})
@@ -495,7 +547,10 @@ class Gngm
495
547
 
496
548
  if not @opts[:samtools][:g]
497
549
  @file.mpileup(@opts[:samtools]) do |pileup|
498
- arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
550
+ if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts) and not @known_snps[pileup.ref_name][pileup.pos]
551
+ arr << [pileup.pos, pileup.discordant_chastity]
552
+ write(pileup)
553
+ end
499
554
  end
500
555
  else
501
556
  @file.mpileup_plus(@opts[:samtools]) do |vcf|
@@ -503,9 +558,9 @@ class Gngm
503
558
  next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
504
559
  ##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
505
560
  if opts[:indels]
506
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
561
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
507
562
  else
508
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
563
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
509
564
  end
510
565
  end
511
566
  end
@@ -515,11 +570,6 @@ class Gngm
515
570
  arr
516
571
  end
517
572
 
518
- private
519
- def get_snp_positions_from_map(options={})
520
- arr = []
521
- opts = @opts[:variant_call].merge(options)
522
- end
523
573
 
524
574
  #this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
525
575
  #text file is of format chr\tpos\tref\talt\tfreq\n
@@ -530,7 +580,7 @@ class Gngm
530
580
  chr,pos,ref,alt,freq = line.chomp.split("\t")
531
581
  pos = pos.to_i
532
582
  freq = freq.to_f
533
- next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
583
+ next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts) and not @known_snps[chr][pos]
534
584
  arr << [pos, freq]
535
585
  end
536
586
  @snp_positions = arr
@@ -546,13 +596,25 @@ class Gngm
546
596
  next
547
597
  end
548
598
  #old fashioned 10 col pileup format has extra fields we can use if needed
549
- if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
599
+ if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil? and not @known_snps[pileup.ref_name][pileup.pos]
600
+ write(pileup)
550
601
  arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
551
602
  end
552
603
  end
553
604
  @snp_positions = arr
554
605
  end
555
606
 
607
+ private
608
+ #writes out pileup/vcf files of SNPs that were used
609
+ def write(obj)
610
+ if @opts[:write_pileup]
611
+ @pileup_outfile.puts(obj.to_s)
612
+ end
613
+ if @opts[:write_vcf]
614
+ @vcf_outfile.puts(obj.to_vcf)
615
+ end
616
+ end
617
+
556
618
  private
557
619
  #Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
558
620
  def get_insert_size_frequency_from_bam(opts={})