bio-gngm 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Author : Naveed Ishaque (inspired by Dan Maclean) edited again by Dan to include ChD value setting and deletion of SNPs file
4
+ # naveed.ishaque@tsl.ac.uk; naveed.ishaque@hotmail.co.uk
5
+ # Date: 20th June 2012 and 1st November 2012
6
+
7
+ # This scripts produces a HTML with embedded images showing the SNP density and chastity plots for a given BAM file
8
+ # It will automatically iterate over all contigs, form begining to end
9
+ # NOTE - run using ruby executable: /home/programs/gngm/ruby/bin/ruby
10
+ # NOTE - this will only run on the cluster via a bsub command
11
+
12
+
13
+ require 'rubygems'
14
+ require 'bio'
15
+ require 'bio-gngm'
16
+ require 'base64'
17
+ require 'getoptions'
18
+
19
+ usage = "\n#{$PROGRAM_NAME} reads in a fasta and bam files and produces a html file indicating QTL locations as peaks\n\n\n\t #{$PROGRAM_NAME}\n\n\n -f [reference fasta file]\n -b [bam file]\n -e expected ChD (allele freq) default 1\n -c control ChD (allele freq) default 0.5\n -s List of known SNPS [tab delimited file]\n\n"
20
+
21
+ # PARSE INPUTS
22
+
23
+ opt = GetOptions.new(%w(h help f=@s b=@s e=@s c=@s s=@s))
24
+
25
+ puts "#{usage}" if opt[:h]
26
+ exit if opt[:h]
27
+ puts "#{usage}" if opt[:help]
28
+ exit if opt[:help]
29
+
30
+ puts "ERROR - no fasta file provided (-f)\n#{usage}" unless opt[:f]
31
+ exit unless opt[:f]
32
+ puts "ERROR - fasta file '#{opt[:f][0]}' does not exist\n#{$usage}" unless FileTest.exist?("#{opt[:f][0]}")
33
+ exit unless FileTest.exist?("#{opt[:f][0]}")
34
+ warn "\nUsing FASTA file #{opt[:f][0]}"
35
+
36
+ puts "ERROR - no bam file provided (-b)\n#{usage}" unless opt[:b]
37
+ exit unless opt[:b]
38
+ puts "ERROR - BAM file '#{opt[:b][0]}' does not exist\n#{usage}" unless FileTest.exist?("#{opt[:b][0]}")
39
+ exit unless FileTest.exist?("#{opt[:b][0]}")
40
+ warn "Using BAM file #{opt[:b][0]}"
41
+
42
+
43
+ expected_chd = 1.0
44
+ expected_chd =opt[:e][0].to_f if expected_chd and opt[:e]
45
+
46
+ control_chd = 0.5
47
+ control_chd = opt[:c][0].to_f if control_chd and opt[:c]
48
+
49
+ known_snps = Hash.new { |h,k| h[k] = Array.new }
50
+ if opt[:s].first
51
+ File.open(opt[:s].first).each do |line|
52
+ chr,pos = line.split("\t")
53
+ known_snps[chr] << pos.to_i
54
+ end
55
+ end
56
+
57
+
58
+ $stderr.puts "using expected ChD: #{expected_chd} and control ChD: #{control_chd}"
59
+
60
+ hist_bins = [100000, 250000, 500000]
61
+ ks = [5, 7, 9, 11]
62
+ kadjusts = [0.5, 0.25, 0.1, 0.05, 0.01]
63
+ warn "Histogram bin sizes: #{hist_bins}\nThread clusters (K): #{ks}\nKernal adjusts: #{kadjusts}\n"
64
+
65
+ # LOAD FASTA and find contigs
66
+ sequences = Bio::DB::FastaLengthDB.new(:file => "#{opt[:f][0]}")
67
+
68
+
69
+ # For Each contig in the fasta file analyse...
70
+
71
+ sequences.each do |id,length|
72
+ warn "\nProcessing #{id}:1 - #{length}..."
73
+
74
+ warn "Skipping #{id} as too short ..." if length < (4 * hist_bins.max)
75
+ next if length < (4 * hist_bins.max)
76
+
77
+ g = Bio::Util::Gngm.new(:file => "#{opt[:b][0]}",
78
+ :format => :bam,
79
+ :fasta => "#{opt[:f][0]}",
80
+ :start => 1,
81
+ :stop => 10000,
82
+ :chromosome => id,
83
+ :samtools => {
84
+ :q => 20,
85
+ :Q => 20
86
+ },
87
+ :ignore_file => "#{opt[:s][0]}",
88
+ :write_pileup => "pileup.txt",
89
+ :write_vcf => "snps.vcf"
90
+ )
91
+
92
+ # predict SNPs
93
+
94
+ warn " Prediciting SNPs for #{id}:1-#{length}..."
95
+ g.snp_positions
96
+
97
+ #delete SNPs from known snp_list
98
+ #a = g.snp_positions.dup
99
+ #known_snps[seq.entry_id].each {|snp_pos| a.delete_if{|x| x.first == snp_pos} }
100
+ #$stderr.puts "deleted #{g.snp_positions.length - a.length} snps appearing in #{opt[:s]}"
101
+ #g.snp_positions = a
102
+
103
+
104
+
105
+ # produce SNP density histograms
106
+
107
+ warn " Iterating over different histogram bin sizes..."
108
+ hist_bins.each do |bin_width|
109
+ warn " Makings PNG for bin size #{bin_width}..."
110
+ file_name = "#{id}_SNP_histogram_bin#{bin_width}.png"
111
+ g.frequency_histogram("#{file_name}",bin_width, :title => "#{id}: SNP density histogram (bin width - #{bin_width})", :width => 1066, :height => 300)
112
+ end
113
+
114
+ # Write to embedded HTML
115
+
116
+ htmlout = File.open("#{id}.html", 'w')
117
+ htmlout.puts "<html>\n"
118
+ htmlout.puts " <head>\n"
119
+ htmlout.puts " <title>GNGM #{id} - QTL mapping</title>\n"
120
+ htmlout.puts " <style type=\"text/css\">\n"
121
+ htmlout.puts " table,\n"
122
+ htmlout.puts " td,\n"
123
+ htmlout.puts " tbody,\n"
124
+ htmlout.puts " thead,\n"
125
+ htmlout.puts " thead th,\n"
126
+ htmlout.puts " tr.even,\n"
127
+ htmlout.puts " tr.odd {\n"
128
+ htmlout.puts " border: 0;\n"
129
+ htmlout.puts " }\n"
130
+ htmlout.puts " </style>\n"
131
+ htmlout.puts " </head>\n"
132
+ htmlout.puts " <body>\n\t\t"
133
+ htmlout.puts " <table>\n"
134
+ htmlout.puts " <tr>\n"
135
+ hist_bins.each do |bin_width|
136
+ htmlout.puts " <td>\n"
137
+ htmlout.puts "<img src=\"data:image/gif;base64,"
138
+ htmlout.puts [open("#{id}_SNP_histogram_bin#{bin_width}.png").read].pack("m")
139
+ File.delete("#{id}_SNP_histogram_bin#{bin_width}.png")
140
+ htmlout.puts "\" width=\"533\" height=\"150\"/>\n"
141
+ htmlout.puts " </td>\n"
142
+ end
143
+ htmlout.puts " </tr>\n"
144
+ htmlout.puts " </table>\n"
145
+
146
+ # Perform chastity calculations
147
+
148
+ warn " Collecting threads..."
149
+ g.collect_threads
150
+ warn " Iterating over k and kernel adjusts..."
151
+ ks.each do | k |
152
+ begin
153
+ warn " Makings PNG for k = #{k} ..."
154
+ warn " Calculating threads ..."
155
+ g.calculate_clusters(:k => k, :adjust => 0.5, :control_chd => control_chd, :expected_chd => expected_chd)
156
+ warn " Drawing threads ..."
157
+ filename = "#{id}_k#{k}_threads.png"
158
+ g.draw_threads(filename, :title => "#{id}: Chastity bands - all phases (k=#{k})", :width => 700, :height => 300)
159
+ warn " Clustering bands ..."
160
+ filename = "#{id}_k#{k}_clustered_bands.png"
161
+ g.draw_bands(filename, :title => "#{id}: Homozygous and heterozygous chastity belts (k=#{k})", :width => 800, :height => 300)
162
+ kadjusts.each do |kernel_adjust|
163
+ begin
164
+ warn " Calculating threads (with kernal adjust #{kernel_adjust}) ..."
165
+ g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => control_chd, :expected_chd => expected_chd)
166
+ warn " Calculating signal ..."
167
+ filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png"
168
+ g.draw_signal(filename, :title => "#{id}: Homo/Het signal ratio (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
169
+ warn " Estimating peaks ..."
170
+ filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png"
171
+ g.draw_peaks(filename, :title => "#{id}: Signal peaks (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
172
+ rescue => e
173
+ $stderr.puts "skipping #{k} #{kernel_adjust} => #{e}"
174
+ end
175
+ end
176
+ rescue => e
177
+ $stderr.puts "Skipping #{k} => #{e}"
178
+ end
179
+ end
180
+
181
+ g.close
182
+
183
+ # Write to embedded HTML
184
+
185
+ htmlout.puts " <table>\n"
186
+
187
+ # all bands
188
+ htmlout.puts " <tr>\n"
189
+ ks.each do | k |
190
+ htmlout.puts " <td>\n"
191
+ htmlout.puts "<img src=\"data:image/gif;base64,"
192
+ htmlout.puts [open("#{id}_k#{k}_threads.png").read].pack("m")
193
+ File.delete("#{id}_k#{k}_threads.png")
194
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
195
+ htmlout.puts " </td>\n"
196
+ end
197
+ htmlout.puts " </tr>\n"
198
+
199
+ # homo/het bands
200
+ htmlout.puts " <tr>\n"
201
+ ks.each do | k |
202
+ htmlout.puts " <td>\n"
203
+ htmlout.puts "<img src=\"data:image/gif;base64,"
204
+ htmlout.puts [open("#{id}_k#{k}_clustered_bands.png").read].pack("m")
205
+ File.delete("#{id}_k#{k}_clustered_bands.png")
206
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
207
+ htmlout.puts " </td>\n"
208
+ end
209
+ htmlout.puts " </tr>\n"
210
+
211
+ # k/adjusts
212
+ kadjusts.each do |kernel_adjust|
213
+ htmlout.puts " <tr>\n"
214
+ ks.each do | k |
215
+ htmlout.puts " <td>\n"
216
+ htmlout.puts "<img src=\"data:image/gif;base64,"
217
+ htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png").read].pack("m")
218
+ File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png")
219
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
220
+ htmlout.puts " </td>\n"
221
+ end
222
+ htmlout.puts " </tr>\n"
223
+ htmlout.puts " <tr>\n"
224
+ ks.each do | k |
225
+ htmlout.puts " <td>\n"
226
+ htmlout.puts "<img src=\"data:image/gif;base64,"
227
+ htmlout.puts [open("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png").read].pack("m")
228
+ File.delete("#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png")
229
+ htmlout.puts "\" width=\"400\" height=\"150\"/>"
230
+ htmlout.puts " </td>\n"
231
+ end
232
+ htmlout.puts " </tr>\n"
233
+ end
234
+
235
+ htmlout.puts " </table>\n"
236
+ htmlout.puts "\n </body>\n</html>\n"
237
+
238
+ htmlout.close
239
+
240
+ end
241
+
242
+
243
+
@@ -0,0 +1,22 @@
1
+ Chr1 23 C 4 AA.. !!II
2
+ Chr1 30 C 5 .-1T.-1T... !!HII
3
+ Chr1 33 T 5 .-1G.-1G... !!IHI
4
+ Chr1 37 T 6 AA.... !!IIII
5
+ Chr1 40 T 7 CC..... !!IHIID
6
+ Chr1 44 T 7 AA..... !!IIIFB
7
+ Chr1 711 T 16 ,.,,,..,,,.,,CC. >GIIDH;IIIHID48I
8
+ Chr1 1584 G 24 ,,.,..a,,,....,,.C,,.,.^]. IIIHIC.IIDGGII=DI'IFHIIE
9
+ Chr1 2544 T 34 ,,,,,c,,,...,,..,,.,,,..,,.,.,.,C. EIGEG&GIIIHIGG@IGGIDIHIIHEIHIIG=/D
10
+ Chr1 3975 G 22 .,....+1T......,...,-1t.,-1t.., HIHIIGGHDIEGFIFDCIDGBE
11
+ Chr1 4290 C 9 ,.,.,,-2at,-2at.-2AT. IGDIC@<II
12
+ Chr1 4292 T 11 ,.,.,***.C^]C F4DI+!!!I!!
13
+ Chr1 5699 C 25 tT.,,..,.....,..,......,. !);IH<AHD>;H;I09IDIGIIIGI
14
+ Chr1 5927 T 20 .,......,g,.,..g,... HI?IHIHII%IBDI=%HBHF
15
+ Chr1 5932 A 20 ,......,g,.,..t,.... >-HHIIIH&II7I=2HHHGH
16
+ Chr1 6324 T 23 ,$,,+1a,+1a.+1A.+1A.+1A.+1A,+1a,+1a.+1A,+1a,+1a,+1a,+1a.+1A,+1a,+1a,+1a..+1A.+1A.+1A BII1IIGIIFFIHDFGDIHF?4I
17
+ Chr1 7361 T 28 ,,,,,.$,,,,,..C,..,,A.,,,...^], HIFBIA?IIDHFF(GBIHH,.HBHI.GE
18
+ Chr1 7562 A 31 ..,.c.,,,,.,.t,,.,.......,.,... EFEE+>EEDGC@I)>AI<I?IHIGI8H=IID
19
+ Chr1 8003 C 21 .$,,..,.......-1A....,^]a^],^]a DIGHIGIAIGGEHIIIF7!0!
20
+ Chr1 8017 G 16 .A....A...,,,,,. A-<>76**1@BIGIII
21
+ Chr1 9066 G 28 .,,,.,.,,..,..,,.,,,c..,,C.^], DGHIHIGD2IGII;HHGI?I%IIIH(ID
22
+ Chr1 9971 C 19 ,....,,,,-2at.,.-2AT,-2at,,,... HI=HIGAGIGIIDI<IGIH
@@ -0,0 +1,22 @@
1
+ Chr1 23 . C A,C 0 0 DP=4 GT:GQ:DP 0/1:0:4
2
+ Chr1 30 . C . 0 0 DP=5 GT:GQ:DP 0/0:0:5
3
+ Chr1 33 . T . 0 0 DP=5 GT:GQ:DP 0/0:0:5
4
+ Chr1 37 . T . 0 0 DP=6 GT:GQ:DP 0/0:0:6
5
+ Chr1 40 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
6
+ Chr1 44 . T . 0 0 DP=7 GT:GQ:DP 0/0:0:7
7
+ Chr1 711 . T . 0 0 DP=16 GT:GQ:DP 0/0:0:16
8
+ Chr1 1584 . G . 0 0 DP=24 GT:GQ:DP 0/0:0:24
9
+ Chr1 2544 . T . 0 0 DP=34 GT:GQ:DP 0/0:0:34
10
+ Chr1 3975 . G . 0 0 DP=22 GT:GQ:DP 0/0:0:22
11
+ Chr1 4290 . C . 0 0 DP=9 GT:GQ:DP 0/0:0:9
12
+ Chr1 4292 . T . 0 0 DP=11 GT:GQ:DP 0/0:0:11
13
+ Chr1 5699 . C . 0 0 DP=25 GT:GQ:DP 0/0:0:25
14
+ Chr1 5927 . T . 0 0 DP=20 GT:GQ:DP 0/0:0:20
15
+ Chr1 5932 . A . 0 0 DP=20 GT:GQ:DP 0/0:0:20
16
+ Chr1 6324 . T . 0 0 DP=23 GT:GQ:DP 0/0:0:23
17
+ Chr1 7361 . T . 0 0 DP=28 GT:GQ:DP 0/0:0:28
18
+ Chr1 7562 . A . 0 0 DP=31 GT:GQ:DP 0/0:0:31
19
+ Chr1 8003 . C . 0 0 DP=21 GT:GQ:DP 0/0:0:21
20
+ Chr1 8017 . G . 0 0 DP=16 GT:GQ:DP 0/0:0:16
21
+ Chr1 9066 . G . 0 0 DP=28 GT:GQ:DP 0/0:0:28
22
+ Chr1 9971 . C . 0 0 DP=19 GT:GQ:DP 0/0:0:19
@@ -10,4 +10,3 @@
10
10
  # was ever to get merged into the main bioruby tree.
11
11
 
12
12
  require 'bio/util/bio-gngm'
13
- require 'bio/util/mutation_effects'
@@ -227,14 +227,36 @@ link:images/signal.png
227
227
 
228
228
  g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
229
229
  :format => :bam,
230
- :fasta => "reference.fasta",
231
- :samtools => {:r => "chr1:1-100000",
230
+ :fasta => "reference.fasta",
231
+ :start => 100,
232
+ :stop => 200,
233
+ :write_pileup => "my_pileup_file.pileup",
234
+ :write_vcf => "my_vcf_file.vcf",
235
+ :ignore_file => "my_known_snps.txt"
236
+ :samtools => {
232
237
  :q => 20,
233
238
  :Q => 50
234
239
  },
235
240
  :min_non_ref_freq => 0.5,
236
- :min_non_ref => 3
241
+ :min_non_ref => 3,
242
+ :start => 1,
243
+ :stop => 100000,
244
+ :chromosome => "Chr1",
245
+ :variant_call => {
246
+ :indels => false,
247
+ :min_depth => 6,
248
+ :max_depth => 250,
249
+ :mapping_quality => 20.0,
250
+ :min_non_ref_count => 2,
251
+ :ignore_reference_n => true,
252
+ :min_snp_quality => 20,
253
+ :min_consensus_quality => 20,
254
+ :substitutions => ["C:T","G:A"]
255
+ }
256
+
257
+
237
258
  )
259
+
238
260
  g.snp_positions
239
261
  g.collect_threads(:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 )
240
262
  [0.25, 0.5, 1.0].each do |kernel_adjust| # loop through different kernel values
@@ -311,10 +333,7 @@ The following R packages are required
311
333
  Thanks very much indeed to Ryan Austin, who invented NGM in the first place and was very forthcoming with R code, around which this implementation is based.
312
334
 
313
335
  == Using bio-gngm
314
- The package is not yet released, a gem will be prepared soon. Until then scripts run fine when saved in the package scripts from within the package directory with the below pre-amble at the top of the script. Run scripts from the root of the package directory.
315
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
316
- $LOAD_PATH.unshift(File.dirname(__FILE__))
317
- require 'bio-samtools'
336
+
318
337
  require 'bio-gngm'
319
338
 
320
339
  == API
@@ -335,30 +354,37 @@ class Gngm
335
354
  # :samtools => {:q => 20, :Q => 50},
336
355
  # :fasta => "reference.fa"
337
356
  # :start => 100,
338
- # :stop => 200
357
+ # :stop => 200,
358
+ # :write_pileup => "my_pileup_file.pileup",
359
+ # :write_vcf => "my_vcf_file.vcf",
360
+ # :ignore_file => "my_known_snps.txt"
361
+ #
339
362
  # )
340
363
  #
341
364
  #Required parameters and defaults:
342
- #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
343
- #- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
365
+ #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present. A pileup file, or tab-delimited text file can be used.
366
+ #- <tt>:format => :bam</tt> -either :bam, :pileup, :txt (pileup expected to be 10 col format from samtools -vcf)
344
367
  #- <tt>:chromosome => "nil"</tt> -sequence id to look at
345
368
  #- <tt>:start => nil</tt> -start position on that sequence
346
369
  #- <tt>:stop => nil</tt> -stop position on that sequence
347
370
  #- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
348
- #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
371
+ #- <tt>:write_pileup => false</tt> -the path to a file. SNPs will be written in pileup to this file (indels not output)
372
+ #- <tt>:write_vcf => false</tt> -the path to a file. SNPs will be written in VCF to this file (indels not output)
373
+ #- <tt>:ignore_file => false</tt> -file of SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". All SNPs in this file will be ignored
374
+ #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details.
349
375
  #Optional parameters and defaults:
350
-
376
+ #
351
377
  #Most of these are parameters for specific methods and can be over-ridden when particular methods are called
352
- #- <tt>:variant_call => {:indels => false,
353
- # :min_depth => 2,
354
- # :max_depth => 10000000,
355
- # :min_snp_quality => 20,
356
- # :mapping_quality => 10.0,
357
- # :min_non_ref_count => 2,
358
- # :ignore_reference_n => true,
359
- # :min_consensus_quality => 20,
360
- # :min_snp_quality => 20 }</tt>.
361
- # For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
378
+ #- <tt>:variant_call => {:indels => false,</tt>
379
+ #- <tt> :min_depth => 2, </tt>
380
+ #- <tt> :max_depth => 10000000, </tt>
381
+ #- <tt> :min_snp_quality => 20, </tt>
382
+ #- <tt> :mapping_quality => 10.0, </tt>
383
+ #- <tt> :min_non_ref_count => 2, </tt>
384
+ #- <tt> :ignore_reference_n => true, </tt>
385
+ #- <tt> :min_consensus_quality => 20, </tt>
386
+ #- <tt> :min_snp_quality => 20 }</tt>.
387
+ # - <tt> For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
362
388
  #- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
363
389
  #- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
364
390
  #- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
@@ -385,9 +411,12 @@ class Gngm
385
411
  :fasta => nil,
386
412
  :samtools => {:q => 20, :Q => 50},
387
413
  :indels => false,
388
- ##:indels = call indels too, causes return of vcf
414
+ :write_pileup => false,
415
+ :write_vcf => false,
416
+ :ignore_file => false,
389
417
  :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
390
- :variant_call => { :min_depth => 2,
418
+ :variant_call => { :indels => false,
419
+ :min_depth => 2,
391
420
  :max_depth => 10000000,
392
421
  :mapping_quality => 10.0,
393
422
  :min_non_ref_count => 2,
@@ -412,6 +441,31 @@ class Gngm
412
441
  }
413
442
  @opts.merge!(options)
414
443
  @opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
444
+ @pileup_outfile, @vcf_outfile = nil,nil
445
+ if @opts[:variant_call][:indels] and (@opts[:write_pileup] or @opts[:write_vcf])
446
+ $stderr.puts "Cannot yet output VCF/Pileup when generating INDELs. Turning output off."
447
+ @opts[:write_pileup] = false
448
+ @opts[:write_vcf] = false
449
+ end
450
+ if @opts[:write_pileup]
451
+ @pileup_outfile = File.open(@opts[:write_pileup], "w")
452
+ end
453
+ if @opts[:write_vcf]
454
+ @vcf_outfile = File.open(@opts[:write_vcf], "w")
455
+ end
456
+
457
+ @known_snps = Hash.new
458
+ if @opts[:ignore_file]
459
+ File.open(@opts[:ignore_file], "r").each do |line|
460
+ col = line.chomp.split(/\t/)
461
+ if @known_snps[col[0]]
462
+ @known_snps[col[0]][col[1].to_i] = 1
463
+ else
464
+ @known_snps[col[0]] = Hash.new
465
+ @known_snps[col[0]][col[1].to_i] = 1
466
+ end
467
+ end
468
+ end
415
469
  open_file
416
470
  end
417
471
 
@@ -455,10 +509,8 @@ class Gngm
455
509
  #- <tt>:max_depth => 10000000</tt> -maximum quality passing depth of coverage at a position for a SNP call
456
510
  #- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
457
511
  #- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
458
- #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
459
- #- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
460
- #- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
461
- #When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
512
+ #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
513
+ #When INDEL calling only one of <tt>:indels</tt> should be used. If +false+, SNPs are called.
462
514
  #
463
515
  #calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
464
516
  def snp_positions(optsa={})
@@ -495,7 +547,10 @@ class Gngm
495
547
 
496
548
  if not @opts[:samtools][:g]
497
549
  @file.mpileup(@opts[:samtools]) do |pileup|
498
- arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
550
+ if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts) and not @known_snps[pileup.ref_name][pileup.pos]
551
+ arr << [pileup.pos, pileup.discordant_chastity]
552
+ write(pileup)
553
+ end
499
554
  end
500
555
  else
501
556
  @file.mpileup_plus(@opts[:samtools]) do |vcf|
@@ -503,9 +558,9 @@ class Gngm
503
558
  next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
504
559
  ##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
505
560
  if opts[:indels]
506
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
561
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
507
562
  else
508
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
563
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts) and not @known_snps[vcf.ref][vcf.pos]
509
564
  end
510
565
  end
511
566
  end
@@ -515,11 +570,6 @@ class Gngm
515
570
  arr
516
571
  end
517
572
 
518
- private
519
- def get_snp_positions_from_map(options={})
520
- arr = []
521
- opts = @opts[:variant_call].merge(options)
522
- end
523
573
 
524
574
  #this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
525
575
  #text file is of format chr\tpos\tref\talt\tfreq\n
@@ -530,7 +580,7 @@ class Gngm
530
580
  chr,pos,ref,alt,freq = line.chomp.split("\t")
531
581
  pos = pos.to_i
532
582
  freq = freq.to_f
533
- next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
583
+ next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts) and not @known_snps[chr][pos]
534
584
  arr << [pos, freq]
535
585
  end
536
586
  @snp_positions = arr
@@ -546,13 +596,25 @@ class Gngm
546
596
  next
547
597
  end
548
598
  #old fashioned 10 col pileup format has extra fields we can use if needed
549
- if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
599
+ if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil? and not @known_snps[pileup.ref_name][pileup.pos]
600
+ write(pileup)
550
601
  arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
551
602
  end
552
603
  end
553
604
  @snp_positions = arr
554
605
  end
555
606
 
607
+ private
608
+ #writes out pileup/vcf files of SNPs that were used
609
+ def write(obj)
610
+ if @opts[:write_pileup]
611
+ @pileup_outfile.puts(obj.to_s)
612
+ end
613
+ if @opts[:write_vcf]
614
+ @vcf_outfile.puts(obj.to_vcf)
615
+ end
616
+ end
617
+
556
618
  private
557
619
  #Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
558
620
  def get_insert_size_frequency_from_bam(opts={})