bio-gngm 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bio-gngm.gemspec +3 -2
- data/examples/make_histograms.rb +43 -17
- data/lib/bio-gngm.rb +1 -0
- data/lib/bio/util/bio-gngm.rb +171 -60
- data/lib/bio/util/mutation_effects.rb +39 -0
- data/scripts/make_threads_unmapped_simulation.rb +28 -16
- metadata +26 -25
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.2.0
|
data/bio-gngm.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "bio-gngm"
|
|
8
|
-
s.version = "0.
|
|
8
|
+
s.version = "0.2.0"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Dan MacLean"]
|
|
12
|
-
s.date = "2012-
|
|
12
|
+
s.date = "2012-11-15"
|
|
13
13
|
s.description = "Identify causative mutations in a model genome from NGS reads using the NGM method."
|
|
14
14
|
s.email = "maclean.daniel@gmail.com"
|
|
15
15
|
s.extra_rdoc_files = [
|
|
@@ -89,6 +89,7 @@ Gem::Specification.new do |s|
|
|
|
89
89
|
"examples/use_indels.rb",
|
|
90
90
|
"lib/bio-gngm.rb",
|
|
91
91
|
"lib/bio/util/bio-gngm.rb",
|
|
92
|
+
"lib/bio/util/mutation_effects.rb",
|
|
92
93
|
"scripts/get_subseq.rb",
|
|
93
94
|
"scripts/make_histograms_laerfyve.rb",
|
|
94
95
|
"scripts/make_histograms_laerfyve_stitched.rb",
|
data/examples/make_histograms.rb
CHANGED
|
@@ -14,27 +14,53 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
|
14
14
|
require 'bio-gngm'
|
|
15
15
|
require 'bio-samtools'
|
|
16
16
|
|
|
17
|
+
#def make_snp_array(id, file)
|
|
18
|
+
# a = []
|
|
19
|
+
# File.open(file, "r").each do |line|
|
|
20
|
+
# arr = line.split(/\t/)
|
|
21
|
+
# next if arr[0] !~ /#{id}/
|
|
22
|
+
# a << [a]
|
|
23
|
+
# end
|
|
24
|
+
#end
|
|
17
25
|
|
|
26
|
+
sequences = Bio::DB::FastaLengthDB.new(:file => ARGV[0])
|
|
27
|
+
$stderr.puts "Loaded sequences..."
|
|
18
28
|
|
|
19
29
|
|
|
20
|
-
|
|
21
|
-
g = Bio::Util::Gngm.new(:file =>
|
|
22
|
-
|
|
23
|
-
:
|
|
24
|
-
:
|
|
30
|
+
sequences.each do |id, length|
|
|
31
|
+
g = Bio::Util::Gngm.new(:file => ARGV[1],
|
|
32
|
+
#:format => :pileup,
|
|
33
|
+
:format => :bam,
|
|
34
|
+
:fasta => ARGV[0],
|
|
35
|
+
:chromosome => id,
|
|
36
|
+
:start => 1,
|
|
37
|
+
:stop => length,
|
|
38
|
+
:samtools => {
|
|
25
39
|
:q => 20,
|
|
26
|
-
:Q =>
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
40
|
+
:Q => 20
|
|
41
|
+
},
|
|
42
|
+
:variant_call => {
|
|
43
|
+
:indels => false,
|
|
44
|
+
:deletions_only => false,
|
|
45
|
+
:insertions_only => false,
|
|
46
|
+
:min_depth => 6,
|
|
47
|
+
:max_depth => 250,
|
|
48
|
+
:mapping_quality => 20.0,
|
|
49
|
+
:min_non_ref_count => 2,
|
|
50
|
+
:ignore_reference_n => true,
|
|
51
|
+
:min_snp_quality => 20,
|
|
52
|
+
:min_consensus_quality => 20
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
$stderr.puts "getting #{id}.."
|
|
56
|
+
|
|
57
|
+
g.snp_positions
|
|
58
|
+
puts "Found #{g.snp_positions.length} SNPs"
|
|
59
|
+
[100000, 250000, 500000].each do |bin_width|
|
|
60
|
+
$stderr.puts "working on #{bin_width} windows"
|
|
61
|
+
file_name = "test_#{id}_#{bin_width}.png"
|
|
35
62
|
g.frequency_histogram("#{file_name}",bin_width)
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
#close the BAM file
|
|
63
|
+
end
|
|
39
64
|
g.close
|
|
65
|
+
end
|
|
40
66
|
|
data/lib/bio-gngm.rb
CHANGED
data/lib/bio/util/bio-gngm.rb
CHANGED
|
@@ -5,13 +5,36 @@
|
|
|
5
5
|
# Created by Dan MacLean (TSL) on 2011-12-07.
|
|
6
6
|
# Copyright (c) . All rights reserved.
|
|
7
7
|
###################################################
|
|
8
|
-
|
|
8
|
+
require 'rubygems'
|
|
9
9
|
require 'rinruby'
|
|
10
10
|
require 'bio-samtools'
|
|
11
11
|
require 'bio/db/pileup'
|
|
12
12
|
require 'bio/db/vcf'
|
|
13
13
|
require 'pp'
|
|
14
14
|
|
|
15
|
+
=begin
|
|
16
|
+
Simple class representing a file of Fasta format sequences and each ones length
|
|
17
|
+
=end
|
|
18
|
+
class Bio::DB::FastaLengthDB
|
|
19
|
+
require 'bio'
|
|
20
|
+
def initialize(args)
|
|
21
|
+
@file = args[:file]
|
|
22
|
+
@seqs = {}
|
|
23
|
+
file = Bio::FastaFormat.open(@file)
|
|
24
|
+
file.each do |entry|
|
|
25
|
+
@seqs[entry.entry_id] = entry.length
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def each
|
|
29
|
+
@seqs.keys.sort.each do |k|
|
|
30
|
+
yield k, @seqs[k]
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
15
38
|
=begin
|
|
16
39
|
Extends the methods of the Bio::DB::Pileup class in bio-samtools. A pileup object represents the SAMtools pileup format at
|
|
17
40
|
http://samtools.sourceforge.net/pileup.shtml. These extension methods are used by the Bio::Util::Gngm object internally and
|
|
@@ -53,11 +76,10 @@ class Bio::DB::Pileup
|
|
|
53
76
|
# pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 2)
|
|
54
77
|
# pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 1, :ignore_reference_n => true)
|
|
55
78
|
def is_snp?(opts)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
end
|
|
79
|
+
return false if self.ref_base == '*'
|
|
80
|
+
#return false unless is_ct
|
|
81
|
+
return false if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
|
|
82
|
+
return true if self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
|
|
61
83
|
false
|
|
62
84
|
end
|
|
63
85
|
end
|
|
@@ -131,9 +153,9 @@ class Bio::DB::Vcf
|
|
|
131
153
|
end
|
|
132
154
|
|
|
133
155
|
#Returns true if only one variant allele is recorded. Loci with more than one allele are too complicated for now, so are discarded...
|
|
134
|
-
def has_just_one_variant?
|
|
135
|
-
|
|
136
|
-
end
|
|
156
|
+
#def has_just_one_variant?
|
|
157
|
+
# self.alternatives.length == 1 and self.variant?
|
|
158
|
+
#end
|
|
137
159
|
|
|
138
160
|
#Returns true if the position passes criteria
|
|
139
161
|
#
|
|
@@ -143,38 +165,27 @@ class Bio::DB::Vcf
|
|
|
143
165
|
#- :mapping_quality => 10
|
|
144
166
|
#
|
|
145
167
|
#Example
|
|
146
|
-
# vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25)
|
|
168
|
+
# vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25, :min_snp_quality => 20)
|
|
147
169
|
def pass_quality?(options)
|
|
148
|
-
(self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count])
|
|
170
|
+
(self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count] and self.qual >= options[:min_snp_quality])
|
|
149
171
|
end
|
|
150
172
|
|
|
151
|
-
#
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
case
|
|
155
|
-
when (not self.has_just_one_variant?) then false
|
|
156
|
-
when ( self.alt.length < self.ref.length and self.pass_quality?(options) ) then true
|
|
157
|
-
else false
|
|
158
|
-
end
|
|
159
|
-
rescue ## if something goes wrong, skip the postion,
|
|
173
|
+
#returns true if ref col has same length as all alternatives and position variant passes quality
|
|
174
|
+
def is_mnp?(options)
|
|
175
|
+
return true if self.alternatives.all? {|x| x.length == self.ref.length} and self.pass_quality?(options)
|
|
160
176
|
false
|
|
161
177
|
end
|
|
162
178
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
when (not self.has_just_one_variant?) then false
|
|
168
|
-
when ( self.alt.length > self.ref.length and self.pass_quality?(options) ) then true
|
|
169
|
-
else false
|
|
170
|
-
end
|
|
171
|
-
rescue ## if something goes wrong, skip the postion,
|
|
172
|
-
false
|
|
179
|
+
##returns true if ref col has length of 1 and is_mnp?
|
|
180
|
+
def is_snp?(options)
|
|
181
|
+
return true if self.is_mnp?(options) and self.ref.length == 1
|
|
182
|
+
false
|
|
173
183
|
end
|
|
174
184
|
|
|
175
|
-
#Returns true if
|
|
176
|
-
def is_indel?(
|
|
177
|
-
self.
|
|
185
|
+
#Returns true if ref col is different in length from any of the entries in alt column
|
|
186
|
+
def is_indel?(options)
|
|
187
|
+
return true if self.variant? and self.alternatives.any? {|x| x.length != self.ref.length} and self.pass_quality?(options)
|
|
188
|
+
false
|
|
178
189
|
end
|
|
179
190
|
|
|
180
191
|
|
|
@@ -321,19 +332,33 @@ class Gngm
|
|
|
321
332
|
#
|
|
322
333
|
# g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
|
|
323
334
|
# :format => :bam,
|
|
324
|
-
# :samtools => {:q => 20, :Q => 50
|
|
335
|
+
# :samtools => {:q => 20, :Q => 50},
|
|
325
336
|
# :fasta => "reference.fa"
|
|
326
|
-
#
|
|
337
|
+
# :start => 100,
|
|
338
|
+
# :stop => 200
|
|
327
339
|
# )
|
|
328
340
|
#
|
|
329
341
|
#Required parameters and defaults:
|
|
330
342
|
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
|
|
331
|
-
#- <tt>:format => :bam</tt> -
|
|
343
|
+
#- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
|
|
344
|
+
#- <tt>:chromosome => "nil"</tt> -sequence id to look at
|
|
345
|
+
#- <tt>:start => nil</tt> -start position on that sequence
|
|
346
|
+
#- <tt>:stop => nil</tt> -stop position on that sequence
|
|
332
347
|
#- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
|
|
333
|
-
#- <tt>:samtools => {:q => 20, :Q => 50
|
|
348
|
+
#- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
|
|
334
349
|
#Optional parameters and defaults:
|
|
350
|
+
|
|
335
351
|
#Most of these are parameters for specific methods and can be over-ridden when particular methods are called
|
|
336
|
-
#- <tt>:variant_call => {:indels => false,
|
|
352
|
+
#- <tt>:variant_call => {:indels => false,
|
|
353
|
+
# :min_depth => 2,
|
|
354
|
+
# :max_depth => 10000000,
|
|
355
|
+
# :min_snp_quality => 20,
|
|
356
|
+
# :mapping_quality => 10.0,
|
|
357
|
+
# :min_non_ref_count => 2,
|
|
358
|
+
# :ignore_reference_n => true,
|
|
359
|
+
# :min_consensus_quality => 20,
|
|
360
|
+
# :min_snp_quality => 20 }</tt>.
|
|
361
|
+
# For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
|
|
337
362
|
#- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
|
|
338
363
|
#- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
|
|
339
364
|
#- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
|
|
@@ -353,20 +378,30 @@ class Gngm
|
|
|
353
378
|
@density_max_y = nil #the maximum y value needed to plot the entire set density plots of threads and maintain a consistent scale for plots
|
|
354
379
|
@colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
|
|
355
380
|
@thread_colours = {}
|
|
381
|
+
@known_variants = nil #a list of variants to keep track of
|
|
356
382
|
@opts = {
|
|
357
383
|
:file => nil,
|
|
358
384
|
:format => :bam,
|
|
359
385
|
:fasta => nil,
|
|
360
386
|
:samtools => {:q => 20, :Q => 50},
|
|
361
|
-
|
|
387
|
+
:indels => false,
|
|
388
|
+
##:indels = call indels too, causes return of vcf
|
|
389
|
+
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
|
390
|
+
:variant_call => { :min_depth => 2,
|
|
391
|
+
:max_depth => 10000000,
|
|
392
|
+
:mapping_quality => 10.0,
|
|
393
|
+
:min_non_ref_count => 2,
|
|
394
|
+
:ignore_reference_n => true,
|
|
395
|
+
:shore_map => false,
|
|
396
|
+
:snp_file => :false,
|
|
397
|
+
:min_consensus_quality => 20,
|
|
398
|
+
:min_snp_quality => 20},
|
|
362
399
|
## some options are designed to be equivalent to vcfutils.pl from bvftools options when using vcf
|
|
363
400
|
##:min_depth (-d)
|
|
364
401
|
##:max_depth (-D)
|
|
365
402
|
##:mapping_quality (-Q) minimum RMS mappinq quality for SNPs (mq in info fields)
|
|
366
403
|
##:min_non_ref_count (-a) minimum num of alt bases ... the sum of the last two numbers in DP4 in info fields
|
|
367
404
|
##doesnt do anything with window filtering or pv values...
|
|
368
|
-
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
|
369
|
-
:variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true},
|
|
370
405
|
:histo_bin_width => 250000,
|
|
371
406
|
:graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil},
|
|
372
407
|
:adjust => 1,
|
|
@@ -376,6 +411,7 @@ class Gngm
|
|
|
376
411
|
:peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000} ##range is the width of the box to draw on the peak plot
|
|
377
412
|
}
|
|
378
413
|
@opts.merge!(options)
|
|
414
|
+
@opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
|
|
379
415
|
open_file
|
|
380
416
|
end
|
|
381
417
|
|
|
@@ -384,6 +420,7 @@ class Gngm
|
|
|
384
420
|
def open_file
|
|
385
421
|
case @opts[:format]
|
|
386
422
|
when :bam then open_bam
|
|
423
|
+
when :pileup, :text then open_text
|
|
387
424
|
end
|
|
388
425
|
end
|
|
389
426
|
|
|
@@ -394,6 +431,10 @@ class Gngm
|
|
|
394
431
|
@file.open
|
|
395
432
|
end
|
|
396
433
|
|
|
434
|
+
def open_text
|
|
435
|
+
@file = File.open(@opts[:file], "r")
|
|
436
|
+
end
|
|
437
|
+
|
|
397
438
|
public
|
|
398
439
|
#for BAM files calls Bio::DB::Sam#close to close the connections to input files safely
|
|
399
440
|
def close
|
|
@@ -415,16 +456,31 @@ class Gngm
|
|
|
415
456
|
#- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
|
|
416
457
|
#- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
|
|
417
458
|
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
|
418
|
-
|
|
459
|
+
#- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
|
|
460
|
+
#- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
|
|
419
461
|
#When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
|
|
420
462
|
#
|
|
421
|
-
#
|
|
463
|
+
#calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
|
|
422
464
|
def snp_positions(optsa={})
|
|
423
465
|
opts = @opts[:variant_call].merge(optsa)
|
|
424
466
|
return @snp_positions if @snp_positions
|
|
425
|
-
case
|
|
426
|
-
when
|
|
467
|
+
case @opts[:format]
|
|
468
|
+
when :bam then get_snp_positions_from_bam(opts)
|
|
469
|
+
when :text then get_snp_positions_from_text(opts)
|
|
470
|
+
when :pileup then get_snp_positions_from_pileup(opts)
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
##allows the user to assign SNP positions
|
|
475
|
+
def snp_positions=(arr)
|
|
476
|
+
@snp_positions = arr
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def is_allowed_substitution?(ref,alt,opts)
|
|
480
|
+
if opts[:substitutions].instance_of?(Array)
|
|
481
|
+
return false unless opts[:substitutions].include?("#{ref}:#{alt}")
|
|
427
482
|
end
|
|
483
|
+
true
|
|
428
484
|
end
|
|
429
485
|
|
|
430
486
|
private
|
|
@@ -432,18 +488,14 @@ class Gngm
|
|
|
432
488
|
#Sets @snp_positions
|
|
433
489
|
def get_snp_positions_from_bam(options={})
|
|
434
490
|
opts = @opts[:variant_call].merge(options)
|
|
435
|
-
if opts[:indels] and (opts[:deletions_only] or opts[:insertions_only])
|
|
436
|
-
raise "Cant have indels and deletions only or insertions only, need to specify ':indels => true' to get both"
|
|
437
|
-
end
|
|
438
491
|
arr = []
|
|
439
|
-
##when we are calling mpileup_plus we need to add :g to the samtools options
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
end
|
|
492
|
+
##when we are calling mpileup_plus we need to add :g to the samtools options #alw
|
|
493
|
+
@opts[:samtools][:g] = true if opts[:indels]
|
|
494
|
+
|
|
443
495
|
|
|
444
496
|
if not @opts[:samtools][:g]
|
|
445
497
|
@file.mpileup(@opts[:samtools]) do |pileup|
|
|
446
|
-
arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts)
|
|
498
|
+
arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
|
447
499
|
end
|
|
448
500
|
else
|
|
449
501
|
@file.mpileup_plus(@opts[:samtools]) do |vcf|
|
|
@@ -451,19 +503,56 @@ class Gngm
|
|
|
451
503
|
next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
|
|
452
504
|
##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
|
|
453
505
|
if opts[:indels]
|
|
454
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts)
|
|
455
|
-
|
|
456
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.
|
|
457
|
-
elsif opts[:insertions_only]
|
|
458
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_insertion?(opts)
|
|
506
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
|
507
|
+
else
|
|
508
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
|
459
509
|
end
|
|
460
510
|
end
|
|
461
511
|
end
|
|
462
512
|
|
|
463
513
|
@snp_positions = arr
|
|
514
|
+
|
|
464
515
|
arr
|
|
465
516
|
end
|
|
466
517
|
|
|
518
|
+
private
|
|
519
|
+
def get_snp_positions_from_map(options={})
|
|
520
|
+
arr = []
|
|
521
|
+
opts = @opts[:variant_call].merge(options)
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
#this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
|
|
525
|
+
#text file is of format chr\tpos\tref\talt\tfreq\n
|
|
526
|
+
def get_snp_positions_from_text(options={})
|
|
527
|
+
arr = []
|
|
528
|
+
opts = @opts[:variant_call].merge(options)
|
|
529
|
+
@file.each do |line|
|
|
530
|
+
chr,pos,ref,alt,freq = line.chomp.split("\t")
|
|
531
|
+
pos = pos.to_i
|
|
532
|
+
freq = freq.to_f
|
|
533
|
+
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
|
|
534
|
+
arr << [pos, freq]
|
|
535
|
+
end
|
|
536
|
+
@snp_positions = arr
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
private
|
|
540
|
+
def get_snp_positions_from_pileup(options={})
|
|
541
|
+
arr = []
|
|
542
|
+
opts = @opts[:variant_call].merge(options)
|
|
543
|
+
@file.each do |line|
|
|
544
|
+
pileup = Bio::DB::Pileup.new(line)
|
|
545
|
+
if pileup.ref_name != @opts[:chromosome] or pileup.pos < @opts[:start] or pileup.pos > @opts[:stop]
|
|
546
|
+
next
|
|
547
|
+
end
|
|
548
|
+
#old fashioned 10 col pileup format has extra fields we can use if needed
|
|
549
|
+
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
|
|
550
|
+
arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
|
551
|
+
end
|
|
552
|
+
end
|
|
553
|
+
@snp_positions = arr
|
|
554
|
+
end
|
|
555
|
+
|
|
467
556
|
private
|
|
468
557
|
#Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
|
|
469
558
|
def get_insert_size_frequency_from_bam(opts={})
|
|
@@ -815,7 +904,7 @@ class Gngm
|
|
|
815
904
|
@peak_indices = nil #needs resetting as we are working with new cluster
|
|
816
905
|
@peak_y_values = nil #needs resetting as we are working with new cluster
|
|
817
906
|
self.calculate_densities(options[:adjust])
|
|
818
|
-
@clusters = Array.new
|
|
907
|
+
@clusters = Array.new(@densities.length) {|x| 1 + x}
|
|
819
908
|
##now set the cluster colours..
|
|
820
909
|
colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
|
|
821
910
|
ci = 0
|
|
@@ -916,7 +1005,7 @@ class Gngm
|
|
|
916
1005
|
r.quit
|
|
917
1006
|
end
|
|
918
1007
|
|
|
919
|
-
private
|
|
1008
|
+
#private
|
|
920
1009
|
#Calculates the position of peaks in the signal curve
|
|
921
1010
|
def get_peaks(opts=@opts[:peaks])
|
|
922
1011
|
opts[:background] = opts[:background].to_s.upcase
|
|
@@ -1023,7 +1112,29 @@ class Gngm
|
|
|
1023
1112
|
return r
|
|
1024
1113
|
end
|
|
1025
1114
|
|
|
1115
|
+
private
|
|
1116
|
+
#returns an array of arrays of known variants
|
|
1117
|
+
#file:
|
|
1118
|
+
#chr1 500 A G
|
|
1119
|
+
#chr2 1000 ATGTTA
|
|
1120
|
+
#chr3 1500 . TTGGA
|
|
1121
|
+
# returns [["chr1", "500", "A", "G"], ["chr2", "1000", "ATG", "TTA"], ["chr3", "1500", ".", "TTGGA"]]
|
|
1122
|
+
def parse_known_variants(file)
|
|
1123
|
+
File.open(file, "r").readlines.collect {|x| x.chomp.split("\t")}
|
|
1124
|
+
end
|
|
1026
1125
|
|
|
1126
|
+
public
|
|
1127
|
+
#Deletes everything from self.snp_positions not mentioned by position in self.known_variants. Directly modifies self.snp_positions
|
|
1128
|
+
def keep_known_variants(file=nil)
|
|
1129
|
+
raise "file of known variants not provided and @known_variants is nil" if @known_variants.nil? and file.nil?
|
|
1130
|
+
@known_variants = parse_known_variants(file) if @known_variants.nil? and file
|
|
1131
|
+
@snp_positions.each do |snp|
|
|
1132
|
+
end
|
|
1133
|
+
end
|
|
1134
|
+
|
|
1027
1135
|
end
|
|
1028
1136
|
end
|
|
1029
1137
|
end
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require 'bio'
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module Bio
|
|
5
|
+
class Util
|
|
6
|
+
def self.read_gff3(fn)
|
|
7
|
+
genearray=Array.new
|
|
8
|
+
mRNAhash=Hash.new
|
|
9
|
+
exonhash=Hash.new
|
|
10
|
+
tehash=Hash.new
|
|
11
|
+
lastid = ''
|
|
12
|
+
lastrecord = nil
|
|
13
|
+
gff3 = Bio::GFF::GFF3.new(File.read(fn))
|
|
14
|
+
gff3.records.each do | record |
|
|
15
|
+
feature_type = record.feature_type
|
|
16
|
+
if(feature_type == 'gene')
|
|
17
|
+
genearray << record.id
|
|
18
|
+
elsif(feature_type == 'mRNA')
|
|
19
|
+
parent = record.get_attribute('Parent')
|
|
20
|
+
if mRNAhash[parent] == nil
|
|
21
|
+
mRNAhash[parent] = [record]
|
|
22
|
+
else
|
|
23
|
+
mRNAhash[parent] << record
|
|
24
|
+
end
|
|
25
|
+
elsif(feature_type == 'transposable_element')
|
|
26
|
+
#--- not yet implemented
|
|
27
|
+
elsif(feature_type == 'exon')
|
|
28
|
+
parents = record.get_attributes('Parent')
|
|
29
|
+
parents.each do |parent|
|
|
30
|
+
if exonhash[parent] == nil
|
|
31
|
+
exonhash[parent] = Array.new
|
|
32
|
+
end
|
|
33
|
+
exonhash[parent] << record
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -14,36 +14,48 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
|
14
14
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
15
15
|
require 'bio-gngm'
|
|
16
16
|
require 'bio'
|
|
17
|
+
require 'pp'
|
|
17
18
|
length = 0
|
|
18
19
|
chr_name = ""
|
|
19
|
-
file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
|
|
20
|
-
file.each do |entry|
|
|
21
|
-
length = entry.length
|
|
22
|
-
chr_name = entry.entry_id
|
|
23
|
-
end
|
|
20
|
+
#file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
|
|
21
|
+
#file.each do |entry|
|
|
22
|
+
# length = entry.length
|
|
23
|
+
# chr_name = entry.entry_id
|
|
24
|
+
#end
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
region = "
|
|
27
|
+
region = "1:2000000-3000000"
|
|
27
28
|
|
|
28
29
|
puts "analyzing - #{region}"
|
|
29
30
|
|
|
30
|
-
g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/
|
|
31
|
+
g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/insertion_finding/athal/aln.sort.bam",
|
|
31
32
|
:format => :bam,
|
|
32
|
-
:fasta => "/Users/macleand/Desktop/
|
|
33
|
-
:samtools => {:q =>
|
|
33
|
+
:fasta => "/Users/macleand/Desktop/insertion_finding/athal/chr1.fa",
|
|
34
|
+
:samtools => {:q => 50, :Q => 13, :r => region
|
|
34
35
|
}
|
|
35
36
|
)
|
|
36
37
|
|
|
37
|
-
g.get_unmapped_mate_frequency(:ref_window_size =>
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
g.get_unmapped_mate_frequency(:ref_window_size => 152, :ref_window_slide => 10)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
g.collect_threads(:start => 0.0, :stop => 1.0, :slide => 0.1, :size => 0.1)
|
|
43
|
+
pp g.threads
|
|
44
|
+
g.threads.delete_if {|x| x.last.length <= 3 }
|
|
40
45
|
|
|
41
46
|
begin
|
|
42
|
-
g.calculate_clusters(:pseudo => true)
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
#g.calculate_clusters(:pseudo => true)
|
|
48
|
+
g.calculate_clusters(:k => 4, :adjust => 0.5, :control_chd => 0.0, :expected_chd => 0.3, :pseudo => false)
|
|
49
|
+
filename = "deletion_real_data#{region}_all_threads.png"
|
|
50
|
+
g.draw_threads(filename, :draw_legend => "deletion_real_data#{region}_legend.png")
|
|
45
51
|
##no bands or signal to draw without clustering...
|
|
46
|
-
filename = "
|
|
52
|
+
filename = "deletion_real_data#{region}_bands.png"
|
|
53
|
+
g.draw_bands(filename)
|
|
54
|
+
filename = "deletion_real_data#{region}_signal.png"
|
|
55
|
+
g.draw_signal(filename)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
filename = "deletion_real_data_#{region}_hits.png"
|
|
47
59
|
g.draw_hit_count(filename)
|
|
48
60
|
rescue Exception => e
|
|
49
61
|
puts e.message, e.backtrace
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bio-gngm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-
|
|
12
|
+
date: 2012-11-15 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: bio
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &70148237802860 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,10 +21,10 @@ dependencies:
|
|
|
21
21
|
version: 1.4.2
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *70148237802860
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: bio-samtools
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &70148237802000 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ! '>='
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: 0.5.0
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *70148237802000
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: rinruby
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &70148237801440 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ! '>='
|
|
@@ -43,10 +43,10 @@ dependencies:
|
|
|
43
43
|
version: 2.0.2
|
|
44
44
|
type: :runtime
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *70148237801440
|
|
47
47
|
- !ruby/object:Gem::Dependency
|
|
48
48
|
name: shoulda
|
|
49
|
-
requirement: &
|
|
49
|
+
requirement: &70148237800620 !ruby/object:Gem::Requirement
|
|
50
50
|
none: false
|
|
51
51
|
requirements:
|
|
52
52
|
- - ! '>='
|
|
@@ -54,10 +54,10 @@ dependencies:
|
|
|
54
54
|
version: '0'
|
|
55
55
|
type: :development
|
|
56
56
|
prerelease: false
|
|
57
|
-
version_requirements: *
|
|
57
|
+
version_requirements: *70148237800620
|
|
58
58
|
- !ruby/object:Gem::Dependency
|
|
59
59
|
name: bundler
|
|
60
|
-
requirement: &
|
|
60
|
+
requirement: &70148237793360 !ruby/object:Gem::Requirement
|
|
61
61
|
none: false
|
|
62
62
|
requirements:
|
|
63
63
|
- - ~>
|
|
@@ -65,10 +65,10 @@ dependencies:
|
|
|
65
65
|
version: 1.0.0
|
|
66
66
|
type: :development
|
|
67
67
|
prerelease: false
|
|
68
|
-
version_requirements: *
|
|
68
|
+
version_requirements: *70148237793360
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: jeweler
|
|
71
|
-
requirement: &
|
|
71
|
+
requirement: &70148237791880 !ruby/object:Gem::Requirement
|
|
72
72
|
none: false
|
|
73
73
|
requirements:
|
|
74
74
|
- - ! '>='
|
|
@@ -76,10 +76,10 @@ dependencies:
|
|
|
76
76
|
version: '0'
|
|
77
77
|
type: :development
|
|
78
78
|
prerelease: false
|
|
79
|
-
version_requirements: *
|
|
79
|
+
version_requirements: *70148237791880
|
|
80
80
|
- !ruby/object:Gem::Dependency
|
|
81
81
|
name: rcov
|
|
82
|
-
requirement: &
|
|
82
|
+
requirement: &70148237789480 !ruby/object:Gem::Requirement
|
|
83
83
|
none: false
|
|
84
84
|
requirements:
|
|
85
85
|
- - ! '>='
|
|
@@ -87,10 +87,10 @@ dependencies:
|
|
|
87
87
|
version: '0'
|
|
88
88
|
type: :development
|
|
89
89
|
prerelease: false
|
|
90
|
-
version_requirements: *
|
|
90
|
+
version_requirements: *70148237789480
|
|
91
91
|
- !ruby/object:Gem::Dependency
|
|
92
92
|
name: bio
|
|
93
|
-
requirement: &
|
|
93
|
+
requirement: &70148237788820 !ruby/object:Gem::Requirement
|
|
94
94
|
none: false
|
|
95
95
|
requirements:
|
|
96
96
|
- - ! '>='
|
|
@@ -98,10 +98,10 @@ dependencies:
|
|
|
98
98
|
version: 1.4.2
|
|
99
99
|
type: :development
|
|
100
100
|
prerelease: false
|
|
101
|
-
version_requirements: *
|
|
101
|
+
version_requirements: *70148237788820
|
|
102
102
|
- !ruby/object:Gem::Dependency
|
|
103
103
|
name: bio-samtools
|
|
104
|
-
requirement: &
|
|
104
|
+
requirement: &70148237788260 !ruby/object:Gem::Requirement
|
|
105
105
|
none: false
|
|
106
106
|
requirements:
|
|
107
107
|
- - ! '>='
|
|
@@ -109,10 +109,10 @@ dependencies:
|
|
|
109
109
|
version: 0.5.0
|
|
110
110
|
type: :development
|
|
111
111
|
prerelease: false
|
|
112
|
-
version_requirements: *
|
|
112
|
+
version_requirements: *70148237788260
|
|
113
113
|
- !ruby/object:Gem::Dependency
|
|
114
114
|
name: rinruby
|
|
115
|
-
requirement: &
|
|
115
|
+
requirement: &70148237787540 !ruby/object:Gem::Requirement
|
|
116
116
|
none: false
|
|
117
117
|
requirements:
|
|
118
118
|
- - ! '>='
|
|
@@ -120,10 +120,10 @@ dependencies:
|
|
|
120
120
|
version: 2.0.2
|
|
121
121
|
type: :development
|
|
122
122
|
prerelease: false
|
|
123
|
-
version_requirements: *
|
|
123
|
+
version_requirements: *70148237787540
|
|
124
124
|
- !ruby/object:Gem::Dependency
|
|
125
125
|
name: rdoc
|
|
126
|
-
requirement: &
|
|
126
|
+
requirement: &70148237787000 !ruby/object:Gem::Requirement
|
|
127
127
|
none: false
|
|
128
128
|
requirements:
|
|
129
129
|
- - ! '>='
|
|
@@ -131,7 +131,7 @@ dependencies:
|
|
|
131
131
|
version: '0'
|
|
132
132
|
type: :development
|
|
133
133
|
prerelease: false
|
|
134
|
-
version_requirements: *
|
|
134
|
+
version_requirements: *70148237787000
|
|
135
135
|
description: Identify causative mutations in a model genome from NGS reads using the
|
|
136
136
|
NGM method.
|
|
137
137
|
email: maclean.daniel@gmail.com
|
|
@@ -213,6 +213,7 @@ files:
|
|
|
213
213
|
- examples/use_indels.rb
|
|
214
214
|
- lib/bio-gngm.rb
|
|
215
215
|
- lib/bio/util/bio-gngm.rb
|
|
216
|
+
- lib/bio/util/mutation_effects.rb
|
|
216
217
|
- scripts/get_subseq.rb
|
|
217
218
|
- scripts/make_histograms_laerfyve.rb
|
|
218
219
|
- scripts/make_histograms_laerfyve_stitched.rb
|
|
@@ -260,7 +261,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
260
261
|
version: '0'
|
|
261
262
|
segments:
|
|
262
263
|
- 0
|
|
263
|
-
hash:
|
|
264
|
+
hash: 729394422036910323
|
|
264
265
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
265
266
|
none: false
|
|
266
267
|
requirements:
|