bio-gngm 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bio-gngm.gemspec +3 -2
- data/examples/make_histograms.rb +43 -17
- data/lib/bio-gngm.rb +1 -0
- data/lib/bio/util/bio-gngm.rb +171 -60
- data/lib/bio/util/mutation_effects.rb +39 -0
- data/scripts/make_threads_unmapped_simulation.rb +28 -16
- metadata +26 -25
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bio-gngm.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-gngm"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dan MacLean"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-11-15"
|
13
13
|
s.description = "Identify causative mutations in a model genome from NGS reads using the NGM method."
|
14
14
|
s.email = "maclean.daniel@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -89,6 +89,7 @@ Gem::Specification.new do |s|
|
|
89
89
|
"examples/use_indels.rb",
|
90
90
|
"lib/bio-gngm.rb",
|
91
91
|
"lib/bio/util/bio-gngm.rb",
|
92
|
+
"lib/bio/util/mutation_effects.rb",
|
92
93
|
"scripts/get_subseq.rb",
|
93
94
|
"scripts/make_histograms_laerfyve.rb",
|
94
95
|
"scripts/make_histograms_laerfyve_stitched.rb",
|
data/examples/make_histograms.rb
CHANGED
@@ -14,27 +14,53 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
14
14
|
require 'bio-gngm'
|
15
15
|
require 'bio-samtools'
|
16
16
|
|
17
|
+
#def make_snp_array(id, file)
|
18
|
+
# a = []
|
19
|
+
# File.open(file, "r").each do |line|
|
20
|
+
# arr = line.split(/\t/)
|
21
|
+
# next if arr[0] !~ /#{id}/
|
22
|
+
# a << [a]
|
23
|
+
# end
|
24
|
+
#end
|
17
25
|
|
26
|
+
sequences = Bio::DB::FastaLengthDB.new(:file => ARGV[0])
|
27
|
+
$stderr.puts "Loaded sequences..."
|
18
28
|
|
19
29
|
|
20
|
-
|
21
|
-
g = Bio::Util::Gngm.new(:file =>
|
22
|
-
|
23
|
-
:
|
24
|
-
:
|
30
|
+
sequences.each do |id, length|
|
31
|
+
g = Bio::Util::Gngm.new(:file => ARGV[1],
|
32
|
+
#:format => :pileup,
|
33
|
+
:format => :bam,
|
34
|
+
:fasta => ARGV[0],
|
35
|
+
:chromosome => id,
|
36
|
+
:start => 1,
|
37
|
+
:stop => length,
|
38
|
+
:samtools => {
|
25
39
|
:q => 20,
|
26
|
-
:Q =>
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
40
|
+
:Q => 20
|
41
|
+
},
|
42
|
+
:variant_call => {
|
43
|
+
:indels => false,
|
44
|
+
:deletions_only => false,
|
45
|
+
:insertions_only => false,
|
46
|
+
:min_depth => 6,
|
47
|
+
:max_depth => 250,
|
48
|
+
:mapping_quality => 20.0,
|
49
|
+
:min_non_ref_count => 2,
|
50
|
+
:ignore_reference_n => true,
|
51
|
+
:min_snp_quality => 20,
|
52
|
+
:min_consensus_quality => 20
|
53
|
+
}
|
54
|
+
)
|
55
|
+
$stderr.puts "getting #{id}.."
|
56
|
+
|
57
|
+
g.snp_positions
|
58
|
+
puts "Found #{g.snp_positions.length} SNPs"
|
59
|
+
[100000, 250000, 500000].each do |bin_width|
|
60
|
+
$stderr.puts "working on #{bin_width} windows"
|
61
|
+
file_name = "test_#{id}_#{bin_width}.png"
|
35
62
|
g.frequency_histogram("#{file_name}",bin_width)
|
36
|
-
end
|
37
|
-
|
38
|
-
#close the BAM file
|
63
|
+
end
|
39
64
|
g.close
|
65
|
+
end
|
40
66
|
|
data/lib/bio-gngm.rb
CHANGED
data/lib/bio/util/bio-gngm.rb
CHANGED
@@ -5,13 +5,36 @@
|
|
5
5
|
# Created by Dan MacLean (TSL) on 2011-12-07.
|
6
6
|
# Copyright (c) . All rights reserved.
|
7
7
|
###################################################
|
8
|
-
|
8
|
+
require 'rubygems'
|
9
9
|
require 'rinruby'
|
10
10
|
require 'bio-samtools'
|
11
11
|
require 'bio/db/pileup'
|
12
12
|
require 'bio/db/vcf'
|
13
13
|
require 'pp'
|
14
14
|
|
15
|
+
=begin
|
16
|
+
Simple class representing a file of Fasta format sequences and each ones length
|
17
|
+
=end
|
18
|
+
class Bio::DB::FastaLengthDB
|
19
|
+
require 'bio'
|
20
|
+
def initialize(args)
|
21
|
+
@file = args[:file]
|
22
|
+
@seqs = {}
|
23
|
+
file = Bio::FastaFormat.open(@file)
|
24
|
+
file.each do |entry|
|
25
|
+
@seqs[entry.entry_id] = entry.length
|
26
|
+
end
|
27
|
+
|
28
|
+
def each
|
29
|
+
@seqs.keys.sort.each do |k|
|
30
|
+
yield k, @seqs[k]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
15
38
|
=begin
|
16
39
|
Extends the methods of the Bio::DB::Pileup class in bio-samtools. A pileup object represents the SAMtools pileup format at
|
17
40
|
http://samtools.sourceforge.net/pileup.shtml. These extension methods are used by the Bio::Util::Gngm object internally and
|
@@ -53,11 +76,10 @@ class Bio::DB::Pileup
|
|
53
76
|
# pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 2)
|
54
77
|
# pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 1, :ignore_reference_n => true)
|
55
78
|
def is_snp?(opts)
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
end
|
79
|
+
return false if self.ref_base == '*'
|
80
|
+
#return false unless is_ct
|
81
|
+
return false if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
|
82
|
+
return true if self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
|
61
83
|
false
|
62
84
|
end
|
63
85
|
end
|
@@ -131,9 +153,9 @@ class Bio::DB::Vcf
|
|
131
153
|
end
|
132
154
|
|
133
155
|
#Returns true if only one variant allele is recorded. Loci with more than one allele are too complicated for now, so are discarded...
|
134
|
-
def has_just_one_variant?
|
135
|
-
|
136
|
-
end
|
156
|
+
#def has_just_one_variant?
|
157
|
+
# self.alternatives.length == 1 and self.variant?
|
158
|
+
#end
|
137
159
|
|
138
160
|
#Returns true if the position passes criteria
|
139
161
|
#
|
@@ -143,38 +165,27 @@ class Bio::DB::Vcf
|
|
143
165
|
#- :mapping_quality => 10
|
144
166
|
#
|
145
167
|
#Example
|
146
|
-
# vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25)
|
168
|
+
# vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25, :min_snp_quality => 20)
|
147
169
|
def pass_quality?(options)
|
148
|
-
(self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count])
|
170
|
+
(self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count] and self.qual >= options[:min_snp_quality])
|
149
171
|
end
|
150
172
|
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
case
|
155
|
-
when (not self.has_just_one_variant?) then false
|
156
|
-
when ( self.alt.length < self.ref.length and self.pass_quality?(options) ) then true
|
157
|
-
else false
|
158
|
-
end
|
159
|
-
rescue ## if something goes wrong, skip the postion,
|
173
|
+
#returns true if ref col has same length as all alternatives and position variant passes quality
|
174
|
+
def is_mnp?(options)
|
175
|
+
return true if self.alternatives.all? {|x| x.length == self.ref.length} and self.pass_quality?(options)
|
160
176
|
false
|
161
177
|
end
|
162
178
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
when (not self.has_just_one_variant?) then false
|
168
|
-
when ( self.alt.length > self.ref.length and self.pass_quality?(options) ) then true
|
169
|
-
else false
|
170
|
-
end
|
171
|
-
rescue ## if something goes wrong, skip the postion,
|
172
|
-
false
|
179
|
+
##returns true if ref col has length of 1 and is_mnp?
|
180
|
+
def is_snp?(options)
|
181
|
+
return true if self.is_mnp?(options) and self.ref.length == 1
|
182
|
+
false
|
173
183
|
end
|
174
184
|
|
175
|
-
#Returns true if
|
176
|
-
def is_indel?(
|
177
|
-
self.
|
185
|
+
#Returns true if ref col is different in length from any of the entries in alt column
|
186
|
+
def is_indel?(options)
|
187
|
+
return true if self.variant? and self.alternatives.any? {|x| x.length != self.ref.length} and self.pass_quality?(options)
|
188
|
+
false
|
178
189
|
end
|
179
190
|
|
180
191
|
|
@@ -321,19 +332,33 @@ class Gngm
|
|
321
332
|
#
|
322
333
|
# g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
|
323
334
|
# :format => :bam,
|
324
|
-
# :samtools => {:q => 20, :Q => 50
|
335
|
+
# :samtools => {:q => 20, :Q => 50},
|
325
336
|
# :fasta => "reference.fa"
|
326
|
-
#
|
337
|
+
# :start => 100,
|
338
|
+
# :stop => 200
|
327
339
|
# )
|
328
340
|
#
|
329
341
|
#Required parameters and defaults:
|
330
342
|
#- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
|
331
|
-
#- <tt>:format => :bam</tt> -
|
343
|
+
#- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
|
344
|
+
#- <tt>:chromosome => "nil"</tt> -sequence id to look at
|
345
|
+
#- <tt>:start => nil</tt> -start position on that sequence
|
346
|
+
#- <tt>:stop => nil</tt> -stop position on that sequence
|
332
347
|
#- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
|
333
|
-
#- <tt>:samtools => {:q => 20, :Q => 50
|
348
|
+
#- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
|
334
349
|
#Optional parameters and defaults:
|
350
|
+
|
335
351
|
#Most of these are parameters for specific methods and can be over-ridden when particular methods are called
|
336
|
-
#- <tt>:variant_call => {:indels => false,
|
352
|
+
#- <tt>:variant_call => {:indels => false,
|
353
|
+
# :min_depth => 2,
|
354
|
+
# :max_depth => 10000000,
|
355
|
+
# :min_snp_quality => 20,
|
356
|
+
# :mapping_quality => 10.0,
|
357
|
+
# :min_non_ref_count => 2,
|
358
|
+
# :ignore_reference_n => true,
|
359
|
+
# :min_consensus_quality => 20,
|
360
|
+
# :min_snp_quality => 20 }</tt>.
|
361
|
+
# For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
|
337
362
|
#- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
|
338
363
|
#- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
|
339
364
|
#- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
|
@@ -353,20 +378,30 @@ class Gngm
|
|
353
378
|
@density_max_y = nil #the maximum y value needed to plot the entire set density plots of threads and maintain a consistent scale for plots
|
354
379
|
@colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
|
355
380
|
@thread_colours = {}
|
381
|
+
@known_variants = nil #a list of variants to keep track of
|
356
382
|
@opts = {
|
357
383
|
:file => nil,
|
358
384
|
:format => :bam,
|
359
385
|
:fasta => nil,
|
360
386
|
:samtools => {:q => 20, :Q => 50},
|
361
|
-
|
387
|
+
:indels => false,
|
388
|
+
##:indels = call indels too, causes return of vcf
|
389
|
+
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
390
|
+
:variant_call => { :min_depth => 2,
|
391
|
+
:max_depth => 10000000,
|
392
|
+
:mapping_quality => 10.0,
|
393
|
+
:min_non_ref_count => 2,
|
394
|
+
:ignore_reference_n => true,
|
395
|
+
:shore_map => false,
|
396
|
+
:snp_file => :false,
|
397
|
+
:min_consensus_quality => 20,
|
398
|
+
:min_snp_quality => 20},
|
362
399
|
## some options are designed to be equivalent to vcfutils.pl from bvftools options when using vcf
|
363
400
|
##:min_depth (-d)
|
364
401
|
##:max_depth (-D)
|
365
402
|
##:mapping_quality (-Q) minimum RMS mappinq quality for SNPs (mq in info fields)
|
366
403
|
##:min_non_ref_count (-a) minimum num of alt bases ... the sum of the last two numbers in DP4 in info fields
|
367
404
|
##doesnt do anything with window filtering or pv values...
|
368
|
-
:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
|
369
|
-
:variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true},
|
370
405
|
:histo_bin_width => 250000,
|
371
406
|
:graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil},
|
372
407
|
:adjust => 1,
|
@@ -376,6 +411,7 @@ class Gngm
|
|
376
411
|
:peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000} ##range is the width of the box to draw on the peak plot
|
377
412
|
}
|
378
413
|
@opts.merge!(options)
|
414
|
+
@opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
|
379
415
|
open_file
|
380
416
|
end
|
381
417
|
|
@@ -384,6 +420,7 @@ class Gngm
|
|
384
420
|
def open_file
|
385
421
|
case @opts[:format]
|
386
422
|
when :bam then open_bam
|
423
|
+
when :pileup, :text then open_text
|
387
424
|
end
|
388
425
|
end
|
389
426
|
|
@@ -394,6 +431,10 @@ class Gngm
|
|
394
431
|
@file.open
|
395
432
|
end
|
396
433
|
|
434
|
+
def open_text
|
435
|
+
@file = File.open(@opts[:file], "r")
|
436
|
+
end
|
437
|
+
|
397
438
|
public
|
398
439
|
#for BAM files calls Bio::DB::Sam#close to close the connections to input files safely
|
399
440
|
def close
|
@@ -415,16 +456,31 @@ class Gngm
|
|
415
456
|
#- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
|
416
457
|
#- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
|
417
458
|
#- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
|
418
|
-
|
459
|
+
#- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
|
460
|
+
#- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
|
419
461
|
#When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
|
420
462
|
#
|
421
|
-
#
|
463
|
+
#calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
|
422
464
|
def snp_positions(optsa={})
|
423
465
|
opts = @opts[:variant_call].merge(optsa)
|
424
466
|
return @snp_positions if @snp_positions
|
425
|
-
case
|
426
|
-
when
|
467
|
+
case @opts[:format]
|
468
|
+
when :bam then get_snp_positions_from_bam(opts)
|
469
|
+
when :text then get_snp_positions_from_text(opts)
|
470
|
+
when :pileup then get_snp_positions_from_pileup(opts)
|
471
|
+
end
|
472
|
+
end
|
473
|
+
|
474
|
+
##allows the user to assign SNP positions
|
475
|
+
def snp_positions=(arr)
|
476
|
+
@snp_positions = arr
|
477
|
+
end
|
478
|
+
|
479
|
+
def is_allowed_substitution?(ref,alt,opts)
|
480
|
+
if opts[:substitutions].instance_of?(Array)
|
481
|
+
return false unless opts[:substitutions].include?("#{ref}:#{alt}")
|
427
482
|
end
|
483
|
+
true
|
428
484
|
end
|
429
485
|
|
430
486
|
private
|
@@ -432,18 +488,14 @@ class Gngm
|
|
432
488
|
#Sets @snp_positions
|
433
489
|
def get_snp_positions_from_bam(options={})
|
434
490
|
opts = @opts[:variant_call].merge(options)
|
435
|
-
if opts[:indels] and (opts[:deletions_only] or opts[:insertions_only])
|
436
|
-
raise "Cant have indels and deletions only or insertions only, need to specify ':indels => true' to get both"
|
437
|
-
end
|
438
491
|
arr = []
|
439
|
-
##when we are calling mpileup_plus we need to add :g to the samtools options
|
440
|
-
|
441
|
-
|
442
|
-
end
|
492
|
+
##when we are calling mpileup_plus we need to add :g to the samtools options #alw
|
493
|
+
@opts[:samtools][:g] = true if opts[:indels]
|
494
|
+
|
443
495
|
|
444
496
|
if not @opts[:samtools][:g]
|
445
497
|
@file.mpileup(@opts[:samtools]) do |pileup|
|
446
|
-
arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts)
|
498
|
+
arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
447
499
|
end
|
448
500
|
else
|
449
501
|
@file.mpileup_plus(@opts[:samtools]) do |vcf|
|
@@ -451,19 +503,56 @@ class Gngm
|
|
451
503
|
next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
|
452
504
|
##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
|
453
505
|
if opts[:indels]
|
454
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts)
|
455
|
-
|
456
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.
|
457
|
-
elsif opts[:insertions_only]
|
458
|
-
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_insertion?(opts)
|
506
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
507
|
+
else
|
508
|
+
arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
|
459
509
|
end
|
460
510
|
end
|
461
511
|
end
|
462
512
|
|
463
513
|
@snp_positions = arr
|
514
|
+
|
464
515
|
arr
|
465
516
|
end
|
466
517
|
|
518
|
+
private
|
519
|
+
def get_snp_positions_from_map(options={})
|
520
|
+
arr = []
|
521
|
+
opts = @opts[:variant_call].merge(options)
|
522
|
+
end
|
523
|
+
|
524
|
+
#this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
|
525
|
+
#text file is of format chr\tpos\tref\talt\tfreq\n
|
526
|
+
def get_snp_positions_from_text(options={})
|
527
|
+
arr = []
|
528
|
+
opts = @opts[:variant_call].merge(options)
|
529
|
+
@file.each do |line|
|
530
|
+
chr,pos,ref,alt,freq = line.chomp.split("\t")
|
531
|
+
pos = pos.to_i
|
532
|
+
freq = freq.to_f
|
533
|
+
next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
|
534
|
+
arr << [pos, freq]
|
535
|
+
end
|
536
|
+
@snp_positions = arr
|
537
|
+
end
|
538
|
+
|
539
|
+
private
|
540
|
+
def get_snp_positions_from_pileup(options={})
|
541
|
+
arr = []
|
542
|
+
opts = @opts[:variant_call].merge(options)
|
543
|
+
@file.each do |line|
|
544
|
+
pileup = Bio::DB::Pileup.new(line)
|
545
|
+
if pileup.ref_name != @opts[:chromosome] or pileup.pos < @opts[:start] or pileup.pos > @opts[:stop]
|
546
|
+
next
|
547
|
+
end
|
548
|
+
#old fashioned 10 col pileup format has extra fields we can use if needed
|
549
|
+
if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
|
550
|
+
arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
|
551
|
+
end
|
552
|
+
end
|
553
|
+
@snp_positions = arr
|
554
|
+
end
|
555
|
+
|
467
556
|
private
|
468
557
|
#Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
|
469
558
|
def get_insert_size_frequency_from_bam(opts={})
|
@@ -815,7 +904,7 @@ class Gngm
|
|
815
904
|
@peak_indices = nil #needs resetting as we are working with new cluster
|
816
905
|
@peak_y_values = nil #needs resetting as we are working with new cluster
|
817
906
|
self.calculate_densities(options[:adjust])
|
818
|
-
@clusters = Array.new
|
907
|
+
@clusters = Array.new(@densities.length) {|x| 1 + x}
|
819
908
|
##now set the cluster colours..
|
820
909
|
colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
|
821
910
|
ci = 0
|
@@ -916,7 +1005,7 @@ class Gngm
|
|
916
1005
|
r.quit
|
917
1006
|
end
|
918
1007
|
|
919
|
-
private
|
1008
|
+
#private
|
920
1009
|
#Calculates the position of peaks in the signal curve
|
921
1010
|
def get_peaks(opts=@opts[:peaks])
|
922
1011
|
opts[:background] = opts[:background].to_s.upcase
|
@@ -1023,7 +1112,29 @@ class Gngm
|
|
1023
1112
|
return r
|
1024
1113
|
end
|
1025
1114
|
|
1115
|
+
private
|
1116
|
+
#returns an array of arrays of known variants
|
1117
|
+
#file:
|
1118
|
+
#chr1 500 A G
|
1119
|
+
#chr2 1000 ATGTTA
|
1120
|
+
#chr3 1500 . TTGGA
|
1121
|
+
# returns [["chr1", "500", "A", "G"], ["chr2", "1000", "ATG", "TTA"], ["chr3", "1500", ".", "TTGGA"]]
|
1122
|
+
def parse_known_variants(file)
|
1123
|
+
File.open(file, "r").readlines.collect {|x| x.chomp.split("\t")}
|
1124
|
+
end
|
1026
1125
|
|
1126
|
+
public
|
1127
|
+
#Deletes everything from self.snp_positions not mentioned by position in self.known_variants. Directly modifies self.snp_positions
|
1128
|
+
def keep_known_variants(file=nil)
|
1129
|
+
raise "file of known variants not provided and @known_variants is nil" if @known_variants.nil? and file.nil?
|
1130
|
+
@known_variants = parse_known_variants(file) if @known_variants.nil? and file
|
1131
|
+
@snp_positions.each do |snp|
|
1132
|
+
end
|
1133
|
+
end
|
1134
|
+
|
1027
1135
|
end
|
1028
1136
|
end
|
1029
1137
|
end
|
1138
|
+
|
1139
|
+
|
1140
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'bio'
|
2
|
+
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
class Util
|
6
|
+
def self.read_gff3(fn)
|
7
|
+
genearray=Array.new
|
8
|
+
mRNAhash=Hash.new
|
9
|
+
exonhash=Hash.new
|
10
|
+
tehash=Hash.new
|
11
|
+
lastid = ''
|
12
|
+
lastrecord = nil
|
13
|
+
gff3 = Bio::GFF::GFF3.new(File.read(fn))
|
14
|
+
gff3.records.each do | record |
|
15
|
+
feature_type = record.feature_type
|
16
|
+
if(feature_type == 'gene')
|
17
|
+
genearray << record.id
|
18
|
+
elsif(feature_type == 'mRNA')
|
19
|
+
parent = record.get_attribute('Parent')
|
20
|
+
if mRNAhash[parent] == nil
|
21
|
+
mRNAhash[parent] = [record]
|
22
|
+
else
|
23
|
+
mRNAhash[parent] << record
|
24
|
+
end
|
25
|
+
elsif(feature_type == 'transposable_element')
|
26
|
+
#--- not yet implemented
|
27
|
+
elsif(feature_type == 'exon')
|
28
|
+
parents = record.get_attributes('Parent')
|
29
|
+
parents.each do |parent|
|
30
|
+
if exonhash[parent] == nil
|
31
|
+
exonhash[parent] = Array.new
|
32
|
+
end
|
33
|
+
exonhash[parent] << record
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -14,36 +14,48 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
14
14
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
15
|
require 'bio-gngm'
|
16
16
|
require 'bio'
|
17
|
+
require 'pp'
|
17
18
|
length = 0
|
18
19
|
chr_name = ""
|
19
|
-
file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
|
20
|
-
file.each do |entry|
|
21
|
-
length = entry.length
|
22
|
-
chr_name = entry.entry_id
|
23
|
-
end
|
20
|
+
#file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
|
21
|
+
#file.each do |entry|
|
22
|
+
# length = entry.length
|
23
|
+
# chr_name = entry.entry_id
|
24
|
+
#end
|
24
25
|
|
25
26
|
|
26
|
-
region = "
|
27
|
+
region = "1:2000000-3000000"
|
27
28
|
|
28
29
|
puts "analyzing - #{region}"
|
29
30
|
|
30
|
-
g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/
|
31
|
+
g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/insertion_finding/athal/aln.sort.bam",
|
31
32
|
:format => :bam,
|
32
|
-
:fasta => "/Users/macleand/Desktop/
|
33
|
-
:samtools => {:q =>
|
33
|
+
:fasta => "/Users/macleand/Desktop/insertion_finding/athal/chr1.fa",
|
34
|
+
:samtools => {:q => 50, :Q => 13, :r => region
|
34
35
|
}
|
35
36
|
)
|
36
37
|
|
37
|
-
g.get_unmapped_mate_frequency(:ref_window_size =>
|
38
|
-
|
39
|
-
|
38
|
+
g.get_unmapped_mate_frequency(:ref_window_size => 152, :ref_window_slide => 10)
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
g.collect_threads(:start => 0.0, :stop => 1.0, :slide => 0.1, :size => 0.1)
|
43
|
+
pp g.threads
|
44
|
+
g.threads.delete_if {|x| x.last.length <= 3 }
|
40
45
|
|
41
46
|
begin
|
42
|
-
g.calculate_clusters(:pseudo => true)
|
43
|
-
|
44
|
-
|
47
|
+
#g.calculate_clusters(:pseudo => true)
|
48
|
+
g.calculate_clusters(:k => 4, :adjust => 0.5, :control_chd => 0.0, :expected_chd => 0.3, :pseudo => false)
|
49
|
+
filename = "deletion_real_data#{region}_all_threads.png"
|
50
|
+
g.draw_threads(filename, :draw_legend => "deletion_real_data#{region}_legend.png")
|
45
51
|
##no bands or signal to draw without clustering...
|
46
|
-
filename = "
|
52
|
+
filename = "deletion_real_data#{region}_bands.png"
|
53
|
+
g.draw_bands(filename)
|
54
|
+
filename = "deletion_real_data#{region}_signal.png"
|
55
|
+
g.draw_signal(filename)
|
56
|
+
|
57
|
+
|
58
|
+
filename = "deletion_real_data_#{region}_hits.png"
|
47
59
|
g.draw_hit_count(filename)
|
48
60
|
rescue Exception => e
|
49
61
|
puts e.message, e.backtrace
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-gngm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bio
|
16
|
-
requirement: &
|
16
|
+
requirement: &70148237802860 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.4.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70148237802860
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bio-samtools
|
27
|
-
requirement: &
|
27
|
+
requirement: &70148237802000 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.5.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70148237802000
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rinruby
|
38
|
-
requirement: &
|
38
|
+
requirement: &70148237801440 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 2.0.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70148237801440
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: shoulda
|
49
|
-
requirement: &
|
49
|
+
requirement: &70148237800620 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70148237800620
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: bundler
|
60
|
-
requirement: &
|
60
|
+
requirement: &70148237793360 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70148237793360
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70148237791880 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70148237791880
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rcov
|
82
|
-
requirement: &
|
82
|
+
requirement: &70148237789480 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70148237789480
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: bio
|
93
|
-
requirement: &
|
93
|
+
requirement: &70148237788820 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: 1.4.2
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70148237788820
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: bio-samtools
|
104
|
-
requirement: &
|
104
|
+
requirement: &70148237788260 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: 0.5.0
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70148237788260
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: rinruby
|
115
|
-
requirement: &
|
115
|
+
requirement: &70148237787540 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 2.0.2
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70148237787540
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: rdoc
|
126
|
-
requirement: &
|
126
|
+
requirement: &70148237787000 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70148237787000
|
135
135
|
description: Identify causative mutations in a model genome from NGS reads using the
|
136
136
|
NGM method.
|
137
137
|
email: maclean.daniel@gmail.com
|
@@ -213,6 +213,7 @@ files:
|
|
213
213
|
- examples/use_indels.rb
|
214
214
|
- lib/bio-gngm.rb
|
215
215
|
- lib/bio/util/bio-gngm.rb
|
216
|
+
- lib/bio/util/mutation_effects.rb
|
216
217
|
- scripts/get_subseq.rb
|
217
218
|
- scripts/make_histograms_laerfyve.rb
|
218
219
|
- scripts/make_histograms_laerfyve_stitched.rb
|
@@ -260,7 +261,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
260
261
|
version: '0'
|
261
262
|
segments:
|
262
263
|
- 0
|
263
|
-
hash:
|
264
|
+
hash: 729394422036910323
|
264
265
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
265
266
|
none: false
|
266
267
|
requirements:
|