bio-samtools 0.6.2 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -2
- data/README.md +4 -7
- data/VERSION +1 -1
- data/bio-samtools.gemspec +47 -105
- data/doc/Bio.html +68 -131
- data/doc/Bio/DB.html +51 -111
- data/doc/Bio/DB/Alignment.html +135 -363
- data/doc/Bio/DB/Pileup.html +183 -170
- data/doc/Bio/DB/SAM.html +1396 -820
- data/doc/Bio/DB/SAM/Library.html +73 -123
- data/doc/Bio/DB/SAM/Tools.html +51 -273
- data/doc/Bio/DB/Tag.html +78 -124
- data/doc/Bio/DB/Vcf.html +111 -147
- data/doc/LICENSE_txt.html +113 -148
- data/doc/created.rid +9 -10
- data/doc/fonts.css +167 -0
- data/doc/fonts/Lato-Light.ttf +0 -0
- data/doc/fonts/Lato-LightItalic.ttf +0 -0
- data/doc/fonts/Lato-Regular.ttf +0 -0
- data/doc/fonts/Lato-RegularItalic.ttf +0 -0
- data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
- data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
- data/doc/images/add.png +0 -0
- data/doc/images/arrow_up.png +0 -0
- data/doc/images/delete.png +0 -0
- data/doc/images/tag_blue.png +0 -0
- data/doc/index.html +48 -54
- data/doc/js/darkfish.js +9 -22
- data/doc/js/search.js +20 -5
- data/doc/js/search_index.js +1 -1
- data/doc/rdoc.css +255 -218
- data/doc/table_of_contents.html +256 -137
- data/ext/Rakefile +57 -0
- data/lib/bio-samtools.rb +7 -2
- data/lib/bio/BIOExtensions.rb +89 -0
- data/lib/bio/db/alignment.rb +59 -0
- data/lib/bio/db/fastadb.rb +255 -0
- data/lib/bio/db/pileup.rb +221 -172
- data/lib/bio/db/sam.rb +639 -589
- data/lib/bio/db/sam/{faidx.rb → faidx_old.rb} +0 -0
- data/lib/bio/db/vcf.rb +69 -68
- data/test/.gitignore +1 -0
- data/test/{test_basic.rb → old_test_basic.rb} +33 -1
- data/test/samples/small/dupes.bam +0 -0
- data/test/samples/small/dupes.sam +274 -0
- data/test/samples/small/map_for_reheader.sam +8 -0
- data/test/samples/small/map_to_merge1.bam +0 -0
- data/test/samples/small/map_to_merge1.bam.bai +0 -0
- data/test/samples/small/map_to_merge1.sam +8 -0
- data/test/samples/small/map_to_merge2.bam +0 -0
- data/test/samples/small/map_to_merge2.bam.bai +0 -0
- data/test/samples/small/map_to_merge2.sam +8 -0
- data/test/samples/small/no_md.sam +8 -0
- data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
- data/test/samples/small/test_cov.svg +273 -0
- data/test/samples/small/testu.bam.bai +0 -0
- data/test/svg +133 -0
- data/test/test_pileup.rb +84 -0
- data/test/test_sam.rb +331 -0
- data/test/test_vcf.rb +11 -0
- data/{doc → tutorial}/tutorial.html +0 -0
- data/{doc → tutorial}/tutorial.pdf +0 -0
- metadata +56 -114
- data/doc/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -159
- data/doc/Bio/DB/SAM/Tools/Bam1T.html +0 -220
- data/doc/Bio/DB/SAM/Tools/BamHeaderT.html +0 -249
- data/doc/Bio/DB/SAM/Tools/BamPileup1T.html +0 -159
- data/doc/Bio/DB/SAM/Tools/SamfileT.html +0 -171
- data/doc/Bio/DB/SAM/Tools/SamfileTX.html +0 -159
- data/doc/Bio/DB/SAMException.html +0 -205
- data/doc/LibC.html +0 -155
- data/doc/Pileup.html +0 -571
- data/doc/Vcf.html +0 -473
- data/doc/basic_styles.css +0 -31
- data/doc/classes/Bio.html +0 -139
- data/doc/classes/Bio/DB.html +0 -137
- data/doc/classes/Bio/DB/Alignment.html +0 -441
- data/doc/classes/Bio/DB/Alignment.src/M000012.html +0 -19
- data/doc/classes/Bio/DB/Alignment.src/M000013.html +0 -27
- data/doc/classes/Bio/DB/Alignment.src/M000014.html +0 -45
- data/doc/classes/Bio/DB/Alignment.src/M000015.html +0 -40
- data/doc/classes/Bio/DB/SAM.html +0 -510
- data/doc/classes/Bio/DB/SAM/Library.html +0 -135
- data/doc/classes/Bio/DB/SAM/Library.src/M000006.html +0 -28
- data/doc/classes/Bio/DB/SAM/Tools.html +0 -278
- data/doc/classes/Bio/DB/SAM/Tools.src/M000007.html +0 -20
- data/doc/classes/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -111
- data/doc/classes/Bio/DB/SAM/Tools/Bam1T.html +0 -150
- data/doc/classes/Bio/DB/SAM/Tools/Bam1T.src/M000010.html +0 -20
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.html +0 -169
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000008.html +0 -19
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000009.html +0 -18
- data/doc/classes/Bio/DB/SAM/Tools/BamPileup1T.html +0 -111
- data/doc/classes/Bio/DB/SAM/Tools/SamfileT.html +0 -129
- data/doc/classes/Bio/DB/SAM/Tools/SamfileTX.html +0 -111
- data/doc/classes/Bio/DB/SAMException.html +0 -140
- data/doc/classes/Bio/DB/SAMException.src/M000016.html +0 -18
- data/doc/classes/Bio/DB/Sam.src/M000017.html +0 -43
- data/doc/classes/Bio/DB/Sam.src/M000018.html +0 -42
- data/doc/classes/Bio/DB/Sam.src/M000019.html +0 -18
- data/doc/classes/Bio/DB/Sam.src/M000020.html +0 -22
- data/doc/classes/Bio/DB/Sam.src/M000021.html +0 -19
- data/doc/classes/Bio/DB/Sam.src/M000022.html +0 -25
- data/doc/classes/Bio/DB/Sam.src/M000023.html +0 -28
- data/doc/classes/Bio/DB/Sam.src/M000024.html +0 -28
- data/doc/classes/Bio/DB/Sam.src/M000025.html +0 -46
- data/doc/classes/Bio/DB/Sam.src/M000026.html +0 -24
- data/doc/classes/Bio/DB/Sam.src/M000027.html +0 -19
- data/doc/classes/Bio/DB/Sam.src/M000028.html +0 -24
- data/doc/classes/Bio/DB/Sam.src/M000029.html +0 -41
- data/doc/classes/Bio/DB/Sam.src/M000030.html +0 -31
- data/doc/classes/Bio/DB/Sam.src/M000031.html +0 -86
- data/doc/classes/Bio/DB/Sam.src/M000032.html +0 -34
- data/doc/classes/Bio/DB/Tag.html +0 -160
- data/doc/classes/Bio/DB/Tag.src/M000011.html +0 -21
- data/doc/classes/LibC.html +0 -105
- data/doc/classes/Pileup.html +0 -374
- data/doc/classes/Pileup.src/M000001.html +0 -34
- data/doc/classes/Pileup.src/M000002.html +0 -21
- data/doc/classes/Pileup.src/M000003.html +0 -21
- data/doc/classes/Pileup.src/M000004.html +0 -21
- data/doc/classes/Pileup.src/M000005.html +0 -31
- data/doc/files/lib/bio-samtools_rb.html +0 -109
- data/doc/files/lib/bio/db/sam/bam_rb.html +0 -108
- data/doc/files/lib/bio/db/sam/faidx_rb.html +0 -108
- data/doc/files/lib/bio/db/sam/library_rb.html +0 -101
- data/doc/files/lib/bio/db/sam/pileup_rb.html +0 -178
- data/doc/files/lib/bio/db/sam/sam_rb.html +0 -113
- data/doc/files/lib/bio/db/sam_rb.html +0 -111
- data/doc/fr_class_index.html +0 -43
- data/doc/fr_file_index.html +0 -33
- data/doc/fr_method_index.html +0 -58
- data/doc/lib/bio-samtools_rb.html +0 -115
- data/doc/lib/bio/db/pileup_rb.html +0 -171
- data/doc/lib/bio/db/sam/bam_rb.html +0 -121
- data/doc/lib/bio/db/sam/faidx_rb.html +0 -117
- data/doc/lib/bio/db/sam/library_rb.html +0 -115
- data/doc/lib/bio/db/sam/pileup_rb.html +0 -171
- data/doc/lib/bio/db/sam/sam_rb.html +0 -121
- data/doc/lib/bio/db/sam/vcf_rb.html +0 -124
- data/doc/lib/bio/db/sam_rb.html +0 -115
- data/doc/lib/bio/db/vcf_rb.html +0 -124
- data/doc/rdoc-style.css +0 -208
- data/lib/bio/db/sam/bam.rb +0 -210
- data/lib/bio/db/sam/sam.rb +0 -86
- data/test/samples/pipe_char/test.bam +0 -0
- data/test/samples/pipe_char/test.bam.bai +0 -0
- data/test/samples/pipe_char/test.tam +0 -10
- data/test/samples/pipe_char/test_chr.fasta +0 -1000
- data/test/samples/pipe_char/test_chr.fasta.fai +0 -1
- data/test/samples/small/test +0 -0
- data/test/samples/small/test.bam +0 -0
- data/test/samples/small/test.fa +0 -20
- data/test/samples/small/test.fai +0 -0
data/lib/bio-samtools.rb
CHANGED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
class Bio::NucleicAcid
|
|
2
|
+
|
|
3
|
+
IUPAC_CODES = {
|
|
4
|
+
|
|
5
|
+
'y' => 'ct',
|
|
6
|
+
'r' => 'ag',
|
|
7
|
+
'w' => 'at',
|
|
8
|
+
's' => 'cg',
|
|
9
|
+
'k' => 'gt',
|
|
10
|
+
'm' => 'ac',
|
|
11
|
+
|
|
12
|
+
'b' => 'cgt',
|
|
13
|
+
'd' => 'agt',
|
|
14
|
+
'h' => 'act',
|
|
15
|
+
'v' => 'acg',
|
|
16
|
+
|
|
17
|
+
'n' => 'acgt',
|
|
18
|
+
|
|
19
|
+
'a' => 'a',
|
|
20
|
+
't' => 't',
|
|
21
|
+
'g' => 'g',
|
|
22
|
+
'c' => 'c',
|
|
23
|
+
'u' => 'u',
|
|
24
|
+
|
|
25
|
+
'ct' => 'y',
|
|
26
|
+
'ag' => 'r',
|
|
27
|
+
'at' => 'w',
|
|
28
|
+
'cg' => 's',
|
|
29
|
+
'gt' => 'k',
|
|
30
|
+
'ac' => 'm',
|
|
31
|
+
|
|
32
|
+
'cgt' => 'b',
|
|
33
|
+
'agt' => 'd',
|
|
34
|
+
'act' => 'h',
|
|
35
|
+
'acg' => 'v',
|
|
36
|
+
|
|
37
|
+
'acgt' => 'n'
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def self.is_unambiguous(base)
|
|
42
|
+
"acgtACGT".match(base)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.to_IUAPC(bases)
|
|
46
|
+
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
|
47
|
+
if base == nil
|
|
48
|
+
p "Invalid base! #{base}"
|
|
49
|
+
base = 'n' #This is a patch... as one of the scripts failed here.
|
|
50
|
+
end
|
|
51
|
+
base.upcase
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.is_valid(code, base)
|
|
55
|
+
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
#Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
|
|
61
|
+
#sequences are already aligned and doesn't check if a base on the first sequence is
|
|
62
|
+
#valid on the second.
|
|
63
|
+
class Bio::Sequence
|
|
64
|
+
def self.snps_between(seq1, seq2)
|
|
65
|
+
snps=0
|
|
66
|
+
for i in (0..seq1.size-1)
|
|
67
|
+
snps += 1 if seq1[i] != seq2[i]
|
|
68
|
+
end
|
|
69
|
+
snps
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
class String
|
|
74
|
+
#Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
|
|
75
|
+
def count_ambiguities
|
|
76
|
+
snps=0
|
|
77
|
+
|
|
78
|
+
for i in (0..self.size-1)
|
|
79
|
+
|
|
80
|
+
snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
|
|
81
|
+
end
|
|
82
|
+
snps
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
#Counts how many bases are uppercase
|
|
86
|
+
def upper_case_count
|
|
87
|
+
match(/[^A-Z]*/).to_s.size
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
module Bio
|
|
2
|
+
class DB
|
|
3
|
+
|
|
4
|
+
#a class to represent the SAM OPT values, presented in SAM as TAG:VTYPE:VALUE
|
|
5
|
+
class Tag
|
|
6
|
+
attr_accessor :tag, :type, :value
|
|
7
|
+
def set(str)
|
|
8
|
+
@tag = str[0..1]
|
|
9
|
+
@type = str[3]
|
|
10
|
+
@value = str[5..-1]
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
#Attrobites frp, the flag field (see chapter 2.2.2 of the sam file documentation)
|
|
15
|
+
#query_strand and mate_strand are true if they are forward. It is the opposite to
|
|
16
|
+
#the definition in the BAM format for clarity.
|
|
17
|
+
#primary is the negation of is_negative from the BAM format
|
|
18
|
+
class Alignment
|
|
19
|
+
attr_accessor :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr, :calend, :qlen
|
|
20
|
+
|
|
21
|
+
attr_accessor :is_paired, :is_mapped, :query_unmapped, :mate_unmapped, :query_strand, :mate_strand, :first_in_pair,:second_in_pair, :primary, :failed_quality, :is_duplicate
|
|
22
|
+
|
|
23
|
+
#parses the SAM string into its constituents and set its attributes
|
|
24
|
+
def initialize(sam_string)
|
|
25
|
+
s = sam_string.chomp.split("\t")
|
|
26
|
+
@qname = s[0]
|
|
27
|
+
@flag = s[1].to_i
|
|
28
|
+
@rname = s[2]
|
|
29
|
+
@pos = s[3].to_i
|
|
30
|
+
@mapq = s[4].to_i
|
|
31
|
+
@cigar = s[5]
|
|
32
|
+
@mrnm = s[6]
|
|
33
|
+
@mpos = s[7].to_i
|
|
34
|
+
@isize = s[8].to_i
|
|
35
|
+
@seq = s[9]
|
|
36
|
+
@qual = s[10]
|
|
37
|
+
@tags = {}
|
|
38
|
+
11.upto(s.size-1) {|n|
|
|
39
|
+
t = Bio::DB::Tag.new
|
|
40
|
+
t.set(s[n])
|
|
41
|
+
tags[t.tag] = t
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@is_paired = (@flag & 0x0001) > 0
|
|
45
|
+
@is_mapped = @flag & 0x0002 > 0
|
|
46
|
+
@query_unmapped = @flag & 0x0004 > 0
|
|
47
|
+
@mate_unmapped = @flag & 0x0008 > 0
|
|
48
|
+
@query_strand = !(@flag & 0x0010 > 0)
|
|
49
|
+
@mate_strand = !(@flag & 0x0020 > 0)
|
|
50
|
+
@first_in_pair = @flag & 0x0040 > 0
|
|
51
|
+
@second_in_pair = @flag & 0x0080 > 0
|
|
52
|
+
@primary = !(@flag & 0x0100 > 0)
|
|
53
|
+
@failed_quality = @flag & 0x0200 > 0
|
|
54
|
+
@is_duplicate = @flag & 0x0400 > 0
|
|
55
|
+
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
#Module to hold the information about the fasta file
|
|
2
|
+
|
|
3
|
+
module Bio::DB::Fasta
|
|
4
|
+
#This class contains the entries in a fasta, as generated by samtools faidx
|
|
5
|
+
class Index
|
|
6
|
+
include Enumerable
|
|
7
|
+
attr_reader :entries
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@entries=[]
|
|
11
|
+
@entries_map = Hash.new
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
#This doesnt validate if you are adding the same entry twice. I may add
|
|
15
|
+
#a validation for that.
|
|
16
|
+
def << (entry)
|
|
17
|
+
@entries << entry
|
|
18
|
+
@entries_map[entry.id] = entry
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def each(&block)
|
|
22
|
+
@entries.entries(&block)
|
|
23
|
+
end
|
|
24
|
+
#Total number of entries
|
|
25
|
+
def length
|
|
26
|
+
@entries.length
|
|
27
|
+
end
|
|
28
|
+
alias_method :size, :length
|
|
29
|
+
|
|
30
|
+
#Returns a new Index just with the specified range, as if it was an Array.
|
|
31
|
+
#The return object is of type Index.
|
|
32
|
+
def [](args)
|
|
33
|
+
tmp = @entries[args]
|
|
34
|
+
new_index = Index.new
|
|
35
|
+
tmp.each do | entry |
|
|
36
|
+
@new_index << entry
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
#Gets the Region object for the full length of the sequence
|
|
41
|
+
#name queried.
|
|
42
|
+
def region_for_entry(entry)
|
|
43
|
+
@entries_map[entry]
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
class Entry
|
|
48
|
+
attr_reader :id, :length
|
|
49
|
+
|
|
50
|
+
def initialize(id, length)
|
|
51
|
+
@id=id
|
|
52
|
+
@length=length.to_i
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def get_full_region
|
|
56
|
+
reg = Region.new
|
|
57
|
+
reg.entry = id
|
|
58
|
+
reg.start = 1
|
|
59
|
+
reg.end = @length
|
|
60
|
+
reg.orientation = :forward
|
|
61
|
+
reg
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
alias_method :to_region, :get_full_region
|
|
65
|
+
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
#Class to wrap a region of a chromosome
|
|
69
|
+
class Region
|
|
70
|
+
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
|
71
|
+
attr_accessor :entry, :start, :end, :orientation
|
|
72
|
+
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases, :total_cov, :called
|
|
73
|
+
|
|
74
|
+
#TODO: Debug, as it hasnt been tested in the actual code.
|
|
75
|
+
def base_ratios_for_base(base)
|
|
76
|
+
@all_ratios = Hash.new unless @all_ratios
|
|
77
|
+
unless @all_ratios[base]
|
|
78
|
+
ratios = Array.new
|
|
79
|
+
for i in (0..region.size-1)
|
|
80
|
+
ratios << @base_ratios[i][base]
|
|
81
|
+
end
|
|
82
|
+
@all_ratios[base] = ratios
|
|
83
|
+
end
|
|
84
|
+
@all_ratios[base]
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
#Calculates the concensus, base ratios, coverages and total coverages in the region
|
|
88
|
+
#* min_cov minimum coverage to make a call (default 0)
|
|
89
|
+
#* min_per minimum representation to make make a call. If more than one base
|
|
90
|
+
# can be called, the IUAPC ambiguity code is returned
|
|
91
|
+
def calculate_stats_from_pile(opts={})
|
|
92
|
+
min_cov = opts[:min_cov] ? opts[:min_cov] : 0
|
|
93
|
+
min_per = opts[:min_per] ? opts[:min_per] : 0.20
|
|
94
|
+
self.called = 0
|
|
95
|
+
reference = self.reference.downcase
|
|
96
|
+
|
|
97
|
+
self.base_ratios = Array.new(self.size, BASE_COUNT_ZERO)
|
|
98
|
+
self.bases = Array.new(self.size, BASE_COUNT_ZERO)
|
|
99
|
+
self.coverages = Array.new(self.size, 0)
|
|
100
|
+
self.total_cov = 0
|
|
101
|
+
|
|
102
|
+
self.pileup.each do | pile |
|
|
103
|
+
|
|
104
|
+
if pile.coverage > min_cov
|
|
105
|
+
self.base_ratios[pile.pos - self.start ] = pile.base_ratios
|
|
106
|
+
reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
|
|
107
|
+
self.coverages[pile.pos - self.start ] = pile.coverage.to_i
|
|
108
|
+
self.bases[pile.pos - self.start ] = pile.bases
|
|
109
|
+
self.called += 1
|
|
110
|
+
end
|
|
111
|
+
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
|
112
|
+
self.total_cov += pile.coverage
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
self.consensus = Bio::Sequence.new(reference)
|
|
116
|
+
self.consensus.na
|
|
117
|
+
if self.orientation == :reverse
|
|
118
|
+
self.consensus.reverse_complement!()
|
|
119
|
+
end
|
|
120
|
+
self.average_coverage = self.total_cov.to_f/self.size.to_f
|
|
121
|
+
self
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def to_s
|
|
125
|
+
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
|
126
|
+
string
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
#Returns a region object from a string in form "name:start-end"
|
|
130
|
+
def self.parse_region(reg_str)
|
|
131
|
+
string = reg_str.delete("'")
|
|
132
|
+
fields_1 = string.split(":")
|
|
133
|
+
fields_2 = fields_1[1].split("-")
|
|
134
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
|
|
135
|
+
|
|
136
|
+
reg = Region.new
|
|
137
|
+
reg.entry = fields_1[0]
|
|
138
|
+
reg.start = fields_2[0].to_i
|
|
139
|
+
reg.end = fields_2[1].to_i
|
|
140
|
+
|
|
141
|
+
if reg.end < reg.start
|
|
142
|
+
reg.orientation = :reverse
|
|
143
|
+
else
|
|
144
|
+
reg.orientation = :forward
|
|
145
|
+
end
|
|
146
|
+
reg
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
#Length of the region
|
|
150
|
+
def size
|
|
151
|
+
@end - @start
|
|
152
|
+
end
|
|
153
|
+
alias_method :length, :size
|
|
154
|
+
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
class FastaDBException < StandardError; end
|
|
158
|
+
|
|
159
|
+
#Class that holds the fasta file. It is used as a database.
|
|
160
|
+
class FastaFile
|
|
161
|
+
attr_reader :index, :fasta_path
|
|
162
|
+
|
|
163
|
+
#Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
|
|
164
|
+
#* fasta path to the fasta file
|
|
165
|
+
#* samtools path to samtools, if it is not provided, use the bundled version
|
|
166
|
+
def initialize(args)
|
|
167
|
+
@fasta_path = args[:fasta]
|
|
168
|
+
@samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
|
|
169
|
+
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
|
170
|
+
@fai_file = @fasta_path + ".fai"
|
|
171
|
+
unless File.file?(@fai_file) then
|
|
172
|
+
command = "#{@samtools} faidx '#{@fasta_path}'"
|
|
173
|
+
@last_command = command
|
|
174
|
+
system(command)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
#Loads the fai entries
|
|
180
|
+
def load_fai_entries()
|
|
181
|
+
return @index.length if @index
|
|
182
|
+
@index = Index.new
|
|
183
|
+
fai_file = @fai_file
|
|
184
|
+
File.open(fai_file).each do | line |
|
|
185
|
+
fields = line.split("\t")
|
|
186
|
+
@index << Entry.new(fields[0], fields[1])
|
|
187
|
+
end
|
|
188
|
+
@index.length
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
#Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
|
|
194
|
+
#Options - if a subsequence is required
|
|
195
|
+
#* chr - [STRING] the reference name of the subsequence
|
|
196
|
+
#* start - [INT] the start position for the subsequence
|
|
197
|
+
#* stop - [INT] the stop position for the subsequence
|
|
198
|
+
def faidx(opts={})
|
|
199
|
+
if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
|
|
200
|
+
opts={:as_bio => false}
|
|
201
|
+
self.fetch_reference(:chr,:start,:stop,opts)
|
|
202
|
+
else
|
|
203
|
+
command = "#{@samtools} faidx #{@fasta_path}"
|
|
204
|
+
@last_command = command
|
|
205
|
+
system(command)
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
|
211
|
+
def fetch_sequence(region)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
query = region.to_s
|
|
215
|
+
query = region.to_region.to_s if region.respond_to?(:to_region)
|
|
216
|
+
command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
|
|
217
|
+
puts command
|
|
218
|
+
@last_command = command
|
|
219
|
+
seq = ""
|
|
220
|
+
yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
|
|
221
|
+
|
|
222
|
+
reference = Bio::Sequence::NA.new(seq)
|
|
223
|
+
|
|
224
|
+
if region.orientation == :reverse
|
|
225
|
+
#puts "reversing! #{reference.to_s}"
|
|
226
|
+
reference.reverse_complement!()
|
|
227
|
+
end
|
|
228
|
+
reference
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
private
|
|
232
|
+
#Returns Process::Status with the execution status. If run in a $VERBOSE environment, stderr of the process
|
|
233
|
+
#is forwarded to the default stdout
|
|
234
|
+
def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
|
|
235
|
+
stdin, pipe, stderr, wait_thr = Open3.popen3(command)
|
|
236
|
+
pid = wait_thr[:pid] # pid of the started process.
|
|
237
|
+
if type == :text
|
|
238
|
+
while (line = pipe.gets)
|
|
239
|
+
next if skip_comments and line[0] == comment_char
|
|
240
|
+
yield klass.new(line.chomp)
|
|
241
|
+
end
|
|
242
|
+
elsif type == :binary
|
|
243
|
+
while (c = pipe.gets(nil))
|
|
244
|
+
yield c
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
exit_status = wait_thr.value # Process::Status object returned.
|
|
248
|
+
puts stderr.read if $VERBOSE
|
|
249
|
+
stdin.close
|
|
250
|
+
pipe.close
|
|
251
|
+
stderr.close
|
|
252
|
+
return exit_status
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
end
|
data/lib/bio/db/pileup.rb
CHANGED
|
@@ -26,185 +26,234 @@
|
|
|
26
26
|
#
|
|
27
27
|
module Bio
|
|
28
28
|
class DB
|
|
29
|
-
class Pileup
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
29
|
+
class Pileup
|
|
30
|
+
attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
|
|
31
|
+
|
|
32
|
+
#creates the Pileup object
|
|
33
|
+
# pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
|
34
|
+
# pile = Bio::DB::Pileup.new(pile_up_line)
|
|
35
|
+
def initialize(pile_up_line)
|
|
36
|
+
cols = pile_up_line.split(/\t/)
|
|
37
|
+
if cols.length == 6 ##should only be able to get 6 lines from mpileup
|
|
38
|
+
@ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
|
|
39
|
+
elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
|
|
40
|
+
if cols[2] == '*' #indel
|
|
41
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
|
|
42
|
+
else #snp / identity
|
|
43
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
|
|
44
|
+
end
|
|
45
|
+
@consensus_quality = @consensus_quality.to_f
|
|
46
|
+
@snp_quality = @snp_quality.to_f
|
|
47
|
+
@rms_mapq = @rms_mapq.to_f
|
|
48
|
+
else
|
|
49
|
+
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
@pos = @pos.to_i
|
|
53
|
+
@coverage = @coverage.to_f
|
|
54
|
+
@ref_count = nil
|
|
55
|
+
@non_ref_count_hash = nil
|
|
56
|
+
@non_ref_count = nil
|
|
44
57
|
end
|
|
45
|
-
@consensus_quality = @consensus_quality.to_f
|
|
46
|
-
@snp_quality = @snp_quality.to_f
|
|
47
|
-
@rms_mapq = @rms_mapq.to_f
|
|
48
|
-
else
|
|
49
|
-
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
|
50
|
-
end
|
|
51
58
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
59
|
+
#Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
|
|
60
|
+
#returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
|
|
61
|
+
def non_refs
|
|
62
|
+
if @non_ref_count_hash.nil?
|
|
63
|
+
@non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
|
|
64
|
+
end
|
|
65
|
+
@non_ref_count_hash
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# returns the total non-reference bases in the reads at this position
|
|
69
|
+
def non_ref_count
|
|
70
|
+
if @non_ref_count.nil?
|
|
71
|
+
@non_ref_count = @read_bases.count("ATGCatgc").to_f
|
|
72
|
+
end
|
|
73
|
+
@non_ref_count
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# returns the count of reference-bases in the reads at this position
|
|
77
|
+
def ref_count
|
|
78
|
+
if @ref_count.nil?
|
|
79
|
+
@ref_count = self.read_bases.count(".,")
|
|
80
|
+
end
|
|
81
|
+
@ref_count
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
|
85
|
+
def consensus
|
|
86
|
+
if @consensus.nil?
|
|
87
|
+
max = self.non_refs.values.max
|
|
88
|
+
if (self.ref_count / self.coverage) > 0.5
|
|
89
|
+
@consensus = self.ref_base
|
|
90
|
+
elsif self.ref_count > max
|
|
91
|
+
@consensus = self.ref_base
|
|
92
|
+
else
|
|
93
|
+
arr = self.non_refs.select {|k,v| v == max }
|
|
94
|
+
bases = arr.collect {|b| b[0].to_s }
|
|
95
|
+
bases << self.ref_base if self.ref_count == max
|
|
96
|
+
@consensus = bases.sort.join
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
@consensus
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
#returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
|
|
103
|
+
def to_vcf
|
|
104
|
+
alt,g = self.genotype_list
|
|
105
|
+
alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
|
|
106
|
+
alt = '.' if alt == self.ref_base
|
|
107
|
+
[self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
def Pileup.vcf_header
|
|
112
|
+
%{##fileformat=VCFv3.3
|
|
113
|
+
##INFO=DP,1,Integer,"Total Depth"
|
|
114
|
+
##FORMAT=GT,1,String,"Genotype"
|
|
115
|
+
##FORMAT=GQ,1,Integer,"Genotype Quality"
|
|
116
|
+
##FORMAT=DP,1,Integer,"Read Depth"
|
|
117
|
+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
|
|
118
|
+
}.join("\n")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
#returns the genotype of the indel
|
|
124
|
+
def indel_gt
|
|
125
|
+
return "undef" if self.consensus.instance_of?(Array)
|
|
126
|
+
al1, al2 = self.consensus.split(/\//)
|
|
127
|
+
if al1 == al2 && al1 == '*'
|
|
128
|
+
al1=self.indel_1
|
|
129
|
+
al2=self.indel_2
|
|
130
|
+
end
|
|
131
|
+
alt1 = parse_indel(al1)
|
|
132
|
+
alt2 = parse_indel(al2)
|
|
133
|
+
alt,gt = nil,nil
|
|
134
|
+
|
|
135
|
+
return nil if !alt1 and !alt2
|
|
136
|
+
if !alt1
|
|
137
|
+
alt = alt2
|
|
138
|
+
gt = '0/1'
|
|
139
|
+
elsif !alt2
|
|
140
|
+
alt = alt1
|
|
141
|
+
gt - '0/1'
|
|
142
|
+
elsif alt1 == alt2
|
|
143
|
+
alt = alt1
|
|
144
|
+
gt = '1/1'
|
|
92
145
|
else
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
bases << self.ref_base if self.ref_count == max
|
|
96
|
-
@consensus = bases.sort.join
|
|
146
|
+
alt="#{alt1},#{alt2}"
|
|
147
|
+
gt= '1/2'
|
|
97
148
|
end
|
|
149
|
+
return [alt, gt]
|
|
150
|
+
|
|
98
151
|
end
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
152
|
+
#returns the genotype of the snp
|
|
153
|
+
def snp_gt
|
|
154
|
+
return ['.','0/0'] if self.ref_base == self.consensus
|
|
155
|
+
bases = Pileup.iupac_to_base(self.consensus)
|
|
156
|
+
if bases[0] == self.ref_base
|
|
157
|
+
return [bases[1],'0/1']
|
|
158
|
+
elsif bases[1] == self.ref_base
|
|
159
|
+
return [bases[0],'0/1']
|
|
160
|
+
else
|
|
161
|
+
return ["#{bases[0]},#{bases[1]}",'1/2']
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
#identifies the reference base and returns the indel or snp genotype as applicable
|
|
166
|
+
public
|
|
167
|
+
def genotype_list
|
|
168
|
+
if self.ref_base == '*'
|
|
169
|
+
return indel_gt
|
|
170
|
+
else
|
|
171
|
+
return snp_gt
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
#returns the two bases for the corresponding iupac code
|
|
176
|
+
public
|
|
177
|
+
def Pileup.iupac_to_base(alt_base)
|
|
178
|
+
case alt_base
|
|
179
|
+
when 'K' then ['G','T']
|
|
180
|
+
when 'M' then ['A','C']
|
|
181
|
+
when 'S' then ['C','G']
|
|
182
|
+
when 'R' then ['A','G']
|
|
183
|
+
when 'W' then ['A','T']
|
|
184
|
+
when 'Y' then ['C','T']
|
|
185
|
+
else alt_base.split(//)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
#identifies if the indel is an insertion or a deletion
|
|
190
|
+
def parse_indel(alt)
|
|
191
|
+
return "D#{$'.length}" if alt =~/^-/
|
|
192
|
+
if alt=~/^\+/
|
|
193
|
+
return "I#{$'}"
|
|
194
|
+
elsif alt == '*'
|
|
195
|
+
return nil
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
#returns pileup format line
|
|
201
|
+
def to_s
|
|
202
|
+
if @read_quals and !@consensus_quality #6col
|
|
203
|
+
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
|
204
|
+
elsif @indel_1 #13 cols
|
|
205
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
|
206
|
+
else #10 cols
|
|
207
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def bases
|
|
214
|
+
return @bases if @bases
|
|
215
|
+
@bases = self.non_refs
|
|
216
|
+
#puts self.ref_count
|
|
217
|
+
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
|
218
|
+
@bases
|
|
219
|
+
end
|
|
104
220
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def Pileup.vcf_header
|
|
113
|
-
%{##fileformat=VCFv3.3
|
|
114
|
-
##INFO=DP,1,Integer,"Total Depth"
|
|
115
|
-
##FORMAT=GT,1,String,"Genotype"
|
|
116
|
-
##FORMAT=GQ,1,Integer,"Genotype Quality"
|
|
117
|
-
##FORMAT=DP,1,Integer,"Read Depth"
|
|
118
|
-
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
|
|
119
|
-
}.join("\n")
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
def parse_indel(alt)
|
|
221
|
+
def base_coverage
|
|
222
|
+
total = 0
|
|
223
|
+
@bases.each do |k,v|
|
|
224
|
+
total += v
|
|
225
|
+
end
|
|
226
|
+
total
|
|
227
|
+
end
|
|
123
228
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
gt = '1/1'
|
|
153
|
-
else
|
|
154
|
-
alt="#{alt1},#{alt2}"
|
|
155
|
-
gt= '1/2'
|
|
156
|
-
end
|
|
157
|
-
return [alt, gt]
|
|
158
|
-
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
def snp_gt
|
|
162
|
-
return ['.','0/0'] if self.ref_base == self.consensus
|
|
163
|
-
bases = Pileup.iupac_to_base(self.consensus)
|
|
164
|
-
if bases[0] == self.ref_base
|
|
165
|
-
return [bases[1],'0/1']
|
|
166
|
-
elsif bases[1] == self.ref_base
|
|
167
|
-
return [bases[0],'0/1']
|
|
168
|
-
else
|
|
169
|
-
return ["#{bases[0]},#{bases[1]}",'1/2']
|
|
170
|
-
end
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
public
|
|
174
|
-
def genotype_list
|
|
175
|
-
if self.ref_base == '*'
|
|
176
|
-
return indel_gt
|
|
177
|
-
else
|
|
178
|
-
return snp_gt
|
|
179
|
-
end
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
public
|
|
183
|
-
#returns
|
|
184
|
-
def Pileup.iupac_to_base(alt_base)
|
|
185
|
-
case alt_base
|
|
186
|
-
when 'K' then ['G','T']
|
|
187
|
-
when 'M' then ['A','C']
|
|
188
|
-
when 'S' then ['C','G']
|
|
189
|
-
when 'R' then ['A','G']
|
|
190
|
-
when 'W' then ['A','T']
|
|
191
|
-
when 'Y' then ['C','T']
|
|
192
|
-
else alt_base.split(//)
|
|
193
|
-
end
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
#returns pileup format line
|
|
197
|
-
def to_s
|
|
198
|
-
if @read_quals and !@consensus_quality #6col
|
|
199
|
-
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
|
200
|
-
elsif @indel_1 #13 cols
|
|
201
|
-
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
|
202
|
-
else #10 cols
|
|
203
|
-
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
|
229
|
+
def base_ratios
|
|
230
|
+
return @base_ratios if @base_ratios
|
|
231
|
+
bases = self.bases
|
|
232
|
+
@base_ratios = Hash.new
|
|
233
|
+
bases.each do |k,v|
|
|
234
|
+
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
|
235
|
+
end
|
|
236
|
+
@base_ratios
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
|
240
|
+
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
|
241
|
+
|
|
242
|
+
if @consensus_iuap.nil?
|
|
243
|
+
@consensus_iuap = self.ref_base.downcase
|
|
244
|
+
bases = self.bases
|
|
245
|
+
tmp = String.new
|
|
246
|
+
bases.each do |k,v|
|
|
247
|
+
tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
|
|
248
|
+
end
|
|
249
|
+
if tmp.length > 0
|
|
250
|
+
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
@consensus_iuap
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
|
|
204
257
|
end
|
|
205
|
-
|
|
206
258
|
end
|
|
207
|
-
|
|
208
|
-
end
|
|
209
|
-
end
|
|
210
259
|
end
|