bio-samtools 0.6.2 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -2
- data/README.md +4 -7
- data/VERSION +1 -1
- data/bio-samtools.gemspec +47 -105
- data/doc/Bio.html +68 -131
- data/doc/Bio/DB.html +51 -111
- data/doc/Bio/DB/Alignment.html +135 -363
- data/doc/Bio/DB/Pileup.html +183 -170
- data/doc/Bio/DB/SAM.html +1396 -820
- data/doc/Bio/DB/SAM/Library.html +73 -123
- data/doc/Bio/DB/SAM/Tools.html +51 -273
- data/doc/Bio/DB/Tag.html +78 -124
- data/doc/Bio/DB/Vcf.html +111 -147
- data/doc/LICENSE_txt.html +113 -148
- data/doc/created.rid +9 -10
- data/doc/fonts.css +167 -0
- data/doc/fonts/Lato-Light.ttf +0 -0
- data/doc/fonts/Lato-LightItalic.ttf +0 -0
- data/doc/fonts/Lato-Regular.ttf +0 -0
- data/doc/fonts/Lato-RegularItalic.ttf +0 -0
- data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
- data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
- data/doc/images/add.png +0 -0
- data/doc/images/arrow_up.png +0 -0
- data/doc/images/delete.png +0 -0
- data/doc/images/tag_blue.png +0 -0
- data/doc/index.html +48 -54
- data/doc/js/darkfish.js +9 -22
- data/doc/js/search.js +20 -5
- data/doc/js/search_index.js +1 -1
- data/doc/rdoc.css +255 -218
- data/doc/table_of_contents.html +256 -137
- data/ext/Rakefile +57 -0
- data/lib/bio-samtools.rb +7 -2
- data/lib/bio/BIOExtensions.rb +89 -0
- data/lib/bio/db/alignment.rb +59 -0
- data/lib/bio/db/fastadb.rb +255 -0
- data/lib/bio/db/pileup.rb +221 -172
- data/lib/bio/db/sam.rb +639 -589
- data/lib/bio/db/sam/{faidx.rb → faidx_old.rb} +0 -0
- data/lib/bio/db/vcf.rb +69 -68
- data/test/.gitignore +1 -0
- data/test/{test_basic.rb → old_test_basic.rb} +33 -1
- data/test/samples/small/dupes.bam +0 -0
- data/test/samples/small/dupes.sam +274 -0
- data/test/samples/small/map_for_reheader.sam +8 -0
- data/test/samples/small/map_to_merge1.bam +0 -0
- data/test/samples/small/map_to_merge1.bam.bai +0 -0
- data/test/samples/small/map_to_merge1.sam +8 -0
- data/test/samples/small/map_to_merge2.bam +0 -0
- data/test/samples/small/map_to_merge2.bam.bai +0 -0
- data/test/samples/small/map_to_merge2.sam +8 -0
- data/test/samples/small/no_md.sam +8 -0
- data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
- data/test/samples/small/test_cov.svg +273 -0
- data/test/samples/small/testu.bam.bai +0 -0
- data/test/svg +133 -0
- data/test/test_pileup.rb +84 -0
- data/test/test_sam.rb +331 -0
- data/test/test_vcf.rb +11 -0
- data/{doc → tutorial}/tutorial.html +0 -0
- data/{doc → tutorial}/tutorial.pdf +0 -0
- metadata +56 -114
- data/doc/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -159
- data/doc/Bio/DB/SAM/Tools/Bam1T.html +0 -220
- data/doc/Bio/DB/SAM/Tools/BamHeaderT.html +0 -249
- data/doc/Bio/DB/SAM/Tools/BamPileup1T.html +0 -159
- data/doc/Bio/DB/SAM/Tools/SamfileT.html +0 -171
- data/doc/Bio/DB/SAM/Tools/SamfileTX.html +0 -159
- data/doc/Bio/DB/SAMException.html +0 -205
- data/doc/LibC.html +0 -155
- data/doc/Pileup.html +0 -571
- data/doc/Vcf.html +0 -473
- data/doc/basic_styles.css +0 -31
- data/doc/classes/Bio.html +0 -139
- data/doc/classes/Bio/DB.html +0 -137
- data/doc/classes/Bio/DB/Alignment.html +0 -441
- data/doc/classes/Bio/DB/Alignment.src/M000012.html +0 -19
- data/doc/classes/Bio/DB/Alignment.src/M000013.html +0 -27
- data/doc/classes/Bio/DB/Alignment.src/M000014.html +0 -45
- data/doc/classes/Bio/DB/Alignment.src/M000015.html +0 -40
- data/doc/classes/Bio/DB/SAM.html +0 -510
- data/doc/classes/Bio/DB/SAM/Library.html +0 -135
- data/doc/classes/Bio/DB/SAM/Library.src/M000006.html +0 -28
- data/doc/classes/Bio/DB/SAM/Tools.html +0 -278
- data/doc/classes/Bio/DB/SAM/Tools.src/M000007.html +0 -20
- data/doc/classes/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -111
- data/doc/classes/Bio/DB/SAM/Tools/Bam1T.html +0 -150
- data/doc/classes/Bio/DB/SAM/Tools/Bam1T.src/M000010.html +0 -20
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.html +0 -169
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000008.html +0 -19
- data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000009.html +0 -18
- data/doc/classes/Bio/DB/SAM/Tools/BamPileup1T.html +0 -111
- data/doc/classes/Bio/DB/SAM/Tools/SamfileT.html +0 -129
- data/doc/classes/Bio/DB/SAM/Tools/SamfileTX.html +0 -111
- data/doc/classes/Bio/DB/SAMException.html +0 -140
- data/doc/classes/Bio/DB/SAMException.src/M000016.html +0 -18
- data/doc/classes/Bio/DB/Sam.src/M000017.html +0 -43
- data/doc/classes/Bio/DB/Sam.src/M000018.html +0 -42
- data/doc/classes/Bio/DB/Sam.src/M000019.html +0 -18
- data/doc/classes/Bio/DB/Sam.src/M000020.html +0 -22
- data/doc/classes/Bio/DB/Sam.src/M000021.html +0 -19
- data/doc/classes/Bio/DB/Sam.src/M000022.html +0 -25
- data/doc/classes/Bio/DB/Sam.src/M000023.html +0 -28
- data/doc/classes/Bio/DB/Sam.src/M000024.html +0 -28
- data/doc/classes/Bio/DB/Sam.src/M000025.html +0 -46
- data/doc/classes/Bio/DB/Sam.src/M000026.html +0 -24
- data/doc/classes/Bio/DB/Sam.src/M000027.html +0 -19
- data/doc/classes/Bio/DB/Sam.src/M000028.html +0 -24
- data/doc/classes/Bio/DB/Sam.src/M000029.html +0 -41
- data/doc/classes/Bio/DB/Sam.src/M000030.html +0 -31
- data/doc/classes/Bio/DB/Sam.src/M000031.html +0 -86
- data/doc/classes/Bio/DB/Sam.src/M000032.html +0 -34
- data/doc/classes/Bio/DB/Tag.html +0 -160
- data/doc/classes/Bio/DB/Tag.src/M000011.html +0 -21
- data/doc/classes/LibC.html +0 -105
- data/doc/classes/Pileup.html +0 -374
- data/doc/classes/Pileup.src/M000001.html +0 -34
- data/doc/classes/Pileup.src/M000002.html +0 -21
- data/doc/classes/Pileup.src/M000003.html +0 -21
- data/doc/classes/Pileup.src/M000004.html +0 -21
- data/doc/classes/Pileup.src/M000005.html +0 -31
- data/doc/files/lib/bio-samtools_rb.html +0 -109
- data/doc/files/lib/bio/db/sam/bam_rb.html +0 -108
- data/doc/files/lib/bio/db/sam/faidx_rb.html +0 -108
- data/doc/files/lib/bio/db/sam/library_rb.html +0 -101
- data/doc/files/lib/bio/db/sam/pileup_rb.html +0 -178
- data/doc/files/lib/bio/db/sam/sam_rb.html +0 -113
- data/doc/files/lib/bio/db/sam_rb.html +0 -111
- data/doc/fr_class_index.html +0 -43
- data/doc/fr_file_index.html +0 -33
- data/doc/fr_method_index.html +0 -58
- data/doc/lib/bio-samtools_rb.html +0 -115
- data/doc/lib/bio/db/pileup_rb.html +0 -171
- data/doc/lib/bio/db/sam/bam_rb.html +0 -121
- data/doc/lib/bio/db/sam/faidx_rb.html +0 -117
- data/doc/lib/bio/db/sam/library_rb.html +0 -115
- data/doc/lib/bio/db/sam/pileup_rb.html +0 -171
- data/doc/lib/bio/db/sam/sam_rb.html +0 -121
- data/doc/lib/bio/db/sam/vcf_rb.html +0 -124
- data/doc/lib/bio/db/sam_rb.html +0 -115
- data/doc/lib/bio/db/vcf_rb.html +0 -124
- data/doc/rdoc-style.css +0 -208
- data/lib/bio/db/sam/bam.rb +0 -210
- data/lib/bio/db/sam/sam.rb +0 -86
- data/test/samples/pipe_char/test.bam +0 -0
- data/test/samples/pipe_char/test.bam.bai +0 -0
- data/test/samples/pipe_char/test.tam +0 -10
- data/test/samples/pipe_char/test_chr.fasta +0 -1000
- data/test/samples/pipe_char/test_chr.fasta.fai +0 -1
- data/test/samples/small/test +0 -0
- data/test/samples/small/test.bam +0 -0
- data/test/samples/small/test.fa +0 -20
- data/test/samples/small/test.fai +0 -0
data/lib/bio-samtools.rb
CHANGED
@@ -0,0 +1,89 @@
|
|
1
|
+
class Bio::NucleicAcid
|
2
|
+
|
3
|
+
IUPAC_CODES = {
|
4
|
+
|
5
|
+
'y' => 'ct',
|
6
|
+
'r' => 'ag',
|
7
|
+
'w' => 'at',
|
8
|
+
's' => 'cg',
|
9
|
+
'k' => 'gt',
|
10
|
+
'm' => 'ac',
|
11
|
+
|
12
|
+
'b' => 'cgt',
|
13
|
+
'd' => 'agt',
|
14
|
+
'h' => 'act',
|
15
|
+
'v' => 'acg',
|
16
|
+
|
17
|
+
'n' => 'acgt',
|
18
|
+
|
19
|
+
'a' => 'a',
|
20
|
+
't' => 't',
|
21
|
+
'g' => 'g',
|
22
|
+
'c' => 'c',
|
23
|
+
'u' => 'u',
|
24
|
+
|
25
|
+
'ct' => 'y',
|
26
|
+
'ag' => 'r',
|
27
|
+
'at' => 'w',
|
28
|
+
'cg' => 's',
|
29
|
+
'gt' => 'k',
|
30
|
+
'ac' => 'm',
|
31
|
+
|
32
|
+
'cgt' => 'b',
|
33
|
+
'agt' => 'd',
|
34
|
+
'act' => 'h',
|
35
|
+
'acg' => 'v',
|
36
|
+
|
37
|
+
'acgt' => 'n'
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
def self.is_unambiguous(base)
|
42
|
+
"acgtACGT".match(base)
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.to_IUAPC(bases)
|
46
|
+
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
47
|
+
if base == nil
|
48
|
+
p "Invalid base! #{base}"
|
49
|
+
base = 'n' #This is a patch... as one of the scripts failed here.
|
50
|
+
end
|
51
|
+
base.upcase
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.is_valid(code, base)
|
55
|
+
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
#Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
|
61
|
+
#sequences are already aligned and doesn't check if a base on the first sequence is
|
62
|
+
#valid on the second.
|
63
|
+
class Bio::Sequence
|
64
|
+
def self.snps_between(seq1, seq2)
|
65
|
+
snps=0
|
66
|
+
for i in (0..seq1.size-1)
|
67
|
+
snps += 1 if seq1[i] != seq2[i]
|
68
|
+
end
|
69
|
+
snps
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
class String
|
74
|
+
#Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
|
75
|
+
def count_ambiguities
|
76
|
+
snps=0
|
77
|
+
|
78
|
+
for i in (0..self.size-1)
|
79
|
+
|
80
|
+
snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
|
81
|
+
end
|
82
|
+
snps
|
83
|
+
end
|
84
|
+
|
85
|
+
#Counts how many bases are uppercase
|
86
|
+
def upper_case_count
|
87
|
+
match(/[^A-Z]*/).to_s.size
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Bio
|
2
|
+
class DB
|
3
|
+
|
4
|
+
#a class to represent the SAM OPT values, presented in SAM as TAG:VTYPE:VALUE
|
5
|
+
class Tag
|
6
|
+
attr_accessor :tag, :type, :value
|
7
|
+
def set(str)
|
8
|
+
@tag = str[0..1]
|
9
|
+
@type = str[3]
|
10
|
+
@value = str[5..-1]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
#Attrobites frp, the flag field (see chapter 2.2.2 of the sam file documentation)
|
15
|
+
#query_strand and mate_strand are true if they are forward. It is the opposite to
|
16
|
+
#the definition in the BAM format for clarity.
|
17
|
+
#primary is the negation of is_negative from the BAM format
|
18
|
+
class Alignment
|
19
|
+
attr_accessor :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr, :calend, :qlen
|
20
|
+
|
21
|
+
attr_accessor :is_paired, :is_mapped, :query_unmapped, :mate_unmapped, :query_strand, :mate_strand, :first_in_pair,:second_in_pair, :primary, :failed_quality, :is_duplicate
|
22
|
+
|
23
|
+
#parses the SAM string into its constituents and set its attributes
|
24
|
+
def initialize(sam_string)
|
25
|
+
s = sam_string.chomp.split("\t")
|
26
|
+
@qname = s[0]
|
27
|
+
@flag = s[1].to_i
|
28
|
+
@rname = s[2]
|
29
|
+
@pos = s[3].to_i
|
30
|
+
@mapq = s[4].to_i
|
31
|
+
@cigar = s[5]
|
32
|
+
@mrnm = s[6]
|
33
|
+
@mpos = s[7].to_i
|
34
|
+
@isize = s[8].to_i
|
35
|
+
@seq = s[9]
|
36
|
+
@qual = s[10]
|
37
|
+
@tags = {}
|
38
|
+
11.upto(s.size-1) {|n|
|
39
|
+
t = Bio::DB::Tag.new
|
40
|
+
t.set(s[n])
|
41
|
+
tags[t.tag] = t
|
42
|
+
}
|
43
|
+
|
44
|
+
@is_paired = (@flag & 0x0001) > 0
|
45
|
+
@is_mapped = @flag & 0x0002 > 0
|
46
|
+
@query_unmapped = @flag & 0x0004 > 0
|
47
|
+
@mate_unmapped = @flag & 0x0008 > 0
|
48
|
+
@query_strand = !(@flag & 0x0010 > 0)
|
49
|
+
@mate_strand = !(@flag & 0x0020 > 0)
|
50
|
+
@first_in_pair = @flag & 0x0040 > 0
|
51
|
+
@second_in_pair = @flag & 0x0080 > 0
|
52
|
+
@primary = !(@flag & 0x0100 > 0)
|
53
|
+
@failed_quality = @flag & 0x0200 > 0
|
54
|
+
@is_duplicate = @flag & 0x0400 > 0
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
#Module to hold the information about the fasta file
|
2
|
+
|
3
|
+
module Bio::DB::Fasta
|
4
|
+
#This class contains the entries in a fasta, as generated by samtools faidx
|
5
|
+
class Index
|
6
|
+
include Enumerable
|
7
|
+
attr_reader :entries
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@entries=[]
|
11
|
+
@entries_map = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
#This doesnt validate if you are adding the same entry twice. I may add
|
15
|
+
#a validation for that.
|
16
|
+
def << (entry)
|
17
|
+
@entries << entry
|
18
|
+
@entries_map[entry.id] = entry
|
19
|
+
end
|
20
|
+
|
21
|
+
def each(&block)
|
22
|
+
@entries.entries(&block)
|
23
|
+
end
|
24
|
+
#Total number of entries
|
25
|
+
def length
|
26
|
+
@entries.length
|
27
|
+
end
|
28
|
+
alias_method :size, :length
|
29
|
+
|
30
|
+
#Returns a new Index just with the specified range, as if it was an Array.
|
31
|
+
#The return object is of type Index.
|
32
|
+
def [](args)
|
33
|
+
tmp = @entries[args]
|
34
|
+
new_index = Index.new
|
35
|
+
tmp.each do | entry |
|
36
|
+
@new_index << entry
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#Gets the Region object for the full length of the sequence
|
41
|
+
#name queried.
|
42
|
+
def region_for_entry(entry)
|
43
|
+
@entries_map[entry]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Entry
|
48
|
+
attr_reader :id, :length
|
49
|
+
|
50
|
+
def initialize(id, length)
|
51
|
+
@id=id
|
52
|
+
@length=length.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_full_region
|
56
|
+
reg = Region.new
|
57
|
+
reg.entry = id
|
58
|
+
reg.start = 1
|
59
|
+
reg.end = @length
|
60
|
+
reg.orientation = :forward
|
61
|
+
reg
|
62
|
+
end
|
63
|
+
|
64
|
+
alias_method :to_region, :get_full_region
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
#Class to wrap a region of a chromosome
|
69
|
+
class Region
|
70
|
+
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
71
|
+
attr_accessor :entry, :start, :end, :orientation
|
72
|
+
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases, :total_cov, :called
|
73
|
+
|
74
|
+
#TODO: Debug, as it hasnt been tested in the actual code.
|
75
|
+
def base_ratios_for_base(base)
|
76
|
+
@all_ratios = Hash.new unless @all_ratios
|
77
|
+
unless @all_ratios[base]
|
78
|
+
ratios = Array.new
|
79
|
+
for i in (0..region.size-1)
|
80
|
+
ratios << @base_ratios[i][base]
|
81
|
+
end
|
82
|
+
@all_ratios[base] = ratios
|
83
|
+
end
|
84
|
+
@all_ratios[base]
|
85
|
+
end
|
86
|
+
|
87
|
+
#Calculates the concensus, base ratios, coverages and total coverages in the region
|
88
|
+
#* min_cov minimum coverage to make a call (default 0)
|
89
|
+
#* min_per minimum representation to make make a call. If more than one base
|
90
|
+
# can be called, the IUAPC ambiguity code is returned
|
91
|
+
def calculate_stats_from_pile(opts={})
|
92
|
+
min_cov = opts[:min_cov] ? opts[:min_cov] : 0
|
93
|
+
min_per = opts[:min_per] ? opts[:min_per] : 0.20
|
94
|
+
self.called = 0
|
95
|
+
reference = self.reference.downcase
|
96
|
+
|
97
|
+
self.base_ratios = Array.new(self.size, BASE_COUNT_ZERO)
|
98
|
+
self.bases = Array.new(self.size, BASE_COUNT_ZERO)
|
99
|
+
self.coverages = Array.new(self.size, 0)
|
100
|
+
self.total_cov = 0
|
101
|
+
|
102
|
+
self.pileup.each do | pile |
|
103
|
+
|
104
|
+
if pile.coverage > min_cov
|
105
|
+
self.base_ratios[pile.pos - self.start ] = pile.base_ratios
|
106
|
+
reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
|
107
|
+
self.coverages[pile.pos - self.start ] = pile.coverage.to_i
|
108
|
+
self.bases[pile.pos - self.start ] = pile.bases
|
109
|
+
self.called += 1
|
110
|
+
end
|
111
|
+
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
112
|
+
self.total_cov += pile.coverage
|
113
|
+
end
|
114
|
+
|
115
|
+
self.consensus = Bio::Sequence.new(reference)
|
116
|
+
self.consensus.na
|
117
|
+
if self.orientation == :reverse
|
118
|
+
self.consensus.reverse_complement!()
|
119
|
+
end
|
120
|
+
self.average_coverage = self.total_cov.to_f/self.size.to_f
|
121
|
+
self
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_s
|
125
|
+
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
126
|
+
string
|
127
|
+
end
|
128
|
+
|
129
|
+
#Returns a region object from a string in form "name:start-end"
|
130
|
+
def self.parse_region(reg_str)
|
131
|
+
string = reg_str.delete("'")
|
132
|
+
fields_1 = string.split(":")
|
133
|
+
fields_2 = fields_1[1].split("-")
|
134
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
|
135
|
+
|
136
|
+
reg = Region.new
|
137
|
+
reg.entry = fields_1[0]
|
138
|
+
reg.start = fields_2[0].to_i
|
139
|
+
reg.end = fields_2[1].to_i
|
140
|
+
|
141
|
+
if reg.end < reg.start
|
142
|
+
reg.orientation = :reverse
|
143
|
+
else
|
144
|
+
reg.orientation = :forward
|
145
|
+
end
|
146
|
+
reg
|
147
|
+
end
|
148
|
+
|
149
|
+
#Length of the region
|
150
|
+
def size
|
151
|
+
@end - @start
|
152
|
+
end
|
153
|
+
alias_method :length, :size
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
class FastaDBException < StandardError; end
|
158
|
+
|
159
|
+
#Class that holds the fasta file. It is used as a database.
|
160
|
+
class FastaFile
|
161
|
+
attr_reader :index, :fasta_path
|
162
|
+
|
163
|
+
#Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
|
164
|
+
#* fasta path to the fasta file
|
165
|
+
#* samtools path to samtools, if it is not provided, use the bundled version
|
166
|
+
def initialize(args)
|
167
|
+
@fasta_path = args[:fasta]
|
168
|
+
@samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
|
169
|
+
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
170
|
+
@fai_file = @fasta_path + ".fai"
|
171
|
+
unless File.file?(@fai_file) then
|
172
|
+
command = "#{@samtools} faidx '#{@fasta_path}'"
|
173
|
+
@last_command = command
|
174
|
+
system(command)
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
#Loads the fai entries
|
180
|
+
def load_fai_entries()
|
181
|
+
return @index.length if @index
|
182
|
+
@index = Index.new
|
183
|
+
fai_file = @fai_file
|
184
|
+
File.open(fai_file).each do | line |
|
185
|
+
fields = line.split("\t")
|
186
|
+
@index << Entry.new(fields[0], fields[1])
|
187
|
+
end
|
188
|
+
@index.length
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
|
193
|
+
#Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
|
194
|
+
#Options - if a subsequence is required
|
195
|
+
#* chr - [STRING] the reference name of the subsequence
|
196
|
+
#* start - [INT] the start position for the subsequence
|
197
|
+
#* stop - [INT] the stop position for the subsequence
|
198
|
+
def faidx(opts={})
|
199
|
+
if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
|
200
|
+
opts={:as_bio => false}
|
201
|
+
self.fetch_reference(:chr,:start,:stop,opts)
|
202
|
+
else
|
203
|
+
command = "#{@samtools} faidx #{@fasta_path}"
|
204
|
+
@last_command = command
|
205
|
+
system(command)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
211
|
+
def fetch_sequence(region)
|
212
|
+
|
213
|
+
|
214
|
+
query = region.to_s
|
215
|
+
query = region.to_region.to_s if region.respond_to?(:to_region)
|
216
|
+
command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
|
217
|
+
puts command
|
218
|
+
@last_command = command
|
219
|
+
seq = ""
|
220
|
+
yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
|
221
|
+
|
222
|
+
reference = Bio::Sequence::NA.new(seq)
|
223
|
+
|
224
|
+
if region.orientation == :reverse
|
225
|
+
#puts "reversing! #{reference.to_s}"
|
226
|
+
reference.reverse_complement!()
|
227
|
+
end
|
228
|
+
reference
|
229
|
+
end
|
230
|
+
|
231
|
+
private
|
232
|
+
#Returns Process::Status with the execution status. If run in a $VERBOSE environment, stderr of the process
|
233
|
+
#is forwarded to the default stdout
|
234
|
+
def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
|
235
|
+
stdin, pipe, stderr, wait_thr = Open3.popen3(command)
|
236
|
+
pid = wait_thr[:pid] # pid of the started process.
|
237
|
+
if type == :text
|
238
|
+
while (line = pipe.gets)
|
239
|
+
next if skip_comments and line[0] == comment_char
|
240
|
+
yield klass.new(line.chomp)
|
241
|
+
end
|
242
|
+
elsif type == :binary
|
243
|
+
while (c = pipe.gets(nil))
|
244
|
+
yield c
|
245
|
+
end
|
246
|
+
end
|
247
|
+
exit_status = wait_thr.value # Process::Status object returned.
|
248
|
+
puts stderr.read if $VERBOSE
|
249
|
+
stdin.close
|
250
|
+
pipe.close
|
251
|
+
stderr.close
|
252
|
+
return exit_status
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
data/lib/bio/db/pileup.rb
CHANGED
@@ -26,185 +26,234 @@
|
|
26
26
|
#
|
27
27
|
module Bio
|
28
28
|
class DB
|
29
|
-
class Pileup
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
29
|
+
class Pileup
|
30
|
+
attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
|
31
|
+
|
32
|
+
#creates the Pileup object
|
33
|
+
# pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
34
|
+
# pile = Bio::DB::Pileup.new(pile_up_line)
|
35
|
+
def initialize(pile_up_line)
|
36
|
+
cols = pile_up_line.split(/\t/)
|
37
|
+
if cols.length == 6 ##should only be able to get 6 lines from mpileup
|
38
|
+
@ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
|
39
|
+
elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
|
40
|
+
if cols[2] == '*' #indel
|
41
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
|
42
|
+
else #snp / identity
|
43
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
|
44
|
+
end
|
45
|
+
@consensus_quality = @consensus_quality.to_f
|
46
|
+
@snp_quality = @snp_quality.to_f
|
47
|
+
@rms_mapq = @rms_mapq.to_f
|
48
|
+
else
|
49
|
+
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
50
|
+
end
|
51
|
+
|
52
|
+
@pos = @pos.to_i
|
53
|
+
@coverage = @coverage.to_f
|
54
|
+
@ref_count = nil
|
55
|
+
@non_ref_count_hash = nil
|
56
|
+
@non_ref_count = nil
|
44
57
|
end
|
45
|
-
@consensus_quality = @consensus_quality.to_f
|
46
|
-
@snp_quality = @snp_quality.to_f
|
47
|
-
@rms_mapq = @rms_mapq.to_f
|
48
|
-
else
|
49
|
-
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
50
|
-
end
|
51
58
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
59
|
+
#Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
|
60
|
+
#returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
|
61
|
+
def non_refs
|
62
|
+
if @non_ref_count_hash.nil?
|
63
|
+
@non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
|
64
|
+
end
|
65
|
+
@non_ref_count_hash
|
66
|
+
end
|
67
|
+
|
68
|
+
# returns the total non-reference bases in the reads at this position
|
69
|
+
def non_ref_count
|
70
|
+
if @non_ref_count.nil?
|
71
|
+
@non_ref_count = @read_bases.count("ATGCatgc").to_f
|
72
|
+
end
|
73
|
+
@non_ref_count
|
74
|
+
end
|
75
|
+
|
76
|
+
# returns the count of reference-bases in the reads at this position
|
77
|
+
def ref_count
|
78
|
+
if @ref_count.nil?
|
79
|
+
@ref_count = self.read_bases.count(".,")
|
80
|
+
end
|
81
|
+
@ref_count
|
82
|
+
end
|
83
|
+
|
84
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
85
|
+
def consensus
|
86
|
+
if @consensus.nil?
|
87
|
+
max = self.non_refs.values.max
|
88
|
+
if (self.ref_count / self.coverage) > 0.5
|
89
|
+
@consensus = self.ref_base
|
90
|
+
elsif self.ref_count > max
|
91
|
+
@consensus = self.ref_base
|
92
|
+
else
|
93
|
+
arr = self.non_refs.select {|k,v| v == max }
|
94
|
+
bases = arr.collect {|b| b[0].to_s }
|
95
|
+
bases << self.ref_base if self.ref_count == max
|
96
|
+
@consensus = bases.sort.join
|
97
|
+
end
|
98
|
+
end
|
99
|
+
@consensus
|
100
|
+
end
|
101
|
+
|
102
|
+
#returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
|
103
|
+
def to_vcf
|
104
|
+
alt,g = self.genotype_list
|
105
|
+
alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
|
106
|
+
alt = '.' if alt == self.ref_base
|
107
|
+
[self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
def Pileup.vcf_header
|
112
|
+
%{##fileformat=VCFv3.3
|
113
|
+
##INFO=DP,1,Integer,"Total Depth"
|
114
|
+
##FORMAT=GT,1,String,"Genotype"
|
115
|
+
##FORMAT=GQ,1,Integer,"Genotype Quality"
|
116
|
+
##FORMAT=DP,1,Integer,"Read Depth"
|
117
|
+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
|
118
|
+
}.join("\n")
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
#returns the genotype of the indel
|
124
|
+
def indel_gt
|
125
|
+
return "undef" if self.consensus.instance_of?(Array)
|
126
|
+
al1, al2 = self.consensus.split(/\//)
|
127
|
+
if al1 == al2 && al1 == '*'
|
128
|
+
al1=self.indel_1
|
129
|
+
al2=self.indel_2
|
130
|
+
end
|
131
|
+
alt1 = parse_indel(al1)
|
132
|
+
alt2 = parse_indel(al2)
|
133
|
+
alt,gt = nil,nil
|
134
|
+
|
135
|
+
return nil if !alt1 and !alt2
|
136
|
+
if !alt1
|
137
|
+
alt = alt2
|
138
|
+
gt = '0/1'
|
139
|
+
elsif !alt2
|
140
|
+
alt = alt1
|
141
|
+
gt - '0/1'
|
142
|
+
elsif alt1 == alt2
|
143
|
+
alt = alt1
|
144
|
+
gt = '1/1'
|
92
145
|
else
|
93
|
-
|
94
|
-
|
95
|
-
bases << self.ref_base if self.ref_count == max
|
96
|
-
@consensus = bases.sort.join
|
146
|
+
alt="#{alt1},#{alt2}"
|
147
|
+
gt= '1/2'
|
97
148
|
end
|
149
|
+
return [alt, gt]
|
150
|
+
|
98
151
|
end
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
152
|
+
#returns the genotype of the snp
|
153
|
+
def snp_gt
|
154
|
+
return ['.','0/0'] if self.ref_base == self.consensus
|
155
|
+
bases = Pileup.iupac_to_base(self.consensus)
|
156
|
+
if bases[0] == self.ref_base
|
157
|
+
return [bases[1],'0/1']
|
158
|
+
elsif bases[1] == self.ref_base
|
159
|
+
return [bases[0],'0/1']
|
160
|
+
else
|
161
|
+
return ["#{bases[0]},#{bases[1]}",'1/2']
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
#identifies the reference base and returns the indel or snp genotype as applicable
|
166
|
+
public
|
167
|
+
def genotype_list
|
168
|
+
if self.ref_base == '*'
|
169
|
+
return indel_gt
|
170
|
+
else
|
171
|
+
return snp_gt
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
#returns the two bases for the corresponding iupac code
|
176
|
+
public
|
177
|
+
def Pileup.iupac_to_base(alt_base)
|
178
|
+
case alt_base
|
179
|
+
when 'K' then ['G','T']
|
180
|
+
when 'M' then ['A','C']
|
181
|
+
when 'S' then ['C','G']
|
182
|
+
when 'R' then ['A','G']
|
183
|
+
when 'W' then ['A','T']
|
184
|
+
when 'Y' then ['C','T']
|
185
|
+
else alt_base.split(//)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
#identifies if the indel is an insertion or a deletion
|
190
|
+
def parse_indel(alt)
|
191
|
+
return "D#{$'.length}" if alt =~/^-/
|
192
|
+
if alt=~/^\+/
|
193
|
+
return "I#{$'}"
|
194
|
+
elsif alt == '*'
|
195
|
+
return nil
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
#returns pileup format line
|
201
|
+
def to_s
|
202
|
+
if @read_quals and !@consensus_quality #6col
|
203
|
+
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
204
|
+
elsif @indel_1 #13 cols
|
205
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
206
|
+
else #10 cols
|
207
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
def bases
|
214
|
+
return @bases if @bases
|
215
|
+
@bases = self.non_refs
|
216
|
+
#puts self.ref_count
|
217
|
+
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
218
|
+
@bases
|
219
|
+
end
|
104
220
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
def Pileup.vcf_header
|
113
|
-
%{##fileformat=VCFv3.3
|
114
|
-
##INFO=DP,1,Integer,"Total Depth"
|
115
|
-
##FORMAT=GT,1,String,"Genotype"
|
116
|
-
##FORMAT=GQ,1,Integer,"Genotype Quality"
|
117
|
-
##FORMAT=DP,1,Integer,"Read Depth"
|
118
|
-
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
|
119
|
-
}.join("\n")
|
120
|
-
end
|
121
|
-
|
122
|
-
def parse_indel(alt)
|
221
|
+
def base_coverage
|
222
|
+
total = 0
|
223
|
+
@bases.each do |k,v|
|
224
|
+
total += v
|
225
|
+
end
|
226
|
+
total
|
227
|
+
end
|
123
228
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
gt = '1/1'
|
153
|
-
else
|
154
|
-
alt="#{alt1},#{alt2}"
|
155
|
-
gt= '1/2'
|
156
|
-
end
|
157
|
-
return [alt, gt]
|
158
|
-
|
159
|
-
end
|
160
|
-
|
161
|
-
def snp_gt
|
162
|
-
return ['.','0/0'] if self.ref_base == self.consensus
|
163
|
-
bases = Pileup.iupac_to_base(self.consensus)
|
164
|
-
if bases[0] == self.ref_base
|
165
|
-
return [bases[1],'0/1']
|
166
|
-
elsif bases[1] == self.ref_base
|
167
|
-
return [bases[0],'0/1']
|
168
|
-
else
|
169
|
-
return ["#{bases[0]},#{bases[1]}",'1/2']
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
public
|
174
|
-
def genotype_list
|
175
|
-
if self.ref_base == '*'
|
176
|
-
return indel_gt
|
177
|
-
else
|
178
|
-
return snp_gt
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
public
|
183
|
-
#returns
|
184
|
-
def Pileup.iupac_to_base(alt_base)
|
185
|
-
case alt_base
|
186
|
-
when 'K' then ['G','T']
|
187
|
-
when 'M' then ['A','C']
|
188
|
-
when 'S' then ['C','G']
|
189
|
-
when 'R' then ['A','G']
|
190
|
-
when 'W' then ['A','T']
|
191
|
-
when 'Y' then ['C','T']
|
192
|
-
else alt_base.split(//)
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
#returns pileup format line
|
197
|
-
def to_s
|
198
|
-
if @read_quals and !@consensus_quality #6col
|
199
|
-
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
200
|
-
elsif @indel_1 #13 cols
|
201
|
-
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
202
|
-
else #10 cols
|
203
|
-
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
229
|
+
def base_ratios
|
230
|
+
return @base_ratios if @base_ratios
|
231
|
+
bases = self.bases
|
232
|
+
@base_ratios = Hash.new
|
233
|
+
bases.each do |k,v|
|
234
|
+
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
235
|
+
end
|
236
|
+
@base_ratios
|
237
|
+
end
|
238
|
+
|
239
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
240
|
+
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
241
|
+
|
242
|
+
if @consensus_iuap.nil?
|
243
|
+
@consensus_iuap = self.ref_base.downcase
|
244
|
+
bases = self.bases
|
245
|
+
tmp = String.new
|
246
|
+
bases.each do |k,v|
|
247
|
+
tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
|
248
|
+
end
|
249
|
+
if tmp.length > 0
|
250
|
+
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
@consensus_iuap
|
254
|
+
end
|
255
|
+
|
256
|
+
|
204
257
|
end
|
205
|
-
|
206
258
|
end
|
207
|
-
|
208
|
-
end
|
209
|
-
end
|
210
259
|
end
|