bio-samtools-wrapper 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +27 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +702 -0
- data/README.md +501 -0
- data/Rakefile +73 -0
- data/VERSION +1 -0
- data/bin/bam_consensus.rb +85 -0
- data/bio-samtools-wrapper.gemspec +181 -0
- data/doc/Bio/DB/Alignment.html +552 -0
- data/doc/Bio/DB/Pileup.html +711 -0
- data/doc/Bio/DB/SAM/Library.html +167 -0
- data/doc/Bio/DB/SAM/Tools.html +109 -0
- data/doc/Bio/DB/SAM.html +1853 -0
- data/doc/Bio/DB/Tag.html +208 -0
- data/doc/Bio/DB/Vcf.html +431 -0
- data/doc/Bio/DB.html +105 -0
- data/doc/Bio.html +175 -0
- data/doc/LICENSE_txt.html +846 -0
- data/doc/created.rid +9 -0
- data/doc/fonts/Lato-Light.ttf +0 -0
- data/doc/fonts/Lato-LightItalic.ttf +0 -0
- data/doc/fonts/Lato-Regular.ttf +0 -0
- data/doc/fonts/Lato-RegularItalic.ttf +0 -0
- data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
- data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
- data/doc/fonts.css +167 -0
- data/doc/images/add.png +0 -0
- data/doc/images/arrow_up.png +0 -0
- data/doc/images/brick.png +0 -0
- data/doc/images/brick_link.png +0 -0
- data/doc/images/bug.png +0 -0
- data/doc/images/bullet_black.png +0 -0
- data/doc/images/bullet_toggle_minus.png +0 -0
- data/doc/images/bullet_toggle_plus.png +0 -0
- data/doc/images/date.png +0 -0
- data/doc/images/delete.png +0 -0
- data/doc/images/find.png +0 -0
- data/doc/images/loadingAnimation.gif +0 -0
- data/doc/images/macFFBgHack.png +0 -0
- data/doc/images/package.png +0 -0
- data/doc/images/page_green.png +0 -0
- data/doc/images/page_white_text.png +0 -0
- data/doc/images/page_white_width.png +0 -0
- data/doc/images/plugin.png +0 -0
- data/doc/images/ruby.png +0 -0
- data/doc/images/tag_blue.png +0 -0
- data/doc/images/tag_green.png +0 -0
- data/doc/images/transparent.png +0 -0
- data/doc/images/wrench.png +0 -0
- data/doc/images/wrench_orange.png +0 -0
- data/doc/images/zoom.png +0 -0
- data/doc/index.html +106 -0
- data/doc/js/darkfish.js +140 -0
- data/doc/js/jquery.js +18 -0
- data/doc/js/navigation.js +142 -0
- data/doc/js/search.js +109 -0
- data/doc/js/search_index.js +1 -0
- data/doc/js/searcher.js +228 -0
- data/doc/rdoc.css +580 -0
- data/doc/table_of_contents.html +305 -0
- data/ext/Makefile-bioruby.patch +12 -0
- data/ext/Makefile-suse.patch +11 -0
- data/ext/mkrf_conf.rb +118 -0
- data/lib/bio/BIOExtensions.rb +89 -0
- data/lib/bio/db/alignment.rb +64 -0
- data/lib/bio/db/fastadb.rb +320 -0
- data/lib/bio/db/pileup.rb +273 -0
- data/lib/bio/db/sam/external/COPYING +21 -0
- data/lib/bio/db/sam/external/VERSION +1 -0
- data/lib/bio/db/sam/library.rb +32 -0
- data/lib/bio/db/sam.rb +778 -0
- data/lib/bio/db/vcf.rb +105 -0
- data/lib/bio-samtools-wrapper.rb +9 -0
- data/test/.gitignore +1 -0
- data/test/helper.rb +18 -0
- data/test/sample.vcf +24 -0
- data/test/samples/.gitignore +1 -0
- data/test/samples/LCI/NC_001988.ffn +2 -0
- data/test/samples/LCI/test.bam +0 -0
- data/test/samples/LCI/test.bam.bai +0 -0
- data/test/samples/small/dupes.bam +0 -0
- data/test/samples/small/dupes.sam +274 -0
- data/test/samples/small/ids2.txt +1 -0
- data/test/samples/small/map_for_reheader.sam +8 -0
- data/test/samples/small/map_to_merge1.bam +0 -0
- data/test/samples/small/map_to_merge1.bam.bai +0 -0
- data/test/samples/small/map_to_merge1.sam +8 -0
- data/test/samples/small/map_to_merge2.bam +0 -0
- data/test/samples/small/map_to_merge2.bam.bai +0 -0
- data/test/samples/small/map_to_merge2.sam +8 -0
- data/test/samples/small/no_md.sam +8 -0
- data/test/samples/small/sorted.bam +0 -0
- data/test/samples/small/sorted.bam.bai +0 -0
- data/test/samples/small/test.sai +0 -0
- data/test/samples/small/test.tam +10 -0
- data/test/samples/small/test_chr.fasta +1000 -0
- data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.amb +2 -0
- data/test/samples/small/test_chr.fasta.ann +3 -0
- data/test/samples/small/test_chr.fasta.bwt +0 -0
- data/test/samples/small/test_chr.fasta.pac +0 -0
- data/test/samples/small/test_chr.fasta.rbwt +0 -0
- data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rpac +0 -0
- data/test/samples/small/test_chr.fasta.rsa +0 -0
- data/test/samples/small/test_chr.fasta.sa +0 -0
- data/test/samples/small/test_cov.svg +273 -0
- data/test/samples/small/test_fastadb.fasta +34 -0
- data/test/samples/small/testu.bam +0 -0
- data/test/samples/small/testu.bed +2 -0
- data/test/test_bio-samtools-wrapper.rb +1 -0
- data/test/test_fastadb.rb +89 -0
- data/test/test_pileup.rb +90 -0
- data/test/test_sam.rb +421 -0
- data/test/test_vcf.rb +79 -0
- data/tutorial/tutorial.html +474 -0
- data/tutorial/tutorial.md +424 -0
- data/tutorial/tutorial.pdf +0 -0
- metadata +254 -0
@@ -0,0 +1,320 @@
|
|
1
|
+
#Module to hold the information about the fasta file
|
2
|
+
|
3
|
+
module Bio::DB::Fasta
|
4
|
+
#This class contains the entries in a fasta, as generated by samtools faidx
|
5
|
+
class Index
|
6
|
+
include Enumerable
|
7
|
+
attr_reader :entries
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@entries=[]
|
11
|
+
@entries_map = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
#This doesnt validate if you are adding the same entry twice. I may add
|
15
|
+
#a validation for that.
|
16
|
+
def <<(entry)
|
17
|
+
@entries << entry
|
18
|
+
@entries_map[entry.id] = entry
|
19
|
+
end
|
20
|
+
|
21
|
+
def each(&block)
|
22
|
+
@entries.entries(&block)
|
23
|
+
end
|
24
|
+
#Total number of entries
|
25
|
+
def length
|
26
|
+
@entries.length
|
27
|
+
end
|
28
|
+
alias_method :size, :length
|
29
|
+
|
30
|
+
#Returns a new Index just with the specified range, as if it was an Array.
|
31
|
+
#The return object is of type Index.
|
32
|
+
def [](args)
|
33
|
+
tmp = @entries[args]
|
34
|
+
@new_index = Index.new
|
35
|
+
tmp.each do | entry |
|
36
|
+
@new_index << entry
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#Gets the Region object for the full length of the sequence
|
41
|
+
#name queried.
|
42
|
+
def region_for_entry(entry)
|
43
|
+
@entries_map[entry]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Entry
|
48
|
+
attr_reader :id, :length, :line_bases, :line_length, :offset
|
49
|
+
alias_method :size, :length
|
50
|
+
def initialize(id, length, offset = 0 , line_bases= 0 , line_length = 0 )
|
51
|
+
@id=id
|
52
|
+
@length=length.to_i
|
53
|
+
@offset = offset.to_i
|
54
|
+
@line_bases = line_bases.to_i
|
55
|
+
@line_length = line_length.to_i
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_base_coordinate(coordinate)
|
59
|
+
lines_for_offset = coordinate / line_bases
|
60
|
+
line_offset = coordinate % line_bases
|
61
|
+
#puts "get_base_coordinate"
|
62
|
+
#puts "Coordinate: #{coordinate}"
|
63
|
+
#puts "lines_for_offset: #{lines_for_offset}"
|
64
|
+
#puts "line pffset: #{line_offset}"
|
65
|
+
#puts self.inspect
|
66
|
+
pointer = offset + (line_length * lines_for_offset) + line_offset - 1
|
67
|
+
pointer
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_full_region
|
71
|
+
reg = Region.new
|
72
|
+
reg.entry = id
|
73
|
+
reg.start = 1
|
74
|
+
reg.end = @length
|
75
|
+
reg.orientation = :forward
|
76
|
+
reg
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :to_region, :get_full_region
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
#Class to wrap a region of a chromosome
|
84
|
+
class Region
|
85
|
+
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
86
|
+
attr_accessor :entry, :start, :end, :orientation
|
87
|
+
|
88
|
+
attr_accessor :pileup, :average_coverage, :snps, :reference, :allele_freq, :consensus, :coverages, :bases, :total_cov, :called
|
89
|
+
|
90
|
+
def initialize(args ={})
|
91
|
+
@entry = args[:entry]
|
92
|
+
@start = args[:start]
|
93
|
+
@end = args[:end]
|
94
|
+
@orientation = args[:orientation]
|
95
|
+
end
|
96
|
+
|
97
|
+
#TODO: Debug, as it hasnt been tested in the actual code.
|
98
|
+
def allele_freq_for_base(base)
|
99
|
+
@all_ratios = Hash.new unless @all_ratios
|
100
|
+
unless @all_ratios[base]
|
101
|
+
ratios = Array.new
|
102
|
+
for i in (0..region.size-1)
|
103
|
+
ratios << @allele_freq[i][base]
|
104
|
+
end
|
105
|
+
@all_ratios[base] = ratios
|
106
|
+
end
|
107
|
+
@all_ratios[base]
|
108
|
+
end
|
109
|
+
|
110
|
+
alias_method :base_ratios_for_base, :allele_freq_for_base
|
111
|
+
alias_method :base_ratios, :allele_freq
|
112
|
+
|
113
|
+
#Calculates the concensus, base ratios, coverages and total coverages in the region
|
114
|
+
#* min_cov minimum coverage to make a call (default 0)
|
115
|
+
#* min_per minimum representation to make make a call. If more than one base
|
116
|
+
# can be called, the IUAPC ambiguity code is returned
|
117
|
+
def calculate_stats_from_pile(opts={})
|
118
|
+
min_cov = opts[:min_cov] ? opts[:min_cov] : 0
|
119
|
+
min_per = opts[:min_per] ? opts[:min_per] : 0.20
|
120
|
+
self.called = 0
|
121
|
+
reference = self.reference.downcase
|
122
|
+
|
123
|
+
self.allele_freq = Array.new(self.size, BASE_COUNT_ZERO)
|
124
|
+
self.bases = Array.new(self.size, BASE_COUNT_ZERO)
|
125
|
+
self.coverages = Array.new(self.size, 0)
|
126
|
+
self.total_cov = 0
|
127
|
+
|
128
|
+
self.pileup.each do | pile |
|
129
|
+
|
130
|
+
if pile.coverage > min_cov
|
131
|
+
self.allele_freq[pile.pos - self.start ] = pile.allele_freq
|
132
|
+
reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
|
133
|
+
self.coverages[pile.pos - self.start ] = pile.coverage.to_i
|
134
|
+
self.bases[pile.pos - self.start ] = pile.bases
|
135
|
+
self.called += 1
|
136
|
+
end
|
137
|
+
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
138
|
+
self.total_cov += pile.coverage
|
139
|
+
end
|
140
|
+
|
141
|
+
self.consensus = Bio::Sequence.new(reference)
|
142
|
+
self.consensus.na
|
143
|
+
if self.orientation == :reverse
|
144
|
+
self.consensus.reverse_complement!()
|
145
|
+
end
|
146
|
+
self.average_coverage = self.total_cov.to_f/self.size.to_f
|
147
|
+
self
|
148
|
+
end
|
149
|
+
|
150
|
+
def to_s
|
151
|
+
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
152
|
+
string
|
153
|
+
end
|
154
|
+
|
155
|
+
#Returns a region object from a string in form "name:start-end"
|
156
|
+
def self.parse_region(reg_str)
|
157
|
+
string = reg_str.delete("'")
|
158
|
+
fields_1 = string.split(":")
|
159
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2
|
160
|
+
fields_2 = fields_1[1].split("-")
|
161
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_2.length != 2
|
162
|
+
|
163
|
+
reg = Region.new(:entry=> fields_1[0], :start=>fields_2[0].to_i, :end=>fields_2[1].to_i)
|
164
|
+
|
165
|
+
if reg.end < reg.start
|
166
|
+
reg.orientation = :reverse
|
167
|
+
else
|
168
|
+
reg.orientation = :forward
|
169
|
+
end
|
170
|
+
reg
|
171
|
+
end
|
172
|
+
|
173
|
+
#Length of the region
|
174
|
+
def size
|
175
|
+
@end - @start
|
176
|
+
end
|
177
|
+
alias_method :length, :size
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
class FastaDBException < StandardError; end
|
182
|
+
|
183
|
+
#Class that holds the fasta file. It is used as a database.
|
184
|
+
class FastaFile
|
185
|
+
attr_reader :fasta_path
|
186
|
+
|
187
|
+
#Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
|
188
|
+
#* fasta path to the fasta file
|
189
|
+
#* samtools path to samtools, if it is not provided, use the bundled version
|
190
|
+
def initialize(fasta: nil, samtools: false)
|
191
|
+
#puts "The arguments are: '#{fasta}':'#{samtools}'"
|
192
|
+
@fasta_path = fasta
|
193
|
+
@samtools = samtools
|
194
|
+
@index = nil
|
195
|
+
@fasta_file = nil
|
196
|
+
@samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools') if samtools == true
|
197
|
+
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
198
|
+
@fai_file = @fasta_path + ".fai"
|
199
|
+
unless File.file?(@fai_file) then
|
200
|
+
command = "#{@samtools} faidx '#{@fasta_path}'"
|
201
|
+
@last_command = command
|
202
|
+
system(command)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
#Loads the fai entries
|
207
|
+
def load_fai_entries()
|
208
|
+
return @index.length if @index
|
209
|
+
@index = Index.new
|
210
|
+
fai_file = @fai_file
|
211
|
+
File.open(fai_file).each do | line |
|
212
|
+
fields = line.split("\t")
|
213
|
+
@index << Entry.new(fields[0], fields[1], fields[2], fields[3], fields[4])
|
214
|
+
end
|
215
|
+
@index.length
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
#Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
|
221
|
+
#Options - if a subsequence is required
|
222
|
+
#* chr - [STRING] the reference name of the subsequence
|
223
|
+
#* start - [INT] the start position for the subsequence
|
224
|
+
#* stop - [INT] the stop position for the subsequence
|
225
|
+
def faidx(opts={})
|
226
|
+
if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
|
227
|
+
opts={:as_bio => false}
|
228
|
+
self.fetch_reference(:chr,:start,:stop,opts)
|
229
|
+
else
|
230
|
+
command = "#{@samtools} faidx #{@fasta_path}"
|
231
|
+
@last_command = command
|
232
|
+
system(command)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def index
|
237
|
+
return @index if @index
|
238
|
+
if @samtools
|
239
|
+
faidx
|
240
|
+
else
|
241
|
+
samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
|
242
|
+
#TODO: make a ruby implementations
|
243
|
+
command = "#{samtools} faidx #{@fasta_path}"
|
244
|
+
@last_command = command
|
245
|
+
system(command)
|
246
|
+
end
|
247
|
+
load_fai_entries
|
248
|
+
return @index
|
249
|
+
end
|
250
|
+
|
251
|
+
def fetch_sequence_samtools(region)
|
252
|
+
query = region.to_s
|
253
|
+
query = region.to_region.to_s if region.respond_to?(:to_region)
|
254
|
+
command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
|
255
|
+
puts "Running: #{command}" if $DEBUG
|
256
|
+
@last_command = command
|
257
|
+
seq = ""
|
258
|
+
yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
|
259
|
+
seq
|
260
|
+
end
|
261
|
+
|
262
|
+
def fetch_sequence_native(region)
|
263
|
+
query = region
|
264
|
+
query = Region.parse_region(region) unless region.is_a?(Region)
|
265
|
+
seq = ""
|
266
|
+
#In order to make this reentrant, if we want to make a multithreaded
|
267
|
+
#version of this function, we need to get a lock. Currently, only one thred
|
268
|
+
#can be assosiated with eache fastadb object
|
269
|
+
@fasta_file = File.open(@fasta_path) unless @fasta_file
|
270
|
+
entry = index.region_for_entry(query.entry)
|
271
|
+
|
272
|
+
start_pointer = entry.get_base_coordinate(query.start)
|
273
|
+
@fasta_file.seek(start_pointer, IO::SEEK_SET)
|
274
|
+
end_pointer = entry.get_base_coordinate(query.end)
|
275
|
+
to_read = end_pointer - start_pointer + 1
|
276
|
+
seq = @fasta_file.read(to_read)
|
277
|
+
seq.gsub!(/\s+/, '')
|
278
|
+
seq
|
279
|
+
end
|
280
|
+
|
281
|
+
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
282
|
+
def fetch_sequence(region)
|
283
|
+
load_fai_entries
|
284
|
+
region = Region.parse_region(region.to_s) unless region.is_a?(Region)
|
285
|
+
entry = index.region_for_entry(region.entry)
|
286
|
+
raise FastaDBException.new "Entry (#{region.entry})not found in reference" unless entry
|
287
|
+
raise FastaDBException.new "Region in invalid range (#{region}): Valid range: #{entry.to_region.to_s} has a size of #{entry.size}." if region.end > entry.size or region.start < 1
|
288
|
+
seq = @samtools ? fetch_sequence_samtools(region): fetch_sequence_native(region)
|
289
|
+
reference = Bio::Sequence::NA.new(seq)
|
290
|
+
if region.respond_to? :orientation and region.orientation == :reverse
|
291
|
+
reference.reverse_complement!()
|
292
|
+
end
|
293
|
+
reference
|
294
|
+
end
|
295
|
+
|
296
|
+
private
|
297
|
+
#Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
|
298
|
+
#is forwarded to the default stdout
|
299
|
+
def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
|
300
|
+
stdin, pipe, stderr, wait_thr = Open3.popen3(command)
|
301
|
+
#pid = wait_thr[:pid] # pid of the started process.
|
302
|
+
if type == :text
|
303
|
+
while (line = pipe.gets)
|
304
|
+
next if skip_comments and line[0] == comment_char
|
305
|
+
yield klass.new(line.chomp)
|
306
|
+
end
|
307
|
+
elsif type == :binary
|
308
|
+
while (c = pipe.gets(nil))
|
309
|
+
yield c
|
310
|
+
end
|
311
|
+
end
|
312
|
+
exit_status = wait_thr.value # Process::Status object returned.
|
313
|
+
puts stderr.read if $DEBUG
|
314
|
+
stdin.close
|
315
|
+
pipe.close
|
316
|
+
stderr.close
|
317
|
+
return exit_status
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
# :title:Pileup
|
2
|
+
# = Bio::DB::Pileup
|
3
|
+
# A class representing information in SAMTools pileup format
|
4
|
+
# Author:: Dan MacLean (dan.maclean@tsl.ac.uk)
|
5
|
+
# Pileup is described at http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ#I_do_not_understand_the_columns_in_the_pileup_output.
|
6
|
+
# Briefly (when you invoke pileup with the -c option):
|
7
|
+
# * 1 reference sequence name
|
8
|
+
# * 2 reference coordinate
|
9
|
+
# * (3) reference base, or `*' for an indel line
|
10
|
+
# * (4) genotype where heterozygotes are encoded in the IUB code: M=A/C, R=A/G, W=A/T, S=C/G, Y=C/T and K=G/T; indels are indicated by, for example, */+A, -A/* or +CC/-C. There is no difference between */+A or +A/*.
|
11
|
+
# * (5) Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
|
12
|
+
# * (6) Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, because the evidence of a SNP is very strong.
|
13
|
+
# * (7) root mean square (RMS) mapping quality
|
14
|
+
# * 8 # reads covering the position
|
15
|
+
# * 9 read bases at a SNP line (check the manual page for more information); the 1st indel allele otherwise
|
16
|
+
# * 10 base quality at a SNP line; the 2nd indel allele otherwise
|
17
|
+
# * (11) indel line only: # reads directly supporting the 1st indel allele
|
18
|
+
# * (12) indel line only: # reads directly supporting the 2nd indel allele
|
19
|
+
# * (13) indel line only: # reads supporting a third indel allele
|
20
|
+
# If pileup is invoked without `-c', indel lines and columns between 3 and 7 inclusive will not be outputted.
|
21
|
+
#
|
22
|
+
# NB mpileup uses the 6 column output format eg
|
23
|
+
# "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
24
|
+
# Pileup provides accessors for all columns (6 or 10 column format) and a few other useful methods
|
25
|
+
#
|
26
|
+
#
|
27
|
+
module Bio
|
28
|
+
class DB
|
29
|
+
class Pileup
|
30
|
+
attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
|
31
|
+
|
32
|
+
#creates the Pileup object
|
33
|
+
# pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
34
|
+
# pile = Bio::DB::Pileup.new(pile_up_line)
|
35
|
+
def initialize(pile_up_line)
|
36
|
+
cols = pile_up_line.split(/\t/)
|
37
|
+
@consensus = nil
|
38
|
+
@consensus_quality = nil
|
39
|
+
@read_quals = nil
|
40
|
+
@bases = nil
|
41
|
+
@allele_frequency = nil
|
42
|
+
@consensus_iuap = nil
|
43
|
+
if cols.length == 6 ##should only be able to get 6 lines from mpileup
|
44
|
+
@ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
|
45
|
+
elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
|
46
|
+
if cols[2] == '*' #indel
|
47
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
|
48
|
+
else #snp / identity
|
49
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
|
50
|
+
end
|
51
|
+
@consensus_quality = @consensus_quality.to_f
|
52
|
+
@snp_quality = @snp_quality.to_f
|
53
|
+
@rms_mapq = @rms_mapq.to_f
|
54
|
+
else
|
55
|
+
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
56
|
+
end
|
57
|
+
|
58
|
+
@pos = @pos.to_i
|
59
|
+
@coverage = @coverage.to_f
|
60
|
+
@ref_count = nil
|
61
|
+
@non_ref_count_hash = nil
|
62
|
+
@non_ref_count = nil
|
63
|
+
end
|
64
|
+
|
65
|
+
#Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
|
66
|
+
#returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
|
67
|
+
def non_refs
|
68
|
+
if @non_ref_count_hash.nil?
|
69
|
+
@non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
|
70
|
+
end
|
71
|
+
@non_ref_count_hash
|
72
|
+
end
|
73
|
+
|
74
|
+
# returns the total non-reference bases in the reads at this position
|
75
|
+
def non_ref_count
|
76
|
+
if @non_ref_count.nil?
|
77
|
+
@non_ref_count = @read_bases.count("ATGCatgc").to_f
|
78
|
+
end
|
79
|
+
@non_ref_count
|
80
|
+
end
|
81
|
+
|
82
|
+
# returns the count of reference-bases in the reads at this position
|
83
|
+
def ref_count
|
84
|
+
if @ref_count.nil?
|
85
|
+
@ref_count = self.read_bases.count(".,")
|
86
|
+
end
|
87
|
+
@ref_count
|
88
|
+
end
|
89
|
+
|
90
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
91
|
+
def consensus
|
92
|
+
if @consensus.nil?
|
93
|
+
max = self.non_refs.values.max
|
94
|
+
#if the ref base is in more than half the coverage..
|
95
|
+
if (self.ref_count / self.coverage) > 0.5
|
96
|
+
#..then the ref base is the concensus
|
97
|
+
@consensus = self.ref_base
|
98
|
+
##not sure if the following will ever apply as the non_refs method also returns the ref base count, hence can never be over the max count
|
99
|
+
#elsif self.ref_count > max
|
100
|
+
# @consensus = self.ref_base
|
101
|
+
else
|
102
|
+
#get the base(s) and count(s) that has the max count
|
103
|
+
arr = self.non_refs.select {|k,v| v == max }
|
104
|
+
#just get the bases (remove the counts)
|
105
|
+
bases = arr.collect {|b| b[0].to_s }
|
106
|
+
#add the ref base if the ref base has a max count (commenting this out as it should already be in)
|
107
|
+
#bases << self.ref_base if self.ref_count == max
|
108
|
+
@consensus = bases.sort.join
|
109
|
+
end
|
110
|
+
end
|
111
|
+
@consensus
|
112
|
+
end
|
113
|
+
|
114
|
+
#returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
|
115
|
+
def to_vcf
|
116
|
+
alt,g = self.genotype_list
|
117
|
+
alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
|
118
|
+
alt = '.' if alt == self.ref_base
|
119
|
+
alt = alt.split(',')
|
120
|
+
#if the reference base is in alt, remove it
|
121
|
+
alt.delete(self.ref_base.to_s)
|
122
|
+
alt = alt.join(',')
|
123
|
+
[self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
def Pileup.vcf_header
|
128
|
+
%{##fileformat=VCFv3.3\n##INFO=DP,1,Integer,"Total Depth"\n##FORMAT=GT,1,String,"Genotype"\n##FORMAT=GQ,1,Integer,"Genotype Quality"\n##FORMAT=DP,1,Integer,"Read Depth"\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA\n}
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
#returns the genotype of the indel
|
134
|
+
def indel_gt
|
135
|
+
return "undef" if self.consensus.instance_of?(Array)
|
136
|
+
al1, al2 = self.consensus.split(/\//)
|
137
|
+
if al1 == al2 && al1 == '*'
|
138
|
+
al1=self.indel_1
|
139
|
+
al2=self.indel_2
|
140
|
+
end
|
141
|
+
alt1 = parse_indel(al1)
|
142
|
+
alt2 = parse_indel(al2)
|
143
|
+
alt,gt = nil,nil
|
144
|
+
|
145
|
+
return nil if !alt1 and !alt2
|
146
|
+
if !alt1
|
147
|
+
alt = alt2
|
148
|
+
gt = '0/1'
|
149
|
+
elsif !alt2
|
150
|
+
alt = alt1
|
151
|
+
gt - '0/1'
|
152
|
+
elsif alt1 == alt2
|
153
|
+
alt = alt1
|
154
|
+
gt = '1/1'
|
155
|
+
else
|
156
|
+
alt="#{alt1},#{alt2}"
|
157
|
+
gt= '1/2'
|
158
|
+
end
|
159
|
+
return [alt, gt]
|
160
|
+
|
161
|
+
end
|
162
|
+
#returns the genotype of the snp
|
163
|
+
def snp_gt
|
164
|
+
return ['.','0/0'] if self.ref_base == self.consensus
|
165
|
+
bases = Pileup.iupac_to_base(self.consensus)
|
166
|
+
if bases[0] == self.ref_base
|
167
|
+
return [bases[1],'0/1']
|
168
|
+
elsif bases[1] == self.ref_base
|
169
|
+
return [bases[0],'0/1']
|
170
|
+
else
|
171
|
+
return ["#{bases[0]},#{bases[1]}",'1/1']
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
#identifies the reference base and returns the indel or snp genotype as applicable
|
176
|
+
public
|
177
|
+
def genotype_list
|
178
|
+
if self.ref_base == '*'
|
179
|
+
return indel_gt
|
180
|
+
else
|
181
|
+
return snp_gt
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
#returns the two bases for the corresponding iupac code
|
186
|
+
public
|
187
|
+
def Pileup.iupac_to_base(alt_base)
|
188
|
+
case alt_base
|
189
|
+
when 'K' then ['G','T']
|
190
|
+
when 'M' then ['A','C']
|
191
|
+
when 'S' then ['C','G']
|
192
|
+
when 'R' then ['A','G']
|
193
|
+
when 'W' then ['A','T']
|
194
|
+
when 'Y' then ['C','T']
|
195
|
+
else alt_base.split(//)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
#identifies if the indel is an insertion or a deletion
|
200
|
+
def parse_indel(alt)
|
201
|
+
return "D#{$'.length}" if alt =~/^-/
|
202
|
+
if alt=~/^\+/
|
203
|
+
return "I#{$'}"
|
204
|
+
elsif alt == '*'
|
205
|
+
return nil
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
#returns pileup format line
|
211
|
+
def to_s
|
212
|
+
if @read_quals and !@consensus_quality #6col
|
213
|
+
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
214
|
+
elsif @indel_1 #13 cols
|
215
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
216
|
+
else #10 cols
|
217
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
def bases
|
224
|
+
return @bases if @bases
|
225
|
+
@bases = self.non_refs
|
226
|
+
#puts self.ref_count
|
227
|
+
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
228
|
+
@bases
|
229
|
+
end
|
230
|
+
|
231
|
+
def base_coverage
|
232
|
+
total = 0
|
233
|
+
@bases.each do |k,v|
|
234
|
+
total += v
|
235
|
+
end
|
236
|
+
total
|
237
|
+
end
|
238
|
+
|
239
|
+
#returns the frequency of all bases in pileup position
|
240
|
+
def allele_freq
|
241
|
+
return @allele_frequency if @allele_frequency
|
242
|
+
bases = self.bases
|
243
|
+
@allele_frequency = Hash.new
|
244
|
+
bases.each do |k,v|
|
245
|
+
@allele_frequency[k] = v.to_f/self.base_coverage.to_f
|
246
|
+
end
|
247
|
+
@allele_frequency
|
248
|
+
end
|
249
|
+
|
250
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
251
|
+
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
252
|
+
|
253
|
+
tmp = []
|
254
|
+
if @consensus_iuap.nil?
|
255
|
+
@consensus_iuap = self.ref_base.downcase
|
256
|
+
bases = self.bases
|
257
|
+
#tmp = String.new
|
258
|
+
bases.each do |k,v|
|
259
|
+
tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
|
260
|
+
end
|
261
|
+
if tmp.length > 0
|
262
|
+
tmp = tmp.collect{ |x| Bio::Sequence::NA.new(x) }
|
263
|
+
# creates alignment object
|
264
|
+
a = Bio::Alignment.new(tmp)
|
265
|
+
# shows IUPAC consensus
|
266
|
+
@consensus_iuap = a.consensus_iupac
|
267
|
+
end
|
268
|
+
end
|
269
|
+
@consensus_iuap
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008-2009 Genome Research Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
1.6
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Bio
|
2
|
+
class DB
|
3
|
+
module SAM
|
4
|
+
module Library
|
5
|
+
#IMPORTANT NOTE: Windows library is missing in this distribution
|
6
|
+
|
7
|
+
# Return the path with the file name of the library for the specific operating system
|
8
|
+
def filename
|
9
|
+
#TODO refactor this piece of code in all the files
|
10
|
+
lib_os = case RUBY_PLATFORM
|
11
|
+
when /linux/
|
12
|
+
'so.1'
|
13
|
+
when /darwin/
|
14
|
+
'1.dylib'
|
15
|
+
when /windows/
|
16
|
+
'dll'
|
17
|
+
else
|
18
|
+
case RUBY_DESCRIPTION
|
19
|
+
when /jruby.*darwin/
|
20
|
+
'1.dylib'
|
21
|
+
when /jruby.*linux/
|
22
|
+
'so.1'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
File.join(File.expand_path(File.dirname(__FILE__)),'external',"libbam.#{lib_os}")
|
27
|
+
end #filename
|
28
|
+
module_function :filename
|
29
|
+
end #Library
|
30
|
+
end #Sam
|
31
|
+
end #DB
|
32
|
+
end #Bio
|