bio-samtools-wrapper 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +27 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +702 -0
- data/README.md +501 -0
- data/Rakefile +73 -0
- data/VERSION +1 -0
- data/bin/bam_consensus.rb +85 -0
- data/bio-samtools-wrapper.gemspec +181 -0
- data/doc/Bio/DB/Alignment.html +552 -0
- data/doc/Bio/DB/Pileup.html +711 -0
- data/doc/Bio/DB/SAM/Library.html +167 -0
- data/doc/Bio/DB/SAM/Tools.html +109 -0
- data/doc/Bio/DB/SAM.html +1853 -0
- data/doc/Bio/DB/Tag.html +208 -0
- data/doc/Bio/DB/Vcf.html +431 -0
- data/doc/Bio/DB.html +105 -0
- data/doc/Bio.html +175 -0
- data/doc/LICENSE_txt.html +846 -0
- data/doc/created.rid +9 -0
- data/doc/fonts/Lato-Light.ttf +0 -0
- data/doc/fonts/Lato-LightItalic.ttf +0 -0
- data/doc/fonts/Lato-Regular.ttf +0 -0
- data/doc/fonts/Lato-RegularItalic.ttf +0 -0
- data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
- data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
- data/doc/fonts.css +167 -0
- data/doc/images/add.png +0 -0
- data/doc/images/arrow_up.png +0 -0
- data/doc/images/brick.png +0 -0
- data/doc/images/brick_link.png +0 -0
- data/doc/images/bug.png +0 -0
- data/doc/images/bullet_black.png +0 -0
- data/doc/images/bullet_toggle_minus.png +0 -0
- data/doc/images/bullet_toggle_plus.png +0 -0
- data/doc/images/date.png +0 -0
- data/doc/images/delete.png +0 -0
- data/doc/images/find.png +0 -0
- data/doc/images/loadingAnimation.gif +0 -0
- data/doc/images/macFFBgHack.png +0 -0
- data/doc/images/package.png +0 -0
- data/doc/images/page_green.png +0 -0
- data/doc/images/page_white_text.png +0 -0
- data/doc/images/page_white_width.png +0 -0
- data/doc/images/plugin.png +0 -0
- data/doc/images/ruby.png +0 -0
- data/doc/images/tag_blue.png +0 -0
- data/doc/images/tag_green.png +0 -0
- data/doc/images/transparent.png +0 -0
- data/doc/images/wrench.png +0 -0
- data/doc/images/wrench_orange.png +0 -0
- data/doc/images/zoom.png +0 -0
- data/doc/index.html +106 -0
- data/doc/js/darkfish.js +140 -0
- data/doc/js/jquery.js +18 -0
- data/doc/js/navigation.js +142 -0
- data/doc/js/search.js +109 -0
- data/doc/js/search_index.js +1 -0
- data/doc/js/searcher.js +228 -0
- data/doc/rdoc.css +580 -0
- data/doc/table_of_contents.html +305 -0
- data/ext/Makefile-bioruby.patch +12 -0
- data/ext/Makefile-suse.patch +11 -0
- data/ext/mkrf_conf.rb +118 -0
- data/lib/bio/BIOExtensions.rb +89 -0
- data/lib/bio/db/alignment.rb +64 -0
- data/lib/bio/db/fastadb.rb +320 -0
- data/lib/bio/db/pileup.rb +273 -0
- data/lib/bio/db/sam/external/COPYING +21 -0
- data/lib/bio/db/sam/external/VERSION +1 -0
- data/lib/bio/db/sam/library.rb +32 -0
- data/lib/bio/db/sam.rb +778 -0
- data/lib/bio/db/vcf.rb +105 -0
- data/lib/bio-samtools-wrapper.rb +9 -0
- data/test/.gitignore +1 -0
- data/test/helper.rb +18 -0
- data/test/sample.vcf +24 -0
- data/test/samples/.gitignore +1 -0
- data/test/samples/LCI/NC_001988.ffn +2 -0
- data/test/samples/LCI/test.bam +0 -0
- data/test/samples/LCI/test.bam.bai +0 -0
- data/test/samples/small/dupes.bam +0 -0
- data/test/samples/small/dupes.sam +274 -0
- data/test/samples/small/ids2.txt +1 -0
- data/test/samples/small/map_for_reheader.sam +8 -0
- data/test/samples/small/map_to_merge1.bam +0 -0
- data/test/samples/small/map_to_merge1.bam.bai +0 -0
- data/test/samples/small/map_to_merge1.sam +8 -0
- data/test/samples/small/map_to_merge2.bam +0 -0
- data/test/samples/small/map_to_merge2.bam.bai +0 -0
- data/test/samples/small/map_to_merge2.sam +8 -0
- data/test/samples/small/no_md.sam +8 -0
- data/test/samples/small/sorted.bam +0 -0
- data/test/samples/small/sorted.bam.bai +0 -0
- data/test/samples/small/test.sai +0 -0
- data/test/samples/small/test.tam +10 -0
- data/test/samples/small/test_chr.fasta +1000 -0
- data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.amb +2 -0
- data/test/samples/small/test_chr.fasta.ann +3 -0
- data/test/samples/small/test_chr.fasta.bwt +0 -0
- data/test/samples/small/test_chr.fasta.pac +0 -0
- data/test/samples/small/test_chr.fasta.rbwt +0 -0
- data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rpac +0 -0
- data/test/samples/small/test_chr.fasta.rsa +0 -0
- data/test/samples/small/test_chr.fasta.sa +0 -0
- data/test/samples/small/test_cov.svg +273 -0
- data/test/samples/small/test_fastadb.fasta +34 -0
- data/test/samples/small/testu.bam +0 -0
- data/test/samples/small/testu.bed +2 -0
- data/test/test_bio-samtools-wrapper.rb +1 -0
- data/test/test_fastadb.rb +89 -0
- data/test/test_pileup.rb +90 -0
- data/test/test_sam.rb +421 -0
- data/test/test_vcf.rb +79 -0
- data/tutorial/tutorial.html +474 -0
- data/tutorial/tutorial.md +424 -0
- data/tutorial/tutorial.pdf +0 -0
- metadata +254 -0
@@ -0,0 +1,320 @@
|
|
1
|
+
#Module to hold the information about the fasta file
|
2
|
+
|
3
|
+
module Bio::DB::Fasta
|
4
|
+
#This class contains the entries in a fasta, as generated by samtools faidx
|
5
|
+
class Index
|
6
|
+
include Enumerable
|
7
|
+
attr_reader :entries
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@entries=[]
|
11
|
+
@entries_map = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
#This doesnt validate if you are adding the same entry twice. I may add
|
15
|
+
#a validation for that.
|
16
|
+
def <<(entry)
|
17
|
+
@entries << entry
|
18
|
+
@entries_map[entry.id] = entry
|
19
|
+
end
|
20
|
+
|
21
|
+
def each(&block)
|
22
|
+
@entries.entries(&block)
|
23
|
+
end
|
24
|
+
#Total number of entries
|
25
|
+
def length
|
26
|
+
@entries.length
|
27
|
+
end
|
28
|
+
alias_method :size, :length
|
29
|
+
|
30
|
+
#Returns a new Index just with the specified range, as if it was an Array.
|
31
|
+
#The return object is of type Index.
|
32
|
+
def [](args)
|
33
|
+
tmp = @entries[args]
|
34
|
+
@new_index = Index.new
|
35
|
+
tmp.each do | entry |
|
36
|
+
@new_index << entry
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#Gets the Region object for the full length of the sequence
|
41
|
+
#name queried.
|
42
|
+
def region_for_entry(entry)
|
43
|
+
@entries_map[entry]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Entry
|
48
|
+
attr_reader :id, :length, :line_bases, :line_length, :offset
|
49
|
+
alias_method :size, :length
|
50
|
+
def initialize(id, length, offset = 0 , line_bases= 0 , line_length = 0 )
|
51
|
+
@id=id
|
52
|
+
@length=length.to_i
|
53
|
+
@offset = offset.to_i
|
54
|
+
@line_bases = line_bases.to_i
|
55
|
+
@line_length = line_length.to_i
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_base_coordinate(coordinate)
|
59
|
+
lines_for_offset = coordinate / line_bases
|
60
|
+
line_offset = coordinate % line_bases
|
61
|
+
#puts "get_base_coordinate"
|
62
|
+
#puts "Coordinate: #{coordinate}"
|
63
|
+
#puts "lines_for_offset: #{lines_for_offset}"
|
64
|
+
#puts "line pffset: #{line_offset}"
|
65
|
+
#puts self.inspect
|
66
|
+
pointer = offset + (line_length * lines_for_offset) + line_offset - 1
|
67
|
+
pointer
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_full_region
|
71
|
+
reg = Region.new
|
72
|
+
reg.entry = id
|
73
|
+
reg.start = 1
|
74
|
+
reg.end = @length
|
75
|
+
reg.orientation = :forward
|
76
|
+
reg
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :to_region, :get_full_region
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
#Class to wrap a region of a chromosome
|
84
|
+
class Region
|
85
|
+
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
86
|
+
attr_accessor :entry, :start, :end, :orientation
|
87
|
+
|
88
|
+
attr_accessor :pileup, :average_coverage, :snps, :reference, :allele_freq, :consensus, :coverages, :bases, :total_cov, :called
|
89
|
+
|
90
|
+
def initialize(args ={})
|
91
|
+
@entry = args[:entry]
|
92
|
+
@start = args[:start]
|
93
|
+
@end = args[:end]
|
94
|
+
@orientation = args[:orientation]
|
95
|
+
end
|
96
|
+
|
97
|
+
#TODO: Debug, as it hasnt been tested in the actual code.
|
98
|
+
def allele_freq_for_base(base)
|
99
|
+
@all_ratios = Hash.new unless @all_ratios
|
100
|
+
unless @all_ratios[base]
|
101
|
+
ratios = Array.new
|
102
|
+
for i in (0..region.size-1)
|
103
|
+
ratios << @allele_freq[i][base]
|
104
|
+
end
|
105
|
+
@all_ratios[base] = ratios
|
106
|
+
end
|
107
|
+
@all_ratios[base]
|
108
|
+
end
|
109
|
+
|
110
|
+
alias_method :base_ratios_for_base, :allele_freq_for_base
|
111
|
+
alias_method :base_ratios, :allele_freq
|
112
|
+
|
113
|
+
#Calculates the concensus, base ratios, coverages and total coverages in the region
|
114
|
+
#* min_cov minimum coverage to make a call (default 0)
|
115
|
+
#* min_per minimum representation to make make a call. If more than one base
|
116
|
+
# can be called, the IUAPC ambiguity code is returned
|
117
|
+
def calculate_stats_from_pile(opts={})
|
118
|
+
min_cov = opts[:min_cov] ? opts[:min_cov] : 0
|
119
|
+
min_per = opts[:min_per] ? opts[:min_per] : 0.20
|
120
|
+
self.called = 0
|
121
|
+
reference = self.reference.downcase
|
122
|
+
|
123
|
+
self.allele_freq = Array.new(self.size, BASE_COUNT_ZERO)
|
124
|
+
self.bases = Array.new(self.size, BASE_COUNT_ZERO)
|
125
|
+
self.coverages = Array.new(self.size, 0)
|
126
|
+
self.total_cov = 0
|
127
|
+
|
128
|
+
self.pileup.each do | pile |
|
129
|
+
|
130
|
+
if pile.coverage > min_cov
|
131
|
+
self.allele_freq[pile.pos - self.start ] = pile.allele_freq
|
132
|
+
reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
|
133
|
+
self.coverages[pile.pos - self.start ] = pile.coverage.to_i
|
134
|
+
self.bases[pile.pos - self.start ] = pile.bases
|
135
|
+
self.called += 1
|
136
|
+
end
|
137
|
+
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
138
|
+
self.total_cov += pile.coverage
|
139
|
+
end
|
140
|
+
|
141
|
+
self.consensus = Bio::Sequence.new(reference)
|
142
|
+
self.consensus.na
|
143
|
+
if self.orientation == :reverse
|
144
|
+
self.consensus.reverse_complement!()
|
145
|
+
end
|
146
|
+
self.average_coverage = self.total_cov.to_f/self.size.to_f
|
147
|
+
self
|
148
|
+
end
|
149
|
+
|
150
|
+
def to_s
|
151
|
+
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
152
|
+
string
|
153
|
+
end
|
154
|
+
|
155
|
+
#Returns a region object from a string in form "name:start-end"
|
156
|
+
def self.parse_region(reg_str)
|
157
|
+
string = reg_str.delete("'")
|
158
|
+
fields_1 = string.split(":")
|
159
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2
|
160
|
+
fields_2 = fields_1[1].split("-")
|
161
|
+
raise FastaDBException.new(), "Invalid region. #{string}" if fields_2.length != 2
|
162
|
+
|
163
|
+
reg = Region.new(:entry=> fields_1[0], :start=>fields_2[0].to_i, :end=>fields_2[1].to_i)
|
164
|
+
|
165
|
+
if reg.end < reg.start
|
166
|
+
reg.orientation = :reverse
|
167
|
+
else
|
168
|
+
reg.orientation = :forward
|
169
|
+
end
|
170
|
+
reg
|
171
|
+
end
|
172
|
+
|
173
|
+
#Length of the region
|
174
|
+
def size
|
175
|
+
@end - @start
|
176
|
+
end
|
177
|
+
alias_method :length, :size
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
class FastaDBException < StandardError; end
|
182
|
+
|
183
|
+
#Class that holds the fasta file. It is used as a database.
|
184
|
+
class FastaFile
|
185
|
+
attr_reader :fasta_path
|
186
|
+
|
187
|
+
#Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
|
188
|
+
#* fasta path to the fasta file
|
189
|
+
#* samtools path to samtools, if it is not provided, use the bundled version
|
190
|
+
def initialize(fasta: nil, samtools: false)
|
191
|
+
#puts "The arguments are: '#{fasta}':'#{samtools}'"
|
192
|
+
@fasta_path = fasta
|
193
|
+
@samtools = samtools
|
194
|
+
@index = nil
|
195
|
+
@fasta_file = nil
|
196
|
+
@samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools') if samtools == true
|
197
|
+
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
198
|
+
@fai_file = @fasta_path + ".fai"
|
199
|
+
unless File.file?(@fai_file) then
|
200
|
+
command = "#{@samtools} faidx '#{@fasta_path}'"
|
201
|
+
@last_command = command
|
202
|
+
system(command)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
#Loads the fai entries
|
207
|
+
def load_fai_entries()
|
208
|
+
return @index.length if @index
|
209
|
+
@index = Index.new
|
210
|
+
fai_file = @fai_file
|
211
|
+
File.open(fai_file).each do | line |
|
212
|
+
fields = line.split("\t")
|
213
|
+
@index << Entry.new(fields[0], fields[1], fields[2], fields[3], fields[4])
|
214
|
+
end
|
215
|
+
@index.length
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
#Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
|
221
|
+
#Options - if a subsequence is required
|
222
|
+
#* chr - [STRING] the reference name of the subsequence
|
223
|
+
#* start - [INT] the start position for the subsequence
|
224
|
+
#* stop - [INT] the stop position for the subsequence
|
225
|
+
def faidx(opts={})
|
226
|
+
if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
|
227
|
+
opts={:as_bio => false}
|
228
|
+
self.fetch_reference(:chr,:start,:stop,opts)
|
229
|
+
else
|
230
|
+
command = "#{@samtools} faidx #{@fasta_path}"
|
231
|
+
@last_command = command
|
232
|
+
system(command)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def index
|
237
|
+
return @index if @index
|
238
|
+
if @samtools
|
239
|
+
faidx
|
240
|
+
else
|
241
|
+
samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
|
242
|
+
#TODO: make a ruby implementations
|
243
|
+
command = "#{samtools} faidx #{@fasta_path}"
|
244
|
+
@last_command = command
|
245
|
+
system(command)
|
246
|
+
end
|
247
|
+
load_fai_entries
|
248
|
+
return @index
|
249
|
+
end
|
250
|
+
|
251
|
+
def fetch_sequence_samtools(region)
|
252
|
+
query = region.to_s
|
253
|
+
query = region.to_region.to_s if region.respond_to?(:to_region)
|
254
|
+
command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
|
255
|
+
puts "Running: #{command}" if $DEBUG
|
256
|
+
@last_command = command
|
257
|
+
seq = ""
|
258
|
+
yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
|
259
|
+
seq
|
260
|
+
end
|
261
|
+
|
262
|
+
def fetch_sequence_native(region)
|
263
|
+
query = region
|
264
|
+
query = Region.parse_region(region) unless region.is_a?(Region)
|
265
|
+
seq = ""
|
266
|
+
#In order to make this reentrant, if we want to make a multithreaded
|
267
|
+
#version of this function, we need to get a lock. Currently, only one thred
|
268
|
+
#can be assosiated with eache fastadb object
|
269
|
+
@fasta_file = File.open(@fasta_path) unless @fasta_file
|
270
|
+
entry = index.region_for_entry(query.entry)
|
271
|
+
|
272
|
+
start_pointer = entry.get_base_coordinate(query.start)
|
273
|
+
@fasta_file.seek(start_pointer, IO::SEEK_SET)
|
274
|
+
end_pointer = entry.get_base_coordinate(query.end)
|
275
|
+
to_read = end_pointer - start_pointer + 1
|
276
|
+
seq = @fasta_file.read(to_read)
|
277
|
+
seq.gsub!(/\s+/, '')
|
278
|
+
seq
|
279
|
+
end
|
280
|
+
|
281
|
+
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
282
|
+
def fetch_sequence(region)
|
283
|
+
load_fai_entries
|
284
|
+
region = Region.parse_region(region.to_s) unless region.is_a?(Region)
|
285
|
+
entry = index.region_for_entry(region.entry)
|
286
|
+
raise FastaDBException.new "Entry (#{region.entry})not found in reference" unless entry
|
287
|
+
raise FastaDBException.new "Region in invalid range (#{region}): Valid range: #{entry.to_region.to_s} has a size of #{entry.size}." if region.end > entry.size or region.start < 1
|
288
|
+
seq = @samtools ? fetch_sequence_samtools(region): fetch_sequence_native(region)
|
289
|
+
reference = Bio::Sequence::NA.new(seq)
|
290
|
+
if region.respond_to? :orientation and region.orientation == :reverse
|
291
|
+
reference.reverse_complement!()
|
292
|
+
end
|
293
|
+
reference
|
294
|
+
end
|
295
|
+
|
296
|
+
private
|
297
|
+
#Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
|
298
|
+
#is forwarded to the default stdout
|
299
|
+
def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
|
300
|
+
stdin, pipe, stderr, wait_thr = Open3.popen3(command)
|
301
|
+
#pid = wait_thr[:pid] # pid of the started process.
|
302
|
+
if type == :text
|
303
|
+
while (line = pipe.gets)
|
304
|
+
next if skip_comments and line[0] == comment_char
|
305
|
+
yield klass.new(line.chomp)
|
306
|
+
end
|
307
|
+
elsif type == :binary
|
308
|
+
while (c = pipe.gets(nil))
|
309
|
+
yield c
|
310
|
+
end
|
311
|
+
end
|
312
|
+
exit_status = wait_thr.value # Process::Status object returned.
|
313
|
+
puts stderr.read if $DEBUG
|
314
|
+
stdin.close
|
315
|
+
pipe.close
|
316
|
+
stderr.close
|
317
|
+
return exit_status
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
# :title:Pileup
|
2
|
+
# = Bio::DB::Pileup
|
3
|
+
# A class representing information in SAMTools pileup format
|
4
|
+
# Author:: Dan MacLean (dan.maclean@tsl.ac.uk)
|
5
|
+
# Pileup is described at http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ#I_do_not_understand_the_columns_in_the_pileup_output.
|
6
|
+
# Briefly (when you invoke pileup with the -c option):
|
7
|
+
# * 1 reference sequence name
|
8
|
+
# * 2 reference coordinate
|
9
|
+
# * (3) reference base, or `*' for an indel line
|
10
|
+
# * (4) genotype where heterozygotes are encoded in the IUB code: M=A/C, R=A/G, W=A/T, S=C/G, Y=C/T and K=G/T; indels are indicated by, for example, */+A, -A/* or +CC/-C. There is no difference between */+A or +A/*.
|
11
|
+
# * (5) Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
|
12
|
+
# * (6) Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, because the evidence of a SNP is very strong.
|
13
|
+
# * (7) root mean square (RMS) mapping quality
|
14
|
+
# * 8 # reads covering the position
|
15
|
+
# * 9 read bases at a SNP line (check the manual page for more information); the 1st indel allele otherwise
|
16
|
+
# * 10 base quality at a SNP line; the 2nd indel allele otherwise
|
17
|
+
# * (11) indel line only: # reads directly supporting the 1st indel allele
|
18
|
+
# * (12) indel line only: # reads directly supporting the 2nd indel allele
|
19
|
+
# * (13) indel line only: # reads supporting a third indel allele
|
20
|
+
# If pileup is invoked without `-c', indel lines and columns between 3 and 7 inclusive will not be outputted.
|
21
|
+
#
|
22
|
+
# NB mpileup uses the 6 column output format eg
|
23
|
+
# "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
24
|
+
# Pileup provides accessors for all columns (6 or 10 column format) and a few other useful methods
|
25
|
+
#
|
26
|
+
#
|
27
|
+
module Bio
|
28
|
+
class DB
|
29
|
+
class Pileup
|
30
|
+
attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
|
31
|
+
|
32
|
+
#creates the Pileup object
|
33
|
+
# pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
|
34
|
+
# pile = Bio::DB::Pileup.new(pile_up_line)
|
35
|
+
def initialize(pile_up_line)
|
36
|
+
cols = pile_up_line.split(/\t/)
|
37
|
+
@consensus = nil
|
38
|
+
@consensus_quality = nil
|
39
|
+
@read_quals = nil
|
40
|
+
@bases = nil
|
41
|
+
@allele_frequency = nil
|
42
|
+
@consensus_iuap = nil
|
43
|
+
if cols.length == 6 ##should only be able to get 6 lines from mpileup
|
44
|
+
@ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
|
45
|
+
elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
|
46
|
+
if cols[2] == '*' #indel
|
47
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
|
48
|
+
else #snp / identity
|
49
|
+
@ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
|
50
|
+
end
|
51
|
+
@consensus_quality = @consensus_quality.to_f
|
52
|
+
@snp_quality = @snp_quality.to_f
|
53
|
+
@rms_mapq = @rms_mapq.to_f
|
54
|
+
else
|
55
|
+
#raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
|
56
|
+
end
|
57
|
+
|
58
|
+
@pos = @pos.to_i
|
59
|
+
@coverage = @coverage.to_f
|
60
|
+
@ref_count = nil
|
61
|
+
@non_ref_count_hash = nil
|
62
|
+
@non_ref_count = nil
|
63
|
+
end
|
64
|
+
|
65
|
+
#Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
|
66
|
+
#returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
|
67
|
+
def non_refs
|
68
|
+
if @non_ref_count_hash.nil?
|
69
|
+
@non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
|
70
|
+
end
|
71
|
+
@non_ref_count_hash
|
72
|
+
end
|
73
|
+
|
74
|
+
# returns the total non-reference bases in the reads at this position
|
75
|
+
def non_ref_count
|
76
|
+
if @non_ref_count.nil?
|
77
|
+
@non_ref_count = @read_bases.count("ATGCatgc").to_f
|
78
|
+
end
|
79
|
+
@non_ref_count
|
80
|
+
end
|
81
|
+
|
82
|
+
# returns the count of reference-bases in the reads at this position
|
83
|
+
def ref_count
|
84
|
+
if @ref_count.nil?
|
85
|
+
@ref_count = self.read_bases.count(".,")
|
86
|
+
end
|
87
|
+
@ref_count
|
88
|
+
end
|
89
|
+
|
90
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
91
|
+
def consensus
|
92
|
+
if @consensus.nil?
|
93
|
+
max = self.non_refs.values.max
|
94
|
+
#if the ref base is in more than half the coverage..
|
95
|
+
if (self.ref_count / self.coverage) > 0.5
|
96
|
+
#..then the ref base is the concensus
|
97
|
+
@consensus = self.ref_base
|
98
|
+
##not sure if the following will ever apply as the non_refs method also returns the ref base count, hence can never be over the max count
|
99
|
+
#elsif self.ref_count > max
|
100
|
+
# @consensus = self.ref_base
|
101
|
+
else
|
102
|
+
#get the base(s) and count(s) that has the max count
|
103
|
+
arr = self.non_refs.select {|k,v| v == max }
|
104
|
+
#just get the bases (remove the counts)
|
105
|
+
bases = arr.collect {|b| b[0].to_s }
|
106
|
+
#add the ref base if the ref base has a max count (commenting this out as it should already be in)
|
107
|
+
#bases << self.ref_base if self.ref_count == max
|
108
|
+
@consensus = bases.sort.join
|
109
|
+
end
|
110
|
+
end
|
111
|
+
@consensus
|
112
|
+
end
|
113
|
+
|
114
|
+
#returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
|
115
|
+
def to_vcf
|
116
|
+
alt,g = self.genotype_list
|
117
|
+
alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
|
118
|
+
alt = '.' if alt == self.ref_base
|
119
|
+
alt = alt.split(',')
|
120
|
+
#if the reference base is in alt, remove it
|
121
|
+
alt.delete(self.ref_base.to_s)
|
122
|
+
alt = alt.join(',')
|
123
|
+
[self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
def Pileup.vcf_header
|
128
|
+
%{##fileformat=VCFv3.3\n##INFO=DP,1,Integer,"Total Depth"\n##FORMAT=GT,1,String,"Genotype"\n##FORMAT=GQ,1,Integer,"Genotype Quality"\n##FORMAT=DP,1,Integer,"Read Depth"\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA\n}
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
#returns the genotype of the indel
|
134
|
+
def indel_gt
|
135
|
+
return "undef" if self.consensus.instance_of?(Array)
|
136
|
+
al1, al2 = self.consensus.split(/\//)
|
137
|
+
if al1 == al2 && al1 == '*'
|
138
|
+
al1=self.indel_1
|
139
|
+
al2=self.indel_2
|
140
|
+
end
|
141
|
+
alt1 = parse_indel(al1)
|
142
|
+
alt2 = parse_indel(al2)
|
143
|
+
alt,gt = nil,nil
|
144
|
+
|
145
|
+
return nil if !alt1 and !alt2
|
146
|
+
if !alt1
|
147
|
+
alt = alt2
|
148
|
+
gt = '0/1'
|
149
|
+
elsif !alt2
|
150
|
+
alt = alt1
|
151
|
+
gt - '0/1'
|
152
|
+
elsif alt1 == alt2
|
153
|
+
alt = alt1
|
154
|
+
gt = '1/1'
|
155
|
+
else
|
156
|
+
alt="#{alt1},#{alt2}"
|
157
|
+
gt= '1/2'
|
158
|
+
end
|
159
|
+
return [alt, gt]
|
160
|
+
|
161
|
+
end
|
162
|
+
#returns the genotype of the snp
|
163
|
+
def snp_gt
|
164
|
+
return ['.','0/0'] if self.ref_base == self.consensus
|
165
|
+
bases = Pileup.iupac_to_base(self.consensus)
|
166
|
+
if bases[0] == self.ref_base
|
167
|
+
return [bases[1],'0/1']
|
168
|
+
elsif bases[1] == self.ref_base
|
169
|
+
return [bases[0],'0/1']
|
170
|
+
else
|
171
|
+
return ["#{bases[0]},#{bases[1]}",'1/1']
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
#identifies the reference base and returns the indel or snp genotype as applicable
|
176
|
+
public
|
177
|
+
def genotype_list
|
178
|
+
if self.ref_base == '*'
|
179
|
+
return indel_gt
|
180
|
+
else
|
181
|
+
return snp_gt
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
#returns the two bases for the corresponding iupac code
|
186
|
+
public
|
187
|
+
def Pileup.iupac_to_base(alt_base)
|
188
|
+
case alt_base
|
189
|
+
when 'K' then ['G','T']
|
190
|
+
when 'M' then ['A','C']
|
191
|
+
when 'S' then ['C','G']
|
192
|
+
when 'R' then ['A','G']
|
193
|
+
when 'W' then ['A','T']
|
194
|
+
when 'Y' then ['C','T']
|
195
|
+
else alt_base.split(//)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
#identifies if the indel is an insertion or a deletion
|
200
|
+
def parse_indel(alt)
|
201
|
+
return "D#{$'.length}" if alt =~/^-/
|
202
|
+
if alt=~/^\+/
|
203
|
+
return "I#{$'}"
|
204
|
+
elsif alt == '*'
|
205
|
+
return nil
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
#returns pileup format line
|
211
|
+
def to_s
|
212
|
+
if @read_quals and !@consensus_quality #6col
|
213
|
+
[@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
214
|
+
elsif @indel_1 #13 cols
|
215
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
|
216
|
+
else #10 cols
|
217
|
+
[@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
def bases
|
224
|
+
return @bases if @bases
|
225
|
+
@bases = self.non_refs
|
226
|
+
#puts self.ref_count
|
227
|
+
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
228
|
+
@bases
|
229
|
+
end
|
230
|
+
|
231
|
+
def base_coverage
|
232
|
+
total = 0
|
233
|
+
@bases.each do |k,v|
|
234
|
+
total += v
|
235
|
+
end
|
236
|
+
total
|
237
|
+
end
|
238
|
+
|
239
|
+
#returns the frequency of all bases in pileup position
|
240
|
+
def allele_freq
|
241
|
+
return @allele_frequency if @allele_frequency
|
242
|
+
bases = self.bases
|
243
|
+
@allele_frequency = Hash.new
|
244
|
+
bases.each do |k,v|
|
245
|
+
@allele_frequency[k] = v.to_f/self.base_coverage.to_f
|
246
|
+
end
|
247
|
+
@allele_frequency
|
248
|
+
end
|
249
|
+
|
250
|
+
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
251
|
+
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
252
|
+
|
253
|
+
tmp = []
|
254
|
+
if @consensus_iuap.nil?
|
255
|
+
@consensus_iuap = self.ref_base.downcase
|
256
|
+
bases = self.bases
|
257
|
+
#tmp = String.new
|
258
|
+
bases.each do |k,v|
|
259
|
+
tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
|
260
|
+
end
|
261
|
+
if tmp.length > 0
|
262
|
+
tmp = tmp.collect{ |x| Bio::Sequence::NA.new(x) }
|
263
|
+
# creates alignment object
|
264
|
+
a = Bio::Alignment.new(tmp)
|
265
|
+
# shows IUPAC consensus
|
266
|
+
@consensus_iuap = a.consensus_iupac
|
267
|
+
end
|
268
|
+
end
|
269
|
+
@consensus_iuap
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008-2009 Genome Research Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
1.6
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Bio
|
2
|
+
class DB
|
3
|
+
module SAM
|
4
|
+
module Library
|
5
|
+
#IMPORTANT NOTE: Windows library is missing in this distribution
|
6
|
+
|
7
|
+
# Return the path with the file name of the library for the specific operating system
|
8
|
+
def filename
|
9
|
+
#TODO refactor this piece of code in all the files
|
10
|
+
lib_os = case RUBY_PLATFORM
|
11
|
+
when /linux/
|
12
|
+
'so.1'
|
13
|
+
when /darwin/
|
14
|
+
'1.dylib'
|
15
|
+
when /windows/
|
16
|
+
'dll'
|
17
|
+
else
|
18
|
+
case RUBY_DESCRIPTION
|
19
|
+
when /jruby.*darwin/
|
20
|
+
'1.dylib'
|
21
|
+
when /jruby.*linux/
|
22
|
+
'so.1'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
File.join(File.expand_path(File.dirname(__FILE__)),'external',"libbam.#{lib_os}")
|
27
|
+
end #filename
|
28
|
+
module_function :filename
|
29
|
+
end #Library
|
30
|
+
end #Sam
|
31
|
+
end #DB
|
32
|
+
end #Bio
|