bio-samtools-wrapper 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +27 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +702 -0
- data/README.md +501 -0
- data/Rakefile +73 -0
- data/VERSION +1 -0
- data/bin/bam_consensus.rb +85 -0
- data/bio-samtools-wrapper.gemspec +181 -0
- data/doc/Bio/DB/Alignment.html +552 -0
- data/doc/Bio/DB/Pileup.html +711 -0
- data/doc/Bio/DB/SAM/Library.html +167 -0
- data/doc/Bio/DB/SAM/Tools.html +109 -0
- data/doc/Bio/DB/SAM.html +1853 -0
- data/doc/Bio/DB/Tag.html +208 -0
- data/doc/Bio/DB/Vcf.html +431 -0
- data/doc/Bio/DB.html +105 -0
- data/doc/Bio.html +175 -0
- data/doc/LICENSE_txt.html +846 -0
- data/doc/created.rid +9 -0
- data/doc/fonts/Lato-Light.ttf +0 -0
- data/doc/fonts/Lato-LightItalic.ttf +0 -0
- data/doc/fonts/Lato-Regular.ttf +0 -0
- data/doc/fonts/Lato-RegularItalic.ttf +0 -0
- data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
- data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
- data/doc/fonts.css +167 -0
- data/doc/images/add.png +0 -0
- data/doc/images/arrow_up.png +0 -0
- data/doc/images/brick.png +0 -0
- data/doc/images/brick_link.png +0 -0
- data/doc/images/bug.png +0 -0
- data/doc/images/bullet_black.png +0 -0
- data/doc/images/bullet_toggle_minus.png +0 -0
- data/doc/images/bullet_toggle_plus.png +0 -0
- data/doc/images/date.png +0 -0
- data/doc/images/delete.png +0 -0
- data/doc/images/find.png +0 -0
- data/doc/images/loadingAnimation.gif +0 -0
- data/doc/images/macFFBgHack.png +0 -0
- data/doc/images/package.png +0 -0
- data/doc/images/page_green.png +0 -0
- data/doc/images/page_white_text.png +0 -0
- data/doc/images/page_white_width.png +0 -0
- data/doc/images/plugin.png +0 -0
- data/doc/images/ruby.png +0 -0
- data/doc/images/tag_blue.png +0 -0
- data/doc/images/tag_green.png +0 -0
- data/doc/images/transparent.png +0 -0
- data/doc/images/wrench.png +0 -0
- data/doc/images/wrench_orange.png +0 -0
- data/doc/images/zoom.png +0 -0
- data/doc/index.html +106 -0
- data/doc/js/darkfish.js +140 -0
- data/doc/js/jquery.js +18 -0
- data/doc/js/navigation.js +142 -0
- data/doc/js/search.js +109 -0
- data/doc/js/search_index.js +1 -0
- data/doc/js/searcher.js +228 -0
- data/doc/rdoc.css +580 -0
- data/doc/table_of_contents.html +305 -0
- data/ext/Makefile-bioruby.patch +12 -0
- data/ext/Makefile-suse.patch +11 -0
- data/ext/mkrf_conf.rb +118 -0
- data/lib/bio/BIOExtensions.rb +89 -0
- data/lib/bio/db/alignment.rb +64 -0
- data/lib/bio/db/fastadb.rb +320 -0
- data/lib/bio/db/pileup.rb +273 -0
- data/lib/bio/db/sam/external/COPYING +21 -0
- data/lib/bio/db/sam/external/VERSION +1 -0
- data/lib/bio/db/sam/library.rb +32 -0
- data/lib/bio/db/sam.rb +778 -0
- data/lib/bio/db/vcf.rb +105 -0
- data/lib/bio-samtools-wrapper.rb +9 -0
- data/test/.gitignore +1 -0
- data/test/helper.rb +18 -0
- data/test/sample.vcf +24 -0
- data/test/samples/.gitignore +1 -0
- data/test/samples/LCI/NC_001988.ffn +2 -0
- data/test/samples/LCI/test.bam +0 -0
- data/test/samples/LCI/test.bam.bai +0 -0
- data/test/samples/small/dupes.bam +0 -0
- data/test/samples/small/dupes.sam +274 -0
- data/test/samples/small/ids2.txt +1 -0
- data/test/samples/small/map_for_reheader.sam +8 -0
- data/test/samples/small/map_to_merge1.bam +0 -0
- data/test/samples/small/map_to_merge1.bam.bai +0 -0
- data/test/samples/small/map_to_merge1.sam +8 -0
- data/test/samples/small/map_to_merge2.bam +0 -0
- data/test/samples/small/map_to_merge2.bam.bai +0 -0
- data/test/samples/small/map_to_merge2.sam +8 -0
- data/test/samples/small/no_md.sam +8 -0
- data/test/samples/small/sorted.bam +0 -0
- data/test/samples/small/sorted.bam.bai +0 -0
- data/test/samples/small/test.sai +0 -0
- data/test/samples/small/test.tam +10 -0
- data/test/samples/small/test_chr.fasta +1000 -0
- data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.amb +2 -0
- data/test/samples/small/test_chr.fasta.ann +3 -0
- data/test/samples/small/test_chr.fasta.bwt +0 -0
- data/test/samples/small/test_chr.fasta.pac +0 -0
- data/test/samples/small/test_chr.fasta.rbwt +0 -0
- data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
- data/test/samples/small/test_chr.fasta.rpac +0 -0
- data/test/samples/small/test_chr.fasta.rsa +0 -0
- data/test/samples/small/test_chr.fasta.sa +0 -0
- data/test/samples/small/test_cov.svg +273 -0
- data/test/samples/small/test_fastadb.fasta +34 -0
- data/test/samples/small/testu.bam +0 -0
- data/test/samples/small/testu.bed +2 -0
- data/test/test_bio-samtools-wrapper.rb +1 -0
- data/test/test_fastadb.rb +89 -0
- data/test/test_pileup.rb +90 -0
- data/test/test_sam.rb +421 -0
- data/test/test_vcf.rb +79 -0
- data/tutorial/tutorial.html +474 -0
- data/tutorial/tutorial.md +424 -0
- data/tutorial/tutorial.pdf +0 -0
- metadata +254 -0
data/lib/bio/db/sam.rb
ADDED
@@ -0,0 +1,778 @@
|
|
1
|
+
module Bio
|
2
|
+
class DB
|
3
|
+
class Sam
|
4
|
+
|
5
|
+
class SamException < StandardError; end
|
6
|
+
|
7
|
+
attr_accessor :bam, :fasta, :samtools, :bcftools, :last_command
|
8
|
+
attr_accessor :minumum_ratio_for_iup_consensus
|
9
|
+
attr_reader :cached_regions
|
10
|
+
#attr_accessor :pileup_cache
|
11
|
+
@minumum_ratio_for_iup_consensus = 0.20
|
12
|
+
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
13
|
+
|
14
|
+
#Creates a new Bio::DB::Sam object
|
15
|
+
#* fasta [String] - the path to the Fasta reference sequence
|
16
|
+
#* bam [String] - path to bam files
|
17
|
+
#* samtools [String] - path to alternative installation of samtools
|
18
|
+
#* bcftools [String] - path to alternative installation of bcftools
|
19
|
+
#* returns [Bio::DB::Sam] a new `Bio::DB::Sam` object
|
20
|
+
def initialize(args)
|
21
|
+
@fasta = args[:fasta]
|
22
|
+
@bam = args[:bam]
|
23
|
+
@bams = nil
|
24
|
+
@sam = nil
|
25
|
+
@files = nil
|
26
|
+
@cached_regions = nil
|
27
|
+
@stats = nil
|
28
|
+
@samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
|
29
|
+
@bcftools = args[:bcftools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','bcftools')
|
30
|
+
|
31
|
+
@files = [@files] if @files.instance_of?(String)
|
32
|
+
|
33
|
+
|
34
|
+
@last_command = nil
|
35
|
+
raise ArgumentError, "Need Fasta and at least one BAM or SAM" if not @fasta or not @bam
|
36
|
+
raise IOError, "File not found #{@files}" if not files_ok?
|
37
|
+
@bams = [@bams] if @bams.instance_of? String
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
#backward compatibility method, returns true if file exists otherwise, complains and quits.
|
42
|
+
def open
|
43
|
+
files_ok?
|
44
|
+
end
|
45
|
+
|
46
|
+
#runs the samtools view command
|
47
|
+
#* b - output BAM
|
48
|
+
#* h - print header for the SAM output
|
49
|
+
#* H - print header only (no alignments)
|
50
|
+
#* S - input is SAM
|
51
|
+
#* u - uncompressed BAM output (force -b)
|
52
|
+
#* one - fast compression (force -b)
|
53
|
+
#* x - output FLAG in HEX (samtools-C specific)
|
54
|
+
#* X - output FLAG in string (samtools-C specific)
|
55
|
+
#* c - print only the count of matching records
|
56
|
+
#* B - collapse the backward CIGAR operation
|
57
|
+
#* at - INT number of BAM compression threads [0]
|
58
|
+
#* L - FILE output alignments overlapping the input BED FILE [null]
|
59
|
+
#* t - FILE list of reference names and lengths (force -S) [null]
|
60
|
+
#* T - FILE reference sequence file (force -S) [null]
|
61
|
+
#* o - FILE output file name [stdout]
|
62
|
+
#* R - FILE list of read groups to be outputted [null]
|
63
|
+
#* f - INT required flag 0 for unset [0]
|
64
|
+
#* F - INT filtering flag 0 for unset [0]
|
65
|
+
#* q - INT minimum mapping quality [0]
|
66
|
+
#* l - STR only output reads in library STR [null]
|
67
|
+
#* r - STR only output reads in read group STR [null]
|
68
|
+
#* s - FLOAT fraction of templates to subsample; integer part as seed [-1]
|
69
|
+
#* chr - name of reference sequence to get alignments from
|
70
|
+
#* start - start position on reference sequence
|
71
|
+
#* stop - end postion on reference sequence
|
72
|
+
def view(opts={},&block)
|
73
|
+
region = String.new
|
74
|
+
if opts[:chr] and opts[:start] and opts[:stop]
|
75
|
+
has_e = self.has_entry? opts[:chr]
|
76
|
+
raise SamException.new(), "[view] The sequence #{opts[:chr]} is not in the bam file" unless has_e
|
77
|
+
region = "#{opts[:chr]}:#{opts[:start]}-#{opts[:stop]}"
|
78
|
+
[:chr, :start, :stop].each {|o| opts.delete(o)}
|
79
|
+
end
|
80
|
+
if opts[:at]
|
81
|
+
opts['@'] = opts[:at]
|
82
|
+
opts.delete(:at)
|
83
|
+
end
|
84
|
+
|
85
|
+
if opts[:one]
|
86
|
+
opts['1'] = opts[:one]
|
87
|
+
opts.delete(:one)
|
88
|
+
end
|
89
|
+
command = String.new
|
90
|
+
command = form_opt_string(@samtools, 'view', opts, [:b, :h, :H, :S, :u, '1', :x, :X, :c, :B])
|
91
|
+
command = command + " \"#{region}\"" if region.size > 0
|
92
|
+
@last_command = command
|
93
|
+
type = (opts[:u] or opts[:b]) ? :binary : :text
|
94
|
+
klass = (type == :binary) ? String : Bio::DB::Alignment
|
95
|
+
yield_from_pipe(command, klass, type, &block)
|
96
|
+
end
|
97
|
+
|
98
|
+
#fetches a subsequence and calls code block
|
99
|
+
#* chr - the reference name for the subsequence
|
100
|
+
#* start - the start position for the subsequence
|
101
|
+
#* stop - the stop position for the subsequence
|
102
|
+
#* &block - the the block of code to execute
|
103
|
+
def fetch(chr, start,stop, &block)
|
104
|
+
|
105
|
+
view(
|
106
|
+
:chr => chr,
|
107
|
+
:start => start,
|
108
|
+
:stop => stop,
|
109
|
+
&block
|
110
|
+
)
|
111
|
+
end
|
112
|
+
|
113
|
+
alias_method :fetch_with_function, :fetch
|
114
|
+
|
115
|
+
#returns an array of coverage for each location for which there are mapped reads
|
116
|
+
#* chr - the reference name
|
117
|
+
#* start - the start position
|
118
|
+
#* length - the length of the region queried
|
119
|
+
def chromosome_coverage(chr,start,length)
|
120
|
+
result = []
|
121
|
+
region = "#{chr}:#{start}-#{start + length}"
|
122
|
+
self.mpileup(:r => region) do |p|
|
123
|
+
result << p.coverage
|
124
|
+
end
|
125
|
+
result
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
#returns an svg file or object, plotting coverage for each location for which there are mapped reads
|
130
|
+
#* chr - the reference name
|
131
|
+
#* start - the start position
|
132
|
+
#* length - the length of the region queried
|
133
|
+
#OPTIONS
|
134
|
+
#* bin - the amount of bins to split the histogram into. The arithmetic mean score for each bin will be plotted. [default 30 bins]
|
135
|
+
#* svg - a file to write the svg image to [default a String object containing the SVG]
|
136
|
+
def plot_coverage(chr,start,length, opts={})
|
137
|
+
chr = opts[:chr] if chr.nil?
|
138
|
+
start = opts[:start] if start.nil?
|
139
|
+
length = opts[:length] if length.nil?
|
140
|
+
if opts[:bin]
|
141
|
+
bin = length/opts[:bin]
|
142
|
+
else
|
143
|
+
bin = length/30
|
144
|
+
end
|
145
|
+
result = []
|
146
|
+
region = "#{chr}:#{start}-#{start + length}"
|
147
|
+
self.mpileup(:r => region) do |p|
|
148
|
+
result << p.coverage
|
149
|
+
end
|
150
|
+
p = Bio::Graphics::Page.new(:width => 1000,
|
151
|
+
:height => 200,
|
152
|
+
:number_of_intervals => 10,
|
153
|
+
:font_size => 14
|
154
|
+
)
|
155
|
+
default_options = {:glyph => :histogram,
|
156
|
+
:stroke => 'black',
|
157
|
+
:fill_color => 'gold',
|
158
|
+
:track_height => 150,
|
159
|
+
:name => 'read coverage',
|
160
|
+
:label => true,
|
161
|
+
:stroke_width => '1',
|
162
|
+
:x_round => 1,
|
163
|
+
:y_round => 1 }
|
164
|
+
opts = default_options.merge(opts)
|
165
|
+
|
166
|
+
data_track = p.add_track(opts)
|
167
|
+
index = 0;
|
168
|
+
result.each_slice(bin) {|slice|
|
169
|
+
#result.each_with_index {|val, index|
|
170
|
+
data_feature = Bio::Graphics::MiniFeature.new(:start => start + index,
|
171
|
+
:end => (start + index + bin),
|
172
|
+
:segment_height => slice.inject{|sum,x| sum + x }.to_f / slice.size)
|
173
|
+
data_track.add(data_feature)
|
174
|
+
index+=bin
|
175
|
+
}
|
176
|
+
if opts[:svg]
|
177
|
+
svg = opts[:svg].to_s
|
178
|
+
p.write(svg)
|
179
|
+
else
|
180
|
+
return p.get_markup
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
#returns the average coverage over the region queried
|
187
|
+
#* chr - the reference name
|
188
|
+
#* start - the start position
|
189
|
+
#* length - the length of the region queried
|
190
|
+
def average_coverage(chr,start,length)
|
191
|
+
arr = self.chromosome_coverage(chr,start,length)
|
192
|
+
arr.inject{ |sum, el| sum + el }.to_f / arr.size
|
193
|
+
end
|
194
|
+
|
195
|
+
#returns a Bio::DB::Pileup or Bio::DB::VCF object
|
196
|
+
#* region - Only generate pileup in region [chrom:start-stop]
|
197
|
+
#* illumina_quals - Assume the quality is in the Illumina 1.3+ encoding
|
198
|
+
#* count_anomalous - Do not skip anomalous read pairs in variant calling
|
199
|
+
#* no_baq - Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments.
|
200
|
+
#* adjust_mapq - [INT] Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0]
|
201
|
+
#* max_per_bam_depth - [INT] At a position, read maximally INT reads per input BAM. [250]
|
202
|
+
#* extended_baq - Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt specificity a little bit.
|
203
|
+
#* exclude_reads_file - [FILE] exclude read groups listed in FILE [null]
|
204
|
+
#* list_of_positions - [FILE] BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
|
205
|
+
#* mapping_quality_cap - [INT] cap mapping quality at INT [60]
|
206
|
+
#* ignore_rg - ignore read group tags
|
207
|
+
#* min_mapping_quality - [INT] skip alignments with mapQ smaller than INT [0]
|
208
|
+
#* min_base_quality - [INT] skip bases with baseQ/BAQ smaller than INT [13]
|
209
|
+
#* ##following options are for the -g -u option
|
210
|
+
#* genotype_calling - generate BCF output (genotype likelihoods)
|
211
|
+
#* uncompressed_bcf - generate uncompress BCF output
|
212
|
+
#* extension_sequencing_probability - [INT] Phred-scaled gap extension seq error probability [20]
|
213
|
+
#* homopolymer_error_coefficient - [INT] coefficient for homopolymer errors [100]
|
214
|
+
#* no_indels - do not perform indel calling
|
215
|
+
#* skip_indel_over_average_depth - [INT] max per-sample depth for INDEL calling [250]
|
216
|
+
#* gap_open_sequencing_error_probability - [INT] Phred-scaled gap open sequencing error probability [40]
|
217
|
+
#* platforms - [STRING] comma separated list of platforms for indels [all]
|
218
|
+
def mpileup(opts={}, &block)
|
219
|
+
#long option form to short samtools form..
|
220
|
+
long_opts = {
|
221
|
+
:region => :r,
|
222
|
+
:illumina_quals => :six,
|
223
|
+
:count_anomalous => :A,
|
224
|
+
:no_baq => :B,
|
225
|
+
:adjust_mapq => :C,
|
226
|
+
:max_per_bam_depth => :d,
|
227
|
+
:extended_baq => :E,
|
228
|
+
:exclude_reads_file => :G,
|
229
|
+
:list_of_positions => :l,
|
230
|
+
:mapping_quality_cap => :M,
|
231
|
+
:ignore_rg => :R,
|
232
|
+
:min_mapping_quality => :q,
|
233
|
+
:min_base_quality => :Q,
|
234
|
+
###following options are for the -g -u option
|
235
|
+
:genotype_calling => :g,
|
236
|
+
:uncompressed_bcf => :u,
|
237
|
+
:extension_sequencing_probability => :e,
|
238
|
+
:homopolymer_error_coefficient => :h,
|
239
|
+
:no_indels => :I,
|
240
|
+
:skip_indel_over_average_depth => :L,
|
241
|
+
:gap_open_sequencing_error_probability => :o,
|
242
|
+
:platforms => :P
|
243
|
+
}
|
244
|
+
|
245
|
+
##convert any long_opts to short opts
|
246
|
+
temp_opts = opts.dup
|
247
|
+
opts.each_pair do |k,v|
|
248
|
+
if long_opts[k]
|
249
|
+
temp_opts[long_opts[k]] = v
|
250
|
+
temp_opts.delete(k)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
opts = Hash.new
|
254
|
+
#To remove any unwanted options.
|
255
|
+
long_opts.each_pair do |k,v|
|
256
|
+
opts[v] = temp_opts[v] if temp_opts.has_key?(v)
|
257
|
+
end
|
258
|
+
|
259
|
+
# opts = temp_opts
|
260
|
+
opts[:u] = true if opts[:g] #so that we always get uncompressed output
|
261
|
+
opts.delete(:g)
|
262
|
+
|
263
|
+
opts[:f] = @fasta
|
264
|
+
|
265
|
+
#TOODO: reduce the string handling
|
266
|
+
query = opts[:r].to_s
|
267
|
+
query = opts[:r].to_region.to_s if opts[:r].respond_to?(:to_region)
|
268
|
+
if not query.nil? and query.size > 0
|
269
|
+
raise SamException.new(), "The sequence #{query} is not in the bam file" unless has_region? query
|
270
|
+
end
|
271
|
+
opts[:r] = query
|
272
|
+
|
273
|
+
if opts[:six]
|
274
|
+
opts["6"] = nil
|
275
|
+
opts.delete(:six)
|
276
|
+
end
|
277
|
+
|
278
|
+
command = form_opt_string(@samtools, "mpileup", opts, [:R, :B, :E, "6", :A, :g, :u, :I] )
|
279
|
+
puts "Running: #{command}" if $DEBUG
|
280
|
+
if opts[:u]
|
281
|
+
command = command + " | #{@bcftools} view -cg -"
|
282
|
+
end
|
283
|
+
|
284
|
+
klass = opts[:u] ? Bio::DB::Vcf : Bio::DB::Pileup
|
285
|
+
@last_command = command
|
286
|
+
yield_from_pipe(command, klass, :text, &block)
|
287
|
+
|
288
|
+
end
|
289
|
+
|
290
|
+
#fetches a subsequence from a reference genome and option returns it as a Bio::Sequence::NA object
|
291
|
+
#* chr - [STRING] the reference name for the subsequence
|
292
|
+
#* start - [INT] the start position for the subsequence
|
293
|
+
#* stop - [INT] the stop position for the subsequence
|
294
|
+
#* as_bio - boolean stating if the returned object should be a Bio::Sequence::NA object
|
295
|
+
def fetch_reference(chr,start,stop, opts={:as_bio => false})
|
296
|
+
raise SamException.new(), "The sequence #{chr} is not in the bam file" unless has_entry? chr
|
297
|
+
seq = ""
|
298
|
+
unless @fasta #We return a string of Ns if we don't know the reference.
|
299
|
+
seq = "n" * (stop-start)
|
300
|
+
else
|
301
|
+
command = "#{@samtools} faidx \"#{@fasta}\" '#{chr}:#{start}-#{stop}'"
|
302
|
+
puts "Running: #{command}" if $DEBUG
|
303
|
+
@last_command = command
|
304
|
+
seq = ""
|
305
|
+
yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
|
306
|
+
end
|
307
|
+
|
308
|
+
if opts[:as_bio]
|
309
|
+
seq = Bio::Sequence::NA.new(seq).to_fasta("#{chr}:#{start}-#{stop}")
|
310
|
+
end
|
311
|
+
seq
|
312
|
+
end
|
313
|
+
|
314
|
+
#Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
|
315
|
+
#Options - if a subsequence is required
|
316
|
+
#* chr - [STRING] the reference name of the subsequence
|
317
|
+
#* start - [INT] the start position for the subsequence
|
318
|
+
#* stop - [INT] the stop position for the subsequence
|
319
|
+
def faidx(opts={})
|
320
|
+
if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
|
321
|
+
opts={:as_bio => false}
|
322
|
+
self.fetch_reference(:chr,:start,:stop,opts)
|
323
|
+
else
|
324
|
+
command = "#{@samtools} faidx \"#{@fasta}\""
|
325
|
+
@last_command = command
|
326
|
+
system(command)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
#Index sorted alignment for fast random access. Index file <aln.bam>.bai will be created of no out_index is provided.
|
331
|
+
#* out_index - [STRING] name of index
|
332
|
+
def index(opts={})
|
333
|
+
command = "#{@samtools} index \"#{@bam}\" #{opts[:out_index]}"
|
334
|
+
puts "Running: #{command}" if $DEBUG
|
335
|
+
@last_command = command
|
336
|
+
system(command)
|
337
|
+
end
|
338
|
+
|
339
|
+
#Fill in mate coordinates, ISIZE and mate related flags from a name-sorted alignment
|
340
|
+
#* out_bam name of outfile
|
341
|
+
#* r - remove unmapped reads and secondary alignments
|
342
|
+
def fix_mates(opts={})
|
343
|
+
#opts.merge!({:out_index=>nil})
|
344
|
+
remove_reads = ""
|
345
|
+
if opts[:r]
|
346
|
+
remove_reads = "-r"
|
347
|
+
end
|
348
|
+
command = "#{@samtools} fixmate #{remove_reads} \"#{@bam}\" #{opts[:out_bam]}"
|
349
|
+
puts "Running: #{command}" if $DEBUG
|
350
|
+
@last_command = command
|
351
|
+
system(command)
|
352
|
+
end
|
353
|
+
|
354
|
+
alias_method :fixmate, :fix_mates
|
355
|
+
|
356
|
+
#generate simple stats with regard to the number and pairing of reads mapped to a reference
|
357
|
+
def flag_stats(opts={})
|
358
|
+
command = form_opt_string(@samtools, "flagstat", opts, [])
|
359
|
+
puts "Running: #{command}" if $DEBUG
|
360
|
+
@last_command = command
|
361
|
+
strings = []
|
362
|
+
yield_from_pipe(command,String) {|line| strings << line.chomp}
|
363
|
+
strings
|
364
|
+
end
|
365
|
+
|
366
|
+
alias_method :flagstat, :flag_stats
|
367
|
+
|
368
|
+
#Retrieve and print stats in the index file. The output is TAB delimited with each line consisting of reference sequence name, sequence length, number of mapped reads and number unmapped reads.
|
369
|
+
def index_stats
|
370
|
+
return @stats if @stats
|
371
|
+
stats = {}
|
372
|
+
command = form_opt_string(@samtools, "idxstats", {}, [])
|
373
|
+
@last_command = command
|
374
|
+
#puts "Running: #{command}" if $DEBUG
|
375
|
+
yield_from_pipe(command, String, :text, true, "#") do |line|
|
376
|
+
info = line.chomp.split(/\t/)
|
377
|
+
stats[ info[0] ] = {:length => info[1].to_i, :mapped_reads => info[2].to_i, :unmapped_reads => info[3].to_i }
|
378
|
+
end
|
379
|
+
@stats = stats
|
380
|
+
return @stats
|
381
|
+
end
|
382
|
+
|
383
|
+
alias_method :idxstats, :index_stats
|
384
|
+
|
385
|
+
#Retrive a hash with all the regions, with the region id as index or runs the function on each region
|
386
|
+
def each_region
|
387
|
+
index_stats
|
388
|
+
if @regions
|
389
|
+
return @regions unless block_given?
|
390
|
+
else
|
391
|
+
@regions = Hash.new
|
392
|
+
end
|
393
|
+
index_stats.each do |k,v|
|
394
|
+
reg = Bio::DB::Fasta::Region.new
|
395
|
+
reg.entry = k
|
396
|
+
reg.start = 1
|
397
|
+
reg.end = v[:length]
|
398
|
+
reg.orientation = :forward
|
399
|
+
@regions[k] = reg unless @regions[k]
|
400
|
+
yield reg if block_given?
|
401
|
+
end
|
402
|
+
@regions
|
403
|
+
end
|
404
|
+
|
405
|
+
#Tells if the bam file contains the entry. It has to be indexed.
|
406
|
+
def has_entry?(entry)
|
407
|
+
index_stats.has_key?(entry)
|
408
|
+
# puts "#{entry} #{@stats.inspect}"
|
409
|
+
# index_stats
|
410
|
+
end
|
411
|
+
|
412
|
+
def has_region?(region)
|
413
|
+
index_stats
|
414
|
+
reg=Bio::DB::Fasta::Region::parse_region(region)
|
415
|
+
return 0 unless has_entry? (reg.entry)
|
416
|
+
len = @stats[reg.entry][:length]
|
417
|
+
reg.start > 0 and reg.end <= len
|
418
|
+
end
|
419
|
+
|
420
|
+
#Merge multiple sorted alignments
|
421
|
+
#* n - sort by read names
|
422
|
+
#* r - attach RG tag (inferred from file names)
|
423
|
+
#* u - uncompressed BAM output
|
424
|
+
#* f - overwrite the output BAM if exist
|
425
|
+
#* one - compress level 1
|
426
|
+
#* l - [INT] compression level, from 0 to 9 [-1]
|
427
|
+
#* at - [INT] number of BAM compression threads [0]
|
428
|
+
#* R - [STRING] merge file in the specified region STR [all]
|
429
|
+
#* h - [FILE] copy the header in FILE to <out.bam> [in1.bam]
|
430
|
+
#* out - [FILE] out file name
|
431
|
+
#* bams - [FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
|
432
|
+
def merge(opts={})
|
433
|
+
if opts[:one]
|
434
|
+
opts['1'] = nil
|
435
|
+
opts.delete(:one)
|
436
|
+
end
|
437
|
+
|
438
|
+
if opts[:at]
|
439
|
+
opts['@'] = opts[:at]
|
440
|
+
opts.delete(:at)
|
441
|
+
end
|
442
|
+
|
443
|
+
out = opts[:out]
|
444
|
+
opts.delete(:out)
|
445
|
+
|
446
|
+
bam_list = opts[:bams].collect do |b|
|
447
|
+
b.bam rescue b
|
448
|
+
end.join(' ')
|
449
|
+
|
450
|
+
opts.delete(:bams)
|
451
|
+
options = commandify(opts, [:n, :r, :u, :f, '1'] )
|
452
|
+
command = "#{@samtools} merge #{options} #{out} #{bam_list}"
|
453
|
+
|
454
|
+
@last_command = command
|
455
|
+
puts "Running: #{command}" if $DEBUG
|
456
|
+
system(command)
|
457
|
+
|
458
|
+
end
|
459
|
+
|
460
|
+
#Concatenate BAMs. The sequence dictionary of each input BAM must be identical.
|
461
|
+
#* h - header.sam
|
462
|
+
#* out -[FILE] out file name
|
463
|
+
#* bams -[FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
|
464
|
+
def cat(opts={})
|
465
|
+
bam_list = opts[:bams].collect do |b|
|
466
|
+
b.bam rescue b
|
467
|
+
end.join(' ')
|
468
|
+
opts.delete(:bams)
|
469
|
+
options = commandify(opts, [:h] )
|
470
|
+
command = "#{@samtools} cat #{options} -o #{out} #{bam_list}"
|
471
|
+
puts command if $DEBUG
|
472
|
+
@last_command = command
|
473
|
+
system(command)
|
474
|
+
|
475
|
+
end
|
476
|
+
|
477
|
+
#* program - one of 'samtools' 'bcftools'
|
478
|
+
#* command - one of the commands relevant to the program
|
479
|
+
def self.docs(program, command)
|
480
|
+
return "program must be 'samtools' or 'bcftools'" if not ['samtools', 'bcftools'].include? program
|
481
|
+
command = "#{program} #{command}"
|
482
|
+
`#{command}`
|
483
|
+
end
|
484
|
+
|
485
|
+
#Remove potential PCR duplicates: if multiple read pairs have identical external coordinates, only retain the pair with highest mapping quality.
|
486
|
+
#* s - rmdup for SE reads
|
487
|
+
#* S - treat PE reads as SE in rmdup (force -s)
|
488
|
+
#* out - [FILE] output bam
|
489
|
+
def remove_duplicates(opts={})
|
490
|
+
out = opts[:out]
|
491
|
+
opts.delete(:out)
|
492
|
+
command = "#{form_opt_string(@samtools, "rmdup", opts, [:s, :S])} #{out} \"#{@bam}\""
|
493
|
+
@last_command = command
|
494
|
+
system(command)
|
495
|
+
end
|
496
|
+
|
497
|
+
alias_method :rmdup, :remove_duplicates
|
498
|
+
|
499
|
+
#Sort alignments by leftmost coordinates
|
500
|
+
#* n - sort by read name
|
501
|
+
#* f - use <out.prefix> as full file name instead of prefix
|
502
|
+
#* o - final output to stdout returns bio::db::alignment depreciated (samtools-1.x saves to a file)
|
503
|
+
#* l - [INT] compression level, from 0 to 9 [-1]
|
504
|
+
#* at - [INT] number of sorting and compression threads [1]
|
505
|
+
#* m - [INT] max memory per thread; suffix K/M/G recognized [768M]
|
506
|
+
#* prefix - [STRING] prefix for output bamfile (for legacy, becomes "o" to use in samtools-1.x)
|
507
|
+
def sort(opts={})
|
508
|
+
if !opts.has_key?(:prefix)
|
509
|
+
opts.merge!({:o => "sorted"})
|
510
|
+
else
|
511
|
+
opts[:o] = opts[:prefix] += ".bam"
|
512
|
+
end
|
513
|
+
|
514
|
+
opts.delete(:prefix)
|
515
|
+
command = form_opt_string(@samtools, "sort", opts, [:n, :f])
|
516
|
+
command = command + " "
|
517
|
+
@last_command = command
|
518
|
+
puts "Running: #{command}" if $DEBUG
|
519
|
+
#if opts[:o]
|
520
|
+
# yield_from_pipe(command, Bio::DB::Alignment)
|
521
|
+
#else
|
522
|
+
system(command)
|
523
|
+
#end
|
524
|
+
end
|
525
|
+
|
526
|
+
#used to generate a text alignment viewer
|
527
|
+
#* d - display, output as (H)tml or (C)urses or (T)ext
|
528
|
+
#* p - [chr:pos] go directly to this position
|
529
|
+
#* s - [STR] display only reads from this sample or group
|
530
|
+
def tview(opts={})
|
531
|
+
if opts[:d]
|
532
|
+
opts['d'] = opts[:d]
|
533
|
+
opts.delete(:d)
|
534
|
+
end
|
535
|
+
if opts[:p]
|
536
|
+
opts['p'] = opts[:p]
|
537
|
+
opts.delete(:p)
|
538
|
+
end
|
539
|
+
if opts[:s]
|
540
|
+
opts['s'] = opts[:s]
|
541
|
+
opts.delete(:s)
|
542
|
+
end
|
543
|
+
command = "#{form_opt_string(@samtools, "tview", opts)}"
|
544
|
+
puts "Running: #{command}" if $DEBUG
|
545
|
+
@last_command = command
|
546
|
+
system(command)
|
547
|
+
end
|
548
|
+
|
549
|
+
#Replace the header of the current bam file with the header in header_sam
|
550
|
+
#* header_sam - the sam file from which the new header will be taken
|
551
|
+
#* out - [FILE] output bam file
|
552
|
+
def reheader(header_sam, opts={})
|
553
|
+
if opts.has_key?(:out)
|
554
|
+
out=opts[:out]
|
555
|
+
command = "#{@samtools} reheader #{header_sam} \"#{@bam}\" > #{out}"
|
556
|
+
else
|
557
|
+
command = "#{@samtools} reheader #{header_sam} \"#{@bam}\""
|
558
|
+
end
|
559
|
+
puts "Running: #{command}" if $DEBUG
|
560
|
+
@last_command = command
|
561
|
+
system(command)
|
562
|
+
end
|
563
|
+
|
564
|
+
#Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing tag. Output SAM by default.
|
565
|
+
#* A - When used jointly with -r this option overwrites the original base quality.
|
566
|
+
#* e - Convert a the read base to = if it is identical to the aligned reference base. Indel caller does not support the = bases at the moment.
|
567
|
+
#* u - Output uncompressed BAM
|
568
|
+
#* b - Output compressed BAM
|
569
|
+
#* S - The input is SAM with header lines
|
570
|
+
#* C - [INT] Coefficient to cap mapping quality of poorly mapped reads. See the pileup command for details. [0]
|
571
|
+
#* r - Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
|
572
|
+
#* E - Extended BAQ calculation. This option trades specificity for sensitivity, though the effect is minor.
|
573
|
+
def calmd(opts={}, &block)
|
574
|
+
command = form_opt_string(@samtools, "calmd", opts, [:E, :e, :u, :b, :S, :r] )+ " " + @fasta
|
575
|
+
puts "Running: #{command}" if $DEBUG
|
576
|
+
@last_command = command
|
577
|
+
type = :text
|
578
|
+
klass = Bio::DB::Alignment
|
579
|
+
yield_from_pipe(command, klass, type, true, "@",&block)
|
580
|
+
end
|
581
|
+
|
582
|
+
#Identifies target regions by examining the continuity of read depth, computes haploid consensus sequences of targets and outputs a SAM with each sequence corresponding to a target. When option -f is in use, BAQ will be applied.
|
583
|
+
#* Q - [INT] Minimum base quality for a base to be considered [13]
|
584
|
+
#* i - in penalty
|
585
|
+
#* 0 - em0
|
586
|
+
#* 1 - em1
|
587
|
+
#* 2 - em2
|
588
|
+
#* f - reference
|
589
|
+
def targetcut(opts={})
|
590
|
+
if opts[:f]
|
591
|
+
opts['f'] = @fasta
|
592
|
+
opts.delete(:s)
|
593
|
+
end
|
594
|
+
|
595
|
+
command = "#{form_opt_string(@samtools, "targetcut", opts, [] )}"
|
596
|
+
puts "Running: #{command}" if $DEBUG
|
597
|
+
@last_command = command
|
598
|
+
system(command)
|
599
|
+
end
|
600
|
+
|
601
|
+
#Call and phase heterozygous SNPs
|
602
|
+
#* A - Drop reads with ambiguous phase.
|
603
|
+
#* b - [STR] Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file STR.0.bam and phase-1 reads in STR.1.bam. Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads with switch errors will be saved in STR.chimeric.bam. [null]
|
604
|
+
#* F - Do not attempt to fix chimeric reads.
|
605
|
+
#* k - [INT] Maximum length for local phasing. [13]
|
606
|
+
#* q - [INT] Minimum Phred-scaled LOD to call a heterozygote. [40]
|
607
|
+
#* Q - [INT] Minimum base quality to be used in het calling. [13]
|
608
|
+
def phase(opts={})
|
609
|
+
command = "#{form_opt_string(@samtools, "phase", opts, [:A, :F] )}"
|
610
|
+
puts "Running: #{command}" if $DEBUG
|
611
|
+
@last_command = command
|
612
|
+
system(command)
|
613
|
+
end
|
614
|
+
|
615
|
+
|
616
|
+
#returns an array for each position with [sequence_name, position, depth]
|
617
|
+
#* b - list of positions or regions in BED format
|
618
|
+
#* l - [INT] minQLen
|
619
|
+
#* q - [INT] base quality threshold
|
620
|
+
#* Q - [INT] mapping quality threshold
|
621
|
+
#* r - [chr:from-to] region
|
622
|
+
def depth(opts={})
|
623
|
+
command = form_opt_string(@samtools, "depth", opts)
|
624
|
+
@last_command = command
|
625
|
+
system(command)
|
626
|
+
end
|
627
|
+
|
628
|
+
#Returns the pipelup of a region, encapsulated as a Bio::DB::Fasta::Region object.
|
629
|
+
#The opts are the same as for mpileup
|
630
|
+
def fetch_region(opts={})
|
631
|
+
region = opts[:r] ? opts[:r] : opts[:region]
|
632
|
+
opts[:r] = region
|
633
|
+
opts[:region] = region
|
634
|
+
reg = Bio::DB::Fasta::Region.parse_region(region.to_s)
|
635
|
+
reg.reference = self.fetch_reference(region.entry, region.start, region.end).downcase
|
636
|
+
tmp = Array.new
|
637
|
+
mpileup(opts) do | pile |
|
638
|
+
# puts pile
|
639
|
+
tmp << pile
|
640
|
+
yield pile if block_given?
|
641
|
+
end
|
642
|
+
reg.pileup = tmp
|
643
|
+
reg.calculate_stats_from_pile(opts)
|
644
|
+
reg
|
645
|
+
end
|
646
|
+
|
647
|
+
#Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
|
648
|
+
#the pile for different operations, it won't execute the mpilup command several times
|
649
|
+
#Whenever you finish using a region, call mpileup_clear_cache to free the cache
|
650
|
+
#The argument Region is required, as it will be the key for the underlying hash.
|
651
|
+
#We asume that the options (other than the region) are constant. If they are not, the cache mechanism may not be consistent.
|
652
|
+
#
|
653
|
+
#TODO: It may be good to load partially the pileup
|
654
|
+
def mpileup_cached(opts={})
|
655
|
+
raise SamException.new(), "A region must be provided" unless opts[:r] or opts[:region]
|
656
|
+
@cached_regions = Hash.new unless @cached_regions
|
657
|
+
region = opts[:r] ? opts[:r] : opts[:region]
|
658
|
+
@cached_regions[region.to_s] = fetch_region(opts) unless @cached_regions[region.to_s]
|
659
|
+
if block_given?
|
660
|
+
@cached_regions[region.to_s].pileup.each do | pile |
|
661
|
+
yield pile
|
662
|
+
end
|
663
|
+
end
|
664
|
+
region.pileup
|
665
|
+
end
|
666
|
+
|
667
|
+
|
668
|
+
#Clears the pileup cache. If a region is passed as argument, just the specified region is removed
|
669
|
+
#If no region is passed, the hash is emptied
|
670
|
+
def mpileup_clear_cache(region)
|
671
|
+
return unless @cached_regions
|
672
|
+
if region
|
673
|
+
@cached_regions[region.to_s] = nil
|
674
|
+
else
|
675
|
+
@cached_regions.clear
|
676
|
+
end
|
677
|
+
end
|
678
|
+
|
679
|
+
def bedcov(opts={})
|
680
|
+
bed = opts[:bed]
|
681
|
+
#bam = opts[:bam]
|
682
|
+
if opts.has_key?(:out)
|
683
|
+
out=opts[:out]
|
684
|
+
command = "#{@samtools} bedcov \"#{bed}\" \"#{@bam}\" > \"#{out}\""
|
685
|
+
else
|
686
|
+
command = "#{@samtools} bedcov \"#{bed}\" \"#{@bam}\""
|
687
|
+
end
|
688
|
+
puts "Running: #{command}" if $DEBUG
|
689
|
+
#puts command
|
690
|
+
@last_command = command
|
691
|
+
system(command)
|
692
|
+
end
|
693
|
+
|
694
|
+
|
695
|
+
#Extract the reads that align to a region
|
696
|
+
#* region [String] - Region to extract (chromosome:start-end)
|
697
|
+
#* fastq - [INT] fastq file where to print. If empty, prints to stdout
|
698
|
+
#* q - [INT] base quality threshold
|
699
|
+
# Not tested yet
|
700
|
+
def extract_reads(opts={})
|
701
|
+
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
702
|
+
fastq_filename = opts[:fastq]
|
703
|
+
|
704
|
+
out = $stdout
|
705
|
+
print_fastq = Proc.new do |alignment|
|
706
|
+
out.puts "@#{alignment.qname}"
|
707
|
+
out.puts "#{alignment.seq}"
|
708
|
+
out.puts "+#{alignment.qname}"
|
709
|
+
out.puts "#{alignment.qual}"
|
710
|
+
end
|
711
|
+
|
712
|
+
if fastq_filename
|
713
|
+
out = File.open(fastq_filename, "w")
|
714
|
+
end
|
715
|
+
fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
|
716
|
+
out.close if fastq_filename
|
717
|
+
end
|
718
|
+
|
719
|
+
# checks existence of files in instance
|
720
|
+
def files_ok?
|
721
|
+
[@fasta, @sam, @bam].flatten.compact.each {|f| return false unless File.exist? f }
|
722
|
+
true
|
723
|
+
end
|
724
|
+
|
725
|
+
#Returns true if the .bai exists. It doesn't validate if it is valid.
|
726
|
+
def indexed?
|
727
|
+
File.exist? @bam and File.exist? "#{@bam}.bai"
|
728
|
+
end
|
729
|
+
|
730
|
+
private
|
731
|
+
#Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
|
732
|
+
#is forwarded to the default stdout
|
733
|
+
def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
|
734
|
+
puts "[yield_from_pipe] #{command}" if $DEBUG
|
735
|
+
stdin, pipe, stderr, wait_thr = Open3.popen3(command)
|
736
|
+
#pid = wait_thr[:pid] # pid of the started process.
|
737
|
+
if type == :text
|
738
|
+
while (line = pipe.gets)
|
739
|
+
next if skip_comments and line[0] == comment_char
|
740
|
+
yield klass.new(line.chomp)
|
741
|
+
end
|
742
|
+
elsif type == :binary
|
743
|
+
while (c = pipe.gets(nil))
|
744
|
+
yield c
|
745
|
+
end
|
746
|
+
end
|
747
|
+
exit_status = wait_thr.value # Process::Status object returned.
|
748
|
+
puts "Running: #{command}" if $DEBUG
|
749
|
+
stdin.close
|
750
|
+
pipe.close
|
751
|
+
stderr.close
|
752
|
+
return exit_status
|
753
|
+
end
|
754
|
+
|
755
|
+
|
756
|
+
# returns a command string from a program
|
757
|
+
# @param program [Symbol] either `:samtools` or `:bcftools`
|
758
|
+
# @param opts [Hash] the options hash
|
759
|
+
# @param singles `flag` options [Array] the options in `opts` that are single options
|
760
|
+
def form_opt_string(prog, command, opts, singles=[])
|
761
|
+
opts_string = commandify(opts, singles)
|
762
|
+
"#{prog} #{command} #{opts_string} \"#{@bam}\""
|
763
|
+
end
|
764
|
+
|
765
|
+
# turns an opts hash into a string
|
766
|
+
def commandify(opts, singles)
|
767
|
+
list = []
|
768
|
+
opts.each_pair do |tag,value|
|
769
|
+
value = "\"#{value}\""
|
770
|
+
value = "" if singles.include?(tag)
|
771
|
+
|
772
|
+
list << "-#{tag.to_s} #{value}"
|
773
|
+
end
|
774
|
+
list.join(" ")
|
775
|
+
end
|
776
|
+
end
|
777
|
+
end
|
778
|
+
end
|