bio-samtools-wrapper 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (125) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.travis.yml +27 -0
  4. data/Gemfile +20 -0
  5. data/LICENSE.txt +702 -0
  6. data/README.md +501 -0
  7. data/Rakefile +73 -0
  8. data/VERSION +1 -0
  9. data/bin/bam_consensus.rb +85 -0
  10. data/bio-samtools-wrapper.gemspec +181 -0
  11. data/doc/Bio/DB/Alignment.html +552 -0
  12. data/doc/Bio/DB/Pileup.html +711 -0
  13. data/doc/Bio/DB/SAM/Library.html +167 -0
  14. data/doc/Bio/DB/SAM/Tools.html +109 -0
  15. data/doc/Bio/DB/SAM.html +1853 -0
  16. data/doc/Bio/DB/Tag.html +208 -0
  17. data/doc/Bio/DB/Vcf.html +431 -0
  18. data/doc/Bio/DB.html +105 -0
  19. data/doc/Bio.html +175 -0
  20. data/doc/LICENSE_txt.html +846 -0
  21. data/doc/created.rid +9 -0
  22. data/doc/fonts/Lato-Light.ttf +0 -0
  23. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  24. data/doc/fonts/Lato-Regular.ttf +0 -0
  25. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  26. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  27. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  28. data/doc/fonts.css +167 -0
  29. data/doc/images/add.png +0 -0
  30. data/doc/images/arrow_up.png +0 -0
  31. data/doc/images/brick.png +0 -0
  32. data/doc/images/brick_link.png +0 -0
  33. data/doc/images/bug.png +0 -0
  34. data/doc/images/bullet_black.png +0 -0
  35. data/doc/images/bullet_toggle_minus.png +0 -0
  36. data/doc/images/bullet_toggle_plus.png +0 -0
  37. data/doc/images/date.png +0 -0
  38. data/doc/images/delete.png +0 -0
  39. data/doc/images/find.png +0 -0
  40. data/doc/images/loadingAnimation.gif +0 -0
  41. data/doc/images/macFFBgHack.png +0 -0
  42. data/doc/images/package.png +0 -0
  43. data/doc/images/page_green.png +0 -0
  44. data/doc/images/page_white_text.png +0 -0
  45. data/doc/images/page_white_width.png +0 -0
  46. data/doc/images/plugin.png +0 -0
  47. data/doc/images/ruby.png +0 -0
  48. data/doc/images/tag_blue.png +0 -0
  49. data/doc/images/tag_green.png +0 -0
  50. data/doc/images/transparent.png +0 -0
  51. data/doc/images/wrench.png +0 -0
  52. data/doc/images/wrench_orange.png +0 -0
  53. data/doc/images/zoom.png +0 -0
  54. data/doc/index.html +106 -0
  55. data/doc/js/darkfish.js +140 -0
  56. data/doc/js/jquery.js +18 -0
  57. data/doc/js/navigation.js +142 -0
  58. data/doc/js/search.js +109 -0
  59. data/doc/js/search_index.js +1 -0
  60. data/doc/js/searcher.js +228 -0
  61. data/doc/rdoc.css +580 -0
  62. data/doc/table_of_contents.html +305 -0
  63. data/ext/Makefile-bioruby.patch +12 -0
  64. data/ext/Makefile-suse.patch +11 -0
  65. data/ext/mkrf_conf.rb +118 -0
  66. data/lib/bio/BIOExtensions.rb +89 -0
  67. data/lib/bio/db/alignment.rb +64 -0
  68. data/lib/bio/db/fastadb.rb +320 -0
  69. data/lib/bio/db/pileup.rb +273 -0
  70. data/lib/bio/db/sam/external/COPYING +21 -0
  71. data/lib/bio/db/sam/external/VERSION +1 -0
  72. data/lib/bio/db/sam/library.rb +32 -0
  73. data/lib/bio/db/sam.rb +778 -0
  74. data/lib/bio/db/vcf.rb +105 -0
  75. data/lib/bio-samtools-wrapper.rb +9 -0
  76. data/test/.gitignore +1 -0
  77. data/test/helper.rb +18 -0
  78. data/test/sample.vcf +24 -0
  79. data/test/samples/.gitignore +1 -0
  80. data/test/samples/LCI/NC_001988.ffn +2 -0
  81. data/test/samples/LCI/test.bam +0 -0
  82. data/test/samples/LCI/test.bam.bai +0 -0
  83. data/test/samples/small/dupes.bam +0 -0
  84. data/test/samples/small/dupes.sam +274 -0
  85. data/test/samples/small/ids2.txt +1 -0
  86. data/test/samples/small/map_for_reheader.sam +8 -0
  87. data/test/samples/small/map_to_merge1.bam +0 -0
  88. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  89. data/test/samples/small/map_to_merge1.sam +8 -0
  90. data/test/samples/small/map_to_merge2.bam +0 -0
  91. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  92. data/test/samples/small/map_to_merge2.sam +8 -0
  93. data/test/samples/small/no_md.sam +8 -0
  94. data/test/samples/small/sorted.bam +0 -0
  95. data/test/samples/small/sorted.bam.bai +0 -0
  96. data/test/samples/small/test.sai +0 -0
  97. data/test/samples/small/test.tam +10 -0
  98. data/test/samples/small/test_chr.fasta +1000 -0
  99. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  100. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  101. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  102. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  103. data/test/samples/small/test_chr.fasta.amb +2 -0
  104. data/test/samples/small/test_chr.fasta.ann +3 -0
  105. data/test/samples/small/test_chr.fasta.bwt +0 -0
  106. data/test/samples/small/test_chr.fasta.pac +0 -0
  107. data/test/samples/small/test_chr.fasta.rbwt +0 -0
  108. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  109. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  110. data/test/samples/small/test_chr.fasta.rpac +0 -0
  111. data/test/samples/small/test_chr.fasta.rsa +0 -0
  112. data/test/samples/small/test_chr.fasta.sa +0 -0
  113. data/test/samples/small/test_cov.svg +273 -0
  114. data/test/samples/small/test_fastadb.fasta +34 -0
  115. data/test/samples/small/testu.bam +0 -0
  116. data/test/samples/small/testu.bed +2 -0
  117. data/test/test_bio-samtools-wrapper.rb +1 -0
  118. data/test/test_fastadb.rb +89 -0
  119. data/test/test_pileup.rb +90 -0
  120. data/test/test_sam.rb +421 -0
  121. data/test/test_vcf.rb +79 -0
  122. data/tutorial/tutorial.html +474 -0
  123. data/tutorial/tutorial.md +424 -0
  124. data/tutorial/tutorial.pdf +0 -0
  125. metadata +254 -0
data/lib/bio/db/sam.rb ADDED
@@ -0,0 +1,778 @@
1
+ module Bio
2
+ class DB
3
+ class Sam
4
+
5
+ class SamException < StandardError; end
6
+
7
+ attr_accessor :bam, :fasta, :samtools, :bcftools, :last_command
8
+ attr_accessor :minumum_ratio_for_iup_consensus
9
+ attr_reader :cached_regions
10
+ #attr_accessor :pileup_cache
11
+ @minumum_ratio_for_iup_consensus = 0.20
12
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
13
+
14
+ #Creates a new Bio::DB::Sam object
15
+ #* fasta [String] - the path to the Fasta reference sequence
16
+ #* bam [String] - path to bam files
17
+ #* samtools [String] - path to alternative installation of samtools
18
+ #* bcftools [String] - path to alternative installation of bcftools
19
+ #* returns [Bio::DB::Sam] a new `Bio::DB::Sam` object
20
+ def initialize(args)
21
+ @fasta = args[:fasta]
22
+ @bam = args[:bam]
23
+ @bams = nil
24
+ @sam = nil
25
+ @files = nil
26
+ @cached_regions = nil
27
+ @stats = nil
28
+ @samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
29
+ @bcftools = args[:bcftools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','bcftools')
30
+
31
+ @files = [@files] if @files.instance_of?(String)
32
+
33
+
34
+ @last_command = nil
35
+ raise ArgumentError, "Need Fasta and at least one BAM or SAM" if not @fasta or not @bam
36
+ raise IOError, "File not found #{@files}" if not files_ok?
37
+ @bams = [@bams] if @bams.instance_of? String
38
+
39
+ end
40
+
41
+ #backward compatibility method, returns true if file exists otherwise, complains and quits.
42
+ def open
43
+ files_ok?
44
+ end
45
+
46
+ #runs the samtools view command
47
+ #* b - output BAM
48
+ #* h - print header for the SAM output
49
+ #* H - print header only (no alignments)
50
+ #* S - input is SAM
51
+ #* u - uncompressed BAM output (force -b)
52
+ #* one - fast compression (force -b)
53
+ #* x - output FLAG in HEX (samtools-C specific)
54
+ #* X - output FLAG in string (samtools-C specific)
55
+ #* c - print only the count of matching records
56
+ #* B - collapse the backward CIGAR operation
57
+ #* at - INT number of BAM compression threads [0]
58
+ #* L - FILE output alignments overlapping the input BED FILE [null]
59
+ #* t - FILE list of reference names and lengths (force -S) [null]
60
+ #* T - FILE reference sequence file (force -S) [null]
61
+ #* o - FILE output file name [stdout]
62
+ #* R - FILE list of read groups to be outputted [null]
63
+ #* f - INT required flag 0 for unset [0]
64
+ #* F - INT filtering flag 0 for unset [0]
65
+ #* q - INT minimum mapping quality [0]
66
+ #* l - STR only output reads in library STR [null]
67
+ #* r - STR only output reads in read group STR [null]
68
+ #* s - FLOAT fraction of templates to subsample; integer part as seed [-1]
69
+ #* chr - name of reference sequence to get alignments from
70
+ #* start - start position on reference sequence
71
+ #* stop - end postion on reference sequence
72
+ def view(opts={},&block)
73
+ region = String.new
74
+ if opts[:chr] and opts[:start] and opts[:stop]
75
+ has_e = self.has_entry? opts[:chr]
76
+ raise SamException.new(), "[view] The sequence #{opts[:chr]} is not in the bam file" unless has_e
77
+ region = "#{opts[:chr]}:#{opts[:start]}-#{opts[:stop]}"
78
+ [:chr, :start, :stop].each {|o| opts.delete(o)}
79
+ end
80
+ if opts[:at]
81
+ opts['@'] = opts[:at]
82
+ opts.delete(:at)
83
+ end
84
+
85
+ if opts[:one]
86
+ opts['1'] = opts[:one]
87
+ opts.delete(:one)
88
+ end
89
+ command = String.new
90
+ command = form_opt_string(@samtools, 'view', opts, [:b, :h, :H, :S, :u, '1', :x, :X, :c, :B])
91
+ command = command + " \"#{region}\"" if region.size > 0
92
+ @last_command = command
93
+ type = (opts[:u] or opts[:b]) ? :binary : :text
94
+ klass = (type == :binary) ? String : Bio::DB::Alignment
95
+ yield_from_pipe(command, klass, type, &block)
96
+ end
97
+
98
+ #fetches a subsequence and calls code block
99
+ #* chr - the reference name for the subsequence
100
+ #* start - the start position for the subsequence
101
+ #* stop - the stop position for the subsequence
102
+ #* &block - the the block of code to execute
103
+ def fetch(chr, start,stop, &block)
104
+
105
+ view(
106
+ :chr => chr,
107
+ :start => start,
108
+ :stop => stop,
109
+ &block
110
+ )
111
+ end
112
+
113
+ alias_method :fetch_with_function, :fetch
114
+
115
+ #returns an array of coverage for each location for which there are mapped reads
116
+ #* chr - the reference name
117
+ #* start - the start position
118
+ #* length - the length of the region queried
119
+ def chromosome_coverage(chr,start,length)
120
+ result = []
121
+ region = "#{chr}:#{start}-#{start + length}"
122
+ self.mpileup(:r => region) do |p|
123
+ result << p.coverage
124
+ end
125
+ result
126
+ end
127
+
128
+
129
+ #returns an svg file or object, plotting coverage for each location for which there are mapped reads
130
+ #* chr - the reference name
131
+ #* start - the start position
132
+ #* length - the length of the region queried
133
+ #OPTIONS
134
+ #* bin - the amount of bins to split the histogram into. The arithmetic mean score for each bin will be plotted. [default 30 bins]
135
+ #* svg - a file to write the svg image to [default a String object containing the SVG]
136
+ def plot_coverage(chr,start,length, opts={})
137
+ chr = opts[:chr] if chr.nil?
138
+ start = opts[:start] if start.nil?
139
+ length = opts[:length] if length.nil?
140
+ if opts[:bin]
141
+ bin = length/opts[:bin]
142
+ else
143
+ bin = length/30
144
+ end
145
+ result = []
146
+ region = "#{chr}:#{start}-#{start + length}"
147
+ self.mpileup(:r => region) do |p|
148
+ result << p.coverage
149
+ end
150
+ p = Bio::Graphics::Page.new(:width => 1000,
151
+ :height => 200,
152
+ :number_of_intervals => 10,
153
+ :font_size => 14
154
+ )
155
+ default_options = {:glyph => :histogram,
156
+ :stroke => 'black',
157
+ :fill_color => 'gold',
158
+ :track_height => 150,
159
+ :name => 'read coverage',
160
+ :label => true,
161
+ :stroke_width => '1',
162
+ :x_round => 1,
163
+ :y_round => 1 }
164
+ opts = default_options.merge(opts)
165
+
166
+ data_track = p.add_track(opts)
167
+ index = 0;
168
+ result.each_slice(bin) {|slice|
169
+ #result.each_with_index {|val, index|
170
+ data_feature = Bio::Graphics::MiniFeature.new(:start => start + index,
171
+ :end => (start + index + bin),
172
+ :segment_height => slice.inject{|sum,x| sum + x }.to_f / slice.size)
173
+ data_track.add(data_feature)
174
+ index+=bin
175
+ }
176
+ if opts[:svg]
177
+ svg = opts[:svg].to_s
178
+ p.write(svg)
179
+ else
180
+ return p.get_markup
181
+ end
182
+
183
+
184
+ end
185
+
186
+ #returns the average coverage over the region queried
187
+ #* chr - the reference name
188
+ #* start - the start position
189
+ #* length - the length of the region queried
190
+ def average_coverage(chr,start,length)
191
+ arr = self.chromosome_coverage(chr,start,length)
192
+ arr.inject{ |sum, el| sum + el }.to_f / arr.size
193
+ end
194
+
195
+ #returns a Bio::DB::Pileup or Bio::DB::VCF object
196
+ #* region - Only generate pileup in region [chrom:start-stop]
197
+ #* illumina_quals - Assume the quality is in the Illumina 1.3+ encoding
198
+ #* count_anomalous - Do not skip anomalous read pairs in variant calling
199
+ #* no_baq - Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments.
200
+ #* adjust_mapq - [INT] Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0]
201
+ #* max_per_bam_depth - [INT] At a position, read maximally INT reads per input BAM. [250]
202
+ #* extended_baq - Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt specificity a little bit.
203
+ #* exclude_reads_file - [FILE] exclude read groups listed in FILE [null]
204
+ #* list_of_positions - [FILE] BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
205
+ #* mapping_quality_cap - [INT] cap mapping quality at INT [60]
206
+ #* ignore_rg - ignore read group tags
207
+ #* min_mapping_quality - [INT] skip alignments with mapQ smaller than INT [0]
208
+ #* min_base_quality - [INT] skip bases with baseQ/BAQ smaller than INT [13]
209
+ #* ##following options are for the -g -u option
210
+ #* genotype_calling - generate BCF output (genotype likelihoods)
211
+ #* uncompressed_bcf - generate uncompress BCF output
212
+ #* extension_sequencing_probability - [INT] Phred-scaled gap extension seq error probability [20]
213
+ #* homopolymer_error_coefficient - [INT] coefficient for homopolymer errors [100]
214
+ #* no_indels - do not perform indel calling
215
+ #* skip_indel_over_average_depth - [INT] max per-sample depth for INDEL calling [250]
216
+ #* gap_open_sequencing_error_probability - [INT] Phred-scaled gap open sequencing error probability [40]
217
+ #* platforms - [STRING] comma separated list of platforms for indels [all]
218
+ def mpileup(opts={}, &block)
219
+ #long option form to short samtools form..
220
+ long_opts = {
221
+ :region => :r,
222
+ :illumina_quals => :six,
223
+ :count_anomalous => :A,
224
+ :no_baq => :B,
225
+ :adjust_mapq => :C,
226
+ :max_per_bam_depth => :d,
227
+ :extended_baq => :E,
228
+ :exclude_reads_file => :G,
229
+ :list_of_positions => :l,
230
+ :mapping_quality_cap => :M,
231
+ :ignore_rg => :R,
232
+ :min_mapping_quality => :q,
233
+ :min_base_quality => :Q,
234
+ ###following options are for the -g -u option
235
+ :genotype_calling => :g,
236
+ :uncompressed_bcf => :u,
237
+ :extension_sequencing_probability => :e,
238
+ :homopolymer_error_coefficient => :h,
239
+ :no_indels => :I,
240
+ :skip_indel_over_average_depth => :L,
241
+ :gap_open_sequencing_error_probability => :o,
242
+ :platforms => :P
243
+ }
244
+
245
+ ##convert any long_opts to short opts
246
+ temp_opts = opts.dup
247
+ opts.each_pair do |k,v|
248
+ if long_opts[k]
249
+ temp_opts[long_opts[k]] = v
250
+ temp_opts.delete(k)
251
+ end
252
+ end
253
+ opts = Hash.new
254
+ #To remove any unwanted options.
255
+ long_opts.each_pair do |k,v|
256
+ opts[v] = temp_opts[v] if temp_opts.has_key?(v)
257
+ end
258
+
259
+ # opts = temp_opts
260
+ opts[:u] = true if opts[:g] #so that we always get uncompressed output
261
+ opts.delete(:g)
262
+
263
+ opts[:f] = @fasta
264
+
265
+ #TOODO: reduce the string handling
266
+ query = opts[:r].to_s
267
+ query = opts[:r].to_region.to_s if opts[:r].respond_to?(:to_region)
268
+ if not query.nil? and query.size > 0
269
+ raise SamException.new(), "The sequence #{query} is not in the bam file" unless has_region? query
270
+ end
271
+ opts[:r] = query
272
+
273
+ if opts[:six]
274
+ opts["6"] = nil
275
+ opts.delete(:six)
276
+ end
277
+
278
+ command = form_opt_string(@samtools, "mpileup", opts, [:R, :B, :E, "6", :A, :g, :u, :I] )
279
+ puts "Running: #{command}" if $DEBUG
280
+ if opts[:u]
281
+ command = command + " | #{@bcftools} view -cg -"
282
+ end
283
+
284
+ klass = opts[:u] ? Bio::DB::Vcf : Bio::DB::Pileup
285
+ @last_command = command
286
+ yield_from_pipe(command, klass, :text, &block)
287
+
288
+ end
289
+
290
+ #fetches a subsequence from a reference genome and option returns it as a Bio::Sequence::NA object
291
+ #* chr - [STRING] the reference name for the subsequence
292
+ #* start - [INT] the start position for the subsequence
293
+ #* stop - [INT] the stop position for the subsequence
294
+ #* as_bio - boolean stating if the returned object should be a Bio::Sequence::NA object
295
+ def fetch_reference(chr,start,stop, opts={:as_bio => false})
296
+ raise SamException.new(), "The sequence #{chr} is not in the bam file" unless has_entry? chr
297
+ seq = ""
298
+ unless @fasta #We return a string of Ns if we don't know the reference.
299
+ seq = "n" * (stop-start)
300
+ else
301
+ command = "#{@samtools} faidx \"#{@fasta}\" '#{chr}:#{start}-#{stop}'"
302
+ puts "Running: #{command}" if $DEBUG
303
+ @last_command = command
304
+ seq = ""
305
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
306
+ end
307
+
308
+ if opts[:as_bio]
309
+ seq = Bio::Sequence::NA.new(seq).to_fasta("#{chr}:#{start}-#{stop}")
310
+ end
311
+ seq
312
+ end
313
+
314
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
315
+ #Options - if a subsequence is required
316
+ #* chr - [STRING] the reference name of the subsequence
317
+ #* start - [INT] the start position for the subsequence
318
+ #* stop - [INT] the stop position for the subsequence
319
+ def faidx(opts={})
320
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
321
+ opts={:as_bio => false}
322
+ self.fetch_reference(:chr,:start,:stop,opts)
323
+ else
324
+ command = "#{@samtools} faidx \"#{@fasta}\""
325
+ @last_command = command
326
+ system(command)
327
+ end
328
+ end
329
+
330
+ #Index sorted alignment for fast random access. Index file <aln.bam>.bai will be created of no out_index is provided.
331
+ #* out_index - [STRING] name of index
332
+ def index(opts={})
333
+ command = "#{@samtools} index \"#{@bam}\" #{opts[:out_index]}"
334
+ puts "Running: #{command}" if $DEBUG
335
+ @last_command = command
336
+ system(command)
337
+ end
338
+
339
+ #Fill in mate coordinates, ISIZE and mate related flags from a name-sorted alignment
340
+ #* out_bam name of outfile
341
+ #* r - remove unmapped reads and secondary alignments
342
+ def fix_mates(opts={})
343
+ #opts.merge!({:out_index=>nil})
344
+ remove_reads = ""
345
+ if opts[:r]
346
+ remove_reads = "-r"
347
+ end
348
+ command = "#{@samtools} fixmate #{remove_reads} \"#{@bam}\" #{opts[:out_bam]}"
349
+ puts "Running: #{command}" if $DEBUG
350
+ @last_command = command
351
+ system(command)
352
+ end
353
+
354
+ alias_method :fixmate, :fix_mates
355
+
356
+ #generate simple stats with regard to the number and pairing of reads mapped to a reference
357
+ def flag_stats(opts={})
358
+ command = form_opt_string(@samtools, "flagstat", opts, [])
359
+ puts "Running: #{command}" if $DEBUG
360
+ @last_command = command
361
+ strings = []
362
+ yield_from_pipe(command,String) {|line| strings << line.chomp}
363
+ strings
364
+ end
365
+
366
+ alias_method :flagstat, :flag_stats
367
+
368
+ #Retrieve and print stats in the index file. The output is TAB delimited with each line consisting of reference sequence name, sequence length, number of mapped reads and number unmapped reads.
369
+ def index_stats
370
+ return @stats if @stats
371
+ stats = {}
372
+ command = form_opt_string(@samtools, "idxstats", {}, [])
373
+ @last_command = command
374
+ #puts "Running: #{command}" if $DEBUG
375
+ yield_from_pipe(command, String, :text, true, "#") do |line|
376
+ info = line.chomp.split(/\t/)
377
+ stats[ info[0] ] = {:length => info[1].to_i, :mapped_reads => info[2].to_i, :unmapped_reads => info[3].to_i }
378
+ end
379
+ @stats = stats
380
+ return @stats
381
+ end
382
+
383
+ alias_method :idxstats, :index_stats
384
+
385
+ #Retrive a hash with all the regions, with the region id as index or runs the function on each region
386
+ def each_region
387
+ index_stats
388
+ if @regions
389
+ return @regions unless block_given?
390
+ else
391
+ @regions = Hash.new
392
+ end
393
+ index_stats.each do |k,v|
394
+ reg = Bio::DB::Fasta::Region.new
395
+ reg.entry = k
396
+ reg.start = 1
397
+ reg.end = v[:length]
398
+ reg.orientation = :forward
399
+ @regions[k] = reg unless @regions[k]
400
+ yield reg if block_given?
401
+ end
402
+ @regions
403
+ end
404
+
405
+ #Tells if the bam file contains the entry. It has to be indexed.
406
+ def has_entry?(entry)
407
+ index_stats.has_key?(entry)
408
+ # puts "#{entry} #{@stats.inspect}"
409
+ # index_stats
410
+ end
411
+
412
+ def has_region?(region)
413
+ index_stats
414
+ reg=Bio::DB::Fasta::Region::parse_region(region)
415
+ return 0 unless has_entry? (reg.entry)
416
+ len = @stats[reg.entry][:length]
417
+ reg.start > 0 and reg.end <= len
418
+ end
419
+
420
+ #Merge multiple sorted alignments
421
+ #* n - sort by read names
422
+ #* r - attach RG tag (inferred from file names)
423
+ #* u - uncompressed BAM output
424
+ #* f - overwrite the output BAM if exist
425
+ #* one - compress level 1
426
+ #* l - [INT] compression level, from 0 to 9 [-1]
427
+ #* at - [INT] number of BAM compression threads [0]
428
+ #* R - [STRING] merge file in the specified region STR [all]
429
+ #* h - [FILE] copy the header in FILE to <out.bam> [in1.bam]
430
+ #* out - [FILE] out file name
431
+ #* bams - [FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
432
+ def merge(opts={})
433
+ if opts[:one]
434
+ opts['1'] = nil
435
+ opts.delete(:one)
436
+ end
437
+
438
+ if opts[:at]
439
+ opts['@'] = opts[:at]
440
+ opts.delete(:at)
441
+ end
442
+
443
+ out = opts[:out]
444
+ opts.delete(:out)
445
+
446
+ bam_list = opts[:bams].collect do |b|
447
+ b.bam rescue b
448
+ end.join(' ')
449
+
450
+ opts.delete(:bams)
451
+ options = commandify(opts, [:n, :r, :u, :f, '1'] )
452
+ command = "#{@samtools} merge #{options} #{out} #{bam_list}"
453
+
454
+ @last_command = command
455
+ puts "Running: #{command}" if $DEBUG
456
+ system(command)
457
+
458
+ end
459
+
460
+ #Concatenate BAMs. The sequence dictionary of each input BAM must be identical.
461
+ #* h - header.sam
462
+ #* out -[FILE] out file name
463
+ #* bams -[FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
464
+ def cat(opts={})
465
+ bam_list = opts[:bams].collect do |b|
466
+ b.bam rescue b
467
+ end.join(' ')
468
+ opts.delete(:bams)
469
+ options = commandify(opts, [:h] )
470
+ command = "#{@samtools} cat #{options} -o #{out} #{bam_list}"
471
+ puts command if $DEBUG
472
+ @last_command = command
473
+ system(command)
474
+
475
+ end
476
+
477
+ #* program - one of 'samtools' 'bcftools'
478
+ #* command - one of the commands relevant to the program
479
+ def self.docs(program, command)
480
+ return "program must be 'samtools' or 'bcftools'" if not ['samtools', 'bcftools'].include? program
481
+ command = "#{program} #{command}"
482
+ `#{command}`
483
+ end
484
+
485
+ #Remove potential PCR duplicates: if multiple read pairs have identical external coordinates, only retain the pair with highest mapping quality.
486
+ #* s - rmdup for SE reads
487
+ #* S - treat PE reads as SE in rmdup (force -s)
488
+ #* out - [FILE] output bam
489
+ def remove_duplicates(opts={})
490
+ out = opts[:out]
491
+ opts.delete(:out)
492
+ command = "#{form_opt_string(@samtools, "rmdup", opts, [:s, :S])} #{out} \"#{@bam}\""
493
+ @last_command = command
494
+ system(command)
495
+ end
496
+
497
+ alias_method :rmdup, :remove_duplicates
498
+
499
+ #Sort alignments by leftmost coordinates
500
+ #* n - sort by read name
501
+ #* f - use <out.prefix> as full file name instead of prefix
502
+ #* o - final output to stdout returns bio::db::alignment depreciated (samtools-1.x saves to a file)
503
+ #* l - [INT] compression level, from 0 to 9 [-1]
504
+ #* at - [INT] number of sorting and compression threads [1]
505
+ #* m - [INT] max memory per thread; suffix K/M/G recognized [768M]
506
+ #* prefix - [STRING] prefix for output bamfile (for legacy, becomes "o" to use in samtools-1.x)
507
+ def sort(opts={})
508
+ if !opts.has_key?(:prefix)
509
+ opts.merge!({:o => "sorted"})
510
+ else
511
+ opts[:o] = opts[:prefix] += ".bam"
512
+ end
513
+
514
+ opts.delete(:prefix)
515
+ command = form_opt_string(@samtools, "sort", opts, [:n, :f])
516
+ command = command + " "
517
+ @last_command = command
518
+ puts "Running: #{command}" if $DEBUG
519
+ #if opts[:o]
520
+ # yield_from_pipe(command, Bio::DB::Alignment)
521
+ #else
522
+ system(command)
523
+ #end
524
+ end
525
+
526
+ #used to generate a text alignment viewer
527
+ #* d - display, output as (H)tml or (C)urses or (T)ext
528
+ #* p - [chr:pos] go directly to this position
529
+ #* s - [STR] display only reads from this sample or group
530
+ def tview(opts={})
531
+ if opts[:d]
532
+ opts['d'] = opts[:d]
533
+ opts.delete(:d)
534
+ end
535
+ if opts[:p]
536
+ opts['p'] = opts[:p]
537
+ opts.delete(:p)
538
+ end
539
+ if opts[:s]
540
+ opts['s'] = opts[:s]
541
+ opts.delete(:s)
542
+ end
543
+ command = "#{form_opt_string(@samtools, "tview", opts)}"
544
+ puts "Running: #{command}" if $DEBUG
545
+ @last_command = command
546
+ system(command)
547
+ end
548
+
549
+ #Replace the header of the current bam file with the header in header_sam
550
+ #* header_sam - the sam file from which the new header will be taken
551
+ #* out - [FILE] output bam file
552
+ def reheader(header_sam, opts={})
553
+ if opts.has_key?(:out)
554
+ out=opts[:out]
555
+ command = "#{@samtools} reheader #{header_sam} \"#{@bam}\" > #{out}"
556
+ else
557
+ command = "#{@samtools} reheader #{header_sam} \"#{@bam}\""
558
+ end
559
+ puts "Running: #{command}" if $DEBUG
560
+ @last_command = command
561
+ system(command)
562
+ end
563
+
564
+ #Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing tag. Output SAM by default.
565
+ #* A - When used jointly with -r this option overwrites the original base quality.
566
+ #* e - Convert a the read base to = if it is identical to the aligned reference base. Indel caller does not support the = bases at the moment.
567
+ #* u - Output uncompressed BAM
568
+ #* b - Output compressed BAM
569
+ #* S - The input is SAM with header lines
570
+ #* C - [INT] Coefficient to cap mapping quality of poorly mapped reads. See the pileup command for details. [0]
571
+ #* r - Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
572
+ #* E - Extended BAQ calculation. This option trades specificity for sensitivity, though the effect is minor.
573
+ def calmd(opts={}, &block)
574
+ command = form_opt_string(@samtools, "calmd", opts, [:E, :e, :u, :b, :S, :r] )+ " " + @fasta
575
+ puts "Running: #{command}" if $DEBUG
576
+ @last_command = command
577
+ type = :text
578
+ klass = Bio::DB::Alignment
579
+ yield_from_pipe(command, klass, type, true, "@",&block)
580
+ end
581
+
582
+ #Identifies target regions by examining the continuity of read depth, computes haploid consensus sequences of targets and outputs a SAM with each sequence corresponding to a target. When option -f is in use, BAQ will be applied.
583
+ #* Q - [INT] Minimum base quality for a base to be considered [13]
584
+ #* i - in penalty
585
+ #* 0 - em0
586
+ #* 1 - em1
587
+ #* 2 - em2
588
+ #* f - reference
589
+ def targetcut(opts={})
590
+ if opts[:f]
591
+ opts['f'] = @fasta
592
+ opts.delete(:s)
593
+ end
594
+
595
+ command = "#{form_opt_string(@samtools, "targetcut", opts, [] )}"
596
+ puts "Running: #{command}" if $DEBUG
597
+ @last_command = command
598
+ system(command)
599
+ end
600
+
601
+ #Call and phase heterozygous SNPs
602
+ #* A - Drop reads with ambiguous phase.
603
+ #* b - [STR] Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file STR.0.bam and phase-1 reads in STR.1.bam. Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads with switch errors will be saved in STR.chimeric.bam. [null]
604
+ #* F - Do not attempt to fix chimeric reads.
605
+ #* k - [INT] Maximum length for local phasing. [13]
606
+ #* q - [INT] Minimum Phred-scaled LOD to call a heterozygote. [40]
607
+ #* Q - [INT] Minimum base quality to be used in het calling. [13]
608
+ def phase(opts={})
609
+ command = "#{form_opt_string(@samtools, "phase", opts, [:A, :F] )}"
610
+ puts "Running: #{command}" if $DEBUG
611
+ @last_command = command
612
+ system(command)
613
+ end
614
+
615
+
616
+ #returns an array for each position with [sequence_name, position, depth]
617
+ #* b - list of positions or regions in BED format
618
+ #* l - [INT] minQLen
619
+ #* q - [INT] base quality threshold
620
+ #* Q - [INT] mapping quality threshold
621
+ #* r - [chr:from-to] region
622
+ def depth(opts={})
623
+ command = form_opt_string(@samtools, "depth", opts)
624
+ @last_command = command
625
+ system(command)
626
+ end
627
+
628
+ #Returns the pipelup of a region, encapsulated as a Bio::DB::Fasta::Region object.
629
+ #The opts are the same as for mpileup
630
+ def fetch_region(opts={})
631
+ region = opts[:r] ? opts[:r] : opts[:region]
632
+ opts[:r] = region
633
+ opts[:region] = region
634
+ reg = Bio::DB::Fasta::Region.parse_region(region.to_s)
635
+ reg.reference = self.fetch_reference(region.entry, region.start, region.end).downcase
636
+ tmp = Array.new
637
+ mpileup(opts) do | pile |
638
+ # puts pile
639
+ tmp << pile
640
+ yield pile if block_given?
641
+ end
642
+ reg.pileup = tmp
643
+ reg.calculate_stats_from_pile(opts)
644
+ reg
645
+ end
646
+
647
+ #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
648
+ #the pile for different operations, it won't execute the mpilup command several times
649
+ #Whenever you finish using a region, call mpileup_clear_cache to free the cache
650
+ #The argument Region is required, as it will be the key for the underlying hash.
651
+ #We asume that the options (other than the region) are constant. If they are not, the cache mechanism may not be consistent.
652
+ #
653
+ #TODO: It may be good to load partially the pileup
654
+ def mpileup_cached(opts={})
655
+ raise SamException.new(), "A region must be provided" unless opts[:r] or opts[:region]
656
+ @cached_regions = Hash.new unless @cached_regions
657
+ region = opts[:r] ? opts[:r] : opts[:region]
658
+ @cached_regions[region.to_s] = fetch_region(opts) unless @cached_regions[region.to_s]
659
+ if block_given?
660
+ @cached_regions[region.to_s].pileup.each do | pile |
661
+ yield pile
662
+ end
663
+ end
664
+ region.pileup
665
+ end
666
+
667
+
668
+ #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
669
+ #If no region is passed, the hash is emptied
670
+ def mpileup_clear_cache(region)
671
+ return unless @cached_regions
672
+ if region
673
+ @cached_regions[region.to_s] = nil
674
+ else
675
+ @cached_regions.clear
676
+ end
677
+ end
678
+
679
+ def bedcov(opts={})
680
+ bed = opts[:bed]
681
+ #bam = opts[:bam]
682
+ if opts.has_key?(:out)
683
+ out=opts[:out]
684
+ command = "#{@samtools} bedcov \"#{bed}\" \"#{@bam}\" > \"#{out}\""
685
+ else
686
+ command = "#{@samtools} bedcov \"#{bed}\" \"#{@bam}\""
687
+ end
688
+ puts "Running: #{command}" if $DEBUG
689
+ #puts command
690
+ @last_command = command
691
+ system(command)
692
+ end
693
+
694
+
695
+ #Extract the reads that align to a region
696
+ #* region [String] - Region to extract (chromosome:start-end)
697
+ #* fastq - [INT] fastq file where to print. If empty, prints to stdout
698
+ #* q - [INT] base quality threshold
699
+ # Not tested yet
700
+ def extract_reads(opts={})
701
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
702
+ fastq_filename = opts[:fastq]
703
+
704
+ out = $stdout
705
+ print_fastq = Proc.new do |alignment|
706
+ out.puts "@#{alignment.qname}"
707
+ out.puts "#{alignment.seq}"
708
+ out.puts "+#{alignment.qname}"
709
+ out.puts "#{alignment.qual}"
710
+ end
711
+
712
+ if fastq_filename
713
+ out = File.open(fastq_filename, "w")
714
+ end
715
+ fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
716
+ out.close if fastq_filename
717
+ end
718
+
719
+ # checks existence of files in instance
720
+ def files_ok?
721
+ [@fasta, @sam, @bam].flatten.compact.each {|f| return false unless File.exist? f }
722
+ true
723
+ end
724
+
725
+ #Returns true if the .bai exists. It doesn't validate if it is valid.
726
+ def indexed?
727
+ File.exist? @bam and File.exist? "#{@bam}.bai"
728
+ end
729
+
730
+ private
731
+ #Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
732
+ #is forwarded to the default stdout
733
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
734
+ puts "[yield_from_pipe] #{command}" if $DEBUG
735
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
736
+ #pid = wait_thr[:pid] # pid of the started process.
737
+ if type == :text
738
+ while (line = pipe.gets)
739
+ next if skip_comments and line[0] == comment_char
740
+ yield klass.new(line.chomp)
741
+ end
742
+ elsif type == :binary
743
+ while (c = pipe.gets(nil))
744
+ yield c
745
+ end
746
+ end
747
+ exit_status = wait_thr.value # Process::Status object returned.
748
+ puts "Running: #{command}" if $DEBUG
749
+ stdin.close
750
+ pipe.close
751
+ stderr.close
752
+ return exit_status
753
+ end
754
+
755
+
756
+ # returns a command string from a program
757
+ # @param program [Symbol] either `:samtools` or `:bcftools`
758
+ # @param opts [Hash] the options hash
759
+ # @param singles `flag` options [Array] the options in `opts` that are single options
760
+ def form_opt_string(prog, command, opts, singles=[])
761
+ opts_string = commandify(opts, singles)
762
+ "#{prog} #{command} #{opts_string} \"#{@bam}\""
763
+ end
764
+
765
+ # turns an opts hash into a string
766
+ def commandify(opts, singles)
767
+ list = []
768
+ opts.each_pair do |tag,value|
769
+ value = "\"#{value}\""
770
+ value = "" if singles.include?(tag)
771
+
772
+ list << "-#{tag.to_s} #{value}"
773
+ end
774
+ list.join(" ")
775
+ end
776
+ end
777
+ end
778
+ end