bio-samtools 0.6.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -2
  3. data/README.md +4 -7
  4. data/VERSION +1 -1
  5. data/bio-samtools.gemspec +47 -105
  6. data/doc/Bio.html +68 -131
  7. data/doc/Bio/DB.html +51 -111
  8. data/doc/Bio/DB/Alignment.html +135 -363
  9. data/doc/Bio/DB/Pileup.html +183 -170
  10. data/doc/Bio/DB/SAM.html +1396 -820
  11. data/doc/Bio/DB/SAM/Library.html +73 -123
  12. data/doc/Bio/DB/SAM/Tools.html +51 -273
  13. data/doc/Bio/DB/Tag.html +78 -124
  14. data/doc/Bio/DB/Vcf.html +111 -147
  15. data/doc/LICENSE_txt.html +113 -148
  16. data/doc/created.rid +9 -10
  17. data/doc/fonts.css +167 -0
  18. data/doc/fonts/Lato-Light.ttf +0 -0
  19. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  20. data/doc/fonts/Lato-Regular.ttf +0 -0
  21. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  22. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  23. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  24. data/doc/images/add.png +0 -0
  25. data/doc/images/arrow_up.png +0 -0
  26. data/doc/images/delete.png +0 -0
  27. data/doc/images/tag_blue.png +0 -0
  28. data/doc/index.html +48 -54
  29. data/doc/js/darkfish.js +9 -22
  30. data/doc/js/search.js +20 -5
  31. data/doc/js/search_index.js +1 -1
  32. data/doc/rdoc.css +255 -218
  33. data/doc/table_of_contents.html +256 -137
  34. data/ext/Rakefile +57 -0
  35. data/lib/bio-samtools.rb +7 -2
  36. data/lib/bio/BIOExtensions.rb +89 -0
  37. data/lib/bio/db/alignment.rb +59 -0
  38. data/lib/bio/db/fastadb.rb +255 -0
  39. data/lib/bio/db/pileup.rb +221 -172
  40. data/lib/bio/db/sam.rb +639 -589
  41. data/lib/bio/db/sam/{faidx.rb → faidx_old.rb} +0 -0
  42. data/lib/bio/db/vcf.rb +69 -68
  43. data/test/.gitignore +1 -0
  44. data/test/{test_basic.rb → old_test_basic.rb} +33 -1
  45. data/test/samples/small/dupes.bam +0 -0
  46. data/test/samples/small/dupes.sam +274 -0
  47. data/test/samples/small/map_for_reheader.sam +8 -0
  48. data/test/samples/small/map_to_merge1.bam +0 -0
  49. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  50. data/test/samples/small/map_to_merge1.sam +8 -0
  51. data/test/samples/small/map_to_merge2.bam +0 -0
  52. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  53. data/test/samples/small/map_to_merge2.sam +8 -0
  54. data/test/samples/small/no_md.sam +8 -0
  55. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  56. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  57. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  58. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  59. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  60. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  61. data/test/samples/small/test_cov.svg +273 -0
  62. data/test/samples/small/testu.bam.bai +0 -0
  63. data/test/svg +133 -0
  64. data/test/test_pileup.rb +84 -0
  65. data/test/test_sam.rb +331 -0
  66. data/test/test_vcf.rb +11 -0
  67. data/{doc → tutorial}/tutorial.html +0 -0
  68. data/{doc → tutorial}/tutorial.pdf +0 -0
  69. metadata +56 -114
  70. data/doc/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -159
  71. data/doc/Bio/DB/SAM/Tools/Bam1T.html +0 -220
  72. data/doc/Bio/DB/SAM/Tools/BamHeaderT.html +0 -249
  73. data/doc/Bio/DB/SAM/Tools/BamPileup1T.html +0 -159
  74. data/doc/Bio/DB/SAM/Tools/SamfileT.html +0 -171
  75. data/doc/Bio/DB/SAM/Tools/SamfileTX.html +0 -159
  76. data/doc/Bio/DB/SAMException.html +0 -205
  77. data/doc/LibC.html +0 -155
  78. data/doc/Pileup.html +0 -571
  79. data/doc/Vcf.html +0 -473
  80. data/doc/basic_styles.css +0 -31
  81. data/doc/classes/Bio.html +0 -139
  82. data/doc/classes/Bio/DB.html +0 -137
  83. data/doc/classes/Bio/DB/Alignment.html +0 -441
  84. data/doc/classes/Bio/DB/Alignment.src/M000012.html +0 -19
  85. data/doc/classes/Bio/DB/Alignment.src/M000013.html +0 -27
  86. data/doc/classes/Bio/DB/Alignment.src/M000014.html +0 -45
  87. data/doc/classes/Bio/DB/Alignment.src/M000015.html +0 -40
  88. data/doc/classes/Bio/DB/SAM.html +0 -510
  89. data/doc/classes/Bio/DB/SAM/Library.html +0 -135
  90. data/doc/classes/Bio/DB/SAM/Library.src/M000006.html +0 -28
  91. data/doc/classes/Bio/DB/SAM/Tools.html +0 -278
  92. data/doc/classes/Bio/DB/SAM/Tools.src/M000007.html +0 -20
  93. data/doc/classes/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -111
  94. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.html +0 -150
  95. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.src/M000010.html +0 -20
  96. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.html +0 -169
  97. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000008.html +0 -19
  98. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000009.html +0 -18
  99. data/doc/classes/Bio/DB/SAM/Tools/BamPileup1T.html +0 -111
  100. data/doc/classes/Bio/DB/SAM/Tools/SamfileT.html +0 -129
  101. data/doc/classes/Bio/DB/SAM/Tools/SamfileTX.html +0 -111
  102. data/doc/classes/Bio/DB/SAMException.html +0 -140
  103. data/doc/classes/Bio/DB/SAMException.src/M000016.html +0 -18
  104. data/doc/classes/Bio/DB/Sam.src/M000017.html +0 -43
  105. data/doc/classes/Bio/DB/Sam.src/M000018.html +0 -42
  106. data/doc/classes/Bio/DB/Sam.src/M000019.html +0 -18
  107. data/doc/classes/Bio/DB/Sam.src/M000020.html +0 -22
  108. data/doc/classes/Bio/DB/Sam.src/M000021.html +0 -19
  109. data/doc/classes/Bio/DB/Sam.src/M000022.html +0 -25
  110. data/doc/classes/Bio/DB/Sam.src/M000023.html +0 -28
  111. data/doc/classes/Bio/DB/Sam.src/M000024.html +0 -28
  112. data/doc/classes/Bio/DB/Sam.src/M000025.html +0 -46
  113. data/doc/classes/Bio/DB/Sam.src/M000026.html +0 -24
  114. data/doc/classes/Bio/DB/Sam.src/M000027.html +0 -19
  115. data/doc/classes/Bio/DB/Sam.src/M000028.html +0 -24
  116. data/doc/classes/Bio/DB/Sam.src/M000029.html +0 -41
  117. data/doc/classes/Bio/DB/Sam.src/M000030.html +0 -31
  118. data/doc/classes/Bio/DB/Sam.src/M000031.html +0 -86
  119. data/doc/classes/Bio/DB/Sam.src/M000032.html +0 -34
  120. data/doc/classes/Bio/DB/Tag.html +0 -160
  121. data/doc/classes/Bio/DB/Tag.src/M000011.html +0 -21
  122. data/doc/classes/LibC.html +0 -105
  123. data/doc/classes/Pileup.html +0 -374
  124. data/doc/classes/Pileup.src/M000001.html +0 -34
  125. data/doc/classes/Pileup.src/M000002.html +0 -21
  126. data/doc/classes/Pileup.src/M000003.html +0 -21
  127. data/doc/classes/Pileup.src/M000004.html +0 -21
  128. data/doc/classes/Pileup.src/M000005.html +0 -31
  129. data/doc/files/lib/bio-samtools_rb.html +0 -109
  130. data/doc/files/lib/bio/db/sam/bam_rb.html +0 -108
  131. data/doc/files/lib/bio/db/sam/faidx_rb.html +0 -108
  132. data/doc/files/lib/bio/db/sam/library_rb.html +0 -101
  133. data/doc/files/lib/bio/db/sam/pileup_rb.html +0 -178
  134. data/doc/files/lib/bio/db/sam/sam_rb.html +0 -113
  135. data/doc/files/lib/bio/db/sam_rb.html +0 -111
  136. data/doc/fr_class_index.html +0 -43
  137. data/doc/fr_file_index.html +0 -33
  138. data/doc/fr_method_index.html +0 -58
  139. data/doc/lib/bio-samtools_rb.html +0 -115
  140. data/doc/lib/bio/db/pileup_rb.html +0 -171
  141. data/doc/lib/bio/db/sam/bam_rb.html +0 -121
  142. data/doc/lib/bio/db/sam/faidx_rb.html +0 -117
  143. data/doc/lib/bio/db/sam/library_rb.html +0 -115
  144. data/doc/lib/bio/db/sam/pileup_rb.html +0 -171
  145. data/doc/lib/bio/db/sam/sam_rb.html +0 -121
  146. data/doc/lib/bio/db/sam/vcf_rb.html +0 -124
  147. data/doc/lib/bio/db/sam_rb.html +0 -115
  148. data/doc/lib/bio/db/vcf_rb.html +0 -124
  149. data/doc/rdoc-style.css +0 -208
  150. data/lib/bio/db/sam/bam.rb +0 -210
  151. data/lib/bio/db/sam/sam.rb +0 -86
  152. data/test/samples/pipe_char/test.bam +0 -0
  153. data/test/samples/pipe_char/test.bam.bai +0 -0
  154. data/test/samples/pipe_char/test.tam +0 -10
  155. data/test/samples/pipe_char/test_chr.fasta +0 -1000
  156. data/test/samples/pipe_char/test_chr.fasta.fai +0 -1
  157. data/test/samples/small/test +0 -0
  158. data/test/samples/small/test.bam +0 -0
  159. data/test/samples/small/test.fa +0 -20
  160. data/test/samples/small/test.fai +0 -0
@@ -1,655 +1,705 @@
1
- require 'bio/db/sam/library'
2
- require 'bio/db/sam/bam'
3
- require 'bio/db/sam/faidx'
4
- require 'bio/db/sam/sam'
5
- #require 'bio/db/pileup'
6
- #require 'bio/db/vcf'
7
- require 'systemu'
8
-
9
- module LibC
10
- extend FFI::Library
11
- ffi_lib FFI::Library::LIBC
12
- attach_function :free, [ :pointer ], :void
13
- # call #attach_function to attach to malloc, free, memcpy, bcopy, etc.
14
- end
15
-
16
1
  module Bio
17
2
  class DB
18
3
  class Sam
19
- attr_reader :sam_file
4
+ attr_accessor :bam, :fasta, :samtools, :bcftools, :last_command
20
5
  attr_accessor :minumum_ratio_for_iup_consensus
21
6
  attr_reader :cached_regions
7
+ #attr_accessor :pileup_cache
22
8
  @minumum_ratio_for_iup_consensus = 0.20
9
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
10
+
11
+ #Creates a new Bio::DB::Sam object
12
+ #* fasta [String] - the path to the Fasta reference sequence
13
+ #* bam [String] - path to bam files
14
+ #* samtools [String] - path to alternative installation of samtools
15
+ #* bcftools [String] - path to alternative installation of bcftools
16
+ #* returns [Bio::DB::Sam] a new `Bio::DB::Sam` object
17
+ def initialize(args)
18
+ @fasta = args[:fasta]
19
+ @bam = args[:bam]
20
+ @samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
21
+ @bcftools = args[:bcftools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','bcftools')
22
+
23
+ @files = [@files] if @files.instance_of?(String)
24
+
25
+ @last_command = nil
26
+ raise ArgumentError, "Need Fasta and at least one BAM or SAM" if not @fasta or not @bam
27
+ raise IOError, "File not found #{files}" if not files_ok?
28
+ @bams = [@bams] if @bams.instance_of? String
23
29
 
24
- # To make a new sam object. Initialize expects a hash optsa with the following elemets:
25
- # fasta:: The fasta file with the reference. (nil)
26
- # bam:: path to a binary SAM file (nil)
27
- # tam:: path to a text SAM file (nil)
28
- # compressed:: If the binary file is compressed (true)
29
- # write:: If the file is to be writen (false). Not supported yet.
30
- # *NOTE:* you can't use binary and text formats simultaneusly. To make queries, the file has to be a sorted binary.
31
- # This function doesn't actually open the file, it just prepares the object to be opened in a later stage.
32
- #
33
- def initialize(optsa={})
34
- opts = { :fasta => nil, :bam => nil,:tam => nil, :compressed => true, :write => false }.merge!(optsa)
30
+ end
31
+
32
+ #backward compatibility method, returns true if file exists otherwise, complains and quits.
33
+ def open
34
+ files_ok?
35
+ end
35
36
 
37
+ #runs the samtools view command
38
+ #* b - output BAM
39
+ #* h - print header for the SAM output
40
+ #* H - print header only (no alignments)
41
+ #* S - input is SAM
42
+ #* u - uncompressed BAM output (force -b)
43
+ #* one - fast compression (force -b)
44
+ #* x - output FLAG in HEX (samtools-C specific)
45
+ #* X - output FLAG in string (samtools-C specific)
46
+ #* c - print only the count of matching records
47
+ #* B - collapse the backward CIGAR operation
48
+ #* at - INT number of BAM compression threads [0]
49
+ #* L - FILE output alignments overlapping the input BED FILE [null]
50
+ #* t - FILE list of reference names and lengths (force -S) [null]
51
+ #* T - FILE reference sequence file (force -S) [null]
52
+ #* o - FILE output file name [stdout]
53
+ #* R - FILE list of read groups to be outputted [null]
54
+ #* f - INT required flag 0 for unset [0]
55
+ #* F - INT filtering flag 0 for unset [0]
56
+ #* q - INT minimum mapping quality [0]
57
+ #* l - STR only output reads in library STR [null]
58
+ #* r - STR only output reads in read group STR [null]
59
+ #* s - FLOAT fraction of templates to subsample; integer part as seed [-1]
60
+ #* chr - name of reference sequence to get alignments from
61
+ #* start - start position on reference sequence
62
+ #* stop - end postion on reference sequence
63
+ def view(opts={},&block)
64
+ region = String.new
65
+ if opts[:chr] and opts[:start] and opts[:stop]
66
+ region = "#{opts[:chr]}:#{opts[:start]}-#{opts[:stop]}"
67
+ [:chr, :start, :stop].each {|o| opts.delete(o)}
68
+ end
69
+ if opts[:at]
70
+ opts['@'] = opts[:at]
71
+ opts.delete(:at)
72
+ end
73
+
74
+ if opts[:one]
75
+ opts['1'] = opts[:one]
76
+ opts.delete(:one)
77
+ end
78
+
79
+ command = form_opt_string(@samtools, 'view', opts, [:b, :h, :H, :S, :u, '1', :x, :X, :c, :B]) + " " + region
80
+ @last_command = command
81
+ type = (opts[:u] or opts[:b]) ? :binary : :text
82
+ klass = (type == :binary) ? String : Bio::DB::Alignment
83
+ yield_from_pipe(command, klass, type, &block)
84
+ end
36
85
 
86
+ #fetches a subsequence and calls code block
87
+ #* chr - the reference name for the subsequence
88
+ #* start - the start position for the subsequence
89
+ #* stop - the stop position for the subsequence
90
+ #* &block - the the block of code to execute
91
+ def fetch(chr, start,stop, &block)
92
+ view(
93
+ :chr => chr,
94
+ :start => start,
95
+ :stop => stop,
96
+ &block
97
+ )
98
+ end
37
99
 
38
- @fasta_path = opts[:fasta]
39
- @compressed = opts[:compressed]
40
- @write = opts[:write]
41
- bam = opts[:bam]
42
- tam = opts[:tam]
100
+ alias_method :fetch_with_function, :fetch
101
+
102
+ #returns an array of coverage for each location for which there are mapped reads
103
+ #* chr - the reference name
104
+ #* start - the start position
105
+ #* length - the length of the region queried
106
+ def chromosome_coverage(chr,start,length)
107
+ result = []
108
+ region = "#{chr}:#{start}-#{start + length}"
109
+ self.mpileup(:r => region) do |p|
110
+ result << p.coverage
111
+ end
112
+ result
113
+ end
43
114
 
44
- if bam == nil && tam == nil && @fasta_path == nil then
45
- raise SAMException.new(), "No alignment or reference file"
46
- elsif bam != nil && tam != nil then
47
- raise SAMException.new(), "Alignment has to be in either text or binary format, not both"
48
- elsif bam != nil then
49
- @binary = true
50
- @sam = bam
51
- elsif tam != nil then
52
- @sam = tam
53
- @binary = false
54
115
 
116
+ #returns an svg file or object, plotting coverage for each location for which there are mapped reads
117
+ #* chr - the reference name
118
+ #* start - the start position
119
+ #* length - the length of the region queried
120
+ #OPTIONS
121
+ #* bin - the amount of bins to split the histogram into. The arithmetic mean score for each bin will be plotted. [default 30 bins]
122
+ #* svg - a file to write the svg image to [default a String object containing the SVG]
123
+ def plot_coverage(chr,start,length, opts={})
124
+ if opts[:bin]
125
+ bin = length/opts[:bin]
126
+ else
127
+ bin = length/30
128
+ end
129
+ result = []
130
+ region = "#{chr}:#{start}-#{start + length}"
131
+ self.mpileup(:r => region) do |p|
132
+ result << p.coverage
133
+ end
134
+ p = Bio::Graphics::Page.new(:width => 1000,
135
+ :height => 200,
136
+ :number_of_intervals => 10,
137
+ :font_size => 14
138
+ )
139
+ data_track = p.add_track(:glyph => :histogram,
140
+ :stroke_color => 'black',
141
+ :fill_color => 'gold',
142
+ :track_height => 150,
143
+ :name => 'read coverage',
144
+ :label => true,
145
+ :stroke_width => '1',
146
+ :x_round => 1,
147
+ :y_round => 1 )
148
+ index = 0;
149
+ result.each_slice(bin) {|slice|
150
+ #result.each_with_index {|val, index|
151
+ data_feature = Bio::Graphics::MiniFeature.new(:start => start + index,
152
+ :end => (start + index + bin),
153
+ :segment_height => slice.inject{|sum,x| sum + x }.to_f / slice.size)
154
+ data_track.add(data_feature)
155
+ index+=bin
156
+ }
157
+ if opts[:svg]
158
+ svg = opts[:svg].to_s
159
+ p.write(svg)
160
+ else
161
+ return p.get_markup
55
162
  end
56
- @fasta_file = nil
57
- @sam_file = nil
58
163
 
59
- ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
164
+
165
+ end
166
+
167
+ #returns the average coverage over the region queried
168
+ #* chr - the reference name
169
+ #* start - the start position
170
+ #* length - the length of the region queried
171
+ def average_coverage(chr,start,length)
172
+ arr = self.chromosome_coverage(chr,start,length)
173
+ arr.inject{ |sum, el| sum + el }.to_f / arr.size
60
174
  end
61
-
62
- #Function that actually opens the sam file
63
- #Throws a SAMException if the file can't be open.
64
- def open()
65
175
 
66
- raise SAMException.new(), "Writing not supported yet" if @write
67
- raise SAMException.new(), "No SAM file specified" unless @sam
176
+ #returns a Bio::DB::Pileup or Bio::DB::VCF object
177
+ #* region - Only generate pileup in region [chrom:start-stop]
178
+ #* illumina_quals - Assume the quality is in the Illumina 1.3+ encoding
179
+ #* count_anomalous - Do not skip anomalous read pairs in variant calling
180
+ #* no_baq - Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments.
181
+ #* adjust_mapq - [INT] Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0]
182
+ #* max_per_bam_depth - [INT] At a position, read maximally INT reads per input BAM. [250]
183
+ #* extended_baq - Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt specificity a little bit.
184
+ #* exclude_reads_file - [FILE] exclude read groups listed in FILE [null]
185
+ #* list_of_positions - [FILE] BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
186
+ #* mapping_quality_cap - [INT] cap mapping quality at INT [60]
187
+ #* ignore_rg - ignore read group tags
188
+ #* min_mapping_quality - [INT] skip alignments with mapQ smaller than INT [0]
189
+ #* min_base_quality - [INT] skip bases with baseQ/BAQ smaller than INT [13]
190
+ #* ##following options are for the -g -u option
191
+ #* genotype_calling - generate BCF output (genotype likelihoods)
192
+ #* uncompressed_bcf - generate uncompress BCF output
193
+ #* extension_sequencing_probability - [INT] Phred-scaled gap extension seq error probability [20]
194
+ #* homopolymer_error_coefficient - [INT] coefficient for homopolymer errors [100]
195
+ #* no_indels - do not perform indel calling
196
+ #* skip_indel_over_average_depth - [INT] max per-sample depth for INDEL calling [250]
197
+ #* gap_open_sequencing_error_probability - [INT] Phred-scaled gap open sequencing error probability [40]
198
+ #* platforms - [STRING] comma separated list of platforms for indels [all]
199
+ def mpileup(opts={}, &block)
200
+ #long option form to short samtools form..
201
+ long_opts = {
202
+ :region => :r,
203
+ :illumina_quals => :six,
204
+ :count_anomalous => :A,
205
+ :no_baq => :B,
206
+ :adjust_mapq => :C,
207
+ :max_per_bam_depth => :d,
208
+ :extended_baq => :E,
209
+ :exclude_reads_file => :G,
210
+ :list_of_positions => :l,
211
+ :mapping_quality_cap => :M,
212
+ :ignore_rg => :R,
213
+ :min_mapping_quality => :q,
214
+ :min_base_quality => :Q,
215
+ ###following options are for the -g -u option
216
+ :genotype_calling => :g,
217
+ :uncompressed_bcf => :u,
218
+ :extension_sequencing_probability => :e,
219
+ :homopolymer_error_coefficient => :h,
220
+ :no_indels => :I,
221
+ :skip_indel_over_average_depth => :L,
222
+ :gap_open_sequencing_error_probability => :o,
223
+ :platforms => :P
224
+ }
68
225
 
69
- opts = @write ? "w" : "r"
70
- if @binary then
71
- opts += "b"
72
- if @write then
73
- unless @compressed then
74
- opts += "u"
75
- end
226
+ ##convert any long_opts to short opts
227
+ temp_opts = opts.dup
228
+ opts.each_pair do |k,v|
229
+ if long_opts[k]
230
+ temp_opts[long_opts[k]] = v
231
+ temp_opts.delete(k)
76
232
  end
77
233
  end
78
- valid = ["r", "w", "wh", "rb", "wb" , "wbu"]
79
- unless valid.include?(opts) then
80
- raise SAMException.new(), "Invalid options for samopen: " + opts
234
+ opts = Hash.new
235
+ #To remove any unwanted options.
236
+ long_opts.each_pair do |k,v|
237
+ opts[v] = temp_opts[v] if temp_opts.has_key?(v)
238
+ end
239
+
240
+ # opts = temp_opts
241
+ opts[:u] = true if opts[:g] #so that we always get uncompressed output
242
+ opts.delete(:g)
243
+
244
+ opts[:f] = @fasta
245
+
246
+
247
+ query = opts[:r].to_s
248
+ query = opts[:r].to_region.to_s if opts[:r].respond_to?(:to_region)
249
+ opts[:r] = query
250
+
251
+ if opts[:six]
252
+ opts["6"] = nil
253
+ opts.delete(:six)
81
254
  end
82
255
 
83
- samFile = Bio::DB::SAM::Tools.samopen(@sam, opts, nil)
84
- if samFile.null? then
85
- @sam_file = nil
86
- raise SAMException.new(), "File not opened: " + @sam
256
+ command = form_opt_string(@samtools, "mpileup", opts, [:R, :B, :E, "6", :A, :g, :u, :I] )
257
+ puts command if $VERBOSE
258
+ if opts[:u]
259
+ command = command + " | #{@bcftools} view -cg -"
87
260
  end
88
- @sam_file = Bio::DB::SAM::Tools::SamfileT.new(samFile)
261
+
262
+ klass = opts[:u] ? Bio::DB::Vcf : Bio::DB::Pileup
263
+ @last_command = command
264
+ yield_from_pipe(command, klass, :text, &block)
89
265
 
90
266
  end
91
267
 
92
- #Prints a description of the sam file in a text format containg if it is binary or text, the path
93
- #and the fasta file of the reference
94
- def to_s()
95
- (@binary ? "Binary" : "Text") + " file: " + @sam + " with fasta: " + @fasta_path
268
+ #fetches a subsequence from a reference genome and option returns it as a Bio::Sequence::NA object
269
+ #* chr - [STRING] the reference name for the subsequence
270
+ #* start - [INT] the start position for the subsequence
271
+ #* stop - [INT] the stop position for the subsequence
272
+ #* as_bio - boolean stating if the returned object should be a Bio::Sequence::NA object
273
+ def fetch_reference(chr,start,stop, opts={:as_bio => false})
274
+ seq = ""
275
+ unless @fasta #We return a string of Ns if we don't know the reference.
276
+ seq = "n" * (stop-start)
277
+ else
278
+ command = "#{@samtools} faidx #{@fasta} '#{chr}:#{start}-#{stop}'"
279
+ puts command if $VERBOSE
280
+ @last_command = command
281
+ seq = ""
282
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
283
+ end
284
+
285
+ if opts[:as_bio]
286
+ seq = Bio::Sequence::NA.new(seq).to_fasta("#{chr}:#{start}-#{stop}")
287
+ end
288
+ seq
96
289
  end
97
290
 
98
- #Closes the sam file and destroys the C pointers using the functions provided by libbam
99
- def close()
100
- Bio::DB::SAM::Tools.fai_destroy(@fasta_index) unless @fasta_index.nil? || @fasta_index.null?
101
- Bio::DB::SAM::Tools.bam_index_destroy(@sam_index) unless @sam_index.nil? || @sam_index.null?
102
- Bio::DB::SAM::Tools.samclose(@sam_file) unless @sam_file.nil?
103
- @sam_file = nil
104
- @fasta_index = nil
291
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
292
+ #Options - if a subsequence is required
293
+ #* chr - [STRING] the reference name of the subsequence
294
+ #* start - [INT] the start position for the subsequence
295
+ #* stop - [INT] the stop position for the subsequence
296
+ def faidx(opts={})
297
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
298
+ opts={:as_bio => false}
299
+ self.fetch_reference(:chr,:start,:stop,opts)
300
+ else
301
+ command = "#{@samtools} faidx #{@fasta}"
302
+ @last_command = command
303
+ system(command)
304
+ end
105
305
  end
106
306
 
107
- # Destructor method that closes the file before letting the object be garbage collected.
108
- def Sam.finalize(id)
109
- id.close()
110
- puts "Finalizing #{id} at #{Time.new}"
307
+ #Index sorted alignment for fast random access. Index file <aln.bam>.bai will be created of no out_index is provided.
308
+ #* out_index - [STRING] name of index
309
+ def index(opts={})
310
+ command = "#{@samtools} index #{@bam} #{opts[:out_index]}"
311
+ puts command if $VERBOSE
312
+ @last_command = command
313
+ system(command)
111
314
  end
112
315
 
113
- #Loads the bam index to be used for fetching. If the index doesn't exists the index is built provided that
114
- #the user has writing access to the folder where the BAM file is located. If the creation of the file fails
115
- #a SAMException is thrown.
116
- #If the index doesn't exist, loading it will take more time. It is suggested to generate the index separatedly
117
- #if the bam file sits on a server where the executing user may not have writing permissions in the server.
118
- def load_index()
119
- raise SAMException.new(), "Indexes are only supported by BAM files, please use samtools to convert your SAM file" unless @binary
120
- @sam_index = Bio::DB::SAM::Tools.bam_index_load(@sam)
121
- if @sam_index.null? then
122
- p "Generating index for: " + @sam
123
- Bio::DB::SAM::Tools.bam_index_build(@sam)
124
- @sam_index = Bio::DB::SAM::Tools.bam_index_load(@sam)
125
- raise SAMException.new(), "Unable to generate bam index for: " + @sam if @sam_index.nil? || @sam_index.null?
316
+ #Fill in mate coordinates, ISIZE and mate related flags from a name-sorted alignment
317
+ #* out_bam name of outfile
318
+ #* r - remove unmapped reads and secondary alignments
319
+ def fix_mates(opts={})
320
+ #opts.merge!({:out_index=>nil})
321
+ remove_reads = ""
322
+ if opts[:r]
323
+ remove_reads = "-r"
126
324
  end
325
+ command = "#{@samtools} fixmate #{remove_reads} #{@bam} #{opts[:out_bam]}"
326
+ puts command if $VERBOSE
327
+ @last_command = command
328
+ system(command)
329
+ end
330
+
331
+ alias_method :fixmate, :fix_mates
332
+
333
+ #generate simple stats with regard to the number and pairing of reads mapped to a reference
334
+ def flag_stats(opts={})
335
+ command = form_opt_string(@samtools, "flagstat", opts, [])
336
+ puts command if $VERBOSE
337
+ @last_command = command
338
+ strings = []
339
+ yield_from_pipe(command,String) {|line| strings << line.chomp}
340
+ strings
127
341
  end
128
342
 
129
- #Loads the reference file to be able to query regions of it. This requires the fai index to exist in the same
130
- #folder than the reference. If it doesn't exisits, this functions attempts to generate it. If user doesn't
131
- #have writing permissions on the folder, or the creation of the fai fails for any reason, a SAMException is thrown.
132
- def load_reference()
133
- raise SAMException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
343
+ alias_method :flagstat, :flag_stats
344
+
345
+ #Retrieve and print stats in the index file. The output is TAB delimited with each line consisting of reference sequence name, sequence length, number of mapped reads and number unmapped reads.
346
+ def index_stats
347
+ stats = {}
348
+ command = form_opt_string(@samtools, "idxstats #{@bam}", {}, [])
349
+ @last_command = command
350
+ puts command if $VERBOSE
351
+ yield_from_pipe(command, String, :text, true, "#") do |line|
352
+ info = line.chomp.split(/\t/)
353
+ stats[ info[0] ] = {:length => info[1].to_i, :mapped_reads => info[2].to_i, :unmapped_reads => info[3].to_i }
354
+ end
355
+ stats
356
+ end
134
357
 
135
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
358
+ alias_method :idxstats, :index_stats
359
+
360
+ #Merge multiple sorted alignments
361
+ #* n - sort by read names
362
+ #* r - attach RG tag (inferred from file names)
363
+ #* u - uncompressed BAM output
364
+ #* f - overwrite the output BAM if exist
365
+ #* one - compress level 1
366
+ #* l - [INT] compression level, from 0 to 9 [-1]
367
+ #* at - [INT] number of BAM compression threads [0]
368
+ #* R - [STRING] merge file in the specified region STR [all]
369
+ #* h - [FILE] copy the header in FILE to <out.bam> [in1.bam]
370
+ #* out - [FILE] out file name
371
+ #* bams - [FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
372
+ def merge(opts={})
373
+ if opts[:one]
374
+ opts['1'] = nil
375
+ opts.delete(:one)
376
+ end
136
377
 
137
- if @fasta_index.null? then
138
- p "Generating index for: " + @fasta_path
139
- Bio::DB::SAM::Tools.fai_build(@fasta_path)
140
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
141
- raise SAMException.new(), "Unable to generate fasta index for: " + @fasta_path if @fasta_index.nil? || @fasta_index.null?
378
+ if opts[:at]
379
+ opts['@'] = opts[:at]
380
+ opts.delete(:at)
142
381
  end
143
382
 
383
+ out = opts[:out]
384
+ opts.delete(:out)
385
+
386
+ bam_list = opts[:bams].collect do |b|
387
+ b.bam rescue b
388
+ end.join(' ')
389
+
390
+ opts.delete(:bams)
391
+ options = commandify(opts, [:n, :r, :u, :f, '1'] )
392
+ command = "#{@samtools} merge #{options} #{out} #{bam_list}"
393
+
394
+ @last_command = command
395
+ puts command puts command if $VERBOSE
396
+ system(command)
397
+
144
398
  end
145
399
 
146
- #Returns the average coverage of a region in a bam file.
147
- def average_coverage(chromosome, qstart, len)
400
+ #Concatenate BAMs. The sequence dictionary of each input BAM must be identical.
401
+ #* h - header.sam
402
+ #* out -[FILE] out file name
403
+ #* bams -[FILES] or Bio::DB::Sam list of input bams, or Bio::DB::Sam objects
404
+ def cat(opts={})
405
+ out = opts[:out]
406
+ opts.delete(:out)
407
+
408
+ bam_list = opts[:bams].collect do |b|
409
+ b.bam rescue b
410
+ end.join(' ')
411
+ opts.delete(:bams)
412
+ options = commandify(opts, [:h] )
413
+ command = "#{@samtools} cat #{options} -o #{out} #{bam_list}"
414
+ puts command
415
+ @last_command = command
416
+ system(command)
148
417
 
149
- #reference = fetch_reference(chromosome, qstart,len)
150
- # len = reference.length if len > reference.length
418
+ end
151
419
 
420
+ #* program - one of 'samtools' 'bcftools'
421
+ #* command - one of the commands relevant to the program
422
+ def self.docs(program, command)
423
+ return "program must be 'samtools' or 'bcftools'" if not ['samtools', 'bcftools'].include? program
424
+ command = "#{program} #{command}"
425
+ `#{command}`
426
+ end
152
427
 
153
- coverages = chromosome_coverage(chromosome, qstart, len)
154
- total = 0
155
- len.times{ |i| total= total + coverages[i] }
156
- avg_cov = total.to_f / len
157
- #LibC.free reference
158
- avg_cov
428
+ #Remove potential PCR duplicates: if multiple read pairs have identical external coordinates, only retain the pair with highest mapping quality.
429
+ #* s - rmdup for SE reads
430
+ #* S - treat PE reads as SE in rmdup (force -s)
431
+ #* out - [FILE] output bam
432
+ def remove_duplicates(opts={})
433
+ out = opts[:out]
434
+ opts.delete(:out)
435
+ command = "#{form_opt_string(@samtools, "rmdup", opts, [:s, :S])} #{out} #{@bam}"
436
+ @last_command = command
437
+ system(command)
159
438
  end
160
439
 
161
- #Returns an array with the coverage at each possition in the queried region
162
- #This is a simple average coverage just calculated with the first and last
163
- #possition of the alignment, ignoring the gaps.
164
- def chromosome_coverage(chromosome, qstart, len)
165
-
166
- coverages = Array.new(len, 0)
440
+ alias_method :rmdup, :remove_duplicates
441
+
442
+ #Sort alignments by leftmost coordinates
443
+ #* n - sort by read name
444
+ #* f - use <out.prefix> as full file name instead of prefix
445
+ #* o - final output to stdout returns bio::db::alignment
446
+ #* l - [INT] compression level, from 0 to 9 [-1]
447
+ #* at - [INT] number of sorting and compression threads [1]
448
+ #* m - [INT] max memory per thread; suffix K/M/G recognized [768M]
449
+ #* prefix - [STRING] prefix for output bamfile
450
+ def sort(opts={})
451
+ if !opts.has_key?(:prefix)
452
+ opts.merge!({:prefix => "sorted"})
453
+ end
454
+ prefix = opts[:prefix]
455
+ opts.delete(:prefix)
456
+ command = form_opt_string(@samtools, "sort", opts, [:n, :f, :o])
457
+ command = command + " " + prefix
458
+ @last_command = command
459
+ puts command if $VERBOSE
460
+ if opts[:o]
461
+ yield_from_pipe(command, Bio::DB::Alignment)
462
+ else
463
+ system(command)
464
+ end
465
+ end
167
466
 
168
- chr_cov_proc = Proc.new do |alignment|
169
- last = alignment.calend - qstart
170
- first = alignment.pos - qstart
171
- if last < first
172
- tmp = last
173
- last = first
174
- first = last
175
- end
467
+ #used to generate a text alignment viewer
468
+ #* d - display, output as (H)tml or (C)urses or (T)ext
469
+ #* p - [chr:pos] go directly to this position
470
+ #* s - [STR] display only reads from this sample or group
471
+ def tview(opts={})
472
+ if opts[:d]
473
+ opts['d'] = opts[:d]
474
+ opts.delete(:d)
475
+ end
476
+ if opts[:p]
477
+ opts['p'] = opts[:p]
478
+ opts.delete(:p)
479
+ end
480
+ if opts[:s]
481
+ opts['s'] = opts[:s]
482
+ opts.delete(:s)
483
+ end
484
+ command = "#{form_opt_string(@samtools, "tview", opts)}"
485
+ puts command if $VERBOSE
486
+ @last_command = command
487
+ system(command)
488
+ end
176
489
 
177
- # STDERR.puts "#{first} #{last}\n"
178
- first.upto(last-1) { |i|
179
-
180
- coverages[i-1] = 1 + coverages[i-1] if i-1 < len && i > 0
181
- }
182
- end
183
-
184
- fetch_with_function(chromosome, qstart, qstart+len, chr_cov_proc)
185
- #p coverages
186
- coverages
187
- end
188
-
189
- #Returns the sequence for a given region.
190
- def fetch_reference(chromosome, qstart,qend)
191
- load_reference if @fasta_index.nil? || @fasta_index.null?
192
- query = query_string(chromosome, qstart,qend)
193
- len = FFI::MemoryPointer.new :int
194
- reference = Bio::DB::SAM::Tools.fai_fetch(@fasta_index, query, len)
195
- raise SAMException.new(), "Unable to get sequence for reference: "+query if reference.nil?
196
-
197
- reference
198
- end
199
-
200
- #Generates a query sting to be used by the region parser in samtools.
201
- #In principle, you shouldn't need to use this function.
202
- def query_string(chromosome, qstart,qend)
203
- query = chromosome + ":" + qstart.to_s + "-" + qend.to_s
204
- query
205
- end
206
-
207
- #Returns an array of Alignments on a given region.
208
- def fetch(chromosome, qstart, qend)
209
- als = Array.new
210
- fetchAlignment = Proc.new do |alignment|
211
- als.push(alignment.clone)
212
- 0
213
- end
214
- fetch_with_function(chromosome, qstart, qend, fetchAlignment)
215
- als
216
- end
217
-
218
- #Executes a function on each Alignment inside the queried region of the chromosome. The chromosome
219
- #can be either the textual name or a FixNum with the internal index. However, you need to get the
220
- #internal index with the provided API, otherwise the pointer is outside the scope of the C library.
221
- #Returns the count of alignments in the region.
222
- #WARNING: Accepts an index already parsed by the library. It fails when you use your own FixNum (FFI-bug?)
223
- def fetch_with_function(chromosome, qstart, qend, function)
224
- load_index if @sam_index.nil? || @sam_index.null?
225
- chr = FFI::MemoryPointer.new :int
226
- beg = FFI::MemoryPointer.new :int
227
- last = FFI::MemoryPointer.new :int
228
- query = query_string(chromosome, qstart,qend)
229
- qpointer = FFI::MemoryPointer.from_string(query)
230
- header = @sam_file[:header]
231
- Bio::DB::SAM::Tools.bam_parse_region(header,qpointer, chr, beg, last)
232
- #raise SAMException.new(), "invalid query: " + query if(chr.read_int < 0)
233
- count = 0;
234
-
235
- fetchAlignment = Proc.new do |bam_alignment, data|
236
- alignment = Alignment.new
237
- alignment.set(bam_alignment, header)
238
- function.call(alignment)
239
- count = count + 1
240
- 0
241
- end
242
- Bio::DB::SAM::Tools.bam_fetch(@sam_file[:x][:bam], @sam_index,chr.read_int,beg.read_int, last.read_int, nil, fetchAlignment)
243
- #LibC.free chr
244
- #LibC.free beg
245
- #LibC.free last
246
- #LibC.free qpointer
247
- count
248
- end
249
-
250
- #Merges n BAM files. This doesn't require to create a SAM object
251
- #files:: An array with the paths to the files.
252
- #merged_file:: The path to the merged file
253
- #headers:: The BAM file containing the header
254
- #add_RG:: If true, the RG tag is added (infered from the filenames)
255
- #by_qname:: If true, the bamfiles should by ordered by query name, if false, by coordinates.
256
- def self.merge(files, merged_file, headers, add_RG, by_qname)
257
- strptrs = []
258
- strptrs << FFI::MemoryPointer.from_string("merge")
259
- files.each do |file|
260
- strptrs << FFI::MemoryPointer.from_string(file)
261
- end
262
- strptrs << nil
263
-
264
- # Now load all the pointers into a native memory block
265
- argv = FFI::MemoryPointer.new(:pointer, strptrs.length)
266
- strptrs.each_with_index do |p, i|
267
- argv[i].put_pointer(0, p)
268
- end
269
- #void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG)
270
- Bio::DB::SAM::Tools.bam_merge_core(by_qname, merged_file, headers, strptrs.length, argv, add_RG)
271
- end
272
-
273
- #calls the mpileup function, opts is a hash of options identical to the command line options for mpileup.
274
- #is an iterator that yields a Pileup object for each postion
275
- #the command line options that generate/affect BCF/VCF are ignored ie (g,u,e,h,I,L,o,p)
276
- #call the option as a symbol of the flag, eg -r for region is called :r => "some SAM compatible region"
277
- #eg bam.mpileup(:r => "chr1:1000-2000", :q => 50) gets the bases with quality > 50 on chr1 between 1000-5000
278
- def mpileup( opts={})
279
-
280
- raise SAMException.new(), "No BAMFile provided" unless @sam and @binary
281
- raise SAMException.new(), "No FastA provided" unless @fasta_path
282
- #long option form to short samtools form..
283
- long_opts = {
284
- :region => :r,
285
- :illumina_quals => :six,
286
- :count_anomalous => :A,
287
- :no_baq => :B,
288
- :adjust_mapq => :C,
289
- :max_per_bam_depth => :d,
290
- :extended_baq => :E,
291
- :exclude_reads_file => :G,
292
- :list_of_positions => :l,
293
- :mapping_quality_cap => :M,
294
- :ignore_rg => :R,
295
- :min_mapping_quality => :q,
296
- :min_base_quality => :Q
297
- }
298
- ##convert any long_opts to short opts
299
- temp_opts = opts.dup
300
- opts.each_pair do |k,v|
301
- if long_opts[k]
302
- temp_opts[long_opts[k]] = v
303
- temp_opts.delete(k)
304
- end
305
- end
306
- opts = temp_opts
307
- ##remove any calls to -g or -u for mpileup, bcf output is not yet supported
308
- ##and also associated output options
309
- [:g, :u, :e, :h, :I, :L, :o, :p].each {|x| opts.delete(x) }
310
-
311
- sam_opts = []
312
- #strptrs << FFI::MemoryPointer.from_string("mpileup")
313
- opts.each do |k,v|
314
- next unless opts[k] ##dont bother unless the values provided are true..
315
- k = '6' if k == :six
316
- k = '-' + k.to_s
317
- sam_opts << k #strptrs << FFI::MemoryPointer.from_string(k)
318
- sam_opts << v.to_s unless ["-R", "-B", "-E", "-6", "-A"].include?(k) #these are just flags so don't pass a value... strptrs << FFI::MemoryPointer.from_string(v.to_s)
319
- end
320
- sam_exe = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
321
- sam_opts = sam_opts + ['-f', @fasta_path, @sam]
322
-
323
- sam_opts_string = SystemUniversal.quote(*sam_opts)
324
- cmdline = "#{sam_exe} mpileup #{sam_opts_string}"
325
- status, stdout, stderr = systemu cmdline
326
-
327
- if status.exitstatus == 0
328
- stdout.each_line do |line|
329
- yield Bio::DB::Pileup.new(line)
330
- end
331
- else
332
- raise SAMException.new(), "Error running mpileup. Command line was '#{cmdline}'\nsamtools STDERR was:\n#{stderr}"
333
- end
334
-
335
- #strptrs << FFI::MemoryPointer.from_string('-f')
336
- #strptrs << FFI::MemoryPointer.from_string(@fasta_path)
337
- #strptrs << FFI::MemoryPointer.from_string(@sam)
338
- #strptrs << nil
339
-
340
- # Now load all the pointers into a native memory block
341
- #argv = FFI::MemoryPointer.new(:pointer, strptrs.length)
342
- #strptrs.each_with_index do |p, i|
343
- # argv[i].put_pointer(0, p)
344
- #end
345
-
346
- #old_stdout = STDOUT.clone
347
- #read_pipe, write_pipe = IO.pipe()
348
- #STDOUT.reopen(write_pipe)
349
- #int bam_mpileup(int argc, char *argv[])
350
- # Bio::DB::SAM::Tools.bam_mpileup(strptrs.length - 1,argv)
351
- #if fork
352
- # write_pipe.close
353
- # STDOUT.reopen(old_stdout) #beware .. stdout from other processes eg tests calling this method can get mixed in...
354
- # begin
355
- # while line = read_pipe.readline
356
- # yield Pileup.new(line)
357
- # end
358
- # rescue EOFError
359
- # read_pipe.close
360
- # Process.wait
361
- # end
362
- #end
363
- end
364
-
365
- #experimental method that spawns a samtools mpileup | bcftools view process and supports returning of pileup vcf
366
- ##otherwise works like mpileup
367
- def mpileup_plus( opts )
368
-
369
- raise SAMException.new(), "No BAMFile provided" unless @sam and @binary
370
- raise SAMException.new(), "No FastA provided" unless @fasta_path
371
- #long option form to short samtools form..
372
- long_opts = {
373
- :region => :r,
374
- :illumina_quals => :six,
375
- :count_anomalous => :A,
376
- :no_baq => :B,
377
- :adjust_mapq => :C,
378
- :max_per_bam_depth => :d,
379
- :extended_baq => :E,
380
- :exclude_reads_file => :G,
381
- :list_of_positions => :l,
382
- :mapping_quality_cap => :M,
383
- :ignore_rg => :R,
384
- :min_mapping_quality => :q,
385
- :min_base_quality => :Q,
386
- ###following options are for the -g -u option
387
- :genotype_calling => :g,
388
- :uncompressed_bcf => :u,
389
- :extension_sequencing_probability => :e,
390
- :homopolymer_error_coefficient => :h,
391
- :no_indels => :I,
392
- :skip_indel_over_average_depth => :L,
393
- :gap_open_sequencing_error_probability => :o,
394
- :platforms => :P
395
- }
396
-
397
- ##convert any long_opts to short opts
398
- temp_opts = opts.dup
399
- opts.each_pair do |k,v|
400
- if long_opts[k]
401
- temp_opts[long_opts[k]] = v
402
- temp_opts.delete(k)
403
- end
404
- end
405
- opts = temp_opts
406
- ##remove any calls to -g or -u for mpileup, bcf output is not yet supported
407
- ##and also associated output options
408
- #[:g, :u, :e, :h, :I, :L, :o, :p].each {|x| opts.delete(x) }
409
- opts[:u] = true if opts[:g] #so that we always get uncompressed output
410
- opts.delete(:g)
411
-
412
- sam_opts = []
413
- #strptrs << FFI::MemoryPointer.from_string("mpileup")
414
- opts.each do |k,v|
415
- next unless opts[k] ##dont bother unless the values provided are true..
416
- k = '6' if k == :six
417
- k = '-' + k.to_s
418
- sam_opts << k #strptrs << FFI::MemoryPointer.from_string(k)
419
- sam_opts << v.to_s unless ["-R", "-B", "-E", "-6", "-A", "-g", "-u", "-I"].include?(k) #these are just flags so don't pass a value... strptrs << FFI::MemoryPointer.from_string(v.to_s)
420
- end
421
- sam_opts = sam_opts + ['-f', @fasta_path, @sam]
422
-
423
- command = "#{File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')} mpileup #{sam_opts.join(' ')} 2> /dev/null"
424
- if opts[:u]
425
- command = command + " | #{File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','bcftools')} view -cg -"
426
- end
427
- pipe = IO.popen(command)
428
- $stderr.puts command
429
- if opts[:u]
430
- while line = pipe.gets
431
- next if line[0,1] == '#' #skip any header or meta-lines, we dont do anything with those
432
- yield Bio::DB::Vcf.new(line)
433
- end
434
- else
435
- while line = pipe.gets
436
- yield Bio::DB::Pileup.new(line)
437
- end
438
- end
439
- pipe.close
440
- #strptrs << FFI::MemoryPointer.from_string('-f')
441
- #strptrs << FFI::MemoryPointer.from_string(@fasta_path)
442
- #strptrs << FFI::MemoryPointer.from_string(@sam)
443
- #strptrs << nil
444
-
445
- # Now load all the pointers into a native memory block
446
- #argv = FFI::MemoryPointer.new(:pointer, strptrs.length)
447
- #strptrs.each_with_index do |p, i|
448
- # argv[i].put_pointer(0, p)
449
- #end
450
-
451
- #old_stdout = STDOUT.clone
452
- #read_pipe, write_pipe = IO.pipe()
453
- #STDOUT.reopen(write_pipe)
454
- #int bam_mpileup(int argc, char *argv[])
455
- # Bio::DB::SAM::Tools.bam_mpileup(strptrs.length - 1,argv)
456
- #if fork
457
- # write_pipe.close
458
- # STDOUT.reopen(old_stdout) #beware .. stdout from other processes eg tests calling this method can get mixed in...
459
- # begin
460
- # while line = read_pipe.readline
461
- # yield Pileup.new(line)
462
- # end
463
- # rescue EOFError
464
- # read_pipe.close
465
- # Process.wait
466
- # end
467
- #end
468
- end
469
-
470
-
471
- # utility method that does not use the samtools API, it calls samtools directly as if on the command line and catches the output,
472
- # to use this method you must have a version of samtools that supports the pileup command (< 0.1.17)
473
- # otherwise the command will fail.
474
- # mpileup is the preferred method for getting pileups.
475
- # With this method the sam object should be created as usual, but you need to pass this method a string of options for samtools
476
- # you don't need to provide the call to samtools pileup itself or -f <fasta file> or the bam file itself, these are taken from the sam object
477
- def deprecated_pileup( cmd )
478
-
479
- system('samtools pileup > /dev/null 2>&1')
480
- ##assumes samtools is in the path...
481
- if $?.exitstatus > 1
482
- raise RuntimeError, "samtools is required on the path. A version of samtools with the pileup function is required"
483
- end
484
-
485
- raise SAMException.new(), "No BAMFile provided" unless @sam and @binary
486
- raise SAMException.new(), "No FastA provided" unless @fasta_path
487
-
488
- command = 'samtools pileup ' + cmd + " -f #{@fasta_path}" + " #{@sam}"
489
-
490
- pipe = IO.popen(command)
491
- while line = pipe.gets
492
- yield Pileup.new(line)
490
+ #Replace the header of the current bam file with the header in header_sam
491
+ #* header_sam - the sam file from which the new header will be taken
492
+ #* out - [FILE] output bam file
493
+ def reheader(header_sam, opts={})
494
+ if opts.has_key?(:out)
495
+ out=opts[:out]
496
+ command = "#{@samtools} reheader #{header_sam} #{@bam} > #{out}"
497
+ else
498
+ command = "#{@samtools} reheader #{header_sam} #{@bam}"
493
499
  end
494
- pipe.close
500
+ puts command if $VERBOSE
501
+ @last_command = command
502
+ system(command)
495
503
  end
496
-
497
-
498
- def index_stats
499
- raise SAMException.new(), "No BAMFile provided" unless @sam and @binary
500
- raise SAMException.new(), "No FastA provided" unless @fasta_path
501
- strptrs = []
502
- strptrs << FFI::MemoryPointer.from_string("idxstats")
503
- strptrs << FFI::MemoryPointer.from_string(@sam)
504
- strptrs << nil
505
-
506
- # Now load all the pointers into a native memory block
507
- argv = FFI::MemoryPointer.new(:pointer, strptrs.length)
508
- strptrs.each_with_index do |p, i|
509
- argv[i].put_pointer(0, p)
510
- end
511
-
512
- index_stats = {}
513
-
514
- old_stdout = STDOUT.clone
515
- read_pipe, write_pipe = IO.pipe()
516
- STDOUT.reopen(write_pipe)
517
-
518
- #int bam_idxstats(int argc, char *argv[])
519
- Bio::DB::SAM::Tools.bam_idxstats(strptrs.length - 1,argv)
520
- if fork
521
- write_pipe.close
522
- STDOUT.reopen(old_stdout) #beware .. stdout from other processes eg tests calling this method can get mixed in...
523
- begin
524
-
525
- while line = read_pipe.readline #TAB delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped reads.
526
- info = line.split(/\t/)
527
- next unless info.length == 4
528
- index_stats[ info[0] ] = {:length => info[1].to_i, :mapped_reads => info[2].to_i, :unmapped_reads => info[3].to_i }
529
- end
530
- rescue EOFError
531
- read_pipe.close
532
- Process.wait
533
- end
534
- end #fork
535
- index_stats
536
- end
537
-
538
- ##yields each reference name and its length
539
- def each_reference
540
- refs = index_stats
541
- refs.each_pair do |k, v|
542
- yield k, v[:length]
504
+
505
+ #Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing tag. Output SAM by default.
506
+ #* A - When used jointly with -r this option overwrites the original base quality.
507
+ #* e - Convert a the read base to = if it is identical to the aligned reference base. Indel caller does not support the = bases at the moment.
508
+ #* u - Output uncompressed BAM
509
+ #* b - Output compressed BAM
510
+ #* S - The input is SAM with header lines
511
+ #* C - [INT] Coefficient to cap mapping quality of poorly mapped reads. See the pileup command for details. [0]
512
+ #* r - Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
513
+ #* E - Extended BAQ calculation. This option trades specificity for sensitivity, though the effect is minor.
514
+ def calmd(opts={}, &block)
515
+ command = form_opt_string(@samtools, "calmd", opts, [:E, :e, :u, :b, :S, :r] )+ " " + @fasta
516
+ puts command if $VERBOSE
517
+ @last_command = command
518
+ type = :text
519
+ klass = Bio::DB::Alignment
520
+ yield_from_pipe(command, klass, type, true, "@",&block)
521
+ end
522
+
523
+ #Identifies target regions by examining the continuity of read depth, computes haploid consensus sequences of targets and outputs a SAM with each sequence corresponding to a target. When option -f is in use, BAQ will be applied.
524
+ #* Q - [INT] Minimum base quality for a base to be considered [13]
525
+ #* i - in penalty
526
+ #* 0 - em0
527
+ #* 1 - em1
528
+ #* 2 - em2
529
+ #* f - reference
530
+ def targetcut(opts={})
531
+ if opts[:f]
532
+ opts['f'] = @fasta
533
+ opts.delete(:s)
543
534
  end
535
+
536
+ command = "#{form_opt_string(@samtools, "targetcut", opts, [] )}"
537
+ puts command if $VERBOSE
538
+ @last_command = command
539
+ system(command)
544
540
  end
545
541
 
546
- end
542
+ #Call and phase heterozygous SNPs
543
+ #* A - Drop reads with ambiguous phase.
544
+ #* b - [STR] Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file STR.0.bam and phase-1 reads in STR.1.bam. Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads with switch errors will be saved in STR.chimeric.bam. [null]
545
+ #* F - Do not attempt to fix chimeric reads.
546
+ #* k - [INT] Maximum length for local phasing. [13]
547
+ #* q - [INT] Minimum Phred-scaled LOD to call a heterozygote. [40]
548
+ #* Q - [INT] Minimum base quality to be used in het calling. [13]
549
+ def phase(opts={})
550
+ command = "#{form_opt_string(@samtools, "phase", opts, [:A, :F] )}"
551
+ puts command if $VERBOSE
552
+ @last_command = command
553
+ system(command)
554
+ end
555
+
556
+
557
+ #returns an array for each position with [sequence_name, position, depth]
558
+ #* b - list of positions or regions in BED format
559
+ #* l - [INT] minQLen
560
+ #* q - [INT] base quality threshold
561
+ #* Q - [INT] mapping quality threshold
562
+ #* r - [chr:from-to] region
563
+ def depth(opts={})
564
+ command = form_opt_string(@samtools, "depth", opts)
565
+ @last_command = command
566
+ puts command if $VERBOSE
567
+ yield_from_pipe(command, String) do |line|
568
+ yield line.split(/\t/)
569
+ end
547
570
 
548
- class Tag
549
- attr_accessor :tag, :type, :value
550
- def set(str)
551
- @tag = str[0..1]
552
- @type = str[3]
553
- @value = str[5..-1]
554
571
  end
555
- end
556
572
 
557
- class Alignment
558
-
559
- def initialize
560
- ObjectSpace.define_finalizer(self,
561
- self.class.method(:finalize).to_proc)
562
- end
563
- def Alignment.finalize(object_id)
564
-
565
- # puts "Object #{object_id} dying at #{Time.new}"
566
- # p "?" . object_id.al
567
- # p object_id.al
568
- LibC.free object_id.al
569
- LibC.free object_id.sam
570
- LibC.free object_id.calend
571
- LibC.free object_id.qlen
572
-
573
- LibC.free object_id.samstr
574
- end
575
-
576
- #Attributes from the format
577
- attr_accessor :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr
578
- #Attributes pulled with the C library
579
- attr_accessor :calend, :qlen
580
- #Attrobites frp, the flag field (see chapter 2.2.2 of the sam file documentation)
581
- #query_strand and mate_strand are true if they are forward. It is the opposite to the definition in the BAM format for clarity.
582
- #primary is the negation of is_negative from the BAM format
583
- attr_accessor :is_paired, :is_mapped, :query_unmapped, :mate_unmapped, :query_strand, :mate_strand, :first_in_pair,:second_in_pair, :primary, :failed_quality, :is_duplicate
584
-
585
- def set(bam_alignment, header)
586
- #Create the FFI object
587
- @al = Bio::DB::SAM::Tools::Bam1T.new(bam_alignment)
588
-
589
- #set the raw data
590
- tmp_str = Bio::DB::SAM::Tools.bam_format1(header,al)
591
- #self.sam = tmp_str
592
- #ObjectSpace.define_finalizer(self, proc {|id| puts "Finalizer one on #{id}" })
593
- self.sam = String.new(tmp_str)
594
- #LibC.free tmp_str
595
- #Set values calculated by libbam
596
- core = al[:core]
597
- cigar = al[:data][core[:l_qname]]#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
598
- @calend = Bio::DB::SAM::Tools.bam_calend(core,cigar)
599
- @qlen = Bio::DB::SAM::Tools.bam_cigar2qlen(core,cigar)
600
-
601
- #process the flags
602
- @is_paired = @flag & 0x0001 > 0
603
- @is_mapped = @flag & 0x0002 > 0
604
- @query_unmapped = @flag & 0x0004 > 0
605
- @mate_unmapped = @flag & 0x0008 > 0
606
- @query_strand = !(@flag & 0x0010 > 0)
607
- @mate_strand = !(@flag & 0x0020 > 0)
608
- @first_in_pair = @flag & 0x0040 > 0
609
- @second_in_pair = @flag & 0x0080 > 0
610
- @primary = !(@flag & 0x0100 > 0)
611
- @failed_quality = @flag & 0x0200 > 0
612
- @is_duplicate = @flag & 0x0400 > 0
613
-
614
- end
615
-
616
-
617
- def sam=(sam)
618
- #p sam
619
- s = sam.split("\t")
620
- self.qname = s[0]
621
- self.flag = s[1].to_i
622
- self.rname = s[2]
623
- self.pos = s[3].to_i
624
- self.mapq = s[4].to_i
625
- self.cigar = s[5]
626
- self.mrnm = s[6]
627
- self.mpos = s[7].to_i
628
- self.isize = s[8].to_i
629
- self.seq = s[9]
630
- self.qual = s[10]
631
- self.tags = {}
632
- 11.upto(s.size-1) {|n|
633
- t = Tag.new
634
- t.set(s[n])
635
- tags[t.tag] = t
636
- }
573
+ #Returns the pipelup of a region, encapsulated as a Bio::DB::Fasta::Region object.
574
+ #The opts are the same as for mpileup
575
+ def fetch_region(opts={})
576
+ region = opts[:r] ? opts[:r] : opts[:region]
577
+ opts[:r] = region
578
+ opts[:region] = region
579
+ reg = Bio::DB::Fasta::Region.parse_region(region.to_s)
580
+ reg.reference = self.fetch_reference(region.entry, region.start, region.end).downcase
581
+ tmp = Array.new
582
+ mpileup(opts) do | pile |
583
+ # puts pile
584
+ tmp << pile
585
+ yield pile if block_given?
586
+ end
587
+ reg.pileup = tmp
588
+ reg.calculate_stats_from_pile(opts)
589
+ reg
590
+ end
637
591
 
592
+ #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
593
+ #the pile for different operations, it won't execute the mpilup command several times
594
+ #Whenever you finish using a region, call mpileup_clear_cache to free the cache
595
+ #The argument Region is required, as it will be the key for the underlying hash.
596
+ #We asume that the options (other than the region) are constant. If they are not, the cache mechanism may not be consistent.
597
+ #
598
+ #TODO: It may be good to load partially the pileup
599
+ def mpileup_cached (opts={})
600
+ raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
601
+ @cached_regions = Hash.new unless @cached_regions
602
+ region = opts[:r] ? opts[:r] : opts[:region]
603
+ @cached_regions[region.to_s] = fetch_region(opts) unless @cached_regions[region.to_s]
604
+ if block_given?
605
+ @cached_regions[region.to_s].pileup.each do | pile |
606
+ yield pile
607
+ end
608
+ end
609
+ region.pileup
610
+ end
638
611
 
639
- #<QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> \
640
- #[<TAG>:<VTYPE>:<VALUE> [...]]
641
612
 
613
+ #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
614
+ #If no region is passed, the hash is emptied
615
+ def mpileup_clear_cache (region)
616
+ return unless @cached_regions
617
+ if region
618
+ @cached_regions[region.to_s] = nil
619
+ else
620
+ @cached_regions.clear
621
+ end
642
622
  end
643
623
 
644
- end
645
624
 
646
- class SAMException < RuntimeError
647
- #we can add further variables to give information of the excpetion
648
- def initialize()
649
625
 
626
+ #Extract the reads that align to a region
627
+ #* region [String] - Region to extract (chromosome:start-end)
628
+ #* fastq - [INT] fastq file where to print. If empty, prints to stdout
629
+ #* q - [INT] base quality threshold
630
+ # Not tested yet
631
+ def extract_reads(opts={})
632
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
633
+ fastq_filename = opts[:fastq]
634
+
635
+ out = $stdout
636
+ print_fastq = Proc.new do |alignment|
637
+ out.puts "@#{alignment.qname}"
638
+ out.puts "#{alignment.seq}"
639
+ out.puts "+#{alignment.qname}"
640
+ out.puts "#{alignment.qual}"
641
+ end
642
+
643
+ if fastq_filename
644
+ out = File.open(fastq_filename, "w")
645
+ end
646
+ fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
647
+ out.close if fastq_filename
648
+ end
649
+ private
650
+ #Returns Process::Status with the execution status. If run in a $VERBOSE environment, stderr of the process
651
+ #is forwarded to the default stdout
652
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
653
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
654
+ pid = wait_thr[:pid] # pid of the started process.
655
+ if type == :text
656
+ while (line = pipe.gets)
657
+ next if skip_comments and line[0] == comment_char
658
+ yield klass.new(line.chomp)
659
+ end
660
+ elsif type == :binary
661
+ while (c = pipe.gets(nil))
662
+ yield c
663
+ end
664
+ end
665
+ exit_status = wait_thr.value # Process::Status object returned.
666
+ puts stderr.read if $VERBOSE
667
+ stdin.close
668
+ pipe.close
669
+ stderr.close
670
+ return exit_status
671
+ end
672
+
673
+
674
+ # returns a command string from a program
675
+ # @param program [Symbol] either `:samtools` or `:bcftools`
676
+ # @param opts [Hash] the options hash
677
+ # @param singles `flag` options [Array] the options in `opts` that are single options
678
+ def form_opt_string(prog, command, opts, singles=[])
679
+ opts_string = commandify(opts, singles)
680
+ "#{prog} #{command} #{opts_string} #{@bam}"
650
681
  end
682
+
683
+ # turns an opts hash into a s
684
+ def commandify(opts, singles)
685
+ list = []
686
+ opts.each_pair do |tag,value|
687
+ value = "\"#{value}\""
688
+ value = "" if singles.include?(tag)
689
+
690
+ list << "-#{tag.to_s} #{value}"
691
+ end
692
+ list.join(" ")
693
+ end
694
+
695
+ # checks existence of files in instance
696
+ def files_ok?
697
+ [@fasta, @sam, @bam].flatten.compact.each {|f| return false unless File.exists? f }
698
+ true
699
+ end
700
+
701
+
702
+
651
703
  end
652
704
  end
653
705
  end
654
-
655
-