bio-samtools-wrapper 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (125) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.travis.yml +27 -0
  4. data/Gemfile +20 -0
  5. data/LICENSE.txt +702 -0
  6. data/README.md +501 -0
  7. data/Rakefile +73 -0
  8. data/VERSION +1 -0
  9. data/bin/bam_consensus.rb +85 -0
  10. data/bio-samtools-wrapper.gemspec +181 -0
  11. data/doc/Bio/DB/Alignment.html +552 -0
  12. data/doc/Bio/DB/Pileup.html +711 -0
  13. data/doc/Bio/DB/SAM/Library.html +167 -0
  14. data/doc/Bio/DB/SAM/Tools.html +109 -0
  15. data/doc/Bio/DB/SAM.html +1853 -0
  16. data/doc/Bio/DB/Tag.html +208 -0
  17. data/doc/Bio/DB/Vcf.html +431 -0
  18. data/doc/Bio/DB.html +105 -0
  19. data/doc/Bio.html +175 -0
  20. data/doc/LICENSE_txt.html +846 -0
  21. data/doc/created.rid +9 -0
  22. data/doc/fonts/Lato-Light.ttf +0 -0
  23. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  24. data/doc/fonts/Lato-Regular.ttf +0 -0
  25. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  26. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  27. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  28. data/doc/fonts.css +167 -0
  29. data/doc/images/add.png +0 -0
  30. data/doc/images/arrow_up.png +0 -0
  31. data/doc/images/brick.png +0 -0
  32. data/doc/images/brick_link.png +0 -0
  33. data/doc/images/bug.png +0 -0
  34. data/doc/images/bullet_black.png +0 -0
  35. data/doc/images/bullet_toggle_minus.png +0 -0
  36. data/doc/images/bullet_toggle_plus.png +0 -0
  37. data/doc/images/date.png +0 -0
  38. data/doc/images/delete.png +0 -0
  39. data/doc/images/find.png +0 -0
  40. data/doc/images/loadingAnimation.gif +0 -0
  41. data/doc/images/macFFBgHack.png +0 -0
  42. data/doc/images/package.png +0 -0
  43. data/doc/images/page_green.png +0 -0
  44. data/doc/images/page_white_text.png +0 -0
  45. data/doc/images/page_white_width.png +0 -0
  46. data/doc/images/plugin.png +0 -0
  47. data/doc/images/ruby.png +0 -0
  48. data/doc/images/tag_blue.png +0 -0
  49. data/doc/images/tag_green.png +0 -0
  50. data/doc/images/transparent.png +0 -0
  51. data/doc/images/wrench.png +0 -0
  52. data/doc/images/wrench_orange.png +0 -0
  53. data/doc/images/zoom.png +0 -0
  54. data/doc/index.html +106 -0
  55. data/doc/js/darkfish.js +140 -0
  56. data/doc/js/jquery.js +18 -0
  57. data/doc/js/navigation.js +142 -0
  58. data/doc/js/search.js +109 -0
  59. data/doc/js/search_index.js +1 -0
  60. data/doc/js/searcher.js +228 -0
  61. data/doc/rdoc.css +580 -0
  62. data/doc/table_of_contents.html +305 -0
  63. data/ext/Makefile-bioruby.patch +12 -0
  64. data/ext/Makefile-suse.patch +11 -0
  65. data/ext/mkrf_conf.rb +118 -0
  66. data/lib/bio/BIOExtensions.rb +89 -0
  67. data/lib/bio/db/alignment.rb +64 -0
  68. data/lib/bio/db/fastadb.rb +320 -0
  69. data/lib/bio/db/pileup.rb +273 -0
  70. data/lib/bio/db/sam/external/COPYING +21 -0
  71. data/lib/bio/db/sam/external/VERSION +1 -0
  72. data/lib/bio/db/sam/library.rb +32 -0
  73. data/lib/bio/db/sam.rb +778 -0
  74. data/lib/bio/db/vcf.rb +105 -0
  75. data/lib/bio-samtools-wrapper.rb +9 -0
  76. data/test/.gitignore +1 -0
  77. data/test/helper.rb +18 -0
  78. data/test/sample.vcf +24 -0
  79. data/test/samples/.gitignore +1 -0
  80. data/test/samples/LCI/NC_001988.ffn +2 -0
  81. data/test/samples/LCI/test.bam +0 -0
  82. data/test/samples/LCI/test.bam.bai +0 -0
  83. data/test/samples/small/dupes.bam +0 -0
  84. data/test/samples/small/dupes.sam +274 -0
  85. data/test/samples/small/ids2.txt +1 -0
  86. data/test/samples/small/map_for_reheader.sam +8 -0
  87. data/test/samples/small/map_to_merge1.bam +0 -0
  88. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  89. data/test/samples/small/map_to_merge1.sam +8 -0
  90. data/test/samples/small/map_to_merge2.bam +0 -0
  91. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  92. data/test/samples/small/map_to_merge2.sam +8 -0
  93. data/test/samples/small/no_md.sam +8 -0
  94. data/test/samples/small/sorted.bam +0 -0
  95. data/test/samples/small/sorted.bam.bai +0 -0
  96. data/test/samples/small/test.sai +0 -0
  97. data/test/samples/small/test.tam +10 -0
  98. data/test/samples/small/test_chr.fasta +1000 -0
  99. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  100. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  101. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  102. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  103. data/test/samples/small/test_chr.fasta.amb +2 -0
  104. data/test/samples/small/test_chr.fasta.ann +3 -0
  105. data/test/samples/small/test_chr.fasta.bwt +0 -0
  106. data/test/samples/small/test_chr.fasta.pac +0 -0
  107. data/test/samples/small/test_chr.fasta.rbwt +0 -0
  108. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  109. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  110. data/test/samples/small/test_chr.fasta.rpac +0 -0
  111. data/test/samples/small/test_chr.fasta.rsa +0 -0
  112. data/test/samples/small/test_chr.fasta.sa +0 -0
  113. data/test/samples/small/test_cov.svg +273 -0
  114. data/test/samples/small/test_fastadb.fasta +34 -0
  115. data/test/samples/small/testu.bam +0 -0
  116. data/test/samples/small/testu.bed +2 -0
  117. data/test/test_bio-samtools-wrapper.rb +1 -0
  118. data/test/test_fastadb.rb +89 -0
  119. data/test/test_pileup.rb +90 -0
  120. data/test/test_sam.rb +421 -0
  121. data/test/test_vcf.rb +79 -0
  122. data/tutorial/tutorial.html +474 -0
  123. data/tutorial/tutorial.md +424 -0
  124. data/tutorial/tutorial.pdf +0 -0
  125. metadata +254 -0
@@ -0,0 +1,320 @@
1
+ #Module to hold the information about the fasta file
2
+
3
+ module Bio::DB::Fasta
4
+ #This class contains the entries in a fasta, as generated by samtools faidx
5
+ class Index
6
+ include Enumerable
7
+ attr_reader :entries
8
+
9
+ def initialize
10
+ @entries=[]
11
+ @entries_map = Hash.new
12
+ end
13
+
14
+ #This doesnt validate if you are adding the same entry twice. I may add
15
+ #a validation for that.
16
+ def <<(entry)
17
+ @entries << entry
18
+ @entries_map[entry.id] = entry
19
+ end
20
+
21
+ def each(&block)
22
+ @entries.entries(&block)
23
+ end
24
+ #Total number of entries
25
+ def length
26
+ @entries.length
27
+ end
28
+ alias_method :size, :length
29
+
30
+ #Returns a new Index just with the specified range, as if it was an Array.
31
+ #The return object is of type Index.
32
+ def [](args)
33
+ tmp = @entries[args]
34
+ @new_index = Index.new
35
+ tmp.each do | entry |
36
+ @new_index << entry
37
+ end
38
+ end
39
+
40
+ #Gets the Region object for the full length of the sequence
41
+ #name queried.
42
+ def region_for_entry(entry)
43
+ @entries_map[entry]
44
+ end
45
+ end
46
+
47
+ class Entry
48
+ attr_reader :id, :length, :line_bases, :line_length, :offset
49
+ alias_method :size, :length
50
+ def initialize(id, length, offset = 0 , line_bases= 0 , line_length = 0 )
51
+ @id=id
52
+ @length=length.to_i
53
+ @offset = offset.to_i
54
+ @line_bases = line_bases.to_i
55
+ @line_length = line_length.to_i
56
+ end
57
+
58
+ def get_base_coordinate(coordinate)
59
+ lines_for_offset = coordinate / line_bases
60
+ line_offset = coordinate % line_bases
61
+ #puts "get_base_coordinate"
62
+ #puts "Coordinate: #{coordinate}"
63
+ #puts "lines_for_offset: #{lines_for_offset}"
64
+ #puts "line pffset: #{line_offset}"
65
+ #puts self.inspect
66
+ pointer = offset + (line_length * lines_for_offset) + line_offset - 1
67
+ pointer
68
+ end
69
+
70
+ def get_full_region
71
+ reg = Region.new
72
+ reg.entry = id
73
+ reg.start = 1
74
+ reg.end = @length
75
+ reg.orientation = :forward
76
+ reg
77
+ end
78
+
79
+ alias_method :to_region, :get_full_region
80
+
81
+ end
82
+
83
+ #Class to wrap a region of a chromosome
84
+ class Region
85
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
86
+ attr_accessor :entry, :start, :end, :orientation
87
+
88
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :allele_freq, :consensus, :coverages, :bases, :total_cov, :called
89
+
90
+ def initialize(args ={})
91
+ @entry = args[:entry]
92
+ @start = args[:start]
93
+ @end = args[:end]
94
+ @orientation = args[:orientation]
95
+ end
96
+
97
+ #TODO: Debug, as it hasnt been tested in the actual code.
98
+ def allele_freq_for_base(base)
99
+ @all_ratios = Hash.new unless @all_ratios
100
+ unless @all_ratios[base]
101
+ ratios = Array.new
102
+ for i in (0..region.size-1)
103
+ ratios << @allele_freq[i][base]
104
+ end
105
+ @all_ratios[base] = ratios
106
+ end
107
+ @all_ratios[base]
108
+ end
109
+
110
+ alias_method :base_ratios_for_base, :allele_freq_for_base
111
+ alias_method :base_ratios, :allele_freq
112
+
113
+ #Calculates the concensus, base ratios, coverages and total coverages in the region
114
+ #* min_cov minimum coverage to make a call (default 0)
115
+ #* min_per minimum representation to make make a call. If more than one base
116
+ # can be called, the IUAPC ambiguity code is returned
117
+ def calculate_stats_from_pile(opts={})
118
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 0
119
+ min_per = opts[:min_per] ? opts[:min_per] : 0.20
120
+ self.called = 0
121
+ reference = self.reference.downcase
122
+
123
+ self.allele_freq = Array.new(self.size, BASE_COUNT_ZERO)
124
+ self.bases = Array.new(self.size, BASE_COUNT_ZERO)
125
+ self.coverages = Array.new(self.size, 0)
126
+ self.total_cov = 0
127
+
128
+ self.pileup.each do | pile |
129
+
130
+ if pile.coverage > min_cov
131
+ self.allele_freq[pile.pos - self.start ] = pile.allele_freq
132
+ reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
133
+ self.coverages[pile.pos - self.start ] = pile.coverage.to_i
134
+ self.bases[pile.pos - self.start ] = pile.bases
135
+ self.called += 1
136
+ end
137
+ #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
138
+ self.total_cov += pile.coverage
139
+ end
140
+
141
+ self.consensus = Bio::Sequence.new(reference)
142
+ self.consensus.na
143
+ if self.orientation == :reverse
144
+ self.consensus.reverse_complement!()
145
+ end
146
+ self.average_coverage = self.total_cov.to_f/self.size.to_f
147
+ self
148
+ end
149
+
150
+ def to_s
151
+ string = @entry + ":" + @start.to_s + "-" + @end.to_s
152
+ string
153
+ end
154
+
155
+ #Returns a region object from a string in form "name:start-end"
156
+ def self.parse_region(reg_str)
157
+ string = reg_str.delete("'")
158
+ fields_1 = string.split(":")
159
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2
160
+ fields_2 = fields_1[1].split("-")
161
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_2.length != 2
162
+
163
+ reg = Region.new(:entry=> fields_1[0], :start=>fields_2[0].to_i, :end=>fields_2[1].to_i)
164
+
165
+ if reg.end < reg.start
166
+ reg.orientation = :reverse
167
+ else
168
+ reg.orientation = :forward
169
+ end
170
+ reg
171
+ end
172
+
173
+ #Length of the region
174
+ def size
175
+ @end - @start
176
+ end
177
+ alias_method :length, :size
178
+
179
+ end
180
+
181
+ class FastaDBException < StandardError; end
182
+
183
+ #Class that holds the fasta file. It is used as a database.
184
+ class FastaFile
185
+ attr_reader :fasta_path
186
+
187
+ #Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
188
+ #* fasta path to the fasta file
189
+ #* samtools path to samtools, if it is not provided, use the bundled version
190
+ def initialize(fasta: nil, samtools: false)
191
+ #puts "The arguments are: '#{fasta}':'#{samtools}'"
192
+ @fasta_path = fasta
193
+ @samtools = samtools
194
+ @index = nil
195
+ @fasta_file = nil
196
+ @samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools') if samtools == true
197
+ raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
198
+ @fai_file = @fasta_path + ".fai"
199
+ unless File.file?(@fai_file) then
200
+ command = "#{@samtools} faidx '#{@fasta_path}'"
201
+ @last_command = command
202
+ system(command)
203
+ end
204
+ end
205
+
206
+ #Loads the fai entries
207
+ def load_fai_entries()
208
+ return @index.length if @index
209
+ @index = Index.new
210
+ fai_file = @fai_file
211
+ File.open(fai_file).each do | line |
212
+ fields = line.split("\t")
213
+ @index << Entry.new(fields[0], fields[1], fields[2], fields[3], fields[4])
214
+ end
215
+ @index.length
216
+ end
217
+
218
+
219
+
220
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
221
+ #Options - if a subsequence is required
222
+ #* chr - [STRING] the reference name of the subsequence
223
+ #* start - [INT] the start position for the subsequence
224
+ #* stop - [INT] the stop position for the subsequence
225
+ def faidx(opts={})
226
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
227
+ opts={:as_bio => false}
228
+ self.fetch_reference(:chr,:start,:stop,opts)
229
+ else
230
+ command = "#{@samtools} faidx #{@fasta_path}"
231
+ @last_command = command
232
+ system(command)
233
+ end
234
+ end
235
+
236
+ def index
237
+ return @index if @index
238
+ if @samtools
239
+ faidx
240
+ else
241
+ samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
242
+ #TODO: make a ruby implementations
243
+ command = "#{samtools} faidx #{@fasta_path}"
244
+ @last_command = command
245
+ system(command)
246
+ end
247
+ load_fai_entries
248
+ return @index
249
+ end
250
+
251
+ def fetch_sequence_samtools(region)
252
+ query = region.to_s
253
+ query = region.to_region.to_s if region.respond_to?(:to_region)
254
+ command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
255
+ puts "Running: #{command}" if $DEBUG
256
+ @last_command = command
257
+ seq = ""
258
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
259
+ seq
260
+ end
261
+
262
+ def fetch_sequence_native(region)
263
+ query = region
264
+ query = Region.parse_region(region) unless region.is_a?(Region)
265
+ seq = ""
266
+ #In order to make this reentrant, if we want to make a multithreaded
267
+ #version of this function, we need to get a lock. Currently, only one thred
268
+ #can be assosiated with eache fastadb object
269
+ @fasta_file = File.open(@fasta_path) unless @fasta_file
270
+ entry = index.region_for_entry(query.entry)
271
+
272
+ start_pointer = entry.get_base_coordinate(query.start)
273
+ @fasta_file.seek(start_pointer, IO::SEEK_SET)
274
+ end_pointer = entry.get_base_coordinate(query.end)
275
+ to_read = end_pointer - start_pointer + 1
276
+ seq = @fasta_file.read(to_read)
277
+ seq.gsub!(/\s+/, '')
278
+ seq
279
+ end
280
+
281
+ #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
282
+ def fetch_sequence(region)
283
+ load_fai_entries
284
+ region = Region.parse_region(region.to_s) unless region.is_a?(Region)
285
+ entry = index.region_for_entry(region.entry)
286
+ raise FastaDBException.new "Entry (#{region.entry})not found in reference" unless entry
287
+ raise FastaDBException.new "Region in invalid range (#{region}): Valid range: #{entry.to_region.to_s} has a size of #{entry.size}." if region.end > entry.size or region.start < 1
288
+ seq = @samtools ? fetch_sequence_samtools(region): fetch_sequence_native(region)
289
+ reference = Bio::Sequence::NA.new(seq)
290
+ if region.respond_to? :orientation and region.orientation == :reverse
291
+ reference.reverse_complement!()
292
+ end
293
+ reference
294
+ end
295
+
296
+ private
297
+ #Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
298
+ #is forwarded to the default stdout
299
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
300
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
301
+ #pid = wait_thr[:pid] # pid of the started process.
302
+ if type == :text
303
+ while (line = pipe.gets)
304
+ next if skip_comments and line[0] == comment_char
305
+ yield klass.new(line.chomp)
306
+ end
307
+ elsif type == :binary
308
+ while (c = pipe.gets(nil))
309
+ yield c
310
+ end
311
+ end
312
+ exit_status = wait_thr.value # Process::Status object returned.
313
+ puts stderr.read if $DEBUG
314
+ stdin.close
315
+ pipe.close
316
+ stderr.close
317
+ return exit_status
318
+ end
319
+ end
320
+ end
@@ -0,0 +1,273 @@
1
+ # :title:Pileup
2
+ # = Bio::DB::Pileup
3
+ # A class representing information in SAMTools pileup format
4
+ # Author:: Dan MacLean (dan.maclean@tsl.ac.uk)
5
+ # Pileup is described at http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ#I_do_not_understand_the_columns_in_the_pileup_output.
6
+ # Briefly (when you invoke pileup with the -c option):
7
+ # * 1 reference sequence name
8
+ # * 2 reference coordinate
9
+ # * (3) reference base, or `*' for an indel line
10
+ # * (4) genotype where heterozygotes are encoded in the IUB code: M=A/C, R=A/G, W=A/T, S=C/G, Y=C/T and K=G/T; indels are indicated by, for example, */+A, -A/* or +CC/-C. There is no difference between */+A or +A/*.
11
+ # * (5) Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
12
+ # * (6) Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, because the evidence of a SNP is very strong.
13
+ # * (7) root mean square (RMS) mapping quality
14
+ # * 8 # reads covering the position
15
+ # * 9 read bases at a SNP line (check the manual page for more information); the 1st indel allele otherwise
16
+ # * 10 base quality at a SNP line; the 2nd indel allele otherwise
17
+ # * (11) indel line only: # reads directly supporting the 1st indel allele
18
+ # * (12) indel line only: # reads directly supporting the 2nd indel allele
19
+ # * (13) indel line only: # reads supporting a third indel allele
20
+ # If pileup is invoked without `-c', indel lines and columns between 3 and 7 inclusive will not be outputted.
21
+ #
22
+ # NB mpileup uses the 6 column output format eg
23
+ # "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
24
+ # Pileup provides accessors for all columns (6 or 10 column format) and a few other useful methods
25
+ #
26
+ #
27
+ module Bio
28
+ class DB
29
+ class Pileup
30
+ attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
+
32
+ #creates the Pileup object
33
+ # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
+ # pile = Bio::DB::Pileup.new(pile_up_line)
35
+ def initialize(pile_up_line)
36
+ cols = pile_up_line.split(/\t/)
37
+ @consensus = nil
38
+ @consensus_quality = nil
39
+ @read_quals = nil
40
+ @bases = nil
41
+ @allele_frequency = nil
42
+ @consensus_iuap = nil
43
+ if cols.length == 6 ##should only be able to get 6 lines from mpileup
44
+ @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
45
+ elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
46
+ if cols[2] == '*' #indel
47
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
48
+ else #snp / identity
49
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
50
+ end
51
+ @consensus_quality = @consensus_quality.to_f
52
+ @snp_quality = @snp_quality.to_f
53
+ @rms_mapq = @rms_mapq.to_f
54
+ else
55
+ #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
56
+ end
57
+
58
+ @pos = @pos.to_i
59
+ @coverage = @coverage.to_f
60
+ @ref_count = nil
61
+ @non_ref_count_hash = nil
62
+ @non_ref_count = nil
63
+ end
64
+
65
+ #Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
66
+ #returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
67
+ def non_refs
68
+ if @non_ref_count_hash.nil?
69
+ @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
70
+ end
71
+ @non_ref_count_hash
72
+ end
73
+
74
+ # returns the total non-reference bases in the reads at this position
75
+ def non_ref_count
76
+ if @non_ref_count.nil?
77
+ @non_ref_count = @read_bases.count("ATGCatgc").to_f
78
+ end
79
+ @non_ref_count
80
+ end
81
+
82
+ # returns the count of reference-bases in the reads at this position
83
+ def ref_count
84
+ if @ref_count.nil?
85
+ @ref_count = self.read_bases.count(".,")
86
+ end
87
+ @ref_count
88
+ end
89
+
90
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
91
+ def consensus
92
+ if @consensus.nil?
93
+ max = self.non_refs.values.max
94
+ #if the ref base is in more than half the coverage..
95
+ if (self.ref_count / self.coverage) > 0.5
96
+ #..then the ref base is the concensus
97
+ @consensus = self.ref_base
98
+ ##not sure if the following will ever apply as the non_refs method also returns the ref base count, hence can never be over the max count
99
+ #elsif self.ref_count > max
100
+ # @consensus = self.ref_base
101
+ else
102
+ #get the base(s) and count(s) that has the max count
103
+ arr = self.non_refs.select {|k,v| v == max }
104
+ #just get the bases (remove the counts)
105
+ bases = arr.collect {|b| b[0].to_s }
106
+ #add the ref base if the ref base has a max count (commenting this out as it should already be in)
107
+ #bases << self.ref_base if self.ref_count == max
108
+ @consensus = bases.sort.join
109
+ end
110
+ end
111
+ @consensus
112
+ end
113
+
114
+ #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
115
+ def to_vcf
116
+ alt,g = self.genotype_list
117
+ alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
118
+ alt = '.' if alt == self.ref_base
119
+ alt = alt.split(',')
120
+ #if the reference base is in alt, remove it
121
+ alt.delete(self.ref_base.to_s)
122
+ alt = alt.join(',')
123
+ [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
124
+ end
125
+
126
+ private
127
+ def Pileup.vcf_header
128
+ %{##fileformat=VCFv3.3\n##INFO=DP,1,Integer,"Total Depth"\n##FORMAT=GT,1,String,"Genotype"\n##FORMAT=GQ,1,Integer,"Genotype Quality"\n##FORMAT=DP,1,Integer,"Read Depth"\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA\n}
129
+ end
130
+
131
+
132
+
133
+ #returns the genotype of the indel
134
+ def indel_gt
135
+ return "undef" if self.consensus.instance_of?(Array)
136
+ al1, al2 = self.consensus.split(/\//)
137
+ if al1 == al2 && al1 == '*'
138
+ al1=self.indel_1
139
+ al2=self.indel_2
140
+ end
141
+ alt1 = parse_indel(al1)
142
+ alt2 = parse_indel(al2)
143
+ alt,gt = nil,nil
144
+
145
+ return nil if !alt1 and !alt2
146
+ if !alt1
147
+ alt = alt2
148
+ gt = '0/1'
149
+ elsif !alt2
150
+ alt = alt1
151
+ gt - '0/1'
152
+ elsif alt1 == alt2
153
+ alt = alt1
154
+ gt = '1/1'
155
+ else
156
+ alt="#{alt1},#{alt2}"
157
+ gt= '1/2'
158
+ end
159
+ return [alt, gt]
160
+
161
+ end
162
+ #returns the genotype of the snp
163
+ def snp_gt
164
+ return ['.','0/0'] if self.ref_base == self.consensus
165
+ bases = Pileup.iupac_to_base(self.consensus)
166
+ if bases[0] == self.ref_base
167
+ return [bases[1],'0/1']
168
+ elsif bases[1] == self.ref_base
169
+ return [bases[0],'0/1']
170
+ else
171
+ return ["#{bases[0]},#{bases[1]}",'1/1']
172
+ end
173
+ end
174
+
175
+ #identifies the reference base and returns the indel or snp genotype as applicable
176
+ public
177
+ def genotype_list
178
+ if self.ref_base == '*'
179
+ return indel_gt
180
+ else
181
+ return snp_gt
182
+ end
183
+ end
184
+
185
+ #returns the two bases for the corresponding iupac code
186
+ public
187
+ def Pileup.iupac_to_base(alt_base)
188
+ case alt_base
189
+ when 'K' then ['G','T']
190
+ when 'M' then ['A','C']
191
+ when 'S' then ['C','G']
192
+ when 'R' then ['A','G']
193
+ when 'W' then ['A','T']
194
+ when 'Y' then ['C','T']
195
+ else alt_base.split(//)
196
+ end
197
+ end
198
+
199
+ #identifies if the indel is an insertion or a deletion
200
+ def parse_indel(alt)
201
+ return "D#{$'.length}" if alt =~/^-/
202
+ if alt=~/^\+/
203
+ return "I#{$'}"
204
+ elsif alt == '*'
205
+ return nil
206
+ end
207
+ end
208
+
209
+
210
+ #returns pileup format line
211
+ def to_s
212
+ if @read_quals and !@consensus_quality #6col
213
+ [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
214
+ elsif @indel_1 #13 cols
215
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
216
+ else #10 cols
217
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
218
+ end
219
+
220
+ end
221
+
222
+
223
+ def bases
224
+ return @bases if @bases
225
+ @bases = self.non_refs
226
+ #puts self.ref_count
227
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
228
+ @bases
229
+ end
230
+
231
+ def base_coverage
232
+ total = 0
233
+ @bases.each do |k,v|
234
+ total += v
235
+ end
236
+ total
237
+ end
238
+
239
+ #returns the frequency of all bases in pileup position
240
+ def allele_freq
241
+ return @allele_frequency if @allele_frequency
242
+ bases = self.bases
243
+ @allele_frequency = Hash.new
244
+ bases.each do |k,v|
245
+ @allele_frequency[k] = v.to_f/self.base_coverage.to_f
246
+ end
247
+ @allele_frequency
248
+ end
249
+
250
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
251
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
252
+
253
+ tmp = []
254
+ if @consensus_iuap.nil?
255
+ @consensus_iuap = self.ref_base.downcase
256
+ bases = self.bases
257
+ #tmp = String.new
258
+ bases.each do |k,v|
259
+ tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
260
+ end
261
+ if tmp.length > 0
262
+ tmp = tmp.collect{ |x| Bio::Sequence::NA.new(x) }
263
+ # creates alignment object
264
+ a = Bio::Alignment.new(tmp)
265
+ # shows IUPAC consensus
266
+ @consensus_iuap = a.consensus_iupac
267
+ end
268
+ end
269
+ @consensus_iuap
270
+ end
271
+ end
272
+ end
273
+ end
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2008-2009 Genome Research Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1 @@
1
+ 1.6
@@ -0,0 +1,32 @@
1
+ module Bio
2
+ class DB
3
+ module SAM
4
+ module Library
5
+ #IMPORTANT NOTE: Windows library is missing in this distribution
6
+
7
+ # Return the path with the file name of the library for the specific operating system
8
+ def filename
9
+ #TODO refactor this piece of code in all the files
10
+ lib_os = case RUBY_PLATFORM
11
+ when /linux/
12
+ 'so.1'
13
+ when /darwin/
14
+ '1.dylib'
15
+ when /windows/
16
+ 'dll'
17
+ else
18
+ case RUBY_DESCRIPTION
19
+ when /jruby.*darwin/
20
+ '1.dylib'
21
+ when /jruby.*linux/
22
+ 'so.1'
23
+ end
24
+ end
25
+
26
+ File.join(File.expand_path(File.dirname(__FILE__)),'external',"libbam.#{lib_os}")
27
+ end #filename
28
+ module_function :filename
29
+ end #Library
30
+ end #Sam
31
+ end #DB
32
+ end #Bio