bio-samtools-wrapper 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.travis.yml +27 -0
  4. data/Gemfile +20 -0
  5. data/LICENSE.txt +702 -0
  6. data/README.md +501 -0
  7. data/Rakefile +73 -0
  8. data/VERSION +1 -0
  9. data/bin/bam_consensus.rb +85 -0
  10. data/bio-samtools-wrapper.gemspec +181 -0
  11. data/doc/Bio/DB/Alignment.html +552 -0
  12. data/doc/Bio/DB/Pileup.html +711 -0
  13. data/doc/Bio/DB/SAM/Library.html +167 -0
  14. data/doc/Bio/DB/SAM/Tools.html +109 -0
  15. data/doc/Bio/DB/SAM.html +1853 -0
  16. data/doc/Bio/DB/Tag.html +208 -0
  17. data/doc/Bio/DB/Vcf.html +431 -0
  18. data/doc/Bio/DB.html +105 -0
  19. data/doc/Bio.html +175 -0
  20. data/doc/LICENSE_txt.html +846 -0
  21. data/doc/created.rid +9 -0
  22. data/doc/fonts/Lato-Light.ttf +0 -0
  23. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  24. data/doc/fonts/Lato-Regular.ttf +0 -0
  25. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  26. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  27. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  28. data/doc/fonts.css +167 -0
  29. data/doc/images/add.png +0 -0
  30. data/doc/images/arrow_up.png +0 -0
  31. data/doc/images/brick.png +0 -0
  32. data/doc/images/brick_link.png +0 -0
  33. data/doc/images/bug.png +0 -0
  34. data/doc/images/bullet_black.png +0 -0
  35. data/doc/images/bullet_toggle_minus.png +0 -0
  36. data/doc/images/bullet_toggle_plus.png +0 -0
  37. data/doc/images/date.png +0 -0
  38. data/doc/images/delete.png +0 -0
  39. data/doc/images/find.png +0 -0
  40. data/doc/images/loadingAnimation.gif +0 -0
  41. data/doc/images/macFFBgHack.png +0 -0
  42. data/doc/images/package.png +0 -0
  43. data/doc/images/page_green.png +0 -0
  44. data/doc/images/page_white_text.png +0 -0
  45. data/doc/images/page_white_width.png +0 -0
  46. data/doc/images/plugin.png +0 -0
  47. data/doc/images/ruby.png +0 -0
  48. data/doc/images/tag_blue.png +0 -0
  49. data/doc/images/tag_green.png +0 -0
  50. data/doc/images/transparent.png +0 -0
  51. data/doc/images/wrench.png +0 -0
  52. data/doc/images/wrench_orange.png +0 -0
  53. data/doc/images/zoom.png +0 -0
  54. data/doc/index.html +106 -0
  55. data/doc/js/darkfish.js +140 -0
  56. data/doc/js/jquery.js +18 -0
  57. data/doc/js/navigation.js +142 -0
  58. data/doc/js/search.js +109 -0
  59. data/doc/js/search_index.js +1 -0
  60. data/doc/js/searcher.js +228 -0
  61. data/doc/rdoc.css +580 -0
  62. data/doc/table_of_contents.html +305 -0
  63. data/ext/Makefile-bioruby.patch +12 -0
  64. data/ext/Makefile-suse.patch +11 -0
  65. data/ext/mkrf_conf.rb +118 -0
  66. data/lib/bio/BIOExtensions.rb +89 -0
  67. data/lib/bio/db/alignment.rb +64 -0
  68. data/lib/bio/db/fastadb.rb +320 -0
  69. data/lib/bio/db/pileup.rb +273 -0
  70. data/lib/bio/db/sam/external/COPYING +21 -0
  71. data/lib/bio/db/sam/external/VERSION +1 -0
  72. data/lib/bio/db/sam/library.rb +32 -0
  73. data/lib/bio/db/sam.rb +778 -0
  74. data/lib/bio/db/vcf.rb +105 -0
  75. data/lib/bio-samtools-wrapper.rb +9 -0
  76. data/test/.gitignore +1 -0
  77. data/test/helper.rb +18 -0
  78. data/test/sample.vcf +24 -0
  79. data/test/samples/.gitignore +1 -0
  80. data/test/samples/LCI/NC_001988.ffn +2 -0
  81. data/test/samples/LCI/test.bam +0 -0
  82. data/test/samples/LCI/test.bam.bai +0 -0
  83. data/test/samples/small/dupes.bam +0 -0
  84. data/test/samples/small/dupes.sam +274 -0
  85. data/test/samples/small/ids2.txt +1 -0
  86. data/test/samples/small/map_for_reheader.sam +8 -0
  87. data/test/samples/small/map_to_merge1.bam +0 -0
  88. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  89. data/test/samples/small/map_to_merge1.sam +8 -0
  90. data/test/samples/small/map_to_merge2.bam +0 -0
  91. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  92. data/test/samples/small/map_to_merge2.sam +8 -0
  93. data/test/samples/small/no_md.sam +8 -0
  94. data/test/samples/small/sorted.bam +0 -0
  95. data/test/samples/small/sorted.bam.bai +0 -0
  96. data/test/samples/small/test.sai +0 -0
  97. data/test/samples/small/test.tam +10 -0
  98. data/test/samples/small/test_chr.fasta +1000 -0
  99. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  100. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  101. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  102. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  103. data/test/samples/small/test_chr.fasta.amb +2 -0
  104. data/test/samples/small/test_chr.fasta.ann +3 -0
  105. data/test/samples/small/test_chr.fasta.bwt +0 -0
  106. data/test/samples/small/test_chr.fasta.pac +0 -0
  107. data/test/samples/small/test_chr.fasta.rbwt +0 -0
  108. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  109. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  110. data/test/samples/small/test_chr.fasta.rpac +0 -0
  111. data/test/samples/small/test_chr.fasta.rsa +0 -0
  112. data/test/samples/small/test_chr.fasta.sa +0 -0
  113. data/test/samples/small/test_cov.svg +273 -0
  114. data/test/samples/small/test_fastadb.fasta +34 -0
  115. data/test/samples/small/testu.bam +0 -0
  116. data/test/samples/small/testu.bed +2 -0
  117. data/test/test_bio-samtools-wrapper.rb +1 -0
  118. data/test/test_fastadb.rb +89 -0
  119. data/test/test_pileup.rb +90 -0
  120. data/test/test_sam.rb +421 -0
  121. data/test/test_vcf.rb +79 -0
  122. data/tutorial/tutorial.html +474 -0
  123. data/tutorial/tutorial.md +424 -0
  124. data/tutorial/tutorial.pdf +0 -0
  125. metadata +254 -0
@@ -0,0 +1,320 @@
1
+ #Module to hold the information about the fasta file
2
+
3
+ module Bio::DB::Fasta
4
+ #This class contains the entries in a fasta, as generated by samtools faidx
5
+ class Index
6
+ include Enumerable
7
+ attr_reader :entries
8
+
9
+ def initialize
10
+ @entries=[]
11
+ @entries_map = Hash.new
12
+ end
13
+
14
+ #This doesnt validate if you are adding the same entry twice. I may add
15
+ #a validation for that.
16
+ def <<(entry)
17
+ @entries << entry
18
+ @entries_map[entry.id] = entry
19
+ end
20
+
21
+ def each(&block)
22
+ @entries.entries(&block)
23
+ end
24
+ #Total number of entries
25
+ def length
26
+ @entries.length
27
+ end
28
+ alias_method :size, :length
29
+
30
+ #Returns a new Index just with the specified range, as if it was an Array.
31
+ #The return object is of type Index.
32
+ def [](args)
33
+ tmp = @entries[args]
34
+ @new_index = Index.new
35
+ tmp.each do | entry |
36
+ @new_index << entry
37
+ end
38
+ end
39
+
40
+ #Gets the Region object for the full length of the sequence
41
+ #name queried.
42
+ def region_for_entry(entry)
43
+ @entries_map[entry]
44
+ end
45
+ end
46
+
47
+ class Entry
48
+ attr_reader :id, :length, :line_bases, :line_length, :offset
49
+ alias_method :size, :length
50
+ def initialize(id, length, offset = 0 , line_bases= 0 , line_length = 0 )
51
+ @id=id
52
+ @length=length.to_i
53
+ @offset = offset.to_i
54
+ @line_bases = line_bases.to_i
55
+ @line_length = line_length.to_i
56
+ end
57
+
58
+ def get_base_coordinate(coordinate)
59
+ lines_for_offset = coordinate / line_bases
60
+ line_offset = coordinate % line_bases
61
+ #puts "get_base_coordinate"
62
+ #puts "Coordinate: #{coordinate}"
63
+ #puts "lines_for_offset: #{lines_for_offset}"
64
+ #puts "line pffset: #{line_offset}"
65
+ #puts self.inspect
66
+ pointer = offset + (line_length * lines_for_offset) + line_offset - 1
67
+ pointer
68
+ end
69
+
70
+ def get_full_region
71
+ reg = Region.new
72
+ reg.entry = id
73
+ reg.start = 1
74
+ reg.end = @length
75
+ reg.orientation = :forward
76
+ reg
77
+ end
78
+
79
+ alias_method :to_region, :get_full_region
80
+
81
+ end
82
+
83
+ #Class to wrap a region of a chromosome
84
+ class Region
85
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
86
+ attr_accessor :entry, :start, :end, :orientation
87
+
88
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :allele_freq, :consensus, :coverages, :bases, :total_cov, :called
89
+
90
+ def initialize(args ={})
91
+ @entry = args[:entry]
92
+ @start = args[:start]
93
+ @end = args[:end]
94
+ @orientation = args[:orientation]
95
+ end
96
+
97
+ #TODO: Debug, as it hasnt been tested in the actual code.
98
+ def allele_freq_for_base(base)
99
+ @all_ratios = Hash.new unless @all_ratios
100
+ unless @all_ratios[base]
101
+ ratios = Array.new
102
+ for i in (0..region.size-1)
103
+ ratios << @allele_freq[i][base]
104
+ end
105
+ @all_ratios[base] = ratios
106
+ end
107
+ @all_ratios[base]
108
+ end
109
+
110
+ alias_method :base_ratios_for_base, :allele_freq_for_base
111
+ alias_method :base_ratios, :allele_freq
112
+
113
+ #Calculates the concensus, base ratios, coverages and total coverages in the region
114
+ #* min_cov minimum coverage to make a call (default 0)
115
+ #* min_per minimum representation to make make a call. If more than one base
116
+ # can be called, the IUAPC ambiguity code is returned
117
+ def calculate_stats_from_pile(opts={})
118
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 0
119
+ min_per = opts[:min_per] ? opts[:min_per] : 0.20
120
+ self.called = 0
121
+ reference = self.reference.downcase
122
+
123
+ self.allele_freq = Array.new(self.size, BASE_COUNT_ZERO)
124
+ self.bases = Array.new(self.size, BASE_COUNT_ZERO)
125
+ self.coverages = Array.new(self.size, 0)
126
+ self.total_cov = 0
127
+
128
+ self.pileup.each do | pile |
129
+
130
+ if pile.coverage > min_cov
131
+ self.allele_freq[pile.pos - self.start ] = pile.allele_freq
132
+ reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
133
+ self.coverages[pile.pos - self.start ] = pile.coverage.to_i
134
+ self.bases[pile.pos - self.start ] = pile.bases
135
+ self.called += 1
136
+ end
137
+ #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
138
+ self.total_cov += pile.coverage
139
+ end
140
+
141
+ self.consensus = Bio::Sequence.new(reference)
142
+ self.consensus.na
143
+ if self.orientation == :reverse
144
+ self.consensus.reverse_complement!()
145
+ end
146
+ self.average_coverage = self.total_cov.to_f/self.size.to_f
147
+ self
148
+ end
149
+
150
+ def to_s
151
+ string = @entry + ":" + @start.to_s + "-" + @end.to_s
152
+ string
153
+ end
154
+
155
+ #Returns a region object from a string in form "name:start-end"
156
+ def self.parse_region(reg_str)
157
+ string = reg_str.delete("'")
158
+ fields_1 = string.split(":")
159
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2
160
+ fields_2 = fields_1[1].split("-")
161
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_2.length != 2
162
+
163
+ reg = Region.new(:entry=> fields_1[0], :start=>fields_2[0].to_i, :end=>fields_2[1].to_i)
164
+
165
+ if reg.end < reg.start
166
+ reg.orientation = :reverse
167
+ else
168
+ reg.orientation = :forward
169
+ end
170
+ reg
171
+ end
172
+
173
+ #Length of the region
174
+ def size
175
+ @end - @start
176
+ end
177
+ alias_method :length, :size
178
+
179
+ end
180
+
181
+ class FastaDBException < StandardError; end
182
+
183
+ #Class that holds the fasta file. It is used as a database.
184
+ class FastaFile
185
+ attr_reader :fasta_path
186
+
187
+ #Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
188
+ #* fasta path to the fasta file
189
+ #* samtools path to samtools, if it is not provided, use the bundled version
190
+ def initialize(fasta: nil, samtools: false)
191
+ #puts "The arguments are: '#{fasta}':'#{samtools}'"
192
+ @fasta_path = fasta
193
+ @samtools = samtools
194
+ @index = nil
195
+ @fasta_file = nil
196
+ @samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools') if samtools == true
197
+ raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
198
+ @fai_file = @fasta_path + ".fai"
199
+ unless File.file?(@fai_file) then
200
+ command = "#{@samtools} faidx '#{@fasta_path}'"
201
+ @last_command = command
202
+ system(command)
203
+ end
204
+ end
205
+
206
+ #Loads the fai entries
207
+ def load_fai_entries()
208
+ return @index.length if @index
209
+ @index = Index.new
210
+ fai_file = @fai_file
211
+ File.open(fai_file).each do | line |
212
+ fields = line.split("\t")
213
+ @index << Entry.new(fields[0], fields[1], fields[2], fields[3], fields[4])
214
+ end
215
+ @index.length
216
+ end
217
+
218
+
219
+
220
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
221
+ #Options - if a subsequence is required
222
+ #* chr - [STRING] the reference name of the subsequence
223
+ #* start - [INT] the start position for the subsequence
224
+ #* stop - [INT] the stop position for the subsequence
225
+ def faidx(opts={})
226
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
227
+ opts={:as_bio => false}
228
+ self.fetch_reference(:chr,:start,:stop,opts)
229
+ else
230
+ command = "#{@samtools} faidx #{@fasta_path}"
231
+ @last_command = command
232
+ system(command)
233
+ end
234
+ end
235
+
236
+ def index
237
+ return @index if @index
238
+ if @samtools
239
+ faidx
240
+ else
241
+ samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
242
+ #TODO: make a ruby implementations
243
+ command = "#{samtools} faidx #{@fasta_path}"
244
+ @last_command = command
245
+ system(command)
246
+ end
247
+ load_fai_entries
248
+ return @index
249
+ end
250
+
251
+ def fetch_sequence_samtools(region)
252
+ query = region.to_s
253
+ query = region.to_region.to_s if region.respond_to?(:to_region)
254
+ command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
255
+ puts "Running: #{command}" if $DEBUG
256
+ @last_command = command
257
+ seq = ""
258
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
259
+ seq
260
+ end
261
+
262
+ def fetch_sequence_native(region)
263
+ query = region
264
+ query = Region.parse_region(region) unless region.is_a?(Region)
265
+ seq = ""
266
+ #In order to make this reentrant, if we want to make a multithreaded
267
+ #version of this function, we need to get a lock. Currently, only one thred
268
+ #can be assosiated with eache fastadb object
269
+ @fasta_file = File.open(@fasta_path) unless @fasta_file
270
+ entry = index.region_for_entry(query.entry)
271
+
272
+ start_pointer = entry.get_base_coordinate(query.start)
273
+ @fasta_file.seek(start_pointer, IO::SEEK_SET)
274
+ end_pointer = entry.get_base_coordinate(query.end)
275
+ to_read = end_pointer - start_pointer + 1
276
+ seq = @fasta_file.read(to_read)
277
+ seq.gsub!(/\s+/, '')
278
+ seq
279
+ end
280
+
281
+ #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
282
+ def fetch_sequence(region)
283
+ load_fai_entries
284
+ region = Region.parse_region(region.to_s) unless region.is_a?(Region)
285
+ entry = index.region_for_entry(region.entry)
286
+ raise FastaDBException.new "Entry (#{region.entry})not found in reference" unless entry
287
+ raise FastaDBException.new "Region in invalid range (#{region}): Valid range: #{entry.to_region.to_s} has a size of #{entry.size}." if region.end > entry.size or region.start < 1
288
+ seq = @samtools ? fetch_sequence_samtools(region): fetch_sequence_native(region)
289
+ reference = Bio::Sequence::NA.new(seq)
290
+ if region.respond_to? :orientation and region.orientation == :reverse
291
+ reference.reverse_complement!()
292
+ end
293
+ reference
294
+ end
295
+
296
+ private
297
+ #Returns Process::Status with the execution status. If run in a $DEBUG environment, stderr of the process
298
+ #is forwarded to the default stdout
299
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
300
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
301
+ #pid = wait_thr[:pid] # pid of the started process.
302
+ if type == :text
303
+ while (line = pipe.gets)
304
+ next if skip_comments and line[0] == comment_char
305
+ yield klass.new(line.chomp)
306
+ end
307
+ elsif type == :binary
308
+ while (c = pipe.gets(nil))
309
+ yield c
310
+ end
311
+ end
312
+ exit_status = wait_thr.value # Process::Status object returned.
313
+ puts stderr.read if $DEBUG
314
+ stdin.close
315
+ pipe.close
316
+ stderr.close
317
+ return exit_status
318
+ end
319
+ end
320
+ end
@@ -0,0 +1,273 @@
1
+ # :title:Pileup
2
+ # = Bio::DB::Pileup
3
+ # A class representing information in SAMTools pileup format
4
+ # Author:: Dan MacLean (dan.maclean@tsl.ac.uk)
5
+ # Pileup is described at http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ#I_do_not_understand_the_columns_in_the_pileup_output.
6
+ # Briefly (when you invoke pileup with the -c option):
7
+ # * 1 reference sequence name
8
+ # * 2 reference coordinate
9
+ # * (3) reference base, or `*' for an indel line
10
+ # * (4) genotype where heterozygotes are encoded in the IUB code: M=A/C, R=A/G, W=A/T, S=C/G, Y=C/T and K=G/T; indels are indicated by, for example, */+A, -A/* or +CC/-C. There is no difference between */+A or +A/*.
11
+ # * (5) Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
12
+ # * (6) Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, because the evidence of a SNP is very strong.
13
+ # * (7) root mean square (RMS) mapping quality
14
+ # * 8 # reads covering the position
15
+ # * 9 read bases at a SNP line (check the manual page for more information); the 1st indel allele otherwise
16
+ # * 10 base quality at a SNP line; the 2nd indel allele otherwise
17
+ # * (11) indel line only: # reads directly supporting the 1st indel allele
18
+ # * (12) indel line only: # reads directly supporting the 2nd indel allele
19
+ # * (13) indel line only: # reads supporting a third indel allele
20
+ # If pileup is invoked without `-c', indel lines and columns between 3 and 7 inclusive will not be outputted.
21
+ #
22
+ # NB mpileup uses the 6 column output format eg
23
+ # "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
24
+ # Pileup provides accessors for all columns (6 or 10 column format) and a few other useful methods
25
+ #
26
+ #
27
+ module Bio
28
+ class DB
29
+ class Pileup
30
+ attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
+
32
+ #creates the Pileup object
33
+ # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
+ # pile = Bio::DB::Pileup.new(pile_up_line)
35
+ def initialize(pile_up_line)
36
+ cols = pile_up_line.split(/\t/)
37
+ @consensus = nil
38
+ @consensus_quality = nil
39
+ @read_quals = nil
40
+ @bases = nil
41
+ @allele_frequency = nil
42
+ @consensus_iuap = nil
43
+ if cols.length == 6 ##should only be able to get 6 lines from mpileup
44
+ @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
45
+ elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
46
+ if cols[2] == '*' #indel
47
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
48
+ else #snp / identity
49
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
50
+ end
51
+ @consensus_quality = @consensus_quality.to_f
52
+ @snp_quality = @snp_quality.to_f
53
+ @rms_mapq = @rms_mapq.to_f
54
+ else
55
+ #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
56
+ end
57
+
58
+ @pos = @pos.to_i
59
+ @coverage = @coverage.to_f
60
+ @ref_count = nil
61
+ @non_ref_count_hash = nil
62
+ @non_ref_count = nil
63
+ end
64
+
65
+ #Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
66
+ #returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
67
+ def non_refs
68
+ if @non_ref_count_hash.nil?
69
+ @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
70
+ end
71
+ @non_ref_count_hash
72
+ end
73
+
74
+ # returns the total non-reference bases in the reads at this position
75
+ def non_ref_count
76
+ if @non_ref_count.nil?
77
+ @non_ref_count = @read_bases.count("ATGCatgc").to_f
78
+ end
79
+ @non_ref_count
80
+ end
81
+
82
+ # returns the count of reference-bases in the reads at this position
83
+ def ref_count
84
+ if @ref_count.nil?
85
+ @ref_count = self.read_bases.count(".,")
86
+ end
87
+ @ref_count
88
+ end
89
+
90
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
91
+ def consensus
92
+ if @consensus.nil?
93
+ max = self.non_refs.values.max
94
+ #if the ref base is in more than half the coverage..
95
+ if (self.ref_count / self.coverage) > 0.5
96
+ #..then the ref base is the concensus
97
+ @consensus = self.ref_base
98
+ ##not sure if the following will ever apply as the non_refs method also returns the ref base count, hence can never be over the max count
99
+ #elsif self.ref_count > max
100
+ # @consensus = self.ref_base
101
+ else
102
+ #get the base(s) and count(s) that has the max count
103
+ arr = self.non_refs.select {|k,v| v == max }
104
+ #just get the bases (remove the counts)
105
+ bases = arr.collect {|b| b[0].to_s }
106
+ #add the ref base if the ref base has a max count (commenting this out as it should already be in)
107
+ #bases << self.ref_base if self.ref_count == max
108
+ @consensus = bases.sort.join
109
+ end
110
+ end
111
+ @consensus
112
+ end
113
+
114
+ #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
115
+ def to_vcf
116
+ alt,g = self.genotype_list
117
+ alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
118
+ alt = '.' if alt == self.ref_base
119
+ alt = alt.split(',')
120
+ #if the reference base is in alt, remove it
121
+ alt.delete(self.ref_base.to_s)
122
+ alt = alt.join(',')
123
+ [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
124
+ end
125
+
126
+ private
127
+ def Pileup.vcf_header
128
+ %{##fileformat=VCFv3.3\n##INFO=DP,1,Integer,"Total Depth"\n##FORMAT=GT,1,String,"Genotype"\n##FORMAT=GQ,1,Integer,"Genotype Quality"\n##FORMAT=DP,1,Integer,"Read Depth"\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA\n}
129
+ end
130
+
131
+
132
+
133
+ #returns the genotype of the indel
134
+ def indel_gt
135
+ return "undef" if self.consensus.instance_of?(Array)
136
+ al1, al2 = self.consensus.split(/\//)
137
+ if al1 == al2 && al1 == '*'
138
+ al1=self.indel_1
139
+ al2=self.indel_2
140
+ end
141
+ alt1 = parse_indel(al1)
142
+ alt2 = parse_indel(al2)
143
+ alt,gt = nil,nil
144
+
145
+ return nil if !alt1 and !alt2
146
+ if !alt1
147
+ alt = alt2
148
+ gt = '0/1'
149
+ elsif !alt2
150
+ alt = alt1
151
+ gt - '0/1'
152
+ elsif alt1 == alt2
153
+ alt = alt1
154
+ gt = '1/1'
155
+ else
156
+ alt="#{alt1},#{alt2}"
157
+ gt= '1/2'
158
+ end
159
+ return [alt, gt]
160
+
161
+ end
162
+ #returns the genotype of the snp
163
+ def snp_gt
164
+ return ['.','0/0'] if self.ref_base == self.consensus
165
+ bases = Pileup.iupac_to_base(self.consensus)
166
+ if bases[0] == self.ref_base
167
+ return [bases[1],'0/1']
168
+ elsif bases[1] == self.ref_base
169
+ return [bases[0],'0/1']
170
+ else
171
+ return ["#{bases[0]},#{bases[1]}",'1/1']
172
+ end
173
+ end
174
+
175
+ #identifies the reference base and returns the indel or snp genotype as applicable
176
+ public
177
+ def genotype_list
178
+ if self.ref_base == '*'
179
+ return indel_gt
180
+ else
181
+ return snp_gt
182
+ end
183
+ end
184
+
185
+ #returns the two bases for the corresponding iupac code
186
+ public
187
+ def Pileup.iupac_to_base(alt_base)
188
+ case alt_base
189
+ when 'K' then ['G','T']
190
+ when 'M' then ['A','C']
191
+ when 'S' then ['C','G']
192
+ when 'R' then ['A','G']
193
+ when 'W' then ['A','T']
194
+ when 'Y' then ['C','T']
195
+ else alt_base.split(//)
196
+ end
197
+ end
198
+
199
+ #identifies if the indel is an insertion or a deletion
200
+ def parse_indel(alt)
201
+ return "D#{$'.length}" if alt =~/^-/
202
+ if alt=~/^\+/
203
+ return "I#{$'}"
204
+ elsif alt == '*'
205
+ return nil
206
+ end
207
+ end
208
+
209
+
210
+ #returns pileup format line
211
+ def to_s
212
+ if @read_quals and !@consensus_quality #6col
213
+ [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
214
+ elsif @indel_1 #13 cols
215
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
216
+ else #10 cols
217
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
218
+ end
219
+
220
+ end
221
+
222
+
223
+ def bases
224
+ return @bases if @bases
225
+ @bases = self.non_refs
226
+ #puts self.ref_count
227
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
228
+ @bases
229
+ end
230
+
231
+ def base_coverage
232
+ total = 0
233
+ @bases.each do |k,v|
234
+ total += v
235
+ end
236
+ total
237
+ end
238
+
239
+ #returns the frequency of all bases in pileup position
240
+ def allele_freq
241
+ return @allele_frequency if @allele_frequency
242
+ bases = self.bases
243
+ @allele_frequency = Hash.new
244
+ bases.each do |k,v|
245
+ @allele_frequency[k] = v.to_f/self.base_coverage.to_f
246
+ end
247
+ @allele_frequency
248
+ end
249
+
250
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
251
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
252
+
253
+ tmp = []
254
+ if @consensus_iuap.nil?
255
+ @consensus_iuap = self.ref_base.downcase
256
+ bases = self.bases
257
+ #tmp = String.new
258
+ bases.each do |k,v|
259
+ tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
260
+ end
261
+ if tmp.length > 0
262
+ tmp = tmp.collect{ |x| Bio::Sequence::NA.new(x) }
263
+ # creates alignment object
264
+ a = Bio::Alignment.new(tmp)
265
+ # shows IUPAC consensus
266
+ @consensus_iuap = a.consensus_iupac
267
+ end
268
+ end
269
+ @consensus_iuap
270
+ end
271
+ end
272
+ end
273
+ end
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2008-2009 Genome Research Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1 @@
1
+ 1.6
@@ -0,0 +1,32 @@
1
+ module Bio
2
+ class DB
3
+ module SAM
4
+ module Library
5
+ #IMPORTANT NOTE: Windows library is missing in this distribution
6
+
7
+ # Return the path with the file name of the library for the specific operating system
8
+ def filename
9
+ #TODO refactor this piece of code in all the files
10
+ lib_os = case RUBY_PLATFORM
11
+ when /linux/
12
+ 'so.1'
13
+ when /darwin/
14
+ '1.dylib'
15
+ when /windows/
16
+ 'dll'
17
+ else
18
+ case RUBY_DESCRIPTION
19
+ when /jruby.*darwin/
20
+ '1.dylib'
21
+ when /jruby.*linux/
22
+ 'so.1'
23
+ end
24
+ end
25
+
26
+ File.join(File.expand_path(File.dirname(__FILE__)),'external',"libbam.#{lib_os}")
27
+ end #filename
28
+ module_function :filename
29
+ end #Library
30
+ end #Sam
31
+ end #DB
32
+ end #Bio