bio-samtools 0.6.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -2
  3. data/README.md +4 -7
  4. data/VERSION +1 -1
  5. data/bio-samtools.gemspec +47 -105
  6. data/doc/Bio.html +68 -131
  7. data/doc/Bio/DB.html +51 -111
  8. data/doc/Bio/DB/Alignment.html +135 -363
  9. data/doc/Bio/DB/Pileup.html +183 -170
  10. data/doc/Bio/DB/SAM.html +1396 -820
  11. data/doc/Bio/DB/SAM/Library.html +73 -123
  12. data/doc/Bio/DB/SAM/Tools.html +51 -273
  13. data/doc/Bio/DB/Tag.html +78 -124
  14. data/doc/Bio/DB/Vcf.html +111 -147
  15. data/doc/LICENSE_txt.html +113 -148
  16. data/doc/created.rid +9 -10
  17. data/doc/fonts.css +167 -0
  18. data/doc/fonts/Lato-Light.ttf +0 -0
  19. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  20. data/doc/fonts/Lato-Regular.ttf +0 -0
  21. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  22. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  23. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  24. data/doc/images/add.png +0 -0
  25. data/doc/images/arrow_up.png +0 -0
  26. data/doc/images/delete.png +0 -0
  27. data/doc/images/tag_blue.png +0 -0
  28. data/doc/index.html +48 -54
  29. data/doc/js/darkfish.js +9 -22
  30. data/doc/js/search.js +20 -5
  31. data/doc/js/search_index.js +1 -1
  32. data/doc/rdoc.css +255 -218
  33. data/doc/table_of_contents.html +256 -137
  34. data/ext/Rakefile +57 -0
  35. data/lib/bio-samtools.rb +7 -2
  36. data/lib/bio/BIOExtensions.rb +89 -0
  37. data/lib/bio/db/alignment.rb +59 -0
  38. data/lib/bio/db/fastadb.rb +255 -0
  39. data/lib/bio/db/pileup.rb +221 -172
  40. data/lib/bio/db/sam.rb +639 -589
  41. data/lib/bio/db/sam/{faidx.rb → faidx_old.rb} +0 -0
  42. data/lib/bio/db/vcf.rb +69 -68
  43. data/test/.gitignore +1 -0
  44. data/test/{test_basic.rb → old_test_basic.rb} +33 -1
  45. data/test/samples/small/dupes.bam +0 -0
  46. data/test/samples/small/dupes.sam +274 -0
  47. data/test/samples/small/map_for_reheader.sam +8 -0
  48. data/test/samples/small/map_to_merge1.bam +0 -0
  49. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  50. data/test/samples/small/map_to_merge1.sam +8 -0
  51. data/test/samples/small/map_to_merge2.bam +0 -0
  52. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  53. data/test/samples/small/map_to_merge2.sam +8 -0
  54. data/test/samples/small/no_md.sam +8 -0
  55. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  56. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  57. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  58. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  59. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  60. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  61. data/test/samples/small/test_cov.svg +273 -0
  62. data/test/samples/small/testu.bam.bai +0 -0
  63. data/test/svg +133 -0
  64. data/test/test_pileup.rb +84 -0
  65. data/test/test_sam.rb +331 -0
  66. data/test/test_vcf.rb +11 -0
  67. data/{doc → tutorial}/tutorial.html +0 -0
  68. data/{doc → tutorial}/tutorial.pdf +0 -0
  69. metadata +56 -114
  70. data/doc/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -159
  71. data/doc/Bio/DB/SAM/Tools/Bam1T.html +0 -220
  72. data/doc/Bio/DB/SAM/Tools/BamHeaderT.html +0 -249
  73. data/doc/Bio/DB/SAM/Tools/BamPileup1T.html +0 -159
  74. data/doc/Bio/DB/SAM/Tools/SamfileT.html +0 -171
  75. data/doc/Bio/DB/SAM/Tools/SamfileTX.html +0 -159
  76. data/doc/Bio/DB/SAMException.html +0 -205
  77. data/doc/LibC.html +0 -155
  78. data/doc/Pileup.html +0 -571
  79. data/doc/Vcf.html +0 -473
  80. data/doc/basic_styles.css +0 -31
  81. data/doc/classes/Bio.html +0 -139
  82. data/doc/classes/Bio/DB.html +0 -137
  83. data/doc/classes/Bio/DB/Alignment.html +0 -441
  84. data/doc/classes/Bio/DB/Alignment.src/M000012.html +0 -19
  85. data/doc/classes/Bio/DB/Alignment.src/M000013.html +0 -27
  86. data/doc/classes/Bio/DB/Alignment.src/M000014.html +0 -45
  87. data/doc/classes/Bio/DB/Alignment.src/M000015.html +0 -40
  88. data/doc/classes/Bio/DB/SAM.html +0 -510
  89. data/doc/classes/Bio/DB/SAM/Library.html +0 -135
  90. data/doc/classes/Bio/DB/SAM/Library.src/M000006.html +0 -28
  91. data/doc/classes/Bio/DB/SAM/Tools.html +0 -278
  92. data/doc/classes/Bio/DB/SAM/Tools.src/M000007.html +0 -20
  93. data/doc/classes/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -111
  94. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.html +0 -150
  95. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.src/M000010.html +0 -20
  96. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.html +0 -169
  97. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000008.html +0 -19
  98. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000009.html +0 -18
  99. data/doc/classes/Bio/DB/SAM/Tools/BamPileup1T.html +0 -111
  100. data/doc/classes/Bio/DB/SAM/Tools/SamfileT.html +0 -129
  101. data/doc/classes/Bio/DB/SAM/Tools/SamfileTX.html +0 -111
  102. data/doc/classes/Bio/DB/SAMException.html +0 -140
  103. data/doc/classes/Bio/DB/SAMException.src/M000016.html +0 -18
  104. data/doc/classes/Bio/DB/Sam.src/M000017.html +0 -43
  105. data/doc/classes/Bio/DB/Sam.src/M000018.html +0 -42
  106. data/doc/classes/Bio/DB/Sam.src/M000019.html +0 -18
  107. data/doc/classes/Bio/DB/Sam.src/M000020.html +0 -22
  108. data/doc/classes/Bio/DB/Sam.src/M000021.html +0 -19
  109. data/doc/classes/Bio/DB/Sam.src/M000022.html +0 -25
  110. data/doc/classes/Bio/DB/Sam.src/M000023.html +0 -28
  111. data/doc/classes/Bio/DB/Sam.src/M000024.html +0 -28
  112. data/doc/classes/Bio/DB/Sam.src/M000025.html +0 -46
  113. data/doc/classes/Bio/DB/Sam.src/M000026.html +0 -24
  114. data/doc/classes/Bio/DB/Sam.src/M000027.html +0 -19
  115. data/doc/classes/Bio/DB/Sam.src/M000028.html +0 -24
  116. data/doc/classes/Bio/DB/Sam.src/M000029.html +0 -41
  117. data/doc/classes/Bio/DB/Sam.src/M000030.html +0 -31
  118. data/doc/classes/Bio/DB/Sam.src/M000031.html +0 -86
  119. data/doc/classes/Bio/DB/Sam.src/M000032.html +0 -34
  120. data/doc/classes/Bio/DB/Tag.html +0 -160
  121. data/doc/classes/Bio/DB/Tag.src/M000011.html +0 -21
  122. data/doc/classes/LibC.html +0 -105
  123. data/doc/classes/Pileup.html +0 -374
  124. data/doc/classes/Pileup.src/M000001.html +0 -34
  125. data/doc/classes/Pileup.src/M000002.html +0 -21
  126. data/doc/classes/Pileup.src/M000003.html +0 -21
  127. data/doc/classes/Pileup.src/M000004.html +0 -21
  128. data/doc/classes/Pileup.src/M000005.html +0 -31
  129. data/doc/files/lib/bio-samtools_rb.html +0 -109
  130. data/doc/files/lib/bio/db/sam/bam_rb.html +0 -108
  131. data/doc/files/lib/bio/db/sam/faidx_rb.html +0 -108
  132. data/doc/files/lib/bio/db/sam/library_rb.html +0 -101
  133. data/doc/files/lib/bio/db/sam/pileup_rb.html +0 -178
  134. data/doc/files/lib/bio/db/sam/sam_rb.html +0 -113
  135. data/doc/files/lib/bio/db/sam_rb.html +0 -111
  136. data/doc/fr_class_index.html +0 -43
  137. data/doc/fr_file_index.html +0 -33
  138. data/doc/fr_method_index.html +0 -58
  139. data/doc/lib/bio-samtools_rb.html +0 -115
  140. data/doc/lib/bio/db/pileup_rb.html +0 -171
  141. data/doc/lib/bio/db/sam/bam_rb.html +0 -121
  142. data/doc/lib/bio/db/sam/faidx_rb.html +0 -117
  143. data/doc/lib/bio/db/sam/library_rb.html +0 -115
  144. data/doc/lib/bio/db/sam/pileup_rb.html +0 -171
  145. data/doc/lib/bio/db/sam/sam_rb.html +0 -121
  146. data/doc/lib/bio/db/sam/vcf_rb.html +0 -124
  147. data/doc/lib/bio/db/sam_rb.html +0 -115
  148. data/doc/lib/bio/db/vcf_rb.html +0 -124
  149. data/doc/rdoc-style.css +0 -208
  150. data/lib/bio/db/sam/bam.rb +0 -210
  151. data/lib/bio/db/sam/sam.rb +0 -86
  152. data/test/samples/pipe_char/test.bam +0 -0
  153. data/test/samples/pipe_char/test.bam.bai +0 -0
  154. data/test/samples/pipe_char/test.tam +0 -10
  155. data/test/samples/pipe_char/test_chr.fasta +0 -1000
  156. data/test/samples/pipe_char/test_chr.fasta.fai +0 -1
  157. data/test/samples/small/test +0 -0
  158. data/test/samples/small/test.bam +0 -0
  159. data/test/samples/small/test.fa +0 -20
  160. data/test/samples/small/test.fai +0 -0
@@ -1,4 +1,9 @@
1
- require 'ffi'
1
+ require 'bio-svgenes'
2
+ require 'bio'
2
3
  require 'bio/db/sam'
3
4
  require 'bio/db/pileup'
4
- require 'bio/db/vcf'
5
+ require 'bio/db/vcf'
6
+ require 'bio/db/alignment'
7
+ require 'bio/db/fastadb'
8
+ require 'open3'
9
+ require 'bio/BIOExtensions'
@@ -0,0 +1,89 @@
1
+ class Bio::NucleicAcid
2
+
3
+ IUPAC_CODES = {
4
+
5
+ 'y' => 'ct',
6
+ 'r' => 'ag',
7
+ 'w' => 'at',
8
+ 's' => 'cg',
9
+ 'k' => 'gt',
10
+ 'm' => 'ac',
11
+
12
+ 'b' => 'cgt',
13
+ 'd' => 'agt',
14
+ 'h' => 'act',
15
+ 'v' => 'acg',
16
+
17
+ 'n' => 'acgt',
18
+
19
+ 'a' => 'a',
20
+ 't' => 't',
21
+ 'g' => 'g',
22
+ 'c' => 'c',
23
+ 'u' => 'u',
24
+
25
+ 'ct' => 'y',
26
+ 'ag' => 'r',
27
+ 'at' => 'w',
28
+ 'cg' => 's',
29
+ 'gt' => 'k',
30
+ 'ac' => 'm',
31
+
32
+ 'cgt' => 'b',
33
+ 'agt' => 'd',
34
+ 'act' => 'h',
35
+ 'acg' => 'v',
36
+
37
+ 'acgt' => 'n'
38
+ }
39
+
40
+
41
+ def self.is_unambiguous(base)
42
+ "acgtACGT".match(base)
43
+ end
44
+
45
+ def self.to_IUAPC(bases)
46
+ base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
47
+ if base == nil
48
+ p "Invalid base! #{base}"
49
+ base = 'n' #This is a patch... as one of the scripts failed here.
50
+ end
51
+ base.upcase
52
+ end
53
+
54
+ def self.is_valid(code, base)
55
+ IUPAC_CODES[code.downcase].chars.include? base.downcase
56
+ end
57
+
58
+ end
59
+
60
+ #Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
61
+ #sequences are already aligned and doesn't check if a base on the first sequence is
62
+ #valid on the second.
63
+ class Bio::Sequence
64
+ def self.snps_between(seq1, seq2)
65
+ snps=0
66
+ for i in (0..seq1.size-1)
67
+ snps += 1 if seq1[i] != seq2[i]
68
+ end
69
+ snps
70
+ end
71
+ end
72
+
73
+ class String
74
+ #Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
75
+ def count_ambiguities
76
+ snps=0
77
+
78
+ for i in (0..self.size-1)
79
+
80
+ snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
81
+ end
82
+ snps
83
+ end
84
+
85
+ #Counts how many bases are uppercase
86
+ def upper_case_count
87
+ match(/[^A-Z]*/).to_s.size
88
+ end
89
+ end
@@ -0,0 +1,59 @@
1
+ module Bio
2
+ class DB
3
+
4
+ #a class to represent the SAM OPT values, presented in SAM as TAG:VTYPE:VALUE
5
+ class Tag
6
+ attr_accessor :tag, :type, :value
7
+ def set(str)
8
+ @tag = str[0..1]
9
+ @type = str[3]
10
+ @value = str[5..-1]
11
+ end
12
+ end
13
+
14
+ #Attrobites frp, the flag field (see chapter 2.2.2 of the sam file documentation)
15
+ #query_strand and mate_strand are true if they are forward. It is the opposite to
16
+ #the definition in the BAM format for clarity.
17
+ #primary is the negation of is_negative from the BAM format
18
+ class Alignment
19
+ attr_accessor :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr, :calend, :qlen
20
+
21
+ attr_accessor :is_paired, :is_mapped, :query_unmapped, :mate_unmapped, :query_strand, :mate_strand, :first_in_pair,:second_in_pair, :primary, :failed_quality, :is_duplicate
22
+
23
+ #parses the SAM string into its constituents and set its attributes
24
+ def initialize(sam_string)
25
+ s = sam_string.chomp.split("\t")
26
+ @qname = s[0]
27
+ @flag = s[1].to_i
28
+ @rname = s[2]
29
+ @pos = s[3].to_i
30
+ @mapq = s[4].to_i
31
+ @cigar = s[5]
32
+ @mrnm = s[6]
33
+ @mpos = s[7].to_i
34
+ @isize = s[8].to_i
35
+ @seq = s[9]
36
+ @qual = s[10]
37
+ @tags = {}
38
+ 11.upto(s.size-1) {|n|
39
+ t = Bio::DB::Tag.new
40
+ t.set(s[n])
41
+ tags[t.tag] = t
42
+ }
43
+
44
+ @is_paired = (@flag & 0x0001) > 0
45
+ @is_mapped = @flag & 0x0002 > 0
46
+ @query_unmapped = @flag & 0x0004 > 0
47
+ @mate_unmapped = @flag & 0x0008 > 0
48
+ @query_strand = !(@flag & 0x0010 > 0)
49
+ @mate_strand = !(@flag & 0x0020 > 0)
50
+ @first_in_pair = @flag & 0x0040 > 0
51
+ @second_in_pair = @flag & 0x0080 > 0
52
+ @primary = !(@flag & 0x0100 > 0)
53
+ @failed_quality = @flag & 0x0200 > 0
54
+ @is_duplicate = @flag & 0x0400 > 0
55
+
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,255 @@
1
+ #Module to hold the information about the fasta file
2
+
3
+ module Bio::DB::Fasta
4
+ #This class contains the entries in a fasta, as generated by samtools faidx
5
+ class Index
6
+ include Enumerable
7
+ attr_reader :entries
8
+
9
+ def initialize
10
+ @entries=[]
11
+ @entries_map = Hash.new
12
+ end
13
+
14
+ #This doesnt validate if you are adding the same entry twice. I may add
15
+ #a validation for that.
16
+ def << (entry)
17
+ @entries << entry
18
+ @entries_map[entry.id] = entry
19
+ end
20
+
21
+ def each(&block)
22
+ @entries.entries(&block)
23
+ end
24
+ #Total number of entries
25
+ def length
26
+ @entries.length
27
+ end
28
+ alias_method :size, :length
29
+
30
+ #Returns a new Index just with the specified range, as if it was an Array.
31
+ #The return object is of type Index.
32
+ def [](args)
33
+ tmp = @entries[args]
34
+ new_index = Index.new
35
+ tmp.each do | entry |
36
+ @new_index << entry
37
+ end
38
+ end
39
+
40
+ #Gets the Region object for the full length of the sequence
41
+ #name queried.
42
+ def region_for_entry(entry)
43
+ @entries_map[entry]
44
+ end
45
+ end
46
+
47
+ class Entry
48
+ attr_reader :id, :length
49
+
50
+ def initialize(id, length)
51
+ @id=id
52
+ @length=length.to_i
53
+ end
54
+
55
+ def get_full_region
56
+ reg = Region.new
57
+ reg.entry = id
58
+ reg.start = 1
59
+ reg.end = @length
60
+ reg.orientation = :forward
61
+ reg
62
+ end
63
+
64
+ alias_method :to_region, :get_full_region
65
+
66
+ end
67
+
68
+ #Class to wrap a region of a chromosome
69
+ class Region
70
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
71
+ attr_accessor :entry, :start, :end, :orientation
72
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases, :total_cov, :called
73
+
74
+ #TODO: Debug, as it hasnt been tested in the actual code.
75
+ def base_ratios_for_base(base)
76
+ @all_ratios = Hash.new unless @all_ratios
77
+ unless @all_ratios[base]
78
+ ratios = Array.new
79
+ for i in (0..region.size-1)
80
+ ratios << @base_ratios[i][base]
81
+ end
82
+ @all_ratios[base] = ratios
83
+ end
84
+ @all_ratios[base]
85
+ end
86
+
87
+ #Calculates the concensus, base ratios, coverages and total coverages in the region
88
+ #* min_cov minimum coverage to make a call (default 0)
89
+ #* min_per minimum representation to make make a call. If more than one base
90
+ # can be called, the IUAPC ambiguity code is returned
91
+ def calculate_stats_from_pile(opts={})
92
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 0
93
+ min_per = opts[:min_per] ? opts[:min_per] : 0.20
94
+ self.called = 0
95
+ reference = self.reference.downcase
96
+
97
+ self.base_ratios = Array.new(self.size, BASE_COUNT_ZERO)
98
+ self.bases = Array.new(self.size, BASE_COUNT_ZERO)
99
+ self.coverages = Array.new(self.size, 0)
100
+ self.total_cov = 0
101
+
102
+ self.pileup.each do | pile |
103
+
104
+ if pile.coverage > min_cov
105
+ self.base_ratios[pile.pos - self.start ] = pile.base_ratios
106
+ reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
107
+ self.coverages[pile.pos - self.start ] = pile.coverage.to_i
108
+ self.bases[pile.pos - self.start ] = pile.bases
109
+ self.called += 1
110
+ end
111
+ #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
112
+ self.total_cov += pile.coverage
113
+ end
114
+
115
+ self.consensus = Bio::Sequence.new(reference)
116
+ self.consensus.na
117
+ if self.orientation == :reverse
118
+ self.consensus.reverse_complement!()
119
+ end
120
+ self.average_coverage = self.total_cov.to_f/self.size.to_f
121
+ self
122
+ end
123
+
124
+ def to_s
125
+ string = @entry + ":" + @start.to_s + "-" + @end.to_s
126
+ string
127
+ end
128
+
129
+ #Returns a region object from a string in form "name:start-end"
130
+ def self.parse_region(reg_str)
131
+ string = reg_str.delete("'")
132
+ fields_1 = string.split(":")
133
+ fields_2 = fields_1[1].split("-")
134
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
135
+
136
+ reg = Region.new
137
+ reg.entry = fields_1[0]
138
+ reg.start = fields_2[0].to_i
139
+ reg.end = fields_2[1].to_i
140
+
141
+ if reg.end < reg.start
142
+ reg.orientation = :reverse
143
+ else
144
+ reg.orientation = :forward
145
+ end
146
+ reg
147
+ end
148
+
149
+ #Length of the region
150
+ def size
151
+ @end - @start
152
+ end
153
+ alias_method :length, :size
154
+
155
+ end
156
+
157
+ class FastaDBException < StandardError; end
158
+
159
+ #Class that holds the fasta file. It is used as a database.
160
+ class FastaFile
161
+ attr_reader :index, :fasta_path
162
+
163
+ #Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
164
+ #* fasta path to the fasta file
165
+ #* samtools path to samtools, if it is not provided, use the bundled version
166
+ def initialize(args)
167
+ @fasta_path = args[:fasta]
168
+ @samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
169
+ raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
170
+ @fai_file = @fasta_path + ".fai"
171
+ unless File.file?(@fai_file) then
172
+ command = "#{@samtools} faidx '#{@fasta_path}'"
173
+ @last_command = command
174
+ system(command)
175
+ end
176
+
177
+ end
178
+
179
+ #Loads the fai entries
180
+ def load_fai_entries()
181
+ return @index.length if @index
182
+ @index = Index.new
183
+ fai_file = @fai_file
184
+ File.open(fai_file).each do | line |
185
+ fields = line.split("\t")
186
+ @index << Entry.new(fields[0], fields[1])
187
+ end
188
+ @index.length
189
+ end
190
+
191
+
192
+
193
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
194
+ #Options - if a subsequence is required
195
+ #* chr - [STRING] the reference name of the subsequence
196
+ #* start - [INT] the start position for the subsequence
197
+ #* stop - [INT] the stop position for the subsequence
198
+ def faidx(opts={})
199
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
200
+ opts={:as_bio => false}
201
+ self.fetch_reference(:chr,:start,:stop,opts)
202
+ else
203
+ command = "#{@samtools} faidx #{@fasta_path}"
204
+ @last_command = command
205
+ system(command)
206
+ end
207
+ end
208
+
209
+
210
+ #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
211
+ def fetch_sequence(region)
212
+
213
+
214
+ query = region.to_s
215
+ query = region.to_region.to_s if region.respond_to?(:to_region)
216
+ command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
217
+ puts command
218
+ @last_command = command
219
+ seq = ""
220
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
221
+
222
+ reference = Bio::Sequence::NA.new(seq)
223
+
224
+ if region.orientation == :reverse
225
+ #puts "reversing! #{reference.to_s}"
226
+ reference.reverse_complement!()
227
+ end
228
+ reference
229
+ end
230
+
231
+ private
232
+ #Returns Process::Status with the execution status. If run in a $VERBOSE environment, stderr of the process
233
+ #is forwarded to the default stdout
234
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
235
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
236
+ pid = wait_thr[:pid] # pid of the started process.
237
+ if type == :text
238
+ while (line = pipe.gets)
239
+ next if skip_comments and line[0] == comment_char
240
+ yield klass.new(line.chomp)
241
+ end
242
+ elsif type == :binary
243
+ while (c = pipe.gets(nil))
244
+ yield c
245
+ end
246
+ end
247
+ exit_status = wait_thr.value # Process::Status object returned.
248
+ puts stderr.read if $VERBOSE
249
+ stdin.close
250
+ pipe.close
251
+ stderr.close
252
+ return exit_status
253
+ end
254
+ end
255
+ end
@@ -26,185 +26,234 @@
26
26
  #
27
27
  module Bio
28
28
  class DB
29
- class Pileup
30
- attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
-
32
- #creates the Pileup object
33
- # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
- # pile = Bio::DB::Pileup.new(pile_up_line)
35
- def initialize(pile_up_line)
36
- cols = pile_up_line.split(/\t/)
37
- if cols.length == 6 ##should only be able to get 6 lines from mpileup
38
- @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
39
- elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
40
- if cols[2] == '*' #indel
41
- @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
42
- else #snp / identity
43
- @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
29
+ class Pileup
30
+ attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
+
32
+ #creates the Pileup object
33
+ # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
+ # pile = Bio::DB::Pileup.new(pile_up_line)
35
+ def initialize(pile_up_line)
36
+ cols = pile_up_line.split(/\t/)
37
+ if cols.length == 6 ##should only be able to get 6 lines from mpileup
38
+ @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
39
+ elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
40
+ if cols[2] == '*' #indel
41
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
42
+ else #snp / identity
43
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
44
+ end
45
+ @consensus_quality = @consensus_quality.to_f
46
+ @snp_quality = @snp_quality.to_f
47
+ @rms_mapq = @rms_mapq.to_f
48
+ else
49
+ #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
50
+ end
51
+
52
+ @pos = @pos.to_i
53
+ @coverage = @coverage.to_f
54
+ @ref_count = nil
55
+ @non_ref_count_hash = nil
56
+ @non_ref_count = nil
44
57
  end
45
- @consensus_quality = @consensus_quality.to_f
46
- @snp_quality = @snp_quality.to_f
47
- @rms_mapq = @rms_mapq.to_f
48
- else
49
- #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
50
- end
51
58
 
52
- @pos = @pos.to_i
53
- @coverage = @coverage.to_f
54
- @ref_count = nil
55
- @non_ref_count_hash = nil
56
- @non_ref_count = nil
57
- end
58
-
59
- # Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts, returns a hash
60
- # pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
61
- def non_refs
62
- if @non_ref_count_hash.nil?
63
- @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
64
- end
65
- @non_ref_count_hash
66
- end
67
-
68
- # returns the total non-reference bases in the reads at this position
69
- def non_ref_count
70
- if @non_ref_count.nil?
71
- @non_ref_count = @read_bases.count("ATGCatgc").to_f
72
- end
73
- @non_ref_count
74
- end
75
-
76
- # returns the count of reference-bases in the reads at this position
77
- def ref_count
78
- if @ref_count.nil?
79
- @ref_count = self.read_bases.count(".,")
80
- end
81
- @ref_count
82
- end
83
-
84
- # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
85
- def consensus
86
- if @consensus.nil?
87
- max = self.non_refs.values.max
88
- if (self.ref_count / self.coverage) > 0.5
89
- @consensus = self.ref_base
90
- elsif self.ref_count > max
91
- @consensus = self.ref_base
59
+ #Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
60
+ #returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
61
+ def non_refs
62
+ if @non_ref_count_hash.nil?
63
+ @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
64
+ end
65
+ @non_ref_count_hash
66
+ end
67
+
68
+ # returns the total non-reference bases in the reads at this position
69
+ def non_ref_count
70
+ if @non_ref_count.nil?
71
+ @non_ref_count = @read_bases.count("ATGCatgc").to_f
72
+ end
73
+ @non_ref_count
74
+ end
75
+
76
+ # returns the count of reference-bases in the reads at this position
77
+ def ref_count
78
+ if @ref_count.nil?
79
+ @ref_count = self.read_bases.count(".,")
80
+ end
81
+ @ref_count
82
+ end
83
+
84
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
85
+ def consensus
86
+ if @consensus.nil?
87
+ max = self.non_refs.values.max
88
+ if (self.ref_count / self.coverage) > 0.5
89
+ @consensus = self.ref_base
90
+ elsif self.ref_count > max
91
+ @consensus = self.ref_base
92
+ else
93
+ arr = self.non_refs.select {|k,v| v == max }
94
+ bases = arr.collect {|b| b[0].to_s }
95
+ bases << self.ref_base if self.ref_count == max
96
+ @consensus = bases.sort.join
97
+ end
98
+ end
99
+ @consensus
100
+ end
101
+
102
+ #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
103
+ def to_vcf
104
+ alt,g = self.genotype_list
105
+ alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
106
+ alt = '.' if alt == self.ref_base
107
+ [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
108
+ end
109
+
110
+ private
111
+ def Pileup.vcf_header
112
+ %{##fileformat=VCFv3.3
113
+ ##INFO=DP,1,Integer,"Total Depth"
114
+ ##FORMAT=GT,1,String,"Genotype"
115
+ ##FORMAT=GQ,1,Integer,"Genotype Quality"
116
+ ##FORMAT=DP,1,Integer,"Read Depth"
117
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
118
+ }.join("\n")
119
+ end
120
+
121
+
122
+
123
+ #returns the genotype of the indel
124
+ def indel_gt
125
+ return "undef" if self.consensus.instance_of?(Array)
126
+ al1, al2 = self.consensus.split(/\//)
127
+ if al1 == al2 && al1 == '*'
128
+ al1=self.indel_1
129
+ al2=self.indel_2
130
+ end
131
+ alt1 = parse_indel(al1)
132
+ alt2 = parse_indel(al2)
133
+ alt,gt = nil,nil
134
+
135
+ return nil if !alt1 and !alt2
136
+ if !alt1
137
+ alt = alt2
138
+ gt = '0/1'
139
+ elsif !alt2
140
+ alt = alt1
141
+ gt - '0/1'
142
+ elsif alt1 == alt2
143
+ alt = alt1
144
+ gt = '1/1'
92
145
  else
93
- arr = self.non_refs.select {|k,v| v == max }
94
- bases = arr.collect {|b| b[0].to_s }
95
- bases << self.ref_base if self.ref_count == max
96
- @consensus = bases.sort.join
146
+ alt="#{alt1},#{alt2}"
147
+ gt= '1/2'
97
148
  end
149
+ return [alt, gt]
150
+
98
151
  end
99
- @consensus
100
- end
101
-
102
- #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
103
- def to_vcf
152
+ #returns the genotype of the snp
153
+ def snp_gt
154
+ return ['.','0/0'] if self.ref_base == self.consensus
155
+ bases = Pileup.iupac_to_base(self.consensus)
156
+ if bases[0] == self.ref_base
157
+ return [bases[1],'0/1']
158
+ elsif bases[1] == self.ref_base
159
+ return [bases[0],'0/1']
160
+ else
161
+ return ["#{bases[0]},#{bases[1]}",'1/2']
162
+ end
163
+ end
164
+
165
+ #identifies the reference base and returns the indel or snp genotype as applicable
166
+ public
167
+ def genotype_list
168
+ if self.ref_base == '*'
169
+ return indel_gt
170
+ else
171
+ return snp_gt
172
+ end
173
+ end
174
+
175
+ #returns the two bases for the corresponding iupac code
176
+ public
177
+ def Pileup.iupac_to_base(alt_base)
178
+ case alt_base
179
+ when 'K' then ['G','T']
180
+ when 'M' then ['A','C']
181
+ when 'S' then ['C','G']
182
+ when 'R' then ['A','G']
183
+ when 'W' then ['A','T']
184
+ when 'Y' then ['C','T']
185
+ else alt_base.split(//)
186
+ end
187
+ end
188
+
189
+ #identifies if the indel is an insertion or a deletion
190
+ def parse_indel(alt)
191
+ return "D#{$'.length}" if alt =~/^-/
192
+ if alt=~/^\+/
193
+ return "I#{$'}"
194
+ elsif alt == '*'
195
+ return nil
196
+ end
197
+ end
198
+
199
+
200
+ #returns pileup format line
201
+ def to_s
202
+ if @read_quals and !@consensus_quality #6col
203
+ [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
204
+ elsif @indel_1 #13 cols
205
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
206
+ else #10 cols
207
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
208
+ end
209
+
210
+ end
211
+
212
+
213
+ def bases
214
+ return @bases if @bases
215
+ @bases = self.non_refs
216
+ #puts self.ref_count
217
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
218
+ @bases
219
+ end
104
220
 
105
- alt,g = self.genotype_list
106
- alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
107
- alt = '.' if alt == self.ref_base
108
- [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
109
- end
110
-
111
- private
112
- def Pileup.vcf_header
113
- %{##fileformat=VCFv3.3
114
- ##INFO=DP,1,Integer,"Total Depth"
115
- ##FORMAT=GT,1,String,"Genotype"
116
- ##FORMAT=GQ,1,Integer,"Genotype Quality"
117
- ##FORMAT=DP,1,Integer,"Read Depth"
118
- #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
119
- }.join("\n")
120
- end
121
-
122
- def parse_indel(alt)
221
+ def base_coverage
222
+ total = 0
223
+ @bases.each do |k,v|
224
+ total += v
225
+ end
226
+ total
227
+ end
123
228
 
124
- return "D#{$'.length}" if alt =~/^-/
125
- if alt=~/^\+/
126
- return "I#{$'}"
127
- elsif alt == '*'
128
- return nil
129
- end
130
- end
131
-
132
- def indel_gt
133
- return "undef" if self.consensus.instance_of?(Array)
134
- al1, al2 = self.consensus.split(/\//)
135
- if al1 == al2 && al1 == '*'
136
- al1=self.indel_1
137
- al2=self.indel_2
138
- end
139
- alt1 = parse_indel(al1)
140
- alt2 = parse_indel(al2)
141
- alt,gt = nil,nil
142
-
143
- return nil if !alt1 and !alt2
144
- if !alt1
145
- alt = alt2
146
- gt = '0/1'
147
- elsif !alt2
148
- alt = alt1
149
- gt - '0/1'
150
- elsif alt1 == alt2
151
- alt = alt1
152
- gt = '1/1'
153
- else
154
- alt="#{alt1},#{alt2}"
155
- gt= '1/2'
156
- end
157
- return [alt, gt]
158
-
159
- end
160
-
161
- def snp_gt
162
- return ['.','0/0'] if self.ref_base == self.consensus
163
- bases = Pileup.iupac_to_base(self.consensus)
164
- if bases[0] == self.ref_base
165
- return [bases[1],'0/1']
166
- elsif bases[1] == self.ref_base
167
- return [bases[0],'0/1']
168
- else
169
- return ["#{bases[0]},#{bases[1]}",'1/2']
170
- end
171
- end
172
-
173
- public
174
- def genotype_list
175
- if self.ref_base == '*'
176
- return indel_gt
177
- else
178
- return snp_gt
179
- end
180
- end
181
-
182
- public
183
- #returns
184
- def Pileup.iupac_to_base(alt_base)
185
- case alt_base
186
- when 'K' then ['G','T']
187
- when 'M' then ['A','C']
188
- when 'S' then ['C','G']
189
- when 'R' then ['A','G']
190
- when 'W' then ['A','T']
191
- when 'Y' then ['C','T']
192
- else alt_base.split(//)
193
- end
194
- end
195
-
196
- #returns pileup format line
197
- def to_s
198
- if @read_quals and !@consensus_quality #6col
199
- [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
200
- elsif @indel_1 #13 cols
201
- [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
202
- else #10 cols
203
- [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
229
+ def base_ratios
230
+ return @base_ratios if @base_ratios
231
+ bases = self.bases
232
+ @base_ratios = Hash.new
233
+ bases.each do |k,v|
234
+ @base_ratios[k] = v.to_f/self.base_coverage.to_f
235
+ end
236
+ @base_ratios
237
+ end
238
+
239
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
240
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
241
+
242
+ if @consensus_iuap.nil?
243
+ @consensus_iuap = self.ref_base.downcase
244
+ bases = self.bases
245
+ tmp = String.new
246
+ bases.each do |k,v|
247
+ tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
248
+ end
249
+ if tmp.length > 0
250
+ @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
251
+ end
252
+ end
253
+ @consensus_iuap
254
+ end
255
+
256
+
204
257
  end
205
-
206
258
  end
207
-
208
- end
209
- end
210
259
  end