bio-samtools 0.6.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -2
  3. data/README.md +4 -7
  4. data/VERSION +1 -1
  5. data/bio-samtools.gemspec +47 -105
  6. data/doc/Bio.html +68 -131
  7. data/doc/Bio/DB.html +51 -111
  8. data/doc/Bio/DB/Alignment.html +135 -363
  9. data/doc/Bio/DB/Pileup.html +183 -170
  10. data/doc/Bio/DB/SAM.html +1396 -820
  11. data/doc/Bio/DB/SAM/Library.html +73 -123
  12. data/doc/Bio/DB/SAM/Tools.html +51 -273
  13. data/doc/Bio/DB/Tag.html +78 -124
  14. data/doc/Bio/DB/Vcf.html +111 -147
  15. data/doc/LICENSE_txt.html +113 -148
  16. data/doc/created.rid +9 -10
  17. data/doc/fonts.css +167 -0
  18. data/doc/fonts/Lato-Light.ttf +0 -0
  19. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  20. data/doc/fonts/Lato-Regular.ttf +0 -0
  21. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  22. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  23. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  24. data/doc/images/add.png +0 -0
  25. data/doc/images/arrow_up.png +0 -0
  26. data/doc/images/delete.png +0 -0
  27. data/doc/images/tag_blue.png +0 -0
  28. data/doc/index.html +48 -54
  29. data/doc/js/darkfish.js +9 -22
  30. data/doc/js/search.js +20 -5
  31. data/doc/js/search_index.js +1 -1
  32. data/doc/rdoc.css +255 -218
  33. data/doc/table_of_contents.html +256 -137
  34. data/ext/Rakefile +57 -0
  35. data/lib/bio-samtools.rb +7 -2
  36. data/lib/bio/BIOExtensions.rb +89 -0
  37. data/lib/bio/db/alignment.rb +59 -0
  38. data/lib/bio/db/fastadb.rb +255 -0
  39. data/lib/bio/db/pileup.rb +221 -172
  40. data/lib/bio/db/sam.rb +639 -589
  41. data/lib/bio/db/sam/{faidx.rb → faidx_old.rb} +0 -0
  42. data/lib/bio/db/vcf.rb +69 -68
  43. data/test/.gitignore +1 -0
  44. data/test/{test_basic.rb → old_test_basic.rb} +33 -1
  45. data/test/samples/small/dupes.bam +0 -0
  46. data/test/samples/small/dupes.sam +274 -0
  47. data/test/samples/small/map_for_reheader.sam +8 -0
  48. data/test/samples/small/map_to_merge1.bam +0 -0
  49. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  50. data/test/samples/small/map_to_merge1.sam +8 -0
  51. data/test/samples/small/map_to_merge2.bam +0 -0
  52. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  53. data/test/samples/small/map_to_merge2.sam +8 -0
  54. data/test/samples/small/no_md.sam +8 -0
  55. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  56. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  57. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  58. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  59. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  60. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  61. data/test/samples/small/test_cov.svg +273 -0
  62. data/test/samples/small/testu.bam.bai +0 -0
  63. data/test/svg +133 -0
  64. data/test/test_pileup.rb +84 -0
  65. data/test/test_sam.rb +331 -0
  66. data/test/test_vcf.rb +11 -0
  67. data/{doc → tutorial}/tutorial.html +0 -0
  68. data/{doc → tutorial}/tutorial.pdf +0 -0
  69. metadata +56 -114
  70. data/doc/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -159
  71. data/doc/Bio/DB/SAM/Tools/Bam1T.html +0 -220
  72. data/doc/Bio/DB/SAM/Tools/BamHeaderT.html +0 -249
  73. data/doc/Bio/DB/SAM/Tools/BamPileup1T.html +0 -159
  74. data/doc/Bio/DB/SAM/Tools/SamfileT.html +0 -171
  75. data/doc/Bio/DB/SAM/Tools/SamfileTX.html +0 -159
  76. data/doc/Bio/DB/SAMException.html +0 -205
  77. data/doc/LibC.html +0 -155
  78. data/doc/Pileup.html +0 -571
  79. data/doc/Vcf.html +0 -473
  80. data/doc/basic_styles.css +0 -31
  81. data/doc/classes/Bio.html +0 -139
  82. data/doc/classes/Bio/DB.html +0 -137
  83. data/doc/classes/Bio/DB/Alignment.html +0 -441
  84. data/doc/classes/Bio/DB/Alignment.src/M000012.html +0 -19
  85. data/doc/classes/Bio/DB/Alignment.src/M000013.html +0 -27
  86. data/doc/classes/Bio/DB/Alignment.src/M000014.html +0 -45
  87. data/doc/classes/Bio/DB/Alignment.src/M000015.html +0 -40
  88. data/doc/classes/Bio/DB/SAM.html +0 -510
  89. data/doc/classes/Bio/DB/SAM/Library.html +0 -135
  90. data/doc/classes/Bio/DB/SAM/Library.src/M000006.html +0 -28
  91. data/doc/classes/Bio/DB/SAM/Tools.html +0 -278
  92. data/doc/classes/Bio/DB/SAM/Tools.src/M000007.html +0 -20
  93. data/doc/classes/Bio/DB/SAM/Tools/Bam1CoreT.html +0 -111
  94. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.html +0 -150
  95. data/doc/classes/Bio/DB/SAM/Tools/Bam1T.src/M000010.html +0 -20
  96. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.html +0 -169
  97. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000008.html +0 -19
  98. data/doc/classes/Bio/DB/SAM/Tools/BamHeaderT.src/M000009.html +0 -18
  99. data/doc/classes/Bio/DB/SAM/Tools/BamPileup1T.html +0 -111
  100. data/doc/classes/Bio/DB/SAM/Tools/SamfileT.html +0 -129
  101. data/doc/classes/Bio/DB/SAM/Tools/SamfileTX.html +0 -111
  102. data/doc/classes/Bio/DB/SAMException.html +0 -140
  103. data/doc/classes/Bio/DB/SAMException.src/M000016.html +0 -18
  104. data/doc/classes/Bio/DB/Sam.src/M000017.html +0 -43
  105. data/doc/classes/Bio/DB/Sam.src/M000018.html +0 -42
  106. data/doc/classes/Bio/DB/Sam.src/M000019.html +0 -18
  107. data/doc/classes/Bio/DB/Sam.src/M000020.html +0 -22
  108. data/doc/classes/Bio/DB/Sam.src/M000021.html +0 -19
  109. data/doc/classes/Bio/DB/Sam.src/M000022.html +0 -25
  110. data/doc/classes/Bio/DB/Sam.src/M000023.html +0 -28
  111. data/doc/classes/Bio/DB/Sam.src/M000024.html +0 -28
  112. data/doc/classes/Bio/DB/Sam.src/M000025.html +0 -46
  113. data/doc/classes/Bio/DB/Sam.src/M000026.html +0 -24
  114. data/doc/classes/Bio/DB/Sam.src/M000027.html +0 -19
  115. data/doc/classes/Bio/DB/Sam.src/M000028.html +0 -24
  116. data/doc/classes/Bio/DB/Sam.src/M000029.html +0 -41
  117. data/doc/classes/Bio/DB/Sam.src/M000030.html +0 -31
  118. data/doc/classes/Bio/DB/Sam.src/M000031.html +0 -86
  119. data/doc/classes/Bio/DB/Sam.src/M000032.html +0 -34
  120. data/doc/classes/Bio/DB/Tag.html +0 -160
  121. data/doc/classes/Bio/DB/Tag.src/M000011.html +0 -21
  122. data/doc/classes/LibC.html +0 -105
  123. data/doc/classes/Pileup.html +0 -374
  124. data/doc/classes/Pileup.src/M000001.html +0 -34
  125. data/doc/classes/Pileup.src/M000002.html +0 -21
  126. data/doc/classes/Pileup.src/M000003.html +0 -21
  127. data/doc/classes/Pileup.src/M000004.html +0 -21
  128. data/doc/classes/Pileup.src/M000005.html +0 -31
  129. data/doc/files/lib/bio-samtools_rb.html +0 -109
  130. data/doc/files/lib/bio/db/sam/bam_rb.html +0 -108
  131. data/doc/files/lib/bio/db/sam/faidx_rb.html +0 -108
  132. data/doc/files/lib/bio/db/sam/library_rb.html +0 -101
  133. data/doc/files/lib/bio/db/sam/pileup_rb.html +0 -178
  134. data/doc/files/lib/bio/db/sam/sam_rb.html +0 -113
  135. data/doc/files/lib/bio/db/sam_rb.html +0 -111
  136. data/doc/fr_class_index.html +0 -43
  137. data/doc/fr_file_index.html +0 -33
  138. data/doc/fr_method_index.html +0 -58
  139. data/doc/lib/bio-samtools_rb.html +0 -115
  140. data/doc/lib/bio/db/pileup_rb.html +0 -171
  141. data/doc/lib/bio/db/sam/bam_rb.html +0 -121
  142. data/doc/lib/bio/db/sam/faidx_rb.html +0 -117
  143. data/doc/lib/bio/db/sam/library_rb.html +0 -115
  144. data/doc/lib/bio/db/sam/pileup_rb.html +0 -171
  145. data/doc/lib/bio/db/sam/sam_rb.html +0 -121
  146. data/doc/lib/bio/db/sam/vcf_rb.html +0 -124
  147. data/doc/lib/bio/db/sam_rb.html +0 -115
  148. data/doc/lib/bio/db/vcf_rb.html +0 -124
  149. data/doc/rdoc-style.css +0 -208
  150. data/lib/bio/db/sam/bam.rb +0 -210
  151. data/lib/bio/db/sam/sam.rb +0 -86
  152. data/test/samples/pipe_char/test.bam +0 -0
  153. data/test/samples/pipe_char/test.bam.bai +0 -0
  154. data/test/samples/pipe_char/test.tam +0 -10
  155. data/test/samples/pipe_char/test_chr.fasta +0 -1000
  156. data/test/samples/pipe_char/test_chr.fasta.fai +0 -1
  157. data/test/samples/small/test +0 -0
  158. data/test/samples/small/test.bam +0 -0
  159. data/test/samples/small/test.fa +0 -20
  160. data/test/samples/small/test.fai +0 -0
@@ -1,4 +1,9 @@
1
- require 'ffi'
1
+ require 'bio-svgenes'
2
+ require 'bio'
2
3
  require 'bio/db/sam'
3
4
  require 'bio/db/pileup'
4
- require 'bio/db/vcf'
5
+ require 'bio/db/vcf'
6
+ require 'bio/db/alignment'
7
+ require 'bio/db/fastadb'
8
+ require 'open3'
9
+ require 'bio/BIOExtensions'
@@ -0,0 +1,89 @@
1
+ class Bio::NucleicAcid
2
+
3
+ IUPAC_CODES = {
4
+
5
+ 'y' => 'ct',
6
+ 'r' => 'ag',
7
+ 'w' => 'at',
8
+ 's' => 'cg',
9
+ 'k' => 'gt',
10
+ 'm' => 'ac',
11
+
12
+ 'b' => 'cgt',
13
+ 'd' => 'agt',
14
+ 'h' => 'act',
15
+ 'v' => 'acg',
16
+
17
+ 'n' => 'acgt',
18
+
19
+ 'a' => 'a',
20
+ 't' => 't',
21
+ 'g' => 'g',
22
+ 'c' => 'c',
23
+ 'u' => 'u',
24
+
25
+ 'ct' => 'y',
26
+ 'ag' => 'r',
27
+ 'at' => 'w',
28
+ 'cg' => 's',
29
+ 'gt' => 'k',
30
+ 'ac' => 'm',
31
+
32
+ 'cgt' => 'b',
33
+ 'agt' => 'd',
34
+ 'act' => 'h',
35
+ 'acg' => 'v',
36
+
37
+ 'acgt' => 'n'
38
+ }
39
+
40
+
41
+ def self.is_unambiguous(base)
42
+ "acgtACGT".match(base)
43
+ end
44
+
45
+ def self.to_IUAPC(bases)
46
+ base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
47
+ if base == nil
48
+ p "Invalid base! #{base}"
49
+ base = 'n' #This is a patch... as one of the scripts failed here.
50
+ end
51
+ base.upcase
52
+ end
53
+
54
+ def self.is_valid(code, base)
55
+ IUPAC_CODES[code.downcase].chars.include? base.downcase
56
+ end
57
+
58
+ end
59
+
60
+ #Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
61
+ #sequences are already aligned and doesn't check if a base on the first sequence is
62
+ #valid on the second.
63
+ class Bio::Sequence
64
+ def self.snps_between(seq1, seq2)
65
+ snps=0
66
+ for i in (0..seq1.size-1)
67
+ snps += 1 if seq1[i] != seq2[i]
68
+ end
69
+ snps
70
+ end
71
+ end
72
+
73
+ class String
74
+ #Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
75
+ def count_ambiguities
76
+ snps=0
77
+
78
+ for i in (0..self.size-1)
79
+
80
+ snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
81
+ end
82
+ snps
83
+ end
84
+
85
+ #Counts how many bases are uppercase
86
+ def upper_case_count
87
+ match(/[^A-Z]*/).to_s.size
88
+ end
89
+ end
@@ -0,0 +1,59 @@
1
+ module Bio
2
+ class DB
3
+
4
+ #a class to represent the SAM OPT values, presented in SAM as TAG:VTYPE:VALUE
5
+ class Tag
6
+ attr_accessor :tag, :type, :value
7
+ def set(str)
8
+ @tag = str[0..1]
9
+ @type = str[3]
10
+ @value = str[5..-1]
11
+ end
12
+ end
13
+
14
+ #Attrobites frp, the flag field (see chapter 2.2.2 of the sam file documentation)
15
+ #query_strand and mate_strand are true if they are forward. It is the opposite to
16
+ #the definition in the BAM format for clarity.
17
+ #primary is the negation of is_negative from the BAM format
18
+ class Alignment
19
+ attr_accessor :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr, :calend, :qlen
20
+
21
+ attr_accessor :is_paired, :is_mapped, :query_unmapped, :mate_unmapped, :query_strand, :mate_strand, :first_in_pair,:second_in_pair, :primary, :failed_quality, :is_duplicate
22
+
23
+ #parses the SAM string into its constituents and set its attributes
24
+ def initialize(sam_string)
25
+ s = sam_string.chomp.split("\t")
26
+ @qname = s[0]
27
+ @flag = s[1].to_i
28
+ @rname = s[2]
29
+ @pos = s[3].to_i
30
+ @mapq = s[4].to_i
31
+ @cigar = s[5]
32
+ @mrnm = s[6]
33
+ @mpos = s[7].to_i
34
+ @isize = s[8].to_i
35
+ @seq = s[9]
36
+ @qual = s[10]
37
+ @tags = {}
38
+ 11.upto(s.size-1) {|n|
39
+ t = Bio::DB::Tag.new
40
+ t.set(s[n])
41
+ tags[t.tag] = t
42
+ }
43
+
44
+ @is_paired = (@flag & 0x0001) > 0
45
+ @is_mapped = @flag & 0x0002 > 0
46
+ @query_unmapped = @flag & 0x0004 > 0
47
+ @mate_unmapped = @flag & 0x0008 > 0
48
+ @query_strand = !(@flag & 0x0010 > 0)
49
+ @mate_strand = !(@flag & 0x0020 > 0)
50
+ @first_in_pair = @flag & 0x0040 > 0
51
+ @second_in_pair = @flag & 0x0080 > 0
52
+ @primary = !(@flag & 0x0100 > 0)
53
+ @failed_quality = @flag & 0x0200 > 0
54
+ @is_duplicate = @flag & 0x0400 > 0
55
+
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,255 @@
1
+ #Module to hold the information about the fasta file
2
+
3
+ module Bio::DB::Fasta
4
+ #This class contains the entries in a fasta, as generated by samtools faidx
5
+ class Index
6
+ include Enumerable
7
+ attr_reader :entries
8
+
9
+ def initialize
10
+ @entries=[]
11
+ @entries_map = Hash.new
12
+ end
13
+
14
+ #This doesnt validate if you are adding the same entry twice. I may add
15
+ #a validation for that.
16
+ def << (entry)
17
+ @entries << entry
18
+ @entries_map[entry.id] = entry
19
+ end
20
+
21
+ def each(&block)
22
+ @entries.entries(&block)
23
+ end
24
+ #Total number of entries
25
+ def length
26
+ @entries.length
27
+ end
28
+ alias_method :size, :length
29
+
30
+ #Returns a new Index just with the specified range, as if it was an Array.
31
+ #The return object is of type Index.
32
+ def [](args)
33
+ tmp = @entries[args]
34
+ new_index = Index.new
35
+ tmp.each do | entry |
36
+ @new_index << entry
37
+ end
38
+ end
39
+
40
+ #Gets the Region object for the full length of the sequence
41
+ #name queried.
42
+ def region_for_entry(entry)
43
+ @entries_map[entry]
44
+ end
45
+ end
46
+
47
+ class Entry
48
+ attr_reader :id, :length
49
+
50
+ def initialize(id, length)
51
+ @id=id
52
+ @length=length.to_i
53
+ end
54
+
55
+ def get_full_region
56
+ reg = Region.new
57
+ reg.entry = id
58
+ reg.start = 1
59
+ reg.end = @length
60
+ reg.orientation = :forward
61
+ reg
62
+ end
63
+
64
+ alias_method :to_region, :get_full_region
65
+
66
+ end
67
+
68
+ #Class to wrap a region of a chromosome
69
+ class Region
70
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
71
+ attr_accessor :entry, :start, :end, :orientation
72
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases, :total_cov, :called
73
+
74
+ #TODO: Debug, as it hasnt been tested in the actual code.
75
+ def base_ratios_for_base(base)
76
+ @all_ratios = Hash.new unless @all_ratios
77
+ unless @all_ratios[base]
78
+ ratios = Array.new
79
+ for i in (0..region.size-1)
80
+ ratios << @base_ratios[i][base]
81
+ end
82
+ @all_ratios[base] = ratios
83
+ end
84
+ @all_ratios[base]
85
+ end
86
+
87
+ #Calculates the concensus, base ratios, coverages and total coverages in the region
88
+ #* min_cov minimum coverage to make a call (default 0)
89
+ #* min_per minimum representation to make make a call. If more than one base
90
+ # can be called, the IUAPC ambiguity code is returned
91
+ def calculate_stats_from_pile(opts={})
92
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 0
93
+ min_per = opts[:min_per] ? opts[:min_per] : 0.20
94
+ self.called = 0
95
+ reference = self.reference.downcase
96
+
97
+ self.base_ratios = Array.new(self.size, BASE_COUNT_ZERO)
98
+ self.bases = Array.new(self.size, BASE_COUNT_ZERO)
99
+ self.coverages = Array.new(self.size, 0)
100
+ self.total_cov = 0
101
+
102
+ self.pileup.each do | pile |
103
+
104
+ if pile.coverage > min_cov
105
+ self.base_ratios[pile.pos - self.start ] = pile.base_ratios
106
+ reference[pile.pos - self.start ] = pile.consensus_iuap(min_per).upcase
107
+ self.coverages[pile.pos - self.start ] = pile.coverage.to_i
108
+ self.bases[pile.pos - self.start ] = pile.bases
109
+ self.called += 1
110
+ end
111
+ #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
112
+ self.total_cov += pile.coverage
113
+ end
114
+
115
+ self.consensus = Bio::Sequence.new(reference)
116
+ self.consensus.na
117
+ if self.orientation == :reverse
118
+ self.consensus.reverse_complement!()
119
+ end
120
+ self.average_coverage = self.total_cov.to_f/self.size.to_f
121
+ self
122
+ end
123
+
124
+ def to_s
125
+ string = @entry + ":" + @start.to_s + "-" + @end.to_s
126
+ string
127
+ end
128
+
129
+ #Returns a region object from a string in form "name:start-end"
130
+ def self.parse_region(reg_str)
131
+ string = reg_str.delete("'")
132
+ fields_1 = string.split(":")
133
+ fields_2 = fields_1[1].split("-")
134
+ raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
135
+
136
+ reg = Region.new
137
+ reg.entry = fields_1[0]
138
+ reg.start = fields_2[0].to_i
139
+ reg.end = fields_2[1].to_i
140
+
141
+ if reg.end < reg.start
142
+ reg.orientation = :reverse
143
+ else
144
+ reg.orientation = :forward
145
+ end
146
+ reg
147
+ end
148
+
149
+ #Length of the region
150
+ def size
151
+ @end - @start
152
+ end
153
+ alias_method :length, :size
154
+
155
+ end
156
+
157
+ class FastaDBException < StandardError; end
158
+
159
+ #Class that holds the fasta file. It is used as a database.
160
+ class FastaFile
161
+ attr_reader :index, :fasta_path
162
+
163
+ #Initialize the fasta file. If the fai file doesn't exists, it is generated at startup
164
+ #* fasta path to the fasta file
165
+ #* samtools path to samtools, if it is not provided, use the bundled version
166
+ def initialize(args)
167
+ @fasta_path = args[:fasta]
168
+ @samtools = args[:samtools] || File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
169
+ raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
170
+ @fai_file = @fasta_path + ".fai"
171
+ unless File.file?(@fai_file) then
172
+ command = "#{@samtools} faidx '#{@fasta_path}'"
173
+ @last_command = command
174
+ system(command)
175
+ end
176
+
177
+ end
178
+
179
+ #Loads the fai entries
180
+ def load_fai_entries()
181
+ return @index.length if @index
182
+ @index = Index.new
183
+ fai_file = @fai_file
184
+ File.open(fai_file).each do | line |
185
+ fields = line.split("\t")
186
+ @index << Entry.new(fields[0], fields[1])
187
+ end
188
+ @index.length
189
+ end
190
+
191
+
192
+
193
+ #Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format.
194
+ #Options - if a subsequence is required
195
+ #* chr - [STRING] the reference name of the subsequence
196
+ #* start - [INT] the start position for the subsequence
197
+ #* stop - [INT] the stop position for the subsequence
198
+ def faidx(opts={})
199
+ if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
200
+ opts={:as_bio => false}
201
+ self.fetch_reference(:chr,:start,:stop,opts)
202
+ else
203
+ command = "#{@samtools} faidx #{@fasta_path}"
204
+ @last_command = command
205
+ system(command)
206
+ end
207
+ end
208
+
209
+
210
+ #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
211
+ def fetch_sequence(region)
212
+
213
+
214
+ query = region.to_s
215
+ query = region.to_region.to_s if region.respond_to?(:to_region)
216
+ command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
217
+ puts command
218
+ @last_command = command
219
+ seq = ""
220
+ yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
221
+
222
+ reference = Bio::Sequence::NA.new(seq)
223
+
224
+ if region.orientation == :reverse
225
+ #puts "reversing! #{reference.to_s}"
226
+ reference.reverse_complement!()
227
+ end
228
+ reference
229
+ end
230
+
231
+ private
232
+ #Returns Process::Status with the execution status. If run in a $VERBOSE environment, stderr of the process
233
+ #is forwarded to the default stdout
234
+ def yield_from_pipe(command, klass, type=:text, skip_comments=true, comment_char="#", &block)
235
+ stdin, pipe, stderr, wait_thr = Open3.popen3(command)
236
+ pid = wait_thr[:pid] # pid of the started process.
237
+ if type == :text
238
+ while (line = pipe.gets)
239
+ next if skip_comments and line[0] == comment_char
240
+ yield klass.new(line.chomp)
241
+ end
242
+ elsif type == :binary
243
+ while (c = pipe.gets(nil))
244
+ yield c
245
+ end
246
+ end
247
+ exit_status = wait_thr.value # Process::Status object returned.
248
+ puts stderr.read if $VERBOSE
249
+ stdin.close
250
+ pipe.close
251
+ stderr.close
252
+ return exit_status
253
+ end
254
+ end
255
+ end
@@ -26,185 +26,234 @@
26
26
  #
27
27
  module Bio
28
28
  class DB
29
- class Pileup
30
- attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
-
32
- #creates the Pileup object
33
- # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
- # pile = Bio::DB::Pileup.new(pile_up_line)
35
- def initialize(pile_up_line)
36
- cols = pile_up_line.split(/\t/)
37
- if cols.length == 6 ##should only be able to get 6 lines from mpileup
38
- @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
39
- elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
40
- if cols[2] == '*' #indel
41
- @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
42
- else #snp / identity
43
- @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
29
+ class Pileup
30
+ attr_accessor :ref_name, :pos, :ref_base, :coverage, :read_bases, :read_quals, :consensus, :consensus_quality, :snp_quality, :rms_mapq, :ar1, :ar2, :ar3, :indel_1, :indel_2
31
+
32
+ #creates the Pileup object
33
+ # pile_up_line = "seq2\t151\tG\tG\t36\t0\t99\t12\t...........A\t:9<;;7=<<<<<"
34
+ # pile = Bio::DB::Pileup.new(pile_up_line)
35
+ def initialize(pile_up_line)
36
+ cols = pile_up_line.split(/\t/)
37
+ if cols.length == 6 ##should only be able to get 6 lines from mpileup
38
+ @ref_name, @pos, @ref_base, @coverage, @read_bases, @read_quals = cols
39
+ elsif (10..13).include?(cols.length) ##incase anyone tries to use deprecated pileup with -c flag we get upto 13 cols...
40
+ if cols[2] == '*' #indel
41
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @indel_1, @indel_2, @ar1, @ar2, @ar3 = cols
42
+ else #snp / identity
43
+ @ref_name, @pos, @ref_base, @consensus, @consensus_quality, @snp_quality, @rms_mapq, @coverage, @read_bases, @read_quals = cols
44
+ end
45
+ @consensus_quality = @consensus_quality.to_f
46
+ @snp_quality = @snp_quality.to_f
47
+ @rms_mapq = @rms_mapq.to_f
48
+ else
49
+ #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
50
+ end
51
+
52
+ @pos = @pos.to_i
53
+ @coverage = @coverage.to_f
54
+ @ref_count = nil
55
+ @non_ref_count_hash = nil
56
+ @non_ref_count = nil
44
57
  end
45
- @consensus_quality = @consensus_quality.to_f
46
- @snp_quality = @snp_quality.to_f
47
- @rms_mapq = @rms_mapq.to_f
48
- else
49
- #raise RuntimeError, "parsing line '#{pile_up_line.chomp}' failed"
50
- end
51
58
 
52
- @pos = @pos.to_i
53
- @coverage = @coverage.to_f
54
- @ref_count = nil
55
- @non_ref_count_hash = nil
56
- @non_ref_count = nil
57
- end
58
-
59
- # Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts, returns a hash
60
- # pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
61
- def non_refs
62
- if @non_ref_count_hash.nil?
63
- @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
64
- end
65
- @non_ref_count_hash
66
- end
67
-
68
- # returns the total non-reference bases in the reads at this position
69
- def non_ref_count
70
- if @non_ref_count.nil?
71
- @non_ref_count = @read_bases.count("ATGCatgc").to_f
72
- end
73
- @non_ref_count
74
- end
75
-
76
- # returns the count of reference-bases in the reads at this position
77
- def ref_count
78
- if @ref_count.nil?
79
- @ref_count = self.read_bases.count(".,")
80
- end
81
- @ref_count
82
- end
83
-
84
- # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
85
- def consensus
86
- if @consensus.nil?
87
- max = self.non_refs.values.max
88
- if (self.ref_count / self.coverage) > 0.5
89
- @consensus = self.ref_base
90
- elsif self.ref_count > max
91
- @consensus = self.ref_base
59
+ #Calculate the total count of each non-reference nucleotide and return a hash of all 4 nt counts
60
+ #returns a hash pile.non_refs #{:A => 1, :C => 0, :T => 0, :G => 0}
61
+ def non_refs
62
+ if @non_ref_count_hash.nil?
63
+ @non_ref_count_hash = {:A => self.read_bases.count("Aa"), :C => self.read_bases.count("Cc"), :G => self.read_bases.count("Gg"), :T => self.read_bases.count("Tt")}
64
+ end
65
+ @non_ref_count_hash
66
+ end
67
+
68
+ # returns the total non-reference bases in the reads at this position
69
+ def non_ref_count
70
+ if @non_ref_count.nil?
71
+ @non_ref_count = @read_bases.count("ATGCatgc").to_f
72
+ end
73
+ @non_ref_count
74
+ end
75
+
76
+ # returns the count of reference-bases in the reads at this position
77
+ def ref_count
78
+ if @ref_count.nil?
79
+ @ref_count = self.read_bases.count(".,")
80
+ end
81
+ @ref_count
82
+ end
83
+
84
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
85
+ def consensus
86
+ if @consensus.nil?
87
+ max = self.non_refs.values.max
88
+ if (self.ref_count / self.coverage) > 0.5
89
+ @consensus = self.ref_base
90
+ elsif self.ref_count > max
91
+ @consensus = self.ref_base
92
+ else
93
+ arr = self.non_refs.select {|k,v| v == max }
94
+ bases = arr.collect {|b| b[0].to_s }
95
+ bases << self.ref_base if self.ref_count == max
96
+ @consensus = bases.sort.join
97
+ end
98
+ end
99
+ @consensus
100
+ end
101
+
102
+ #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
103
+ def to_vcf
104
+ alt,g = self.genotype_list
105
+ alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
106
+ alt = '.' if alt == self.ref_base
107
+ [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
108
+ end
109
+
110
+ private
111
+ def Pileup.vcf_header
112
+ %{##fileformat=VCFv3.3
113
+ ##INFO=DP,1,Integer,"Total Depth"
114
+ ##FORMAT=GT,1,String,"Genotype"
115
+ ##FORMAT=GQ,1,Integer,"Genotype Quality"
116
+ ##FORMAT=DP,1,Integer,"Read Depth"
117
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
118
+ }.join("\n")
119
+ end
120
+
121
+
122
+
123
+ #returns the genotype of the indel
124
+ def indel_gt
125
+ return "undef" if self.consensus.instance_of?(Array)
126
+ al1, al2 = self.consensus.split(/\//)
127
+ if al1 == al2 && al1 == '*'
128
+ al1=self.indel_1
129
+ al2=self.indel_2
130
+ end
131
+ alt1 = parse_indel(al1)
132
+ alt2 = parse_indel(al2)
133
+ alt,gt = nil,nil
134
+
135
+ return nil if !alt1 and !alt2
136
+ if !alt1
137
+ alt = alt2
138
+ gt = '0/1'
139
+ elsif !alt2
140
+ alt = alt1
141
+ gt - '0/1'
142
+ elsif alt1 == alt2
143
+ alt = alt1
144
+ gt = '1/1'
92
145
  else
93
- arr = self.non_refs.select {|k,v| v == max }
94
- bases = arr.collect {|b| b[0].to_s }
95
- bases << self.ref_base if self.ref_count == max
96
- @consensus = bases.sort.join
146
+ alt="#{alt1},#{alt2}"
147
+ gt= '1/2'
97
148
  end
149
+ return [alt, gt]
150
+
98
151
  end
99
- @consensus
100
- end
101
-
102
- #returns basic VCF string as per samtools/misc sam2vcf.pl except that it scrimps on the ref for indels, returning a '*' instead of the reference allele
103
- def to_vcf
152
+ #returns the genotype of the snp
153
+ def snp_gt
154
+ return ['.','0/0'] if self.ref_base == self.consensus
155
+ bases = Pileup.iupac_to_base(self.consensus)
156
+ if bases[0] == self.ref_base
157
+ return [bases[1],'0/1']
158
+ elsif bases[1] == self.ref_base
159
+ return [bases[0],'0/1']
160
+ else
161
+ return ["#{bases[0]},#{bases[1]}",'1/2']
162
+ end
163
+ end
164
+
165
+ #identifies the reference base and returns the indel or snp genotype as applicable
166
+ public
167
+ def genotype_list
168
+ if self.ref_base == '*'
169
+ return indel_gt
170
+ else
171
+ return snp_gt
172
+ end
173
+ end
174
+
175
+ #returns the two bases for the corresponding iupac code
176
+ public
177
+ def Pileup.iupac_to_base(alt_base)
178
+ case alt_base
179
+ when 'K' then ['G','T']
180
+ when 'M' then ['A','C']
181
+ when 'S' then ['C','G']
182
+ when 'R' then ['A','G']
183
+ when 'W' then ['A','T']
184
+ when 'Y' then ['C','T']
185
+ else alt_base.split(//)
186
+ end
187
+ end
188
+
189
+ #identifies if the indel is an insertion or a deletion
190
+ def parse_indel(alt)
191
+ return "D#{$'.length}" if alt =~/^-/
192
+ if alt=~/^\+/
193
+ return "I#{$'}"
194
+ elsif alt == '*'
195
+ return nil
196
+ end
197
+ end
198
+
199
+
200
+ #returns pileup format line
201
+ def to_s
202
+ if @read_quals and !@consensus_quality #6col
203
+ [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
204
+ elsif @indel_1 #13 cols
205
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
206
+ else #10 cols
207
+ [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
208
+ end
209
+
210
+ end
211
+
212
+
213
+ def bases
214
+ return @bases if @bases
215
+ @bases = self.non_refs
216
+ #puts self.ref_count
217
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
218
+ @bases
219
+ end
104
220
 
105
- alt,g = self.genotype_list
106
- alt = self.consensus.split(//).join(',') unless self.ref_base == '*'
107
- alt = '.' if alt == self.ref_base
108
- [self.ref_name, self.pos, '.', self.ref_base, alt, self.snp_quality.to_i, "0", "DP=#{self.coverage.to_i}", "GT:GQ:DP", "#{g}:#{self.consensus_quality.to_i}:#{self.coverage.to_i}" ].join("\t")
109
- end
110
-
111
- private
112
- def Pileup.vcf_header
113
- %{##fileformat=VCFv3.3
114
- ##INFO=DP,1,Integer,"Total Depth"
115
- ##FORMAT=GT,1,String,"Genotype"
116
- ##FORMAT=GQ,1,Integer,"Genotype Quality"
117
- ##FORMAT=DP,1,Integer,"Read Depth"
118
- #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDATA
119
- }.join("\n")
120
- end
121
-
122
- def parse_indel(alt)
221
+ def base_coverage
222
+ total = 0
223
+ @bases.each do |k,v|
224
+ total += v
225
+ end
226
+ total
227
+ end
123
228
 
124
- return "D#{$'.length}" if alt =~/^-/
125
- if alt=~/^\+/
126
- return "I#{$'}"
127
- elsif alt == '*'
128
- return nil
129
- end
130
- end
131
-
132
- def indel_gt
133
- return "undef" if self.consensus.instance_of?(Array)
134
- al1, al2 = self.consensus.split(/\//)
135
- if al1 == al2 && al1 == '*'
136
- al1=self.indel_1
137
- al2=self.indel_2
138
- end
139
- alt1 = parse_indel(al1)
140
- alt2 = parse_indel(al2)
141
- alt,gt = nil,nil
142
-
143
- return nil if !alt1 and !alt2
144
- if !alt1
145
- alt = alt2
146
- gt = '0/1'
147
- elsif !alt2
148
- alt = alt1
149
- gt - '0/1'
150
- elsif alt1 == alt2
151
- alt = alt1
152
- gt = '1/1'
153
- else
154
- alt="#{alt1},#{alt2}"
155
- gt= '1/2'
156
- end
157
- return [alt, gt]
158
-
159
- end
160
-
161
- def snp_gt
162
- return ['.','0/0'] if self.ref_base == self.consensus
163
- bases = Pileup.iupac_to_base(self.consensus)
164
- if bases[0] == self.ref_base
165
- return [bases[1],'0/1']
166
- elsif bases[1] == self.ref_base
167
- return [bases[0],'0/1']
168
- else
169
- return ["#{bases[0]},#{bases[1]}",'1/2']
170
- end
171
- end
172
-
173
- public
174
- def genotype_list
175
- if self.ref_base == '*'
176
- return indel_gt
177
- else
178
- return snp_gt
179
- end
180
- end
181
-
182
- public
183
- #returns
184
- def Pileup.iupac_to_base(alt_base)
185
- case alt_base
186
- when 'K' then ['G','T']
187
- when 'M' then ['A','C']
188
- when 'S' then ['C','G']
189
- when 'R' then ['A','G']
190
- when 'W' then ['A','T']
191
- when 'Y' then ['C','T']
192
- else alt_base.split(//)
193
- end
194
- end
195
-
196
- #returns pileup format line
197
- def to_s
198
- if @read_quals and !@consensus_quality #6col
199
- [@ref_name, @pos, @ref_base, @coverage.to_i, @read_bases, @read_quals].join("\t")
200
- elsif @indel_1 #13 cols
201
- [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @indel_1, @indel_2, @ar1, @ar2, @ar3].join("\t")
202
- else #10 cols
203
- [@ref_name, @pos, @ref_base, @consensus, @consensus_quality.to_i, @snp_quality.to_i, @rms_mapq.to_i, @coverage.to_i, @read_bases, @read_quals].join("\t")
229
+ def base_ratios
230
+ return @base_ratios if @base_ratios
231
+ bases = self.bases
232
+ @base_ratios = Hash.new
233
+ bases.each do |k,v|
234
+ @base_ratios[k] = v.to_f/self.base_coverage.to_f
235
+ end
236
+ @base_ratios
237
+ end
238
+
239
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
240
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
241
+
242
+ if @consensus_iuap.nil?
243
+ @consensus_iuap = self.ref_base.downcase
244
+ bases = self.bases
245
+ tmp = String.new
246
+ bases.each do |k,v|
247
+ tmp << k[0].to_s if v/self.coverage.to_f > minumum_ratio_for_iup_consensus
248
+ end
249
+ if tmp.length > 0
250
+ @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
251
+ end
252
+ end
253
+ @consensus_iuap
254
+ end
255
+
256
+
204
257
  end
205
-
206
258
  end
207
-
208
- end
209
- end
210
259
  end