bio-polyploid-tools 0.1.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -3
- data/Gemfile.lock +8 -8
- data/README.md +45 -0
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -7
- data/bin/count_variations.rb +1 -1
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +1 -1
- data/bin/polymarker.rb +2 -2
- data/bin/snps_between_bams.rb +37 -7
- data/bio-polyploid-tools.gemspec +17 -13
- data/lib/bio/BFRTools.rb +27 -261
- data/lib/bio/BIOExtensions.rb +0 -124
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +1 -1
- data/lib/bio/PolyploidTools/ExonContainer.rb +6 -5
- data/lib/bio/PolyploidTools/Marker.rb +2 -2
- data/lib/bio/PolyploidTools/SNP.rb +5 -4
- data/lib/bio/db/exonerate.rb +1 -1
- data/test/test_bfr.rb +101 -9
- metadata +28 -12
- data/lib/bio/SAMToolsExtensions.rb +0 -284
- data/lib/bio/db/fastadb.rb +0 -164
@@ -1,284 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'pathname'
|
3
|
-
#require_relative 'db/fasta.rb'
|
4
|
-
require 'bio'
|
5
|
-
|
6
|
-
require_relative 'db/fastadb.rb'
|
7
|
-
|
8
|
-
#require "set"
|
9
|
-
#require 'systemu'
|
10
|
-
#require 'json'
|
11
|
-
|
12
|
-
=begin
|
13
|
-
|
14
|
-
Extends the methods to be able to calculate the BFR and a consensus from the pileup
|
15
|
-
|
16
|
-
=end
|
17
|
-
|
18
|
-
class Bio::DB::Pileup
|
19
|
-
|
20
|
-
#attr_accessor :minumum_ratio_for_iup_consensus
|
21
|
-
#@minumum_ratio_for_iup_consensus = 0.20
|
22
|
-
|
23
|
-
#Returns a hash with the count of bases
|
24
|
-
|
25
|
-
def bases
|
26
|
-
return @bases if @bases
|
27
|
-
@bases = self.non_refs
|
28
|
-
#puts self.ref_count
|
29
|
-
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
30
|
-
@bases
|
31
|
-
end
|
32
|
-
|
33
|
-
def base_coverage
|
34
|
-
total = 0
|
35
|
-
@bases.each do |k,v|
|
36
|
-
total += v
|
37
|
-
end
|
38
|
-
total
|
39
|
-
end
|
40
|
-
|
41
|
-
def base_ratios
|
42
|
-
return @base_ratios if @base_ratios
|
43
|
-
bases = self.bases
|
44
|
-
@base_ratios = Hash.new
|
45
|
-
bases.each do |k,v|
|
46
|
-
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
47
|
-
end
|
48
|
-
@base_ratios
|
49
|
-
end
|
50
|
-
|
51
|
-
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
52
|
-
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
53
|
-
minumum_ratio_for_iup_consensus
|
54
|
-
if @consensus_iuap.nil?
|
55
|
-
@consensus_iuap = self.ref_base.downcase
|
56
|
-
bases = self.bases
|
57
|
-
tmp = String.new
|
58
|
-
bases.each do |k,v|
|
59
|
-
tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
|
60
|
-
end
|
61
|
-
if tmp.length > 0
|
62
|
-
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
@consensus_iuap
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
class Bio::NucleicAcid
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
def self.to_IUAPC(bases)
|
76
|
-
#puts "TADA"
|
77
|
-
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
78
|
-
if base == nil
|
79
|
-
p "Invalid base! #{base}"
|
80
|
-
base = 'n' #This is a patch... as one of the scripts failed here.
|
81
|
-
end
|
82
|
-
base.upcase
|
83
|
-
end
|
84
|
-
|
85
|
-
def self.is_valid(code, base)
|
86
|
-
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
#class Bio::DB::Sam::SAMException < RuntimeError
|
93
|
-
|
94
|
-
#end
|
95
|
-
|
96
|
-
class Bio::DB::Sam
|
97
|
-
|
98
|
-
|
99
|
-
attr_accessor :minumum_ratio_for_iup_consensus
|
100
|
-
attr_reader :cached_regions
|
101
|
-
#attr_accessor :pileup_cache
|
102
|
-
@minumum_ratio_for_iup_consensus = 0.20
|
103
|
-
|
104
|
-
|
105
|
-
#Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
|
106
|
-
#the pile for different operations, it won't execute the mpilup command several times
|
107
|
-
#Whenever you finish using a region, call mpileup_clear_cache to free the cache
|
108
|
-
#The argument Region is required, as it will be the key for the underlying hash.
|
109
|
-
#We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
|
110
|
-
#
|
111
|
-
#TODO: It may be good to load partially the pileup
|
112
|
-
def mpileup_cached (opts={})
|
113
|
-
raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
|
114
|
-
@pileup_cache = Hash.new unless @pileup_cache
|
115
|
-
@cached_regions = Hash.new unless @cached_regions
|
116
|
-
|
117
|
-
region = opts[:r] ? opts[:r] : opts[:region]
|
118
|
-
opts[:r] = "#{region.to_s}"
|
119
|
-
opts[:region] = "#{region.to_s}"
|
120
|
-
opts[:A] = true
|
121
|
-
#reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
|
122
|
-
|
123
|
-
unless @cached_regions[region.to_s]
|
124
|
-
@cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
|
125
|
-
tmp = Array.new
|
126
|
-
@cached_regions[region.to_s].pileup = tmp
|
127
|
-
#puts "Loading #{region.to_s}"
|
128
|
-
mpileup(opts) do | pile |
|
129
|
-
# puts pile
|
130
|
-
tmp << pile
|
131
|
-
yield pile
|
132
|
-
end
|
133
|
-
else
|
134
|
-
# puts "Loaded, reruning #{region.to_s}"
|
135
|
-
@cached_regions.pileup[region.to_s] .each do | pile |
|
136
|
-
yield pile
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
#Clears the pileup cache. If a region is passed as argument, just the specified region is removed
|
142
|
-
#If no region is passed, the hash is emptied
|
143
|
-
def mpileup_clear_cache (region)
|
144
|
-
return unless @cached_regions
|
145
|
-
if region
|
146
|
-
@cached_regions[region.to_s] = nil
|
147
|
-
else
|
148
|
-
@cached_regions.clear
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
#Gets the coverage of a region from a pileup.
|
153
|
-
def average_coverage_from_pileup(opts={})
|
154
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
155
|
-
region = opts[:region]
|
156
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
157
|
-
@cached_regions[region].average_coverage
|
158
|
-
end
|
159
|
-
|
160
|
-
#
|
161
|
-
def coverages_from_pileup(opts={})
|
162
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
163
|
-
region = opts[:region]
|
164
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
165
|
-
@cached_regions[region].coverages
|
166
|
-
end
|
167
|
-
|
168
|
-
def consensus_with_ambiguities(opts={})
|
169
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
170
|
-
region = opts[:region]
|
171
|
-
# p "consensus with ambiguities for: " << opts[:region]
|
172
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
173
|
-
@cached_regions[region].consensus
|
174
|
-
end
|
175
|
-
|
176
|
-
def calculate_stats_from_pile(opts={})
|
177
|
-
min_cov = opts[:min_cov] ? opts[:min_cov] : 20
|
178
|
-
|
179
|
-
|
180
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
181
|
-
region = opts[:region]
|
182
|
-
|
183
|
-
mark_case = true if opts[:case]
|
184
|
-
# puts "Marcase: #{mark_case}"
|
185
|
-
reference = self.fetch_reference(region.entry, region.start, region.end).downcase
|
186
|
-
# p "calculationg from pile..." << region.to_s
|
187
|
-
base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
|
188
|
-
bases = Array.new(region.size, BASE_COUNT_ZERO)
|
189
|
-
coverages = Array.new(region.size, 0)
|
190
|
-
total_cov = 0
|
191
|
-
|
192
|
-
self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
|
193
|
-
#puts pile
|
194
|
-
#puts pile.coverage
|
195
|
-
bef=reference[pile.pos - region.start - 1 ]
|
196
|
-
if pile.coverage > min_cov
|
197
|
-
|
198
|
-
|
199
|
-
base_ratios[pile.pos - region.start ] = pile.base_ratios
|
200
|
-
reference[pile.pos - region.start - 1 ] = pile.consensus_iuap(0.20).upcase
|
201
|
-
coverages[pile.pos - region.start ] = pile.coverage.to_i
|
202
|
-
bases[pile.pos - region.start ] = pile.bases
|
203
|
-
|
204
|
-
|
205
|
-
end
|
206
|
-
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
207
|
-
total_cov += pile.coverage
|
208
|
-
end
|
209
|
-
|
210
|
-
#puts ">Ref\n#{reference}"
|
211
|
-
#puts ">Original\n#{r}"
|
212
|
-
region = @cached_regions[region.to_s]
|
213
|
-
region.coverages = coverages
|
214
|
-
region.base_ratios = base_ratios
|
215
|
-
region.consensus = Bio::Sequence.new(reference)
|
216
|
-
region.consensus.na
|
217
|
-
if region.orientation == :reverse
|
218
|
-
region.consensus.reverse_complement!()
|
219
|
-
end
|
220
|
-
region.average_coverage = total_cov.to_f/region.size.to_f
|
221
|
-
region.bases = bases
|
222
|
-
region
|
223
|
-
end
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
#BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
228
|
-
|
229
|
-
#Gets an array with the proportions of the bases in the region. If there is no coverage, a
|
230
|
-
def base_ratios_in_region(opts={})
|
231
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
232
|
-
region = opts[:region]
|
233
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
234
|
-
@cached_regions[region].base_ratios
|
235
|
-
end
|
236
|
-
|
237
|
-
#Gets an array with the bsaes count in the region. If there is no coverage, a
|
238
|
-
def bases_in_region(opts={})
|
239
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
240
|
-
region = opts[:region]
|
241
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
242
|
-
@cached_regions[region].bases
|
243
|
-
end
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
def extract_reads(opts={})
|
248
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
249
|
-
fastq_filename = opts[:fastq]
|
250
|
-
fastq_file = opts[:fastq_file]
|
251
|
-
|
252
|
-
out = $stdout
|
253
|
-
|
254
|
-
print_fastq = Proc.new do |alignment|
|
255
|
-
out.puts "@#{alignment.qname}"
|
256
|
-
out.puts "#{alignment.seq}"
|
257
|
-
out.puts "+#{alignment.qname}"
|
258
|
-
out.puts "#{alignment.qual}"
|
259
|
-
end
|
260
|
-
|
261
|
-
fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
|
262
|
-
|
263
|
-
|
264
|
-
end
|
265
|
-
|
266
|
-
end
|
267
|
-
|
268
|
-
class Bio::DB::Fasta::Region
|
269
|
-
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
|
270
|
-
|
271
|
-
#TODO: Debug, as it hasnt been tested in the actual code.
|
272
|
-
def base_ratios_for_base(base)
|
273
|
-
@all_ratios = Hash.new unless @all_ratios
|
274
|
-
unless @all_ratios[base]
|
275
|
-
ratios = Array.new
|
276
|
-
for i in (0..region.size-1)
|
277
|
-
ratios << @base_ratios[i][base]
|
278
|
-
end
|
279
|
-
@all_ratios[base] = ratios
|
280
|
-
end
|
281
|
-
@all_ratios[base]
|
282
|
-
end
|
283
|
-
|
284
|
-
end
|
data/lib/bio/db/fastadb.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
#Module to hold the information about the fasta file
|
2
|
-
|
3
|
-
module Bio::DB::Fasta
|
4
|
-
class Index
|
5
|
-
include Enumerable
|
6
|
-
attr_reader :entries
|
7
|
-
|
8
|
-
def initialize
|
9
|
-
@entries=[]
|
10
|
-
@entries_map = Hash.new
|
11
|
-
end
|
12
|
-
|
13
|
-
#This doesnt validate if you are adding the same entry twice. I may add
|
14
|
-
#a validation for that.
|
15
|
-
def << (entry)
|
16
|
-
@entries << entry
|
17
|
-
@entries_map[entry.id] = entry
|
18
|
-
end
|
19
|
-
|
20
|
-
def each(&block)
|
21
|
-
@entries.entries(&block)
|
22
|
-
end
|
23
|
-
|
24
|
-
def length
|
25
|
-
@entries.length
|
26
|
-
end
|
27
|
-
|
28
|
-
#Returns a new Index just with the specified range, as if it was an Array.
|
29
|
-
#The return object is of type Index.
|
30
|
-
def [](args)
|
31
|
-
tmp = @entries[args]
|
32
|
-
new_index = Index.new
|
33
|
-
tmp.each do | entry |
|
34
|
-
@new_index << entry
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def region_for_entry(entry)
|
39
|
-
@entries_map[entry]
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
class Entry
|
44
|
-
attr_reader :id, :length
|
45
|
-
|
46
|
-
def initialize(id, length)
|
47
|
-
@id=id
|
48
|
-
@length=length.to_i
|
49
|
-
end
|
50
|
-
|
51
|
-
def get_full_region
|
52
|
-
reg = Region.new
|
53
|
-
reg.entry = id
|
54
|
-
reg.start = 0
|
55
|
-
reg.end = @length
|
56
|
-
reg.orientation = :forward
|
57
|
-
reg
|
58
|
-
end
|
59
|
-
|
60
|
-
def to_region
|
61
|
-
get_full_region
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
#Class to wrap a region of a chromosome
|
66
|
-
class Region
|
67
|
-
attr_accessor :entry, :start, :end, :orientation
|
68
|
-
|
69
|
-
def to_s
|
70
|
-
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
71
|
-
string
|
72
|
-
end
|
73
|
-
|
74
|
-
def self.parse_region(reg_str)
|
75
|
-
string = reg_str.delete("'")
|
76
|
-
fields_1 = string.split(":")
|
77
|
-
fields_2 = fields_1[1].split("-")
|
78
|
-
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
|
79
|
-
|
80
|
-
reg = Region.new
|
81
|
-
reg.entry = fields_1[0]
|
82
|
-
reg.start = fields_2[0].to_i
|
83
|
-
reg.end = fields_2[1].to_i
|
84
|
-
|
85
|
-
if reg.end < reg.start
|
86
|
-
reg.orientation = :reverse
|
87
|
-
else
|
88
|
-
reg.orientation = :forward
|
89
|
-
end
|
90
|
-
reg
|
91
|
-
end
|
92
|
-
|
93
|
-
def size
|
94
|
-
@end - @start
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
|
99
|
-
class FastaDBException < StandardError; end
|
100
|
-
|
101
|
-
#Class that holds the fasta file. It is used as a database. It heavily relies ond samtools.
|
102
|
-
class FastaFile
|
103
|
-
|
104
|
-
attr_reader :index, :fasta_path
|
105
|
-
|
106
|
-
def FastaFile.finalize(id)
|
107
|
-
#id.close()
|
108
|
-
#puts "Finalizing #{id} at #{Time.new}"
|
109
|
-
end
|
110
|
-
|
111
|
-
def initialize(fasta_filename)
|
112
|
-
@fasta_path = fasta_filename
|
113
|
-
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
114
|
-
@fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
|
115
|
-
if @fasta_index.null? then
|
116
|
-
$stderr.puts "Generating index for: " + @fasta_path
|
117
|
-
Bio::DB::SAM::Tools.fai_build(@fasta_path)
|
118
|
-
@fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
|
119
|
-
raise FastaDBException.new(), "Unable to generate fasta index for: " + @fasta_path if @fasta_index.nil? || @fasta_index.null?
|
120
|
-
end
|
121
|
-
ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
|
122
|
-
end
|
123
|
-
|
124
|
-
def load_fai_entries()
|
125
|
-
return @index.length if @index
|
126
|
-
@index = Index.new
|
127
|
-
fai_file = @fasta_path + ".fai"
|
128
|
-
File.open(fai_file).each do | line |
|
129
|
-
fields = line.split("\t")
|
130
|
-
@index << Entry.new(fields[0], fields[1])
|
131
|
-
|
132
|
-
end
|
133
|
-
@index.length
|
134
|
-
end
|
135
|
-
|
136
|
-
def close()
|
137
|
-
Bio::DB::SAM::Tools.fai_destroy(@fasta_index) unless @fasta_index.nil? || @fasta_index.null?
|
138
|
-
@fasta_index = nil
|
139
|
-
end
|
140
|
-
|
141
|
-
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
142
|
-
def fetch_sequence(region)
|
143
|
-
|
144
|
-
raise FastaDBException.new(), "No fasta index for " if @fasta_index.nil? || @fasta_index.null?
|
145
|
-
query = region.to_s
|
146
|
-
query = region.to_region.to_s if region.respond_to?(:to_region)
|
147
|
-
|
148
|
-
len = FFI::MemoryPointer.new :int
|
149
|
-
str = Bio::DB::SAM::Tools.fai_fetch(@fasta_index, query, len)
|
150
|
-
raise FastaDBException.new(), "Unable to get sequence for reference: " + query if str.nil?
|
151
|
-
reference = Bio::Sequence.auto(str)
|
152
|
-
|
153
|
-
#
|
154
|
-
|
155
|
-
if region.orientation == :reverse
|
156
|
-
#puts "reversing! #{reference.to_s}"
|
157
|
-
reference.reverse_complement!()
|
158
|
-
end
|
159
|
-
reference
|
160
|
-
end
|
161
|
-
|
162
|
-
|
163
|
-
end
|
164
|
-
end
|