bio-polyploid-tools 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -3
- data/Gemfile.lock +8 -8
- data/README.md +45 -0
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -7
- data/bin/count_variations.rb +1 -1
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +1 -1
- data/bin/polymarker.rb +2 -2
- data/bin/snps_between_bams.rb +37 -7
- data/bio-polyploid-tools.gemspec +17 -13
- data/lib/bio/BFRTools.rb +27 -261
- data/lib/bio/BIOExtensions.rb +0 -124
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +1 -1
- data/lib/bio/PolyploidTools/ExonContainer.rb +6 -5
- data/lib/bio/PolyploidTools/Marker.rb +2 -2
- data/lib/bio/PolyploidTools/SNP.rb +5 -4
- data/lib/bio/db/exonerate.rb +1 -1
- data/test/test_bfr.rb +101 -9
- metadata +28 -12
- data/lib/bio/SAMToolsExtensions.rb +0 -284
- data/lib/bio/db/fastadb.rb +0 -164
@@ -1,284 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'pathname'
|
3
|
-
#require_relative 'db/fasta.rb'
|
4
|
-
require 'bio'
|
5
|
-
|
6
|
-
require_relative 'db/fastadb.rb'
|
7
|
-
|
8
|
-
#require "set"
|
9
|
-
#require 'systemu'
|
10
|
-
#require 'json'
|
11
|
-
|
12
|
-
=begin
|
13
|
-
|
14
|
-
Extends the methods to be able to calculate the BFR and a consensus from the pileup
|
15
|
-
|
16
|
-
=end
|
17
|
-
|
18
|
-
class Bio::DB::Pileup
|
19
|
-
|
20
|
-
#attr_accessor :minumum_ratio_for_iup_consensus
|
21
|
-
#@minumum_ratio_for_iup_consensus = 0.20
|
22
|
-
|
23
|
-
#Returns a hash with the count of bases
|
24
|
-
|
25
|
-
def bases
|
26
|
-
return @bases if @bases
|
27
|
-
@bases = self.non_refs
|
28
|
-
#puts self.ref_count
|
29
|
-
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
30
|
-
@bases
|
31
|
-
end
|
32
|
-
|
33
|
-
def base_coverage
|
34
|
-
total = 0
|
35
|
-
@bases.each do |k,v|
|
36
|
-
total += v
|
37
|
-
end
|
38
|
-
total
|
39
|
-
end
|
40
|
-
|
41
|
-
def base_ratios
|
42
|
-
return @base_ratios if @base_ratios
|
43
|
-
bases = self.bases
|
44
|
-
@base_ratios = Hash.new
|
45
|
-
bases.each do |k,v|
|
46
|
-
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
47
|
-
end
|
48
|
-
@base_ratios
|
49
|
-
end
|
50
|
-
|
51
|
-
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
52
|
-
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
53
|
-
minumum_ratio_for_iup_consensus
|
54
|
-
if @consensus_iuap.nil?
|
55
|
-
@consensus_iuap = self.ref_base.downcase
|
56
|
-
bases = self.bases
|
57
|
-
tmp = String.new
|
58
|
-
bases.each do |k,v|
|
59
|
-
tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
|
60
|
-
end
|
61
|
-
if tmp.length > 0
|
62
|
-
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
@consensus_iuap
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
class Bio::NucleicAcid
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
def self.to_IUAPC(bases)
|
76
|
-
#puts "TADA"
|
77
|
-
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
78
|
-
if base == nil
|
79
|
-
p "Invalid base! #{base}"
|
80
|
-
base = 'n' #This is a patch... as one of the scripts failed here.
|
81
|
-
end
|
82
|
-
base.upcase
|
83
|
-
end
|
84
|
-
|
85
|
-
def self.is_valid(code, base)
|
86
|
-
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
#class Bio::DB::Sam::SAMException < RuntimeError
|
93
|
-
|
94
|
-
#end
|
95
|
-
|
96
|
-
class Bio::DB::Sam
|
97
|
-
|
98
|
-
|
99
|
-
attr_accessor :minumum_ratio_for_iup_consensus
|
100
|
-
attr_reader :cached_regions
|
101
|
-
#attr_accessor :pileup_cache
|
102
|
-
@minumum_ratio_for_iup_consensus = 0.20
|
103
|
-
|
104
|
-
|
105
|
-
#Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
|
106
|
-
#the pile for different operations, it won't execute the mpilup command several times
|
107
|
-
#Whenever you finish using a region, call mpileup_clear_cache to free the cache
|
108
|
-
#The argument Region is required, as it will be the key for the underlying hash.
|
109
|
-
#We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
|
110
|
-
#
|
111
|
-
#TODO: It may be good to load partially the pileup
|
112
|
-
def mpileup_cached (opts={})
|
113
|
-
raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
|
114
|
-
@pileup_cache = Hash.new unless @pileup_cache
|
115
|
-
@cached_regions = Hash.new unless @cached_regions
|
116
|
-
|
117
|
-
region = opts[:r] ? opts[:r] : opts[:region]
|
118
|
-
opts[:r] = "#{region.to_s}"
|
119
|
-
opts[:region] = "#{region.to_s}"
|
120
|
-
opts[:A] = true
|
121
|
-
#reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
|
122
|
-
|
123
|
-
unless @cached_regions[region.to_s]
|
124
|
-
@cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
|
125
|
-
tmp = Array.new
|
126
|
-
@cached_regions[region.to_s].pileup = tmp
|
127
|
-
#puts "Loading #{region.to_s}"
|
128
|
-
mpileup(opts) do | pile |
|
129
|
-
# puts pile
|
130
|
-
tmp << pile
|
131
|
-
yield pile
|
132
|
-
end
|
133
|
-
else
|
134
|
-
# puts "Loaded, reruning #{region.to_s}"
|
135
|
-
@cached_regions.pileup[region.to_s] .each do | pile |
|
136
|
-
yield pile
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
#Clears the pileup cache. If a region is passed as argument, just the specified region is removed
|
142
|
-
#If no region is passed, the hash is emptied
|
143
|
-
def mpileup_clear_cache (region)
|
144
|
-
return unless @cached_regions
|
145
|
-
if region
|
146
|
-
@cached_regions[region.to_s] = nil
|
147
|
-
else
|
148
|
-
@cached_regions.clear
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
#Gets the coverage of a region from a pileup.
|
153
|
-
def average_coverage_from_pileup(opts={})
|
154
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
155
|
-
region = opts[:region]
|
156
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
157
|
-
@cached_regions[region].average_coverage
|
158
|
-
end
|
159
|
-
|
160
|
-
#
|
161
|
-
def coverages_from_pileup(opts={})
|
162
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
163
|
-
region = opts[:region]
|
164
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
165
|
-
@cached_regions[region].coverages
|
166
|
-
end
|
167
|
-
|
168
|
-
def consensus_with_ambiguities(opts={})
|
169
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
170
|
-
region = opts[:region]
|
171
|
-
# p "consensus with ambiguities for: " << opts[:region]
|
172
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
173
|
-
@cached_regions[region].consensus
|
174
|
-
end
|
175
|
-
|
176
|
-
def calculate_stats_from_pile(opts={})
|
177
|
-
min_cov = opts[:min_cov] ? opts[:min_cov] : 20
|
178
|
-
|
179
|
-
|
180
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
181
|
-
region = opts[:region]
|
182
|
-
|
183
|
-
mark_case = true if opts[:case]
|
184
|
-
# puts "Marcase: #{mark_case}"
|
185
|
-
reference = self.fetch_reference(region.entry, region.start, region.end).downcase
|
186
|
-
# p "calculationg from pile..." << region.to_s
|
187
|
-
base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
|
188
|
-
bases = Array.new(region.size, BASE_COUNT_ZERO)
|
189
|
-
coverages = Array.new(region.size, 0)
|
190
|
-
total_cov = 0
|
191
|
-
|
192
|
-
self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
|
193
|
-
#puts pile
|
194
|
-
#puts pile.coverage
|
195
|
-
bef=reference[pile.pos - region.start - 1 ]
|
196
|
-
if pile.coverage > min_cov
|
197
|
-
|
198
|
-
|
199
|
-
base_ratios[pile.pos - region.start ] = pile.base_ratios
|
200
|
-
reference[pile.pos - region.start - 1 ] = pile.consensus_iuap(0.20).upcase
|
201
|
-
coverages[pile.pos - region.start ] = pile.coverage.to_i
|
202
|
-
bases[pile.pos - region.start ] = pile.bases
|
203
|
-
|
204
|
-
|
205
|
-
end
|
206
|
-
#puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
|
207
|
-
total_cov += pile.coverage
|
208
|
-
end
|
209
|
-
|
210
|
-
#puts ">Ref\n#{reference}"
|
211
|
-
#puts ">Original\n#{r}"
|
212
|
-
region = @cached_regions[region.to_s]
|
213
|
-
region.coverages = coverages
|
214
|
-
region.base_ratios = base_ratios
|
215
|
-
region.consensus = Bio::Sequence.new(reference)
|
216
|
-
region.consensus.na
|
217
|
-
if region.orientation == :reverse
|
218
|
-
region.consensus.reverse_complement!()
|
219
|
-
end
|
220
|
-
region.average_coverage = total_cov.to_f/region.size.to_f
|
221
|
-
region.bases = bases
|
222
|
-
region
|
223
|
-
end
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
#BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
228
|
-
|
229
|
-
#Gets an array with the proportions of the bases in the region. If there is no coverage, a
|
230
|
-
def base_ratios_in_region(opts={})
|
231
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
232
|
-
region = opts[:region]
|
233
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
234
|
-
@cached_regions[region].base_ratios
|
235
|
-
end
|
236
|
-
|
237
|
-
#Gets an array with the bsaes count in the region. If there is no coverage, a
|
238
|
-
def bases_in_region(opts={})
|
239
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
240
|
-
region = opts[:region]
|
241
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
242
|
-
@cached_regions[region].bases
|
243
|
-
end
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
def extract_reads(opts={})
|
248
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
249
|
-
fastq_filename = opts[:fastq]
|
250
|
-
fastq_file = opts[:fastq_file]
|
251
|
-
|
252
|
-
out = $stdout
|
253
|
-
|
254
|
-
print_fastq = Proc.new do |alignment|
|
255
|
-
out.puts "@#{alignment.qname}"
|
256
|
-
out.puts "#{alignment.seq}"
|
257
|
-
out.puts "+#{alignment.qname}"
|
258
|
-
out.puts "#{alignment.qual}"
|
259
|
-
end
|
260
|
-
|
261
|
-
fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
|
262
|
-
|
263
|
-
|
264
|
-
end
|
265
|
-
|
266
|
-
end
|
267
|
-
|
268
|
-
class Bio::DB::Fasta::Region
|
269
|
-
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
|
270
|
-
|
271
|
-
#TODO: Debug, as it hasnt been tested in the actual code.
|
272
|
-
def base_ratios_for_base(base)
|
273
|
-
@all_ratios = Hash.new unless @all_ratios
|
274
|
-
unless @all_ratios[base]
|
275
|
-
ratios = Array.new
|
276
|
-
for i in (0..region.size-1)
|
277
|
-
ratios << @base_ratios[i][base]
|
278
|
-
end
|
279
|
-
@all_ratios[base] = ratios
|
280
|
-
end
|
281
|
-
@all_ratios[base]
|
282
|
-
end
|
283
|
-
|
284
|
-
end
|
data/lib/bio/db/fastadb.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
#Module to hold the information about the fasta file
|
2
|
-
|
3
|
-
module Bio::DB::Fasta
|
4
|
-
class Index
|
5
|
-
include Enumerable
|
6
|
-
attr_reader :entries
|
7
|
-
|
8
|
-
def initialize
|
9
|
-
@entries=[]
|
10
|
-
@entries_map = Hash.new
|
11
|
-
end
|
12
|
-
|
13
|
-
#This doesnt validate if you are adding the same entry twice. I may add
|
14
|
-
#a validation for that.
|
15
|
-
def << (entry)
|
16
|
-
@entries << entry
|
17
|
-
@entries_map[entry.id] = entry
|
18
|
-
end
|
19
|
-
|
20
|
-
def each(&block)
|
21
|
-
@entries.entries(&block)
|
22
|
-
end
|
23
|
-
|
24
|
-
def length
|
25
|
-
@entries.length
|
26
|
-
end
|
27
|
-
|
28
|
-
#Returns a new Index just with the specified range, as if it was an Array.
|
29
|
-
#The return object is of type Index.
|
30
|
-
def [](args)
|
31
|
-
tmp = @entries[args]
|
32
|
-
new_index = Index.new
|
33
|
-
tmp.each do | entry |
|
34
|
-
@new_index << entry
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def region_for_entry(entry)
|
39
|
-
@entries_map[entry]
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
class Entry
|
44
|
-
attr_reader :id, :length
|
45
|
-
|
46
|
-
def initialize(id, length)
|
47
|
-
@id=id
|
48
|
-
@length=length.to_i
|
49
|
-
end
|
50
|
-
|
51
|
-
def get_full_region
|
52
|
-
reg = Region.new
|
53
|
-
reg.entry = id
|
54
|
-
reg.start = 0
|
55
|
-
reg.end = @length
|
56
|
-
reg.orientation = :forward
|
57
|
-
reg
|
58
|
-
end
|
59
|
-
|
60
|
-
def to_region
|
61
|
-
get_full_region
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
#Class to wrap a region of a chromosome
|
66
|
-
class Region
|
67
|
-
attr_accessor :entry, :start, :end, :orientation
|
68
|
-
|
69
|
-
def to_s
|
70
|
-
string = @entry + ":" + @start.to_s + "-" + @end.to_s
|
71
|
-
string
|
72
|
-
end
|
73
|
-
|
74
|
-
def self.parse_region(reg_str)
|
75
|
-
string = reg_str.delete("'")
|
76
|
-
fields_1 = string.split(":")
|
77
|
-
fields_2 = fields_1[1].split("-")
|
78
|
-
raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
|
79
|
-
|
80
|
-
reg = Region.new
|
81
|
-
reg.entry = fields_1[0]
|
82
|
-
reg.start = fields_2[0].to_i
|
83
|
-
reg.end = fields_2[1].to_i
|
84
|
-
|
85
|
-
if reg.end < reg.start
|
86
|
-
reg.orientation = :reverse
|
87
|
-
else
|
88
|
-
reg.orientation = :forward
|
89
|
-
end
|
90
|
-
reg
|
91
|
-
end
|
92
|
-
|
93
|
-
def size
|
94
|
-
@end - @start
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
|
99
|
-
class FastaDBException < StandardError; end
|
100
|
-
|
101
|
-
#Class that holds the fasta file. It is used as a database. It heavily relies ond samtools.
|
102
|
-
class FastaFile
|
103
|
-
|
104
|
-
attr_reader :index, :fasta_path
|
105
|
-
|
106
|
-
def FastaFile.finalize(id)
|
107
|
-
#id.close()
|
108
|
-
#puts "Finalizing #{id} at #{Time.new}"
|
109
|
-
end
|
110
|
-
|
111
|
-
def initialize(fasta_filename)
|
112
|
-
@fasta_path = fasta_filename
|
113
|
-
raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
|
114
|
-
@fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
|
115
|
-
if @fasta_index.null? then
|
116
|
-
$stderr.puts "Generating index for: " + @fasta_path
|
117
|
-
Bio::DB::SAM::Tools.fai_build(@fasta_path)
|
118
|
-
@fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
|
119
|
-
raise FastaDBException.new(), "Unable to generate fasta index for: " + @fasta_path if @fasta_index.nil? || @fasta_index.null?
|
120
|
-
end
|
121
|
-
ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
|
122
|
-
end
|
123
|
-
|
124
|
-
def load_fai_entries()
|
125
|
-
return @index.length if @index
|
126
|
-
@index = Index.new
|
127
|
-
fai_file = @fasta_path + ".fai"
|
128
|
-
File.open(fai_file).each do | line |
|
129
|
-
fields = line.split("\t")
|
130
|
-
@index << Entry.new(fields[0], fields[1])
|
131
|
-
|
132
|
-
end
|
133
|
-
@index.length
|
134
|
-
end
|
135
|
-
|
136
|
-
def close()
|
137
|
-
Bio::DB::SAM::Tools.fai_destroy(@fasta_index) unless @fasta_index.nil? || @fasta_index.null?
|
138
|
-
@fasta_index = nil
|
139
|
-
end
|
140
|
-
|
141
|
-
#The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
|
142
|
-
def fetch_sequence(region)
|
143
|
-
|
144
|
-
raise FastaDBException.new(), "No fasta index for " if @fasta_index.nil? || @fasta_index.null?
|
145
|
-
query = region.to_s
|
146
|
-
query = region.to_region.to_s if region.respond_to?(:to_region)
|
147
|
-
|
148
|
-
len = FFI::MemoryPointer.new :int
|
149
|
-
str = Bio::DB::SAM::Tools.fai_fetch(@fasta_index, query, len)
|
150
|
-
raise FastaDBException.new(), "Unable to get sequence for reference: " + query if str.nil?
|
151
|
-
reference = Bio::Sequence.auto(str)
|
152
|
-
|
153
|
-
#
|
154
|
-
|
155
|
-
if region.orientation == :reverse
|
156
|
-
#puts "reversing! #{reference.to_s}"
|
157
|
-
reference.reverse_complement!()
|
158
|
-
end
|
159
|
-
reference
|
160
|
-
end
|
161
|
-
|
162
|
-
|
163
|
-
end
|
164
|
-
end
|