bio-polyploid-tools 0.1.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,284 +0,0 @@
1
- require 'rubygems'
2
- require 'pathname'
3
- #require_relative 'db/fasta.rb'
4
- require 'bio'
5
-
6
- require_relative 'db/fastadb.rb'
7
-
8
- #require "set"
9
- #require 'systemu'
10
- #require 'json'
11
-
12
- =begin
13
-
14
- Extends the methods to be able to calculate the BFR and a consensus from the pileup
15
-
16
- =end
17
-
18
- class Bio::DB::Pileup
19
-
20
- #attr_accessor :minumum_ratio_for_iup_consensus
21
- #@minumum_ratio_for_iup_consensus = 0.20
22
-
23
- #Returns a hash with the count of bases
24
-
25
- def bases
26
- return @bases if @bases
27
- @bases = self.non_refs
28
- #puts self.ref_count
29
- @bases[self.ref_base.upcase.to_sym] = self.ref_count
30
- @bases
31
- end
32
-
33
- def base_coverage
34
- total = 0
35
- @bases.each do |k,v|
36
- total += v
37
- end
38
- total
39
- end
40
-
41
- def base_ratios
42
- return @base_ratios if @base_ratios
43
- bases = self.bases
44
- @base_ratios = Hash.new
45
- bases.each do |k,v|
46
- @base_ratios[k] = v.to_f/self.base_coverage.to_f
47
- end
48
- @base_ratios
49
- end
50
-
51
- # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
52
- def consensus_iuap(minumum_ratio_for_iup_consensus)
53
- minumum_ratio_for_iup_consensus
54
- if @consensus_iuap.nil?
55
- @consensus_iuap = self.ref_base.downcase
56
- bases = self.bases
57
- tmp = String.new
58
- bases.each do |k,v|
59
- tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
60
- end
61
- if tmp.length > 0
62
- @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
63
- end
64
- end
65
- @consensus_iuap
66
- end
67
- end
68
-
69
-
70
-
71
- class Bio::NucleicAcid
72
-
73
-
74
-
75
- def self.to_IUAPC(bases)
76
- #puts "TADA"
77
- base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
78
- if base == nil
79
- p "Invalid base! #{base}"
80
- base = 'n' #This is a patch... as one of the scripts failed here.
81
- end
82
- base.upcase
83
- end
84
-
85
- def self.is_valid(code, base)
86
- IUPAC_CODES[code.downcase].chars.include? base.downcase
87
- end
88
-
89
- end
90
-
91
-
92
- #class Bio::DB::Sam::SAMException < RuntimeError
93
-
94
- #end
95
-
96
- class Bio::DB::Sam
97
-
98
-
99
- attr_accessor :minumum_ratio_for_iup_consensus
100
- attr_reader :cached_regions
101
- #attr_accessor :pileup_cache
102
- @minumum_ratio_for_iup_consensus = 0.20
103
-
104
-
105
- #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
106
- #the pile for different operations, it won't execute the mpilup command several times
107
- #Whenever you finish using a region, call mpileup_clear_cache to free the cache
108
- #The argument Region is required, as it will be the key for the underlying hash.
109
- #We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
110
- #
111
- #TODO: It may be good to load partially the pileup
112
- def mpileup_cached (opts={})
113
- raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
114
- @pileup_cache = Hash.new unless @pileup_cache
115
- @cached_regions = Hash.new unless @cached_regions
116
-
117
- region = opts[:r] ? opts[:r] : opts[:region]
118
- opts[:r] = "#{region.to_s}"
119
- opts[:region] = "#{region.to_s}"
120
- opts[:A] = true
121
- #reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
122
-
123
- unless @cached_regions[region.to_s]
124
- @cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
125
- tmp = Array.new
126
- @cached_regions[region.to_s].pileup = tmp
127
- #puts "Loading #{region.to_s}"
128
- mpileup(opts) do | pile |
129
- # puts pile
130
- tmp << pile
131
- yield pile
132
- end
133
- else
134
- # puts "Loaded, reruning #{region.to_s}"
135
- @cached_regions.pileup[region.to_s] .each do | pile |
136
- yield pile
137
- end
138
- end
139
- end
140
-
141
- #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
142
- #If no region is passed, the hash is emptied
143
- def mpileup_clear_cache (region)
144
- return unless @cached_regions
145
- if region
146
- @cached_regions[region.to_s] = nil
147
- else
148
- @cached_regions.clear
149
- end
150
- end
151
-
152
- #Gets the coverage of a region from a pileup.
153
- def average_coverage_from_pileup(opts={})
154
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
155
- region = opts[:region]
156
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
157
- @cached_regions[region].average_coverage
158
- end
159
-
160
- #
161
- def coverages_from_pileup(opts={})
162
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
163
- region = opts[:region]
164
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
165
- @cached_regions[region].coverages
166
- end
167
-
168
- def consensus_with_ambiguities(opts={})
169
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
170
- region = opts[:region]
171
- # p "consensus with ambiguities for: " << opts[:region]
172
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
173
- @cached_regions[region].consensus
174
- end
175
-
176
- def calculate_stats_from_pile(opts={})
177
- min_cov = opts[:min_cov] ? opts[:min_cov] : 20
178
-
179
-
180
- opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
181
- region = opts[:region]
182
-
183
- mark_case = true if opts[:case]
184
- # puts "Marcase: #{mark_case}"
185
- reference = self.fetch_reference(region.entry, region.start, region.end).downcase
186
- # p "calculationg from pile..." << region.to_s
187
- base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
188
- bases = Array.new(region.size, BASE_COUNT_ZERO)
189
- coverages = Array.new(region.size, 0)
190
- total_cov = 0
191
-
192
- self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
193
- #puts pile
194
- #puts pile.coverage
195
- bef=reference[pile.pos - region.start - 1 ]
196
- if pile.coverage > min_cov
197
-
198
-
199
- base_ratios[pile.pos - region.start ] = pile.base_ratios
200
- reference[pile.pos - region.start - 1 ] = pile.consensus_iuap(0.20).upcase
201
- coverages[pile.pos - region.start ] = pile.coverage.to_i
202
- bases[pile.pos - region.start ] = pile.bases
203
-
204
-
205
- end
206
- #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
207
- total_cov += pile.coverage
208
- end
209
-
210
- #puts ">Ref\n#{reference}"
211
- #puts ">Original\n#{r}"
212
- region = @cached_regions[region.to_s]
213
- region.coverages = coverages
214
- region.base_ratios = base_ratios
215
- region.consensus = Bio::Sequence.new(reference)
216
- region.consensus.na
217
- if region.orientation == :reverse
218
- region.consensus.reverse_complement!()
219
- end
220
- region.average_coverage = total_cov.to_f/region.size.to_f
221
- region.bases = bases
222
- region
223
- end
224
-
225
-
226
-
227
- #BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
228
-
229
- #Gets an array with the proportions of the bases in the region. If there is no coverage, a
230
- def base_ratios_in_region(opts={})
231
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
232
- region = opts[:region]
233
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
234
- @cached_regions[region].base_ratios
235
- end
236
-
237
- #Gets an array with the bsaes count in the region. If there is no coverage, a
238
- def bases_in_region(opts={})
239
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
240
- region = opts[:region]
241
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
242
- @cached_regions[region].bases
243
- end
244
-
245
-
246
-
247
- def extract_reads(opts={})
248
- opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
249
- fastq_filename = opts[:fastq]
250
- fastq_file = opts[:fastq_file]
251
-
252
- out = $stdout
253
-
254
- print_fastq = Proc.new do |alignment|
255
- out.puts "@#{alignment.qname}"
256
- out.puts "#{alignment.seq}"
257
- out.puts "+#{alignment.qname}"
258
- out.puts "#{alignment.qual}"
259
- end
260
-
261
- fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
262
-
263
-
264
- end
265
-
266
- end
267
-
268
- class Bio::DB::Fasta::Region
269
- attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
270
-
271
- #TODO: Debug, as it hasnt been tested in the actual code.
272
- def base_ratios_for_base(base)
273
- @all_ratios = Hash.new unless @all_ratios
274
- unless @all_ratios[base]
275
- ratios = Array.new
276
- for i in (0..region.size-1)
277
- ratios << @base_ratios[i][base]
278
- end
279
- @all_ratios[base] = ratios
280
- end
281
- @all_ratios[base]
282
- end
283
-
284
- end
@@ -1,164 +0,0 @@
1
- #Module to hold the information about the fasta file
2
-
3
- module Bio::DB::Fasta
4
- class Index
5
- include Enumerable
6
- attr_reader :entries
7
-
8
- def initialize
9
- @entries=[]
10
- @entries_map = Hash.new
11
- end
12
-
13
- #This doesnt validate if you are adding the same entry twice. I may add
14
- #a validation for that.
15
- def << (entry)
16
- @entries << entry
17
- @entries_map[entry.id] = entry
18
- end
19
-
20
- def each(&block)
21
- @entries.entries(&block)
22
- end
23
-
24
- def length
25
- @entries.length
26
- end
27
-
28
- #Returns a new Index just with the specified range, as if it was an Array.
29
- #The return object is of type Index.
30
- def [](args)
31
- tmp = @entries[args]
32
- new_index = Index.new
33
- tmp.each do | entry |
34
- @new_index << entry
35
- end
36
- end
37
-
38
- def region_for_entry(entry)
39
- @entries_map[entry]
40
- end
41
- end
42
-
43
- class Entry
44
- attr_reader :id, :length
45
-
46
- def initialize(id, length)
47
- @id=id
48
- @length=length.to_i
49
- end
50
-
51
- def get_full_region
52
- reg = Region.new
53
- reg.entry = id
54
- reg.start = 0
55
- reg.end = @length
56
- reg.orientation = :forward
57
- reg
58
- end
59
-
60
- def to_region
61
- get_full_region
62
- end
63
- end
64
-
65
- #Class to wrap a region of a chromosome
66
- class Region
67
- attr_accessor :entry, :start, :end, :orientation
68
-
69
- def to_s
70
- string = @entry + ":" + @start.to_s + "-" + @end.to_s
71
- string
72
- end
73
-
74
- def self.parse_region(reg_str)
75
- string = reg_str.delete("'")
76
- fields_1 = string.split(":")
77
- fields_2 = fields_1[1].split("-")
78
- raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
79
-
80
- reg = Region.new
81
- reg.entry = fields_1[0]
82
- reg.start = fields_2[0].to_i
83
- reg.end = fields_2[1].to_i
84
-
85
- if reg.end < reg.start
86
- reg.orientation = :reverse
87
- else
88
- reg.orientation = :forward
89
- end
90
- reg
91
- end
92
-
93
- def size
94
- @end - @start
95
- end
96
-
97
- end
98
-
99
- class FastaDBException < StandardError; end
100
-
101
- #Class that holds the fasta file. It is used as a database. It heavily relies ond samtools.
102
- class FastaFile
103
-
104
- attr_reader :index, :fasta_path
105
-
106
- def FastaFile.finalize(id)
107
- #id.close()
108
- #puts "Finalizing #{id} at #{Time.new}"
109
- end
110
-
111
- def initialize(fasta_filename)
112
- @fasta_path = fasta_filename
113
- raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
114
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
115
- if @fasta_index.null? then
116
- $stderr.puts "Generating index for: " + @fasta_path
117
- Bio::DB::SAM::Tools.fai_build(@fasta_path)
118
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
119
- raise FastaDBException.new(), "Unable to generate fasta index for: " + @fasta_path if @fasta_index.nil? || @fasta_index.null?
120
- end
121
- ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
122
- end
123
-
124
- def load_fai_entries()
125
- return @index.length if @index
126
- @index = Index.new
127
- fai_file = @fasta_path + ".fai"
128
- File.open(fai_file).each do | line |
129
- fields = line.split("\t")
130
- @index << Entry.new(fields[0], fields[1])
131
-
132
- end
133
- @index.length
134
- end
135
-
136
- def close()
137
- Bio::DB::SAM::Tools.fai_destroy(@fasta_index) unless @fasta_index.nil? || @fasta_index.null?
138
- @fasta_index = nil
139
- end
140
-
141
- #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
142
- def fetch_sequence(region)
143
-
144
- raise FastaDBException.new(), "No fasta index for " if @fasta_index.nil? || @fasta_index.null?
145
- query = region.to_s
146
- query = region.to_region.to_s if region.respond_to?(:to_region)
147
-
148
- len = FFI::MemoryPointer.new :int
149
- str = Bio::DB::SAM::Tools.fai_fetch(@fasta_index, query, len)
150
- raise FastaDBException.new(), "Unable to get sequence for reference: " + query if str.nil?
151
- reference = Bio::Sequence.auto(str)
152
-
153
- #
154
-
155
- if region.orientation == :reverse
156
- #puts "reversing! #{reference.to_s}"
157
- reference.reverse_complement!()
158
- end
159
- reference
160
- end
161
-
162
-
163
- end
164
- end