bio-polyploid-tools 0.1.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,284 +0,0 @@
1
- require 'rubygems'
2
- require 'pathname'
3
- #require_relative 'db/fasta.rb'
4
- require 'bio'
5
-
6
- require_relative 'db/fastadb.rb'
7
-
8
- #require "set"
9
- #require 'systemu'
10
- #require 'json'
11
-
12
- =begin
13
-
14
- Extends the methods to be able to calculate the BFR and a consensus from the pileup
15
-
16
- =end
17
-
18
- class Bio::DB::Pileup
19
-
20
- #attr_accessor :minumum_ratio_for_iup_consensus
21
- #@minumum_ratio_for_iup_consensus = 0.20
22
-
23
- #Returns a hash with the count of bases
24
-
25
- def bases
26
- return @bases if @bases
27
- @bases = self.non_refs
28
- #puts self.ref_count
29
- @bases[self.ref_base.upcase.to_sym] = self.ref_count
30
- @bases
31
- end
32
-
33
- def base_coverage
34
- total = 0
35
- @bases.each do |k,v|
36
- total += v
37
- end
38
- total
39
- end
40
-
41
- def base_ratios
42
- return @base_ratios if @base_ratios
43
- bases = self.bases
44
- @base_ratios = Hash.new
45
- bases.each do |k,v|
46
- @base_ratios[k] = v.to_f/self.base_coverage.to_f
47
- end
48
- @base_ratios
49
- end
50
-
51
- # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
52
- def consensus_iuap(minumum_ratio_for_iup_consensus)
53
- minumum_ratio_for_iup_consensus
54
- if @consensus_iuap.nil?
55
- @consensus_iuap = self.ref_base.downcase
56
- bases = self.bases
57
- tmp = String.new
58
- bases.each do |k,v|
59
- tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
60
- end
61
- if tmp.length > 0
62
- @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
63
- end
64
- end
65
- @consensus_iuap
66
- end
67
- end
68
-
69
-
70
-
71
- class Bio::NucleicAcid
72
-
73
-
74
-
75
- def self.to_IUAPC(bases)
76
- #puts "TADA"
77
- base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
78
- if base == nil
79
- p "Invalid base! #{base}"
80
- base = 'n' #This is a patch... as one of the scripts failed here.
81
- end
82
- base.upcase
83
- end
84
-
85
- def self.is_valid(code, base)
86
- IUPAC_CODES[code.downcase].chars.include? base.downcase
87
- end
88
-
89
- end
90
-
91
-
92
- #class Bio::DB::Sam::SAMException < RuntimeError
93
-
94
- #end
95
-
96
- class Bio::DB::Sam
97
-
98
-
99
- attr_accessor :minumum_ratio_for_iup_consensus
100
- attr_reader :cached_regions
101
- #attr_accessor :pileup_cache
102
- @minumum_ratio_for_iup_consensus = 0.20
103
-
104
-
105
- #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
106
- #the pile for different operations, it won't execute the mpilup command several times
107
- #Whenever you finish using a region, call mpileup_clear_cache to free the cache
108
- #The argument Region is required, as it will be the key for the underlying hash.
109
- #We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
110
- #
111
- #TODO: It may be good to load partially the pileup
112
- def mpileup_cached (opts={})
113
- raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
114
- @pileup_cache = Hash.new unless @pileup_cache
115
- @cached_regions = Hash.new unless @cached_regions
116
-
117
- region = opts[:r] ? opts[:r] : opts[:region]
118
- opts[:r] = "#{region.to_s}"
119
- opts[:region] = "#{region.to_s}"
120
- opts[:A] = true
121
- #reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
122
-
123
- unless @cached_regions[region.to_s]
124
- @cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
125
- tmp = Array.new
126
- @cached_regions[region.to_s].pileup = tmp
127
- #puts "Loading #{region.to_s}"
128
- mpileup(opts) do | pile |
129
- # puts pile
130
- tmp << pile
131
- yield pile
132
- end
133
- else
134
- # puts "Loaded, reruning #{region.to_s}"
135
- @cached_regions.pileup[region.to_s] .each do | pile |
136
- yield pile
137
- end
138
- end
139
- end
140
-
141
- #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
142
- #If no region is passed, the hash is emptied
143
- def mpileup_clear_cache (region)
144
- return unless @cached_regions
145
- if region
146
- @cached_regions[region.to_s] = nil
147
- else
148
- @cached_regions.clear
149
- end
150
- end
151
-
152
- #Gets the coverage of a region from a pileup.
153
- def average_coverage_from_pileup(opts={})
154
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
155
- region = opts[:region]
156
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
157
- @cached_regions[region].average_coverage
158
- end
159
-
160
- #
161
- def coverages_from_pileup(opts={})
162
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
163
- region = opts[:region]
164
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
165
- @cached_regions[region].coverages
166
- end
167
-
168
- def consensus_with_ambiguities(opts={})
169
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
170
- region = opts[:region]
171
- # p "consensus with ambiguities for: " << opts[:region]
172
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
173
- @cached_regions[region].consensus
174
- end
175
-
176
- def calculate_stats_from_pile(opts={})
177
- min_cov = opts[:min_cov] ? opts[:min_cov] : 20
178
-
179
-
180
- opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
181
- region = opts[:region]
182
-
183
- mark_case = true if opts[:case]
184
- # puts "Marcase: #{mark_case}"
185
- reference = self.fetch_reference(region.entry, region.start, region.end).downcase
186
- # p "calculationg from pile..." << region.to_s
187
- base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
188
- bases = Array.new(region.size, BASE_COUNT_ZERO)
189
- coverages = Array.new(region.size, 0)
190
- total_cov = 0
191
-
192
- self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
193
- #puts pile
194
- #puts pile.coverage
195
- bef=reference[pile.pos - region.start - 1 ]
196
- if pile.coverage > min_cov
197
-
198
-
199
- base_ratios[pile.pos - region.start ] = pile.base_ratios
200
- reference[pile.pos - region.start - 1 ] = pile.consensus_iuap(0.20).upcase
201
- coverages[pile.pos - region.start ] = pile.coverage.to_i
202
- bases[pile.pos - region.start ] = pile.bases
203
-
204
-
205
- end
206
- #puts "#{pile.pos}\t#{bef}\t#{reference[pile.pos - region.start - 1 ]} "
207
- total_cov += pile.coverage
208
- end
209
-
210
- #puts ">Ref\n#{reference}"
211
- #puts ">Original\n#{r}"
212
- region = @cached_regions[region.to_s]
213
- region.coverages = coverages
214
- region.base_ratios = base_ratios
215
- region.consensus = Bio::Sequence.new(reference)
216
- region.consensus.na
217
- if region.orientation == :reverse
218
- region.consensus.reverse_complement!()
219
- end
220
- region.average_coverage = total_cov.to_f/region.size.to_f
221
- region.bases = bases
222
- region
223
- end
224
-
225
-
226
-
227
- #BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
228
-
229
- #Gets an array with the proportions of the bases in the region. If there is no coverage, a
230
- def base_ratios_in_region(opts={})
231
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
232
- region = opts[:region]
233
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
234
- @cached_regions[region].base_ratios
235
- end
236
-
237
- #Gets an array with the bsaes count in the region. If there is no coverage, a
238
- def bases_in_region(opts={})
239
- opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
240
- region = opts[:region]
241
- calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
242
- @cached_regions[region].bases
243
- end
244
-
245
-
246
-
247
- def extract_reads(opts={})
248
- opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
249
- fastq_filename = opts[:fastq]
250
- fastq_file = opts[:fastq_file]
251
-
252
- out = $stdout
253
-
254
- print_fastq = Proc.new do |alignment|
255
- out.puts "@#{alignment.qname}"
256
- out.puts "#{alignment.seq}"
257
- out.puts "+#{alignment.qname}"
258
- out.puts "#{alignment.qual}"
259
- end
260
-
261
- fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
262
-
263
-
264
- end
265
-
266
- end
267
-
268
- class Bio::DB::Fasta::Region
269
- attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
270
-
271
- #TODO: Debug, as it hasnt been tested in the actual code.
272
- def base_ratios_for_base(base)
273
- @all_ratios = Hash.new unless @all_ratios
274
- unless @all_ratios[base]
275
- ratios = Array.new
276
- for i in (0..region.size-1)
277
- ratios << @base_ratios[i][base]
278
- end
279
- @all_ratios[base] = ratios
280
- end
281
- @all_ratios[base]
282
- end
283
-
284
- end
@@ -1,164 +0,0 @@
1
- #Module to hold the information about the fasta file
2
-
3
- module Bio::DB::Fasta
4
- class Index
5
- include Enumerable
6
- attr_reader :entries
7
-
8
- def initialize
9
- @entries=[]
10
- @entries_map = Hash.new
11
- end
12
-
13
- #This doesnt validate if you are adding the same entry twice. I may add
14
- #a validation for that.
15
- def << (entry)
16
- @entries << entry
17
- @entries_map[entry.id] = entry
18
- end
19
-
20
- def each(&block)
21
- @entries.entries(&block)
22
- end
23
-
24
- def length
25
- @entries.length
26
- end
27
-
28
- #Returns a new Index just with the specified range, as if it was an Array.
29
- #The return object is of type Index.
30
- def [](args)
31
- tmp = @entries[args]
32
- new_index = Index.new
33
- tmp.each do | entry |
34
- @new_index << entry
35
- end
36
- end
37
-
38
- def region_for_entry(entry)
39
- @entries_map[entry]
40
- end
41
- end
42
-
43
- class Entry
44
- attr_reader :id, :length
45
-
46
- def initialize(id, length)
47
- @id=id
48
- @length=length.to_i
49
- end
50
-
51
- def get_full_region
52
- reg = Region.new
53
- reg.entry = id
54
- reg.start = 0
55
- reg.end = @length
56
- reg.orientation = :forward
57
- reg
58
- end
59
-
60
- def to_region
61
- get_full_region
62
- end
63
- end
64
-
65
- #Class to wrap a region of a chromosome
66
- class Region
67
- attr_accessor :entry, :start, :end, :orientation
68
-
69
- def to_s
70
- string = @entry + ":" + @start.to_s + "-" + @end.to_s
71
- string
72
- end
73
-
74
- def self.parse_region(reg_str)
75
- string = reg_str.delete("'")
76
- fields_1 = string.split(":")
77
- fields_2 = fields_1[1].split("-")
78
- raise FastaDBException.new(), "Invalid region. #{string}" if fields_1.length != 2 || fields_2.length != 2
79
-
80
- reg = Region.new
81
- reg.entry = fields_1[0]
82
- reg.start = fields_2[0].to_i
83
- reg.end = fields_2[1].to_i
84
-
85
- if reg.end < reg.start
86
- reg.orientation = :reverse
87
- else
88
- reg.orientation = :forward
89
- end
90
- reg
91
- end
92
-
93
- def size
94
- @end - @start
95
- end
96
-
97
- end
98
-
99
- class FastaDBException < StandardError; end
100
-
101
- #Class that holds the fasta file. It is used as a database. It heavily relies ond samtools.
102
- class FastaFile
103
-
104
- attr_reader :index, :fasta_path
105
-
106
- def FastaFile.finalize(id)
107
- #id.close()
108
- #puts "Finalizing #{id} at #{Time.new}"
109
- end
110
-
111
- def initialize(fasta_filename)
112
- @fasta_path = fasta_filename
113
- raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
114
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
115
- if @fasta_index.null? then
116
- $stderr.puts "Generating index for: " + @fasta_path
117
- Bio::DB::SAM::Tools.fai_build(@fasta_path)
118
- @fasta_index = Bio::DB::SAM::Tools.fai_load(@fasta_path)
119
- raise FastaDBException.new(), "Unable to generate fasta index for: " + @fasta_path if @fasta_index.nil? || @fasta_index.null?
120
- end
121
- ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
122
- end
123
-
124
- def load_fai_entries()
125
- return @index.length if @index
126
- @index = Index.new
127
- fai_file = @fasta_path + ".fai"
128
- File.open(fai_file).each do | line |
129
- fields = line.split("\t")
130
- @index << Entry.new(fields[0], fields[1])
131
-
132
- end
133
- @index.length
134
- end
135
-
136
- def close()
137
- Bio::DB::SAM::Tools.fai_destroy(@fasta_index) unless @fasta_index.nil? || @fasta_index.null?
138
- @fasta_index = nil
139
- end
140
-
141
- #The region needs to have a method to_region or a method to_s that ha the format "chromosome:start-end" as in samtools
142
- def fetch_sequence(region)
143
-
144
- raise FastaDBException.new(), "No fasta index for " if @fasta_index.nil? || @fasta_index.null?
145
- query = region.to_s
146
- query = region.to_region.to_s if region.respond_to?(:to_region)
147
-
148
- len = FFI::MemoryPointer.new :int
149
- str = Bio::DB::SAM::Tools.fai_fetch(@fasta_index, query, len)
150
- raise FastaDBException.new(), "Unable to get sequence for reference: " + query if str.nil?
151
- reference = Bio::Sequence.auto(str)
152
-
153
- #
154
-
155
- if region.orientation == :reverse
156
- #puts "reversing! #{reference.to_s}"
157
- reference.reverse_complement!()
158
- end
159
- reference
160
- end
161
-
162
-
163
- end
164
- end