bio-polyploid-tools 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +16 -0
  3. data/Gemfile.lock +67 -0
  4. data/README +21 -0
  5. data/Rakefile +61 -0
  6. data/VERSION +1 -0
  7. data/bin/bfr.rb +133 -0
  8. data/bin/count_variations.rb +36 -0
  9. data/bin/filter_blat_by_target_coverage.rb +15 -0
  10. data/bin/find_best_blat_hit.rb +32 -0
  11. data/bin/hexaploid_primers.rb +168 -0
  12. data/bin/homokaryot_primers.rb +155 -0
  13. data/bin/map_markers_to_contigs.rb +66 -0
  14. data/bin/markers_in_region.rb +42 -0
  15. data/bin/polymarker.rb +219 -0
  16. data/bin/snps_between_bams.rb +106 -0
  17. data/bio-polyploid-tools.gemspec +139 -0
  18. data/conf/defaults.rb +1 -0
  19. data/conf/primer3_config/dangle.dh +128 -0
  20. data/conf/primer3_config/dangle.ds +128 -0
  21. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  22. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  23. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  24. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  25. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  26. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  27. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  28. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  29. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  30. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  31. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  32. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  33. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  34. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  35. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  36. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  37. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  38. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  39. data/conf/primer3_config/loops.dh +30 -0
  40. data/conf/primer3_config/loops.ds +30 -0
  41. data/conf/primer3_config/stack.dh +256 -0
  42. data/conf/primer3_config/stack.ds +256 -0
  43. data/conf/primer3_config/stackmm.dh +256 -0
  44. data/conf/primer3_config/stackmm.ds +256 -0
  45. data/conf/primer3_config/tetraloop.dh +77 -0
  46. data/conf/primer3_config/tetraloop.ds +77 -0
  47. data/conf/primer3_config/triloop.dh +16 -0
  48. data/conf/primer3_config/triloop.ds +16 -0
  49. data/conf/primer3_config/tstack.dh +256 -0
  50. data/conf/primer3_config/tstack2.dh +256 -0
  51. data/conf/primer3_config/tstack2.ds +256 -0
  52. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  53. data/lib/bio/BFRTools.rb +698 -0
  54. data/lib/bio/BIOExtensions.rb +186 -0
  55. data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
  56. data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
  57. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  58. data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
  59. data/lib/bio/PolyploidTools/SNP.rb +681 -0
  60. data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
  61. data/lib/bio/SAMToolsExtensions.rb +284 -0
  62. data/lib/bio/db/exonerate.rb +272 -0
  63. data/lib/bio/db/fastadb.rb +164 -0
  64. data/lib/bio/db/primer3.rb +673 -0
  65. data/lib/bioruby-polyploid-tools.rb +25 -0
  66. data/test/data/BS00068396_51.fa +2 -0
  67. data/test/data/BS00068396_51_contigs.aln +1412 -0
  68. data/test/data/BS00068396_51_contigs.dnd +7 -0
  69. data/test/data/BS00068396_51_contigs.fa +8 -0
  70. data/test/data/BS00068396_51_exonerate.tab +6 -0
  71. data/test/data/BS00068396_51_genes.txt +14 -0
  72. data/test/data/LIB1716.bam +0 -0
  73. data/test/data/LIB1716.bam.bai +0 -0
  74. data/test/data/LIB1719.bam +0 -0
  75. data/test/data/LIB1719.bam.bai +0 -0
  76. data/test/data/LIB1721.bam +0 -0
  77. data/test/data/LIB1721.bam.bai +0 -0
  78. data/test/data/LIB1722.bam +0 -0
  79. data/test/data/LIB1722.bam.bai +0 -0
  80. data/test/data/S22380157.fa +16 -0
  81. data/test/data/S22380157.fa.fai +1 -0
  82. data/test/data/Test3Aspecific.csv +1 -0
  83. data/test/data/Test3Aspecific_contigs.fa +6 -0
  84. data/test/data/patological_cases5D.csv +1 -0
  85. data/test/data/short_primer_design_test.csv +10 -0
  86. data/test/data/test_primer3_error.csv +4 -0
  87. data/test/data/test_primer3_error_contigs.fa +10 -0
  88. data/test/test_bfr.rb +51 -0
  89. data/test/test_exon_container.rb +17 -0
  90. data/test/test_exonearate.rb +53 -0
  91. data/test/test_snp_parsing.rb +40 -0
  92. metadata +201 -0
@@ -0,0 +1,698 @@
1
+ require 'rubygems'
2
+ #require 'extensions/all'
3
+ #require 'bio-samtools'
4
+ #require 'bio/db/pileup'
5
+ #require 'bio/db/vcf'
6
+ require 'pathname'
7
+ #require_relative 'BIOExtensions.rb'
8
+ require_relative 'db/fastadb.rb'
9
+
10
+ require 'bio'
11
+ require "set"
12
+ require 'systemu'
13
+ require 'json'
14
+ #require 'strmask'
15
+
16
+ =begin
17
+
18
+ Extends the methods to be able to calculate the BFR and a consensus from the pileup
19
+
20
+ =end
21
+
22
+ class Bio::DB::Pileup
23
+
24
+ #attr_accessor :minumum_ratio_for_iup_consensus
25
+ #@minumum_ratio_for_iup_consensus = 0.20
26
+
27
+ #Returns a hash with the count of bases
28
+
29
+ def bases
30
+ return @bases if @bases
31
+ @bases = self.non_refs
32
+ #puts self.ref_count
33
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
34
+ @bases
35
+ end
36
+
37
+ def base_coverage
38
+ total = 0
39
+ @bases.each do |k,v|
40
+ total += v
41
+ end
42
+ total
43
+ end
44
+
45
+ def base_ratios
46
+ return @base_ratios if @base_ratios
47
+ bases = self.bases
48
+ @base_ratios = Hash.new
49
+ bases.each do |k,v|
50
+ @base_ratios[k] = v.to_f/self.base_coverage.to_f
51
+ end
52
+ @base_ratios
53
+ end
54
+
55
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
56
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
57
+ minumum_ratio_for_iup_consensus
58
+ if @consensus_iuap.nil?
59
+ @consensus_iuap = self.ref_base.downcase
60
+ bases = self.bases
61
+ tmp = String.new
62
+ bases.each do |k,v|
63
+ tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
64
+ end
65
+ if tmp.length > 0
66
+ @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
67
+ end
68
+ end
69
+ @consensus_iuap
70
+ end
71
+ end
72
+
73
+ class Bio::DB::Fasta::Region
74
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
75
+
76
+ #TODO: Debug, as it hasnt been tested in the actual code.
77
+ def base_ratios_for_base(base)
78
+ @all_ratios = Hash.new unless @all_ratios
79
+ unless @all_ratios[base]
80
+ ratios = Array.new
81
+ for i in (0..region.size-1)
82
+ ratios << @base_ratios[i][base]
83
+ end
84
+ @all_ratios[base] = ratios
85
+ end
86
+ @all_ratios[base]
87
+ end
88
+
89
+ end
90
+
91
+ class Bio::DB::Sam::SAMException < RuntimeError
92
+
93
+ end
94
+
95
+ class Bio::DB::Sam
96
+
97
+
98
+ attr_accessor :minumum_ratio_for_iup_consensus
99
+ attr_reader :cached_regions
100
+ #attr_accessor :pileup_cache
101
+ @minumum_ratio_for_iup_consensus = 0.20
102
+
103
+
104
+ #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
105
+ #the pile for different operations, it won't execute the mpilup command several times
106
+ #Whenever you finish using a region, call mpileup_clear_cache to free the cache
107
+ #The argument Region is required, as it will be the key for the underlying hash.
108
+ #We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
109
+ #
110
+ #TODO: It may be good to load partially the pileup
111
+ def mpileup_cached (opts={})
112
+ raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
113
+ @pileup_cache = Hash.new unless @pileup_cache
114
+ @cached_regions = Hash.new unless @cached_regions
115
+
116
+ region = opts[:r] ? opts[:r] : opts[:region]
117
+ opts[:r] = "'#{region.to_s}'"
118
+ opts[:region] = "'#{region.to_s}'"
119
+ opts[:A] = true
120
+ #reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
121
+
122
+ unless @cached_regions[region.to_s]
123
+ @cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
124
+ tmp = Array.new
125
+ @cached_regions[region.to_s].pileup = tmp
126
+ #puts "Loading #{region.to_s}"
127
+ mpileup(opts) do | pile |
128
+ # puts pile
129
+ tmp << pile
130
+ yield pile
131
+ end
132
+ else
133
+ # puts "Loaded, reruning #{region.to_s}"
134
+ @cached_regions.pileup[region.to_s] .each do | pile |
135
+ yield pile
136
+ end
137
+ end
138
+ end
139
+
140
+ #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
141
+ #If no region is passed, the hash is emptied
142
+ def mpileup_clear_cache (region)
143
+ return unless @cached_regions
144
+ if region
145
+ @cached_regions[region.to_s] = nil
146
+ else
147
+ @cached_regions.clear
148
+ end
149
+ end
150
+
151
+ #Gets the coverage of a region from a pileup.
152
+ def average_coverage_from_pileup(opts={})
153
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
154
+ region = opts[:region]
155
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
156
+ @cached_regions[region].average_coverage
157
+ end
158
+
159
+ #
160
+ def coverages_from_pileup(opts={})
161
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
162
+ region = opts[:region]
163
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
164
+ @cached_regions[region].coverages
165
+ end
166
+
167
+ def consensus_with_ambiguities(opts={})
168
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
169
+ region = opts[:region]
170
+ # p "consensus with ambiguities for: " << opts[:region]
171
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
172
+ @cached_regions[region].consensus
173
+ end
174
+
175
+ def calculate_stats_from_pile(opts={})
176
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 20
177
+
178
+
179
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
180
+ region = opts[:region]
181
+ reference = self.fetch_reference(region.entry, region.start, region.end).downcase
182
+ # p "calculationg from pile..." << region.to_s
183
+ base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
184
+ bases = Array.new(region.size, BASE_COUNT_ZERO)
185
+ coverages = Array.new(region.size, 0)
186
+ total_cov = 0
187
+
188
+ self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
189
+ #puts pile
190
+ #puts pile.coverage
191
+ if pile.coverage > min_cov
192
+ base_ratios[pile.pos - region.start ] = pile.base_ratios
193
+ reference[pile.pos - region.start ] = pile.consensus_iuap(0.20)
194
+ coverages[pile.pos - region.start ] = pile.coverage.to_i
195
+ bases[pile.pos - region.start ] = pile.bases
196
+ end
197
+ total_cov += pile.coverage
198
+ end
199
+
200
+ region = @cached_regions[region.to_s]
201
+ region.coverages = coverages
202
+ region.base_ratios = base_ratios
203
+ region.consensus = reference
204
+
205
+ region.average_coverage = total_cov.to_f/region.size.to_f
206
+ region.bases = bases
207
+ region
208
+ end
209
+
210
+
211
+
212
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
213
+
214
+ #Gets an array with the proportions of the bases in the region. If there is no coverage, a
215
+ def base_ratios_in_region(opts={})
216
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
217
+ region = opts[:region]
218
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
219
+ @cached_regions[region].base_ratios
220
+ end
221
+
222
+ #Gets an array with the bsaes count in the region. If there is no coverage, a
223
+ def bases_in_region(opts={})
224
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
225
+ region = opts[:region]
226
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
227
+ @cached_regions[region].bases
228
+ end
229
+
230
+
231
+
232
+ def extract_reads(opts={})
233
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
234
+ fastq_filename = opts[:fastq]
235
+ fastq_file = opts[:fastq_file]
236
+
237
+ out = $stdout
238
+
239
+ print_fastq = Proc.new do |alignment|
240
+ out.puts "@#{alignment.qname}"
241
+ out.puts "#{alignment.seq}"
242
+ out.puts "+#{alignment.qname}"
243
+ out.puts "#{alignment.qual}"
244
+ end
245
+
246
+ fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
247
+
248
+
249
+ end
250
+
251
+
252
+
253
+ end
254
+
255
+ module Bio::BFRTools
256
+
257
+
258
+
259
+ class BFRToolsException < StandardError; end
260
+
261
+
262
+ class Container
263
+
264
+ attr_reader :putative_snps, :processed_regions, :total_length, :parental_1_sam, :parental_2_sam, :bulk_1_sam, :bulk_2_sam
265
+ attr_reader :parental_1_name, :parental_2_name, :bulk_1_name, :bulk_2_name, :reference_db
266
+
267
+ BASES = [:A, :C, :G, :T]
268
+ #Sets the reference file
269
+ def reference(path)
270
+ @reference_db = Bio::DB::Fasta::FastaFile.new(path)
271
+ @reference_path = path
272
+ end
273
+
274
+ def reference_sequence(region)
275
+ @reference_db.fetch_sequence(region)
276
+ end
277
+
278
+ #Sets the sorted BAM file of the first parental
279
+ #It accepts the following arguments
280
+ #:name=>A name for thie parental 1 (optional). If not provided,
281
+ #:path=>
282
+ def parental_1(opts)
283
+ raise BFRToolsException.new("Missing path for parental 1") if opts[:path] == nil
284
+ path = Pathname.new(opts[:path])
285
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
286
+
287
+ @parental_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
288
+ @parental_1_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
289
+ @parental_1_path = path
290
+
291
+ end
292
+
293
+ #Sets the sorted BAM file of the second parental
294
+ def parental_2(opts)
295
+ raise BFRToolsException.new("Missing path for parental 2") if opts[:path] == nil
296
+ path = Pathname.new(opts[:path])
297
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
298
+
299
+ @parental_2_name = @name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
300
+ @parental_2_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
301
+ @parental_2_path = path
302
+ end
303
+
304
+ #Sets the sorted BAM file of the first bulk
305
+ def bulk_1(opts)
306
+ raise BFRToolsException.new("Missing path for bulk 1") if opts[:path] == nil
307
+ path = Pathname.new(opts[:path])
308
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
309
+
310
+ @bulk_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
311
+ @bulk_1_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
312
+ @bulk_1_path = path
313
+ end
314
+
315
+ #Sets the sorted BAM file of the second bulk
316
+ def bulk_2(opts)
317
+ raise BFRToolsException.new("Missing path for bulk 2") if opts[:path] == nil
318
+ path = Pathname.new(opts[:path])
319
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
320
+
321
+ @bulk_2_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
322
+ @bulk_2_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
323
+ @bulk_2_path = path
324
+ end
325
+
326
+
327
+ def self.snps_between(seq1, seq2)
328
+ snps=0
329
+ for i in (0..seq1.size)
330
+ snps += 1 if seq1[i] != seq2[i]
331
+ end
332
+ snps
333
+ end
334
+ end
335
+
336
+ class BFRLine
337
+ attr_reader :original_base, :variation_base, :position, :bulk_1_ratio, :bulk_2_ratio, :bfr
338
+
339
+
340
+ end
341
+
342
+ class BFRRegion < Bio::DB::Fasta::Region
343
+ BASES = [:A, :C, :G, :T]
344
+ attr_reader :parental_1_sequence, :parental_2_sequence, :bulk_1_sequence, :bulk_2_sequence, :snp_count
345
+ attr_reader :ratios_bulk_1, :ratios_bulk_2, :avg_cov_bulk_1, :avg_cov_bulk_2, :coverages_1, :coverages_2, :bases_bulk_1, :bases_bulk_2
346
+
347
+ def initialize(opts)
348
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
349
+ reg = Bio::DB::Fasta::Region.parse_region(opts[:region])
350
+ self.entry = reg.entry
351
+ self.start = reg.start
352
+ self.end = reg.end
353
+
354
+ @container = opts[:container]
355
+
356
+ parental_1_sam = @container.parental_1_sam
357
+ parental_2_sam = @container.parental_2_sam
358
+ bulk_1_sam = @container.bulk_1_sam
359
+ bulk_2_sam = @container.bulk_2_sam
360
+
361
+ @parental_1_sequence = parental_1_sam.consensus_with_ambiguities(opts)
362
+ @parental_2_sequence = parental_2_sam.consensus_with_ambiguities(opts)
363
+
364
+ @bulk_1_sequence = bulk_1_sam.consensus_with_ambiguities(opts)
365
+ @bulk_2_sequence = bulk_2_sam.consensus_with_ambiguities(opts)
366
+
367
+ @snp_count = Container.snps_between( @parental_1_sequence , @parental_2_sequence )
368
+
369
+ @ratios_bulk_1 = bulk_1_sam.base_ratios_in_region(opts)
370
+ @ratios_bulk_2 = bulk_2_sam.base_ratios_in_region(opts)
371
+
372
+ @bases_bulk_1 = bulk_1_sam.bases_in_region(opts)
373
+ @bases_bulk_2 = bulk_2_sam.bases_in_region(opts)
374
+
375
+ @avg_cov_bulk_1 = bulk_1_sam.average_coverage_from_pileup(opts)
376
+ @avg_cov_bulk_2 = bulk_2_sam.average_coverage_from_pileup(opts)
377
+
378
+ @coverages_1 = bulk_1_sam.coverages_from_pileup(opts)
379
+ @coverages_2 = bulk_2_sam.coverages_from_pileup(opts)
380
+
381
+ end
382
+
383
+ def get_bfr_lines(opts = {})
384
+
385
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
386
+ p opts.inspect
387
+ region = self
388
+ line = String.new
389
+ info = Array.new
390
+
391
+ for i in (0..region.size-1)
392
+
393
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
394
+ BASES.each do |base|
395
+
396
+ info.clear
397
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
398
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
399
+ info << :first
400
+ end
401
+
402
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
403
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
404
+ info << :second
405
+ end
406
+
407
+
408
+ for informative in info
409
+ l = region.get_bfr_line(i, base, informative)
410
+ puts l << "\n"
411
+ line << l << "\n"
412
+
413
+ # output.print line , "\n"
414
+ end
415
+ end
416
+ end
417
+ end
418
+ line
419
+ end
420
+
421
+
422
+ def snp_1kbp
423
+ @snp_count.to_f * 1000 / self.size.to_f
424
+ end
425
+
426
+ def bfrs
427
+ return @BFRs if @BFRs
428
+ @BFRs = Hash.new
429
+
430
+ [:first, :second].each do | reference |
431
+ @BFRs[reference] = Hash.new
432
+ BASES.each do |base|
433
+ @BFRs[reference][base] = Array.new
434
+ end
435
+ end
436
+
437
+
438
+ for i in (0..self.size-1)
439
+ ratios_1 = @ratios_bulk_1[i]
440
+ ratios_2 = @ratios_bulk_2[i]
441
+ BASES.each do |base|
442
+
443
+ if ratios_1[base] == 0 and ratios_2[base] == 0
444
+ bfr1 = 0
445
+ bfr2 = 0
446
+ elsif ratios_1[base] == 0
447
+ bfr1 = 0
448
+ bfr2 = Float::INFINITY
449
+ elsif ratios_2[base] == 0
450
+ bfr1 = Float::INFINITY
451
+ bfr2 = 0
452
+ #bfr = Float::INFINITY
453
+ else
454
+ bfr1 = ratios_1[base] / ratios_2[base]
455
+ bfr2 = ratios_2[base] / ratios_1[base]
456
+ end
457
+ @BFRs[:first][base] << bfr1
458
+ @BFRs[:second][base] << bfr2
459
+ end
460
+ end
461
+ @BFRs
462
+ end
463
+
464
+ def get_bfr_line(position, base, reference)
465
+ if(reference == :first)
466
+ informative = @container.parental_1_name
467
+ ref_base = @parental_2_sequence[position]
468
+ elsif(reference == :second )
469
+ informative = @container.parental_2_name
470
+ ref_base = @parental_1_sequence[position]
471
+ else
472
+ raise BFRToolsException.new ("The reference for the line should be :first or :second, but was " + reference.to_s )
473
+ end
474
+
475
+ relative_position = self.start + position + 1
476
+
477
+ bfr = bfrs[reference][base][position]
478
+ cov_1 = @coverages_1[position]
479
+ cov_2 = @coverages_2[position]
480
+ ratios_1 = @ratios_bulk_1[position][base]
481
+ ratios_2 = @ratios_bulk_2[position][base]
482
+ base_1_count = @bases_bulk_1[position][base.to_sym]
483
+ base_2_count = @bases_bulk_2[position][base.to_sym]
484
+ #puts "bases_1 #{@bases_bulk_1[position].to_s}"
485
+ #puts "bases_2 #{@bases_bulk_2[position].to_s}"
486
+ line = String.new
487
+ line << @container.parental_1_name << "\t" << @container.parental_2_name << "\t" << @container.bulk_1_name << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
488
+ line << ref_base << "\t" << relative_position.to_s
489
+ line << "\t" << base.to_s << "\t"
490
+ line << bfr.round(2).to_s << "\t"
491
+ line << cov_1.to_s << "\t" << cov_2.to_s << "\t"
492
+ line << informative
493
+ line << "\t" << ratios_1.round(2).to_s << "\t" << ratios_2.round(2).to_s
494
+ line << "\t" << base_1_count.to_s << "\t" << base_2_count.to_s
495
+ line
496
+ end
497
+
498
+ def to_multi_fasta
499
+ fasta_string = String.new
500
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_1_name << "\n" << @parental_1_sequence << "\n"
501
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_2_name << "\n" << @parental_2_sequence << "\n"
502
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_1_name << "\n" << @bulk_1_sequence << "\n"
503
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_2_name << "\n" << @bulk_2_sequence << "\n"
504
+ fasta_string
505
+ end
506
+
507
+ def to_json (opts)
508
+ # puts JSON.dump self
509
+ # JSON.dump self
510
+ #{}"{\"firstName\": \"John\"}"
511
+ out = String.new
512
+ out << "{"
513
+ out << "\"Parental_1\" : \"" << @container.parental_1_name << "\"\n"
514
+ out << "\"Parental 2\" : \"" << @container.parental_2_name << "\"\n"
515
+ out << "\"Bulk 1\" : \"" << @container.bulk_1_name << "\"\n"
516
+ out << "\"Bulk 2\" : \"" << @container.bulk_2_name << "\"\n"
517
+ out << "\"Positions\" : " << (1..self.size).to_a.to_json << "\n" #TODO: Make this for any subsection, so we can subquery in case we are working on something bigger
518
+ out << "\"Parental_1_consensus\":" << @parental_1_sequence .split(//).to_json << "\n"
519
+ out << "\"Parental_2_consensus\":" << @parental_2_sequence .split(//).to_json << "\n"
520
+ out << "\"Bulk_1_consensus\":" << @bulk_1_sequence .split(//).to_json << "\n"
521
+ out << "\"Bulk_1_coverage\":" << @coverages_1.to_json << "\n"
522
+ # puts BASES
523
+
524
+ BASES.each do |base|
525
+ out << "\"Bases_Bulk_1" << base.to_s << "\":" << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
526
+ out << "\"Ratios_Bulk_1" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
527
+ end
528
+ out << "\"Bulk_2_consensus\":" << @bulk_2_sequence .split(//).join(",") << "\n"
529
+ out << "\"Bulk_2_coverage\":" << @coverages_2.join(",") << "\n"
530
+
531
+ BASES.each do |base|
532
+ out << "\"Bases_Bulk_2"<< base.to_s << "\":" << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
533
+ out << "\"Ratios_Bulk_2" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
534
+ end
535
+ BASES.each do |base|
536
+ out << "\"BFR" << base.to_s << "\":" << bfrs[:first][base].join(",") << "\n"
537
+ end
538
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
539
+ out << "}"
540
+ out
541
+
542
+ end
543
+
544
+ def to_csv
545
+ out = String.new
546
+ out << "Parental 1," << @container.parental_1_name << "\n"
547
+ out << "Parental 2," << @container.parental_2_name << "\n"
548
+ out << "Bulk 1, " << @container.bulk_1_name << "\n"
549
+ out << "Bulk 2," << @container.bulk_2_name << "\n"
550
+ out << "Positions," << (1..self.size).to_a.join(",") << "\n"
551
+ out << "Parental 1 consensus," << @parental_1_sequence .split(//).join(",") << "\n"
552
+ out << "Parental 2 consensus," << @parental_2_sequence .split(//).join(",") << "\n"
553
+ out << "Bulk 1 consensus," << @bulk_1_sequence .split(//).join(",") << "\n"
554
+ out << "Bulk 1 coverage," << @coverages_1.join(",") << "\n"
555
+ # puts BASES
556
+ BASES.each do |base|
557
+ out << "Bases Bulk 1"<< base.to_s << "," << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
558
+ out << "Ratios Bulk 1 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
559
+ end
560
+ out << "Bulk 2 consensus," << @bulk_2_sequence .split(//).join(",") << "\n"
561
+ out << "Bulk 2 coverage," << @coverages_2.join(",") << "\n"
562
+
563
+ BASES.each do |base|
564
+ out << "Bases Bulk 2 "<< base.to_s << "," << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
565
+ out << "Ratios Bulk 2 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
566
+ end
567
+ BASES.each do |base|
568
+ out << "BFRs" << base.to_s << "," << bfrs[:first][base].join(",") << "\n"
569
+ end
570
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
571
+ out
572
+ end
573
+
574
+ def base_ratios_for_base(base, ratios_matrix)
575
+ ratios = Array.new
576
+ for i in (0..ratios_matrix.size-1)
577
+ ratios << ratios_matrix[i][base]
578
+ end
579
+ ratios
580
+ end
581
+
582
+ def base_count_for_base(base, base_matrix)
583
+ bases = Array.new
584
+ for i in (0..base_matrix.size-1)
585
+ bases << base_matrix[i][base]
586
+ end
587
+ bases
588
+ end
589
+
590
+ end
591
+
592
+
593
+ class BFRContainer < Container
594
+
595
+ def init_counters
596
+ @putative_snps = 0
597
+ @proccesed_regions = 0
598
+ @not_enogh_coverage = 0
599
+ @total_avg_coverage_bulk_1 = 0.0
600
+ @total_avg_coverage_bulk_2 = 0.0
601
+ @total_snp_1kbp = 0.0
602
+ @no_snps = 0
603
+ @too_many_snps = 0
604
+
605
+ end
606
+ def print_header(opts={})
607
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
608
+ output.print "#bulk_1\tbulk_2\tProcessed_regions\tputative_snps\tno_snps\ttoo_many_snps\tno_enough_coverage\tavg_cov_bulk_1\tavg_cov_bulk_2\tavg_snp_1kbp\n"
609
+ end
610
+
611
+ def print_stats(opts={})
612
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
613
+ output.print @bulk_1_name, "\t", @bulk_2_name, "\t"
614
+ output.print @proccesed_regions, "\t", @putative_snps, "\t", @no_snps, "\t", @too_many_snps,"\t", @not_enogh_coverage, "\t"
615
+ output.print @total_avg_coverage_bulk_1/@proccesed_regions, "\t",@total_avg_coverage_bulk_2/@proccesed_regions, "\t"
616
+ output.print @total_snp_1kbp / @proccesed_regions,"\n"
617
+ end
618
+
619
+ def get_region(opts={})
620
+ opts[:container] = self
621
+ region = BFRRegion.new(opts)
622
+ end
623
+
624
+ def process_region(opts={})
625
+ opts = { :min_cov=>20, :max_snp_1kbp => 10 }.merge!(opts)
626
+
627
+ @proccesed_regions += 1
628
+ output = opts[:output_file] ? opts[:output_file] : $stdout
629
+ print_output = opts[:output_file] ? true : false
630
+ opts[:container] = self
631
+
632
+ region = BFRRegion.new(opts)
633
+
634
+ #puts region.to_multi_fasta
635
+
636
+ @total_snp_1kbp += region.snp_1kbp
637
+ # puts "SNPS: #{region.snp_1kbp}"
638
+ if region.snp_count == 0
639
+ @no_snps += 1
640
+ print_output = false
641
+ end
642
+
643
+ if region.snp_1kbp > opts[:max_snp_1kbp]
644
+ @too_many_snps += 1
645
+ print_output = false
646
+ end
647
+
648
+
649
+
650
+ @total_avg_coverage_bulk_2 += region.avg_cov_bulk_2
651
+ @total_avg_coverage_bulk_1 += region.avg_cov_bulk_1
652
+
653
+ if region.avg_cov_bulk_2 < opts[:min_cov] or region.avg_cov_bulk_1 < opts[:min_cov]
654
+ @not_enogh_coverage += 1
655
+ print_output = false
656
+ end
657
+
658
+ info = Array.new
659
+
660
+ if print_output
661
+ for i in (0..region.size-1)
662
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
663
+ BASES.each do |base|
664
+
665
+ info.clear
666
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
667
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
668
+ info << :first
669
+ end
670
+
671
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
672
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
673
+ info << :second
674
+ end
675
+
676
+
677
+ for informative in info
678
+ line = region.get_bfr_line(i+1, base, informative)
679
+ output.print line , "\n"
680
+ end
681
+ end
682
+ end
683
+ end
684
+ end
685
+
686
+
687
+ @parental_1_sam.mpileup_clear_cache region
688
+ @parental_2_sam.mpileup_clear_cache region
689
+ @bulk_2_sam.mpileup_clear_cache region
690
+ @bulk_1_sam.mpileup_clear_cache region
691
+ return region
692
+ end
693
+
694
+ end
695
+
696
+
697
+ end
698
+