bio-polyploid-tools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +16 -0
  3. data/Gemfile.lock +67 -0
  4. data/README +21 -0
  5. data/Rakefile +61 -0
  6. data/VERSION +1 -0
  7. data/bin/bfr.rb +133 -0
  8. data/bin/count_variations.rb +36 -0
  9. data/bin/filter_blat_by_target_coverage.rb +15 -0
  10. data/bin/find_best_blat_hit.rb +32 -0
  11. data/bin/hexaploid_primers.rb +168 -0
  12. data/bin/homokaryot_primers.rb +155 -0
  13. data/bin/map_markers_to_contigs.rb +66 -0
  14. data/bin/markers_in_region.rb +42 -0
  15. data/bin/polymarker.rb +219 -0
  16. data/bin/snps_between_bams.rb +106 -0
  17. data/bio-polyploid-tools.gemspec +139 -0
  18. data/conf/defaults.rb +1 -0
  19. data/conf/primer3_config/dangle.dh +128 -0
  20. data/conf/primer3_config/dangle.ds +128 -0
  21. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  22. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  23. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  24. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  25. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  26. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  27. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  28. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  29. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  30. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  31. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  32. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  33. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  34. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  35. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  36. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  37. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  38. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  39. data/conf/primer3_config/loops.dh +30 -0
  40. data/conf/primer3_config/loops.ds +30 -0
  41. data/conf/primer3_config/stack.dh +256 -0
  42. data/conf/primer3_config/stack.ds +256 -0
  43. data/conf/primer3_config/stackmm.dh +256 -0
  44. data/conf/primer3_config/stackmm.ds +256 -0
  45. data/conf/primer3_config/tetraloop.dh +77 -0
  46. data/conf/primer3_config/tetraloop.ds +77 -0
  47. data/conf/primer3_config/triloop.dh +16 -0
  48. data/conf/primer3_config/triloop.ds +16 -0
  49. data/conf/primer3_config/tstack.dh +256 -0
  50. data/conf/primer3_config/tstack2.dh +256 -0
  51. data/conf/primer3_config/tstack2.ds +256 -0
  52. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  53. data/lib/bio/BFRTools.rb +698 -0
  54. data/lib/bio/BIOExtensions.rb +186 -0
  55. data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
  56. data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
  57. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  58. data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
  59. data/lib/bio/PolyploidTools/SNP.rb +681 -0
  60. data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
  61. data/lib/bio/SAMToolsExtensions.rb +284 -0
  62. data/lib/bio/db/exonerate.rb +272 -0
  63. data/lib/bio/db/fastadb.rb +164 -0
  64. data/lib/bio/db/primer3.rb +673 -0
  65. data/lib/bioruby-polyploid-tools.rb +25 -0
  66. data/test/data/BS00068396_51.fa +2 -0
  67. data/test/data/BS00068396_51_contigs.aln +1412 -0
  68. data/test/data/BS00068396_51_contigs.dnd +7 -0
  69. data/test/data/BS00068396_51_contigs.fa +8 -0
  70. data/test/data/BS00068396_51_exonerate.tab +6 -0
  71. data/test/data/BS00068396_51_genes.txt +14 -0
  72. data/test/data/LIB1716.bam +0 -0
  73. data/test/data/LIB1716.bam.bai +0 -0
  74. data/test/data/LIB1719.bam +0 -0
  75. data/test/data/LIB1719.bam.bai +0 -0
  76. data/test/data/LIB1721.bam +0 -0
  77. data/test/data/LIB1721.bam.bai +0 -0
  78. data/test/data/LIB1722.bam +0 -0
  79. data/test/data/LIB1722.bam.bai +0 -0
  80. data/test/data/S22380157.fa +16 -0
  81. data/test/data/S22380157.fa.fai +1 -0
  82. data/test/data/Test3Aspecific.csv +1 -0
  83. data/test/data/Test3Aspecific_contigs.fa +6 -0
  84. data/test/data/patological_cases5D.csv +1 -0
  85. data/test/data/short_primer_design_test.csv +10 -0
  86. data/test/data/test_primer3_error.csv +4 -0
  87. data/test/data/test_primer3_error_contigs.fa +10 -0
  88. data/test/test_bfr.rb +51 -0
  89. data/test/test_exon_container.rb +17 -0
  90. data/test/test_exonearate.rb +53 -0
  91. data/test/test_snp_parsing.rb +40 -0
  92. metadata +201 -0
@@ -0,0 +1,698 @@
1
+ require 'rubygems'
2
+ #require 'extensions/all'
3
+ #require 'bio-samtools'
4
+ #require 'bio/db/pileup'
5
+ #require 'bio/db/vcf'
6
+ require 'pathname'
7
+ #require_relative 'BIOExtensions.rb'
8
+ require_relative 'db/fastadb.rb'
9
+
10
+ require 'bio'
11
+ require "set"
12
+ require 'systemu'
13
+ require 'json'
14
+ #require 'strmask'
15
+
16
+ =begin
17
+
18
+ Extends the methods to be able to calculate the BFR and a consensus from the pileup
19
+
20
+ =end
21
+
22
+ class Bio::DB::Pileup
23
+
24
+ #attr_accessor :minumum_ratio_for_iup_consensus
25
+ #@minumum_ratio_for_iup_consensus = 0.20
26
+
27
+ #Returns a hash with the count of bases
28
+
29
+ def bases
30
+ return @bases if @bases
31
+ @bases = self.non_refs
32
+ #puts self.ref_count
33
+ @bases[self.ref_base.upcase.to_sym] = self.ref_count
34
+ @bases
35
+ end
36
+
37
+ def base_coverage
38
+ total = 0
39
+ @bases.each do |k,v|
40
+ total += v
41
+ end
42
+ total
43
+ end
44
+
45
+ def base_ratios
46
+ return @base_ratios if @base_ratios
47
+ bases = self.bases
48
+ @base_ratios = Hash.new
49
+ bases.each do |k,v|
50
+ @base_ratios[k] = v.to_f/self.base_coverage.to_f
51
+ end
52
+ @base_ratios
53
+ end
54
+
55
+ # returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
56
+ def consensus_iuap(minumum_ratio_for_iup_consensus)
57
+ minumum_ratio_for_iup_consensus
58
+ if @consensus_iuap.nil?
59
+ @consensus_iuap = self.ref_base.downcase
60
+ bases = self.bases
61
+ tmp = String.new
62
+ bases.each do |k,v|
63
+ tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
64
+ end
65
+ if tmp.length > 0
66
+ @consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
67
+ end
68
+ end
69
+ @consensus_iuap
70
+ end
71
+ end
72
+
73
+ class Bio::DB::Fasta::Region
74
+ attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
75
+
76
+ #TODO: Debug, as it hasnt been tested in the actual code.
77
+ def base_ratios_for_base(base)
78
+ @all_ratios = Hash.new unless @all_ratios
79
+ unless @all_ratios[base]
80
+ ratios = Array.new
81
+ for i in (0..region.size-1)
82
+ ratios << @base_ratios[i][base]
83
+ end
84
+ @all_ratios[base] = ratios
85
+ end
86
+ @all_ratios[base]
87
+ end
88
+
89
+ end
90
+
91
+ class Bio::DB::Sam::SAMException < RuntimeError
92
+
93
+ end
94
+
95
+ class Bio::DB::Sam
96
+
97
+
98
+ attr_accessor :minumum_ratio_for_iup_consensus
99
+ attr_reader :cached_regions
100
+ #attr_accessor :pileup_cache
101
+ @minumum_ratio_for_iup_consensus = 0.20
102
+
103
+
104
+ #Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
105
+ #the pile for different operations, it won't execute the mpilup command several times
106
+ #Whenever you finish using a region, call mpileup_clear_cache to free the cache
107
+ #The argument Region is required, as it will be the key for the underlying hash.
108
+ #We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
109
+ #
110
+ #TODO: It may be good to load partially the pileup
111
+ def mpileup_cached (opts={})
112
+ raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
113
+ @pileup_cache = Hash.new unless @pileup_cache
114
+ @cached_regions = Hash.new unless @cached_regions
115
+
116
+ region = opts[:r] ? opts[:r] : opts[:region]
117
+ opts[:r] = "'#{region.to_s}'"
118
+ opts[:region] = "'#{region.to_s}'"
119
+ opts[:A] = true
120
+ #reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
121
+
122
+ unless @cached_regions[region.to_s]
123
+ @cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
124
+ tmp = Array.new
125
+ @cached_regions[region.to_s].pileup = tmp
126
+ #puts "Loading #{region.to_s}"
127
+ mpileup(opts) do | pile |
128
+ # puts pile
129
+ tmp << pile
130
+ yield pile
131
+ end
132
+ else
133
+ # puts "Loaded, reruning #{region.to_s}"
134
+ @cached_regions.pileup[region.to_s] .each do | pile |
135
+ yield pile
136
+ end
137
+ end
138
+ end
139
+
140
+ #Clears the pileup cache. If a region is passed as argument, just the specified region is removed
141
+ #If no region is passed, the hash is emptied
142
+ def mpileup_clear_cache (region)
143
+ return unless @cached_regions
144
+ if region
145
+ @cached_regions[region.to_s] = nil
146
+ else
147
+ @cached_regions.clear
148
+ end
149
+ end
150
+
151
+ #Gets the coverage of a region from a pileup.
152
+ def average_coverage_from_pileup(opts={})
153
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
154
+ region = opts[:region]
155
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
156
+ @cached_regions[region].average_coverage
157
+ end
158
+
159
+ #
160
+ def coverages_from_pileup(opts={})
161
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
162
+ region = opts[:region]
163
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
164
+ @cached_regions[region].coverages
165
+ end
166
+
167
+ def consensus_with_ambiguities(opts={})
168
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
169
+ region = opts[:region]
170
+ # p "consensus with ambiguities for: " << opts[:region]
171
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
172
+ @cached_regions[region].consensus
173
+ end
174
+
175
+ def calculate_stats_from_pile(opts={})
176
+ min_cov = opts[:min_cov] ? opts[:min_cov] : 20
177
+
178
+
179
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
180
+ region = opts[:region]
181
+ reference = self.fetch_reference(region.entry, region.start, region.end).downcase
182
+ # p "calculationg from pile..." << region.to_s
183
+ base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
184
+ bases = Array.new(region.size, BASE_COUNT_ZERO)
185
+ coverages = Array.new(region.size, 0)
186
+ total_cov = 0
187
+
188
+ self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
189
+ #puts pile
190
+ #puts pile.coverage
191
+ if pile.coverage > min_cov
192
+ base_ratios[pile.pos - region.start ] = pile.base_ratios
193
+ reference[pile.pos - region.start ] = pile.consensus_iuap(0.20)
194
+ coverages[pile.pos - region.start ] = pile.coverage.to_i
195
+ bases[pile.pos - region.start ] = pile.bases
196
+ end
197
+ total_cov += pile.coverage
198
+ end
199
+
200
+ region = @cached_regions[region.to_s]
201
+ region.coverages = coverages
202
+ region.base_ratios = base_ratios
203
+ region.consensus = reference
204
+
205
+ region.average_coverage = total_cov.to_f/region.size.to_f
206
+ region.bases = bases
207
+ region
208
+ end
209
+
210
+
211
+
212
+ BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
213
+
214
+ #Gets an array with the proportions of the bases in the region. If there is no coverage, a
215
+ def base_ratios_in_region(opts={})
216
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
217
+ region = opts[:region]
218
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
219
+ @cached_regions[region].base_ratios
220
+ end
221
+
222
+ #Gets an array with the bsaes count in the region. If there is no coverage, a
223
+ def bases_in_region(opts={})
224
+ opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
225
+ region = opts[:region]
226
+ calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
227
+ @cached_regions[region].bases
228
+ end
229
+
230
+
231
+
232
+ def extract_reads(opts={})
233
+ opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
234
+ fastq_filename = opts[:fastq]
235
+ fastq_file = opts[:fastq_file]
236
+
237
+ out = $stdout
238
+
239
+ print_fastq = Proc.new do |alignment|
240
+ out.puts "@#{alignment.qname}"
241
+ out.puts "#{alignment.seq}"
242
+ out.puts "+#{alignment.qname}"
243
+ out.puts "#{alignment.qual}"
244
+ end
245
+
246
+ fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
247
+
248
+
249
+ end
250
+
251
+
252
+
253
+ end
254
+
255
+ module Bio::BFRTools
256
+
257
+
258
+
259
+ class BFRToolsException < StandardError; end
260
+
261
+
262
+ class Container
263
+
264
+ attr_reader :putative_snps, :processed_regions, :total_length, :parental_1_sam, :parental_2_sam, :bulk_1_sam, :bulk_2_sam
265
+ attr_reader :parental_1_name, :parental_2_name, :bulk_1_name, :bulk_2_name, :reference_db
266
+
267
+ BASES = [:A, :C, :G, :T]
268
+ #Sets the reference file
269
+ def reference(path)
270
+ @reference_db = Bio::DB::Fasta::FastaFile.new(path)
271
+ @reference_path = path
272
+ end
273
+
274
+ def reference_sequence(region)
275
+ @reference_db.fetch_sequence(region)
276
+ end
277
+
278
+ #Sets the sorted BAM file of the first parental
279
+ #It accepts the following arguments
280
+ #:name=>A name for thie parental 1 (optional). If not provided,
281
+ #:path=>
282
+ def parental_1(opts)
283
+ raise BFRToolsException.new("Missing path for parental 1") if opts[:path] == nil
284
+ path = Pathname.new(opts[:path])
285
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
286
+
287
+ @parental_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
288
+ @parental_1_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
289
+ @parental_1_path = path
290
+
291
+ end
292
+
293
+ #Sets the sorted BAM file of the second parental
294
+ def parental_2(opts)
295
+ raise BFRToolsException.new("Missing path for parental 2") if opts[:path] == nil
296
+ path = Pathname.new(opts[:path])
297
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
298
+
299
+ @parental_2_name = @name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
300
+ @parental_2_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
301
+ @parental_2_path = path
302
+ end
303
+
304
+ #Sets the sorted BAM file of the first bulk
305
+ def bulk_1(opts)
306
+ raise BFRToolsException.new("Missing path for bulk 1") if opts[:path] == nil
307
+ path = Pathname.new(opts[:path])
308
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
309
+
310
+ @bulk_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
311
+ @bulk_1_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
312
+ @bulk_1_path = path
313
+ end
314
+
315
+ #Sets the sorted BAM file of the second bulk
316
+ def bulk_2(opts)
317
+ raise BFRToolsException.new("Missing path for bulk 2") if opts[:path] == nil
318
+ path = Pathname.new(opts[:path])
319
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
320
+
321
+ @bulk_2_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
322
+ @bulk_2_sam = Bio::DB::Sam.new({:fasta=>@reference_path, :bam=>path.realpath.to_s})
323
+ @bulk_2_path = path
324
+ end
325
+
326
+
327
+ def self.snps_between(seq1, seq2)
328
+ snps=0
329
+ for i in (0..seq1.size)
330
+ snps += 1 if seq1[i] != seq2[i]
331
+ end
332
+ snps
333
+ end
334
+ end
335
+
336
+ class BFRLine
337
+ attr_reader :original_base, :variation_base, :position, :bulk_1_ratio, :bulk_2_ratio, :bfr
338
+
339
+
340
+ end
341
+
342
+ class BFRRegion < Bio::DB::Fasta::Region
343
+ BASES = [:A, :C, :G, :T]
344
+ attr_reader :parental_1_sequence, :parental_2_sequence, :bulk_1_sequence, :bulk_2_sequence, :snp_count
345
+ attr_reader :ratios_bulk_1, :ratios_bulk_2, :avg_cov_bulk_1, :avg_cov_bulk_2, :coverages_1, :coverages_2, :bases_bulk_1, :bases_bulk_2
346
+
347
+ def initialize(opts)
348
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
349
+ reg = Bio::DB::Fasta::Region.parse_region(opts[:region])
350
+ self.entry = reg.entry
351
+ self.start = reg.start
352
+ self.end = reg.end
353
+
354
+ @container = opts[:container]
355
+
356
+ parental_1_sam = @container.parental_1_sam
357
+ parental_2_sam = @container.parental_2_sam
358
+ bulk_1_sam = @container.bulk_1_sam
359
+ bulk_2_sam = @container.bulk_2_sam
360
+
361
+ @parental_1_sequence = parental_1_sam.consensus_with_ambiguities(opts)
362
+ @parental_2_sequence = parental_2_sam.consensus_with_ambiguities(opts)
363
+
364
+ @bulk_1_sequence = bulk_1_sam.consensus_with_ambiguities(opts)
365
+ @bulk_2_sequence = bulk_2_sam.consensus_with_ambiguities(opts)
366
+
367
+ @snp_count = Container.snps_between( @parental_1_sequence , @parental_2_sequence )
368
+
369
+ @ratios_bulk_1 = bulk_1_sam.base_ratios_in_region(opts)
370
+ @ratios_bulk_2 = bulk_2_sam.base_ratios_in_region(opts)
371
+
372
+ @bases_bulk_1 = bulk_1_sam.bases_in_region(opts)
373
+ @bases_bulk_2 = bulk_2_sam.bases_in_region(opts)
374
+
375
+ @avg_cov_bulk_1 = bulk_1_sam.average_coverage_from_pileup(opts)
376
+ @avg_cov_bulk_2 = bulk_2_sam.average_coverage_from_pileup(opts)
377
+
378
+ @coverages_1 = bulk_1_sam.coverages_from_pileup(opts)
379
+ @coverages_2 = bulk_2_sam.coverages_from_pileup(opts)
380
+
381
+ end
382
+
383
+ def get_bfr_lines(opts = {})
384
+
385
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
386
+ p opts.inspect
387
+ region = self
388
+ line = String.new
389
+ info = Array.new
390
+
391
+ for i in (0..region.size-1)
392
+
393
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
394
+ BASES.each do |base|
395
+
396
+ info.clear
397
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
398
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
399
+ info << :first
400
+ end
401
+
402
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
403
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
404
+ info << :second
405
+ end
406
+
407
+
408
+ for informative in info
409
+ l = region.get_bfr_line(i, base, informative)
410
+ puts l << "\n"
411
+ line << l << "\n"
412
+
413
+ # output.print line , "\n"
414
+ end
415
+ end
416
+ end
417
+ end
418
+ line
419
+ end
420
+
421
+
422
+ def snp_1kbp
423
+ @snp_count.to_f * 1000 / self.size.to_f
424
+ end
425
+
426
+ def bfrs
427
+ return @BFRs if @BFRs
428
+ @BFRs = Hash.new
429
+
430
+ [:first, :second].each do | reference |
431
+ @BFRs[reference] = Hash.new
432
+ BASES.each do |base|
433
+ @BFRs[reference][base] = Array.new
434
+ end
435
+ end
436
+
437
+
438
+ for i in (0..self.size-1)
439
+ ratios_1 = @ratios_bulk_1[i]
440
+ ratios_2 = @ratios_bulk_2[i]
441
+ BASES.each do |base|
442
+
443
+ if ratios_1[base] == 0 and ratios_2[base] == 0
444
+ bfr1 = 0
445
+ bfr2 = 0
446
+ elsif ratios_1[base] == 0
447
+ bfr1 = 0
448
+ bfr2 = Float::INFINITY
449
+ elsif ratios_2[base] == 0
450
+ bfr1 = Float::INFINITY
451
+ bfr2 = 0
452
+ #bfr = Float::INFINITY
453
+ else
454
+ bfr1 = ratios_1[base] / ratios_2[base]
455
+ bfr2 = ratios_2[base] / ratios_1[base]
456
+ end
457
+ @BFRs[:first][base] << bfr1
458
+ @BFRs[:second][base] << bfr2
459
+ end
460
+ end
461
+ @BFRs
462
+ end
463
+
464
+ def get_bfr_line(position, base, reference)
465
+ if(reference == :first)
466
+ informative = @container.parental_1_name
467
+ ref_base = @parental_2_sequence[position]
468
+ elsif(reference == :second )
469
+ informative = @container.parental_2_name
470
+ ref_base = @parental_1_sequence[position]
471
+ else
472
+ raise BFRToolsException.new ("The reference for the line should be :first or :second, but was " + reference.to_s )
473
+ end
474
+
475
+ relative_position = self.start + position + 1
476
+
477
+ bfr = bfrs[reference][base][position]
478
+ cov_1 = @coverages_1[position]
479
+ cov_2 = @coverages_2[position]
480
+ ratios_1 = @ratios_bulk_1[position][base]
481
+ ratios_2 = @ratios_bulk_2[position][base]
482
+ base_1_count = @bases_bulk_1[position][base.to_sym]
483
+ base_2_count = @bases_bulk_2[position][base.to_sym]
484
+ #puts "bases_1 #{@bases_bulk_1[position].to_s}"
485
+ #puts "bases_2 #{@bases_bulk_2[position].to_s}"
486
+ line = String.new
487
+ line << @container.parental_1_name << "\t" << @container.parental_2_name << "\t" << @container.bulk_1_name << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
488
+ line << ref_base << "\t" << relative_position.to_s
489
+ line << "\t" << base.to_s << "\t"
490
+ line << bfr.round(2).to_s << "\t"
491
+ line << cov_1.to_s << "\t" << cov_2.to_s << "\t"
492
+ line << informative
493
+ line << "\t" << ratios_1.round(2).to_s << "\t" << ratios_2.round(2).to_s
494
+ line << "\t" << base_1_count.to_s << "\t" << base_2_count.to_s
495
+ line
496
+ end
497
+
498
+ def to_multi_fasta
499
+ fasta_string = String.new
500
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_1_name << "\n" << @parental_1_sequence << "\n"
501
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_2_name << "\n" << @parental_2_sequence << "\n"
502
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_1_name << "\n" << @bulk_1_sequence << "\n"
503
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_2_name << "\n" << @bulk_2_sequence << "\n"
504
+ fasta_string
505
+ end
506
+
507
+ def to_json (opts)
508
+ # puts JSON.dump self
509
+ # JSON.dump self
510
+ #{}"{\"firstName\": \"John\"}"
511
+ out = String.new
512
+ out << "{"
513
+ out << "\"Parental_1\" : \"" << @container.parental_1_name << "\"\n"
514
+ out << "\"Parental 2\" : \"" << @container.parental_2_name << "\"\n"
515
+ out << "\"Bulk 1\" : \"" << @container.bulk_1_name << "\"\n"
516
+ out << "\"Bulk 2\" : \"" << @container.bulk_2_name << "\"\n"
517
+ out << "\"Positions\" : " << (1..self.size).to_a.to_json << "\n" #TODO: Make this for any subsection, so we can subquery in case we are working on something bigger
518
+ out << "\"Parental_1_consensus\":" << @parental_1_sequence .split(//).to_json << "\n"
519
+ out << "\"Parental_2_consensus\":" << @parental_2_sequence .split(//).to_json << "\n"
520
+ out << "\"Bulk_1_consensus\":" << @bulk_1_sequence .split(//).to_json << "\n"
521
+ out << "\"Bulk_1_coverage\":" << @coverages_1.to_json << "\n"
522
+ # puts BASES
523
+
524
+ BASES.each do |base|
525
+ out << "\"Bases_Bulk_1" << base.to_s << "\":" << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
526
+ out << "\"Ratios_Bulk_1" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
527
+ end
528
+ out << "\"Bulk_2_consensus\":" << @bulk_2_sequence .split(//).join(",") << "\n"
529
+ out << "\"Bulk_2_coverage\":" << @coverages_2.join(",") << "\n"
530
+
531
+ BASES.each do |base|
532
+ out << "\"Bases_Bulk_2"<< base.to_s << "\":" << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
533
+ out << "\"Ratios_Bulk_2" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
534
+ end
535
+ BASES.each do |base|
536
+ out << "\"BFR" << base.to_s << "\":" << bfrs[:first][base].join(",") << "\n"
537
+ end
538
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
539
+ out << "}"
540
+ out
541
+
542
+ end
543
+
544
+ def to_csv
545
+ out = String.new
546
+ out << "Parental 1," << @container.parental_1_name << "\n"
547
+ out << "Parental 2," << @container.parental_2_name << "\n"
548
+ out << "Bulk 1, " << @container.bulk_1_name << "\n"
549
+ out << "Bulk 2," << @container.bulk_2_name << "\n"
550
+ out << "Positions," << (1..self.size).to_a.join(",") << "\n"
551
+ out << "Parental 1 consensus," << @parental_1_sequence .split(//).join(",") << "\n"
552
+ out << "Parental 2 consensus," << @parental_2_sequence .split(//).join(",") << "\n"
553
+ out << "Bulk 1 consensus," << @bulk_1_sequence .split(//).join(",") << "\n"
554
+ out << "Bulk 1 coverage," << @coverages_1.join(",") << "\n"
555
+ # puts BASES
556
+ BASES.each do |base|
557
+ out << "Bases Bulk 1"<< base.to_s << "," << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
558
+ out << "Ratios Bulk 1 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
559
+ end
560
+ out << "Bulk 2 consensus," << @bulk_2_sequence .split(//).join(",") << "\n"
561
+ out << "Bulk 2 coverage," << @coverages_2.join(",") << "\n"
562
+
563
+ BASES.each do |base|
564
+ out << "Bases Bulk 2 "<< base.to_s << "," << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
565
+ out << "Ratios Bulk 2 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
566
+ end
567
+ BASES.each do |base|
568
+ out << "BFRs" << base.to_s << "," << bfrs[:first][base].join(",") << "\n"
569
+ end
570
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
571
+ out
572
+ end
573
+
574
+ def base_ratios_for_base(base, ratios_matrix)
575
+ ratios = Array.new
576
+ for i in (0..ratios_matrix.size-1)
577
+ ratios << ratios_matrix[i][base]
578
+ end
579
+ ratios
580
+ end
581
+
582
+ def base_count_for_base(base, base_matrix)
583
+ bases = Array.new
584
+ for i in (0..base_matrix.size-1)
585
+ bases << base_matrix[i][base]
586
+ end
587
+ bases
588
+ end
589
+
590
+ end
591
+
592
+
593
+ class BFRContainer < Container
594
+
595
+ def init_counters
596
+ @putative_snps = 0
597
+ @proccesed_regions = 0
598
+ @not_enogh_coverage = 0
599
+ @total_avg_coverage_bulk_1 = 0.0
600
+ @total_avg_coverage_bulk_2 = 0.0
601
+ @total_snp_1kbp = 0.0
602
+ @no_snps = 0
603
+ @too_many_snps = 0
604
+
605
+ end
606
+ def print_header(opts={})
607
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
608
+ output.print "#bulk_1\tbulk_2\tProcessed_regions\tputative_snps\tno_snps\ttoo_many_snps\tno_enough_coverage\tavg_cov_bulk_1\tavg_cov_bulk_2\tavg_snp_1kbp\n"
609
+ end
610
+
611
+ def print_stats(opts={})
612
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
613
+ output.print @bulk_1_name, "\t", @bulk_2_name, "\t"
614
+ output.print @proccesed_regions, "\t", @putative_snps, "\t", @no_snps, "\t", @too_many_snps,"\t", @not_enogh_coverage, "\t"
615
+ output.print @total_avg_coverage_bulk_1/@proccesed_regions, "\t",@total_avg_coverage_bulk_2/@proccesed_regions, "\t"
616
+ output.print @total_snp_1kbp / @proccesed_regions,"\n"
617
+ end
618
+
619
+ def get_region(opts={})
620
+ opts[:container] = self
621
+ region = BFRRegion.new(opts)
622
+ end
623
+
624
+ def process_region(opts={})
625
+ opts = { :min_cov=>20, :max_snp_1kbp => 10 }.merge!(opts)
626
+
627
+ @proccesed_regions += 1
628
+ output = opts[:output_file] ? opts[:output_file] : $stdout
629
+ print_output = opts[:output_file] ? true : false
630
+ opts[:container] = self
631
+
632
+ region = BFRRegion.new(opts)
633
+
634
+ #puts region.to_multi_fasta
635
+
636
+ @total_snp_1kbp += region.snp_1kbp
637
+ # puts "SNPS: #{region.snp_1kbp}"
638
+ if region.snp_count == 0
639
+ @no_snps += 1
640
+ print_output = false
641
+ end
642
+
643
+ if region.snp_1kbp > opts[:max_snp_1kbp]
644
+ @too_many_snps += 1
645
+ print_output = false
646
+ end
647
+
648
+
649
+
650
+ @total_avg_coverage_bulk_2 += region.avg_cov_bulk_2
651
+ @total_avg_coverage_bulk_1 += region.avg_cov_bulk_1
652
+
653
+ if region.avg_cov_bulk_2 < opts[:min_cov] or region.avg_cov_bulk_1 < opts[:min_cov]
654
+ @not_enogh_coverage += 1
655
+ print_output = false
656
+ end
657
+
658
+ info = Array.new
659
+
660
+ if print_output
661
+ for i in (0..region.size-1)
662
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
663
+ BASES.each do |base|
664
+
665
+ info.clear
666
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
667
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
668
+ info << :first
669
+ end
670
+
671
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
672
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
673
+ info << :second
674
+ end
675
+
676
+
677
+ for informative in info
678
+ line = region.get_bfr_line(i+1, base, informative)
679
+ output.print line , "\n"
680
+ end
681
+ end
682
+ end
683
+ end
684
+ end
685
+
686
+
687
+ @parental_1_sam.mpileup_clear_cache region
688
+ @parental_2_sam.mpileup_clear_cache region
689
+ @bulk_2_sam.mpileup_clear_cache region
690
+ @bulk_1_sam.mpileup_clear_cache region
691
+ return region
692
+ end
693
+
694
+ end
695
+
696
+
697
+ end
698
+