bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,256 @@
1
+ inf
2
+ inf
3
+ inf
4
+ -6.3
5
+ inf
6
+ inf
7
+ inf
8
+ -7.0
9
+ inf
10
+ inf
11
+ inf
12
+ -5.8
13
+ -7.8
14
+ -4.0
15
+ -4.4
16
+ inf
17
+ inf
18
+ inf
19
+ -22.5
20
+ inf
21
+ inf
22
+ inf
23
+ -7.1
24
+ inf
25
+ inf
26
+ inf
27
+ -11.4
28
+ inf
29
+ -3.8
30
+ -0.5
31
+ inf
32
+ -1.7
33
+ inf
34
+ -10.7
35
+ inf
36
+ inf
37
+ inf
38
+ -6.0
39
+ inf
40
+ inf
41
+ inf
42
+ -15.5
43
+ inf
44
+ inf
45
+ -5.9
46
+ inf
47
+ -2.1
48
+ -8.7
49
+ -7.8
50
+ inf
51
+ inf
52
+ inf
53
+ -3.8
54
+ inf
55
+ inf
56
+ inf
57
+ -5.9
58
+ inf
59
+ inf
60
+ inf
61
+ inf
62
+ -6.3
63
+ -9.4
64
+ -6.5
65
+ inf
66
+ inf
67
+ inf
68
+ -5.9
69
+ inf
70
+ inf
71
+ inf
72
+ -1.3
73
+ -10.7
74
+ -5.9
75
+ -9.6
76
+ inf
77
+ inf
78
+ inf
79
+ inf
80
+ -1.2
81
+ inf
82
+ inf
83
+ -13.8
84
+ inf
85
+ inf
86
+ inf
87
+ -10.6
88
+ inf
89
+ -6.0
90
+ -5.1
91
+ inf
92
+ -8.0
93
+ inf
94
+ inf
95
+ -7.8
96
+ inf
97
+ inf
98
+ -5.9
99
+ inf
100
+ inf
101
+ inf
102
+ -5.1
103
+ inf
104
+ inf
105
+ -15.5
106
+ inf
107
+ -9.5
108
+ -9.0
109
+ inf
110
+ -10.6
111
+ inf
112
+ inf
113
+ -4.0
114
+ inf
115
+ inf
116
+ inf
117
+ -0.5
118
+ inf
119
+ inf
120
+ inf
121
+ inf
122
+ -10.6
123
+ -18.7
124
+ -16.9
125
+ -6.3
126
+ inf
127
+ inf
128
+ inf
129
+ inf
130
+ inf
131
+ inf
132
+ -4.7
133
+ -22.5
134
+ -13.8
135
+ -11.1
136
+ inf
137
+ inf
138
+ inf
139
+ inf
140
+ -2.7
141
+ inf
142
+ inf
143
+ inf
144
+ -9.8
145
+ inf
146
+ inf
147
+ -11.1
148
+ inf
149
+ -7.1
150
+ -10.6
151
+ inf
152
+ -13.5
153
+ inf
154
+ inf
155
+ -19.2
156
+ inf
157
+ inf
158
+ inf
159
+ -16.1
160
+ inf
161
+ inf
162
+ -9.6
163
+ inf
164
+ inf
165
+ -11.4
166
+ inf
167
+ -19.2
168
+ -15.9
169
+ inf
170
+ -9.5
171
+ inf
172
+ inf
173
+ inf
174
+ -18.7
175
+ inf
176
+ inf
177
+ -4.4
178
+ inf
179
+ inf
180
+ inf
181
+ inf
182
+ -7.8
183
+ -16.1
184
+ -21.2
185
+ -2.1
186
+ inf
187
+ inf
188
+ inf
189
+ -9.4
190
+ inf
191
+ inf
192
+ inf
193
+ -6.3
194
+ -5.9
195
+ -4.7
196
+ inf
197
+ inf
198
+ inf
199
+ inf
200
+ -6.3
201
+ inf
202
+ inf
203
+ inf
204
+ -10.5
205
+ inf
206
+ inf
207
+ inf
208
+ -8.9
209
+ -7.0
210
+ -1.3
211
+ inf
212
+ -6.3
213
+ inf
214
+ inf
215
+ -13.5
216
+ inf
217
+ inf
218
+ inf
219
+ -15.9
220
+ inf
221
+ inf
222
+ inf
223
+ -21.2
224
+ inf
225
+ -5.8
226
+ inf
227
+ -2.7
228
+ -10.5
229
+ inf
230
+ -8.0
231
+ inf
232
+ inf
233
+ inf
234
+ -9.0
235
+ inf
236
+ inf
237
+ inf
238
+ -16.9
239
+ inf
240
+ inf
241
+ inf
242
+ -1.2
243
+ -9.8
244
+ -8.9
245
+ -1.7
246
+ inf
247
+ inf
248
+ inf
249
+ -8.7
250
+ inf
251
+ inf
252
+ inf
253
+ -6.5
254
+ inf
255
+ inf
256
+ inf
@@ -0,0 +1,465 @@
1
+ require 'rubygems'
2
+ #require 'extensions/all'
3
+ #require 'bio-samtools'
4
+ #require 'bio/db/pileup'
5
+ #require 'bio/db/vcf'
6
+ require 'pathname'
7
+ #require_relative 'BIOExtensions.rb'
8
+
9
+
10
+ require 'bio'
11
+ require 'bio-samtools-wrapper'
12
+
13
+ require "set"
14
+ require 'systemu'
15
+ require 'json'
16
+ #require 'strmask'
17
+
18
+
19
+ module Bio::BFRTools
20
+
21
+
22
+
23
+ class BFRToolsException < StandardError; end
24
+
25
+
26
+ class Container
27
+
28
+ attr_reader :putative_snps, :processed_regions, :total_length, :parental_1_sam, :parental_2_sam, :bulk_1_sam, :bulk_2_sam
29
+ attr_reader :parental_1_name, :parental_2_name, :bulk_1_name, :bulk_2_name, :reference_db
30
+
31
+ BASES = [:A, :C, :G, :T]
32
+ #Sets the reference file
33
+ def reference(path)
34
+ @reference_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
35
+ @reference_path = path
36
+ end
37
+
38
+ def reference_sequence(region)
39
+ @reference_db.fetch_sequence(region)
40
+ end
41
+
42
+ #Sets the sorted BAM file of the first parental
43
+ #It accepts the following arguments
44
+ #:name=>A name for thie parental 1 (optional). If not provided,
45
+ #:path=>
46
+ def parental_1(opts)
47
+ raise BFRToolsException.new("Missing path for parental 1") if opts[:path] == nil
48
+ path = Pathname.new(opts[:path])
49
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
50
+
51
+ @parental_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
52
+ @parental_1_sam = Bio::DB::Sam.new(fasta: @reference_path, :bam=>path.realpath.to_s)
53
+ @parental_1_path = path
54
+
55
+ end
56
+
57
+ #Sets the sorted BAM file of the second parental
58
+ def parental_2(opts)
59
+ raise BFRToolsException.new("Missing path for parental 2") if opts[:path] == nil
60
+ path = Pathname.new(opts[:path])
61
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
62
+
63
+ @parental_2_name = @name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
64
+ @parental_2_sam = Bio::DB::Sam.new(fasta: @reference_path, :bam=>path.realpath.to_s)
65
+ @parental_2_path = path
66
+ end
67
+
68
+ #Sets the sorted BAM file of the first bulk
69
+ def bulk_1(opts)
70
+ raise BFRToolsException.new("Missing path for bulk 1") if opts[:path] == nil
71
+ path = Pathname.new(opts[:path])
72
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
73
+
74
+ @bulk_1_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
75
+ @bulk_1_sam = Bio::DB::Sam.new(fasta: @reference_path, :bam=>path.realpath.to_s)
76
+ @bulk_1_path = path
77
+ end
78
+
79
+ #Sets the sorted BAM file of the second bulk
80
+ def bulk_2(opts)
81
+ raise BFRToolsException.new("Missing path for bulk 2") if opts[:path] == nil
82
+ path = Pathname.new(opts[:path])
83
+ raise BFRToolsException.new("Unable to open #{path}") unless path.readable? or path.directory?
84
+
85
+ @bulk_2_name = opts[:name] ? opts[:name] : path.basename(".bam").to_s
86
+ @bulk_2_sam = Bio::DB::Sam.new(fasta: @reference_path, :bam=>path.realpath.to_s)
87
+ @bulk_2_path = path
88
+ end
89
+
90
+
91
+ def self.snps_between(seq1, seq2)
92
+ snps=0
93
+ for i in (0..seq1.size)
94
+ snps += 1 if seq1[i] != seq2[i]
95
+ end
96
+ snps
97
+ end
98
+ end
99
+
100
+ class BFRLine
101
+ attr_reader :original_base, :variation_base, :position, :bulk_1_ratio, :bulk_2_ratio, :bfr
102
+
103
+
104
+ end
105
+
106
+ class BFRRegion < Bio::DB::Fasta::Region
107
+ BASES = [:A, :C, :G, :T]
108
+ attr_reader :parental_1_sequence, :parental_2_sequence, :bulk_1_sequence, :bulk_2_sequence, :snp_count
109
+ attr_reader :ratios_bulk_1, :ratios_bulk_2, :avg_cov_bulk_1, :avg_cov_bulk_2, :coverages_1, :coverages_2, :bases_bulk_1, :bases_bulk_2
110
+
111
+ def initialize(opts)
112
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
113
+ reg = Bio::DB::Fasta::Region.parse_region(opts[:region])
114
+ self.entry = reg.entry
115
+ self.start = reg.start
116
+ self.end = reg.end
117
+ @BFRs = nil
118
+ opts[:region] = reg
119
+ @container = opts[:container]
120
+
121
+ parental_1_reg = @container.parental_1_sam.fetch_region(opts)
122
+ parental_2_reg = @container.parental_2_sam.fetch_region(opts)
123
+ bulk_1_reg = @container.bulk_1_sam.fetch_region(opts)
124
+ bulk_2_reg = @container.bulk_2_sam.fetch_region(opts)
125
+
126
+
127
+
128
+ @parental_1_sequence = parental_1_reg.consensus
129
+ @parental_2_sequence = parental_2_reg.consensus
130
+
131
+ @bulk_1_sequence = bulk_1_reg.consensus
132
+ @bulk_2_sequence = bulk_2_reg.consensus
133
+
134
+ @snp_count = Container.snps_between( @parental_1_sequence , @parental_2_sequence )
135
+
136
+ @ratios_bulk_1 = bulk_1_reg.base_ratios
137
+ @ratios_bulk_2 = bulk_2_reg.base_ratios
138
+
139
+ @bases_bulk_1 = bulk_1_reg.bases
140
+ @bases_bulk_2 = bulk_2_reg.bases
141
+
142
+ @avg_cov_bulk_1 = bulk_1_reg.average_coverage
143
+ @avg_cov_bulk_2 = bulk_2_reg.average_coverage
144
+
145
+ @coverages_1 = bulk_1_reg.coverages
146
+ @coverages_2 = bulk_2_reg.coverages
147
+
148
+ end
149
+
150
+ def get_bfr_lines(opts = {})
151
+
152
+ opts = { :min_cov=>20, :max_snp_1kbp => 5 }.merge!(opts)
153
+ p opts.inspect
154
+ region = self
155
+ line = String.new
156
+ info = Array.new
157
+
158
+ for i in (0..region.size-1)
159
+
160
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
161
+ BASES.each do |base|
162
+
163
+ info.clear
164
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
165
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
166
+ info << :first
167
+ end
168
+
169
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
170
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
171
+ info << :second
172
+ end
173
+
174
+
175
+ for informative in info
176
+ l = region.get_bfr_line(i, base, informative)
177
+ puts l << "\n"
178
+ line << l << "\n"
179
+
180
+ # output.print line , "\n"
181
+ end
182
+ end
183
+ end
184
+ end
185
+ line
186
+ end
187
+
188
+
189
+ def snp_1kbp
190
+ @snp_count.to_f * 1000 / self.size.to_f
191
+ end
192
+
193
+ def bfrs
194
+ return @BFRs if @BFRs
195
+ @BFRs = Hash.new
196
+
197
+ [:first, :second].each do | reference |
198
+ @BFRs[reference] = Hash.new
199
+ BASES.each do |base|
200
+ @BFRs[reference][base] = Array.new
201
+ end
202
+ end
203
+
204
+
205
+ for i in (0..self.size-1)
206
+ ratios_1 = @ratios_bulk_1[i]
207
+ ratios_2 = @ratios_bulk_2[i]
208
+ BASES.each do |base|
209
+
210
+ if ratios_1[base] == 0 and ratios_2[base] == 0
211
+ bfr1 = 0
212
+ bfr2 = 0
213
+ elsif ratios_1[base] == 0
214
+ bfr1 = 0
215
+ bfr2 = Float::INFINITY
216
+ elsif ratios_2[base] == 0
217
+ bfr1 = Float::INFINITY
218
+ bfr2 = 0
219
+ #bfr = Float::INFINITY
220
+ else
221
+ bfr1 = ratios_1[base] / ratios_2[base]
222
+ bfr2 = ratios_2[base] / ratios_1[base]
223
+ end
224
+ @BFRs[:first][base] << bfr1
225
+ @BFRs[:second][base] << bfr2
226
+ end
227
+ end
228
+ @BFRs
229
+ end
230
+
231
+ def get_bfr_line(position, base, reference)
232
+ if(reference == :first)
233
+ informative = @container.parental_1_name
234
+ ref_base = @parental_2_sequence[position]
235
+ elsif(reference == :second )
236
+ informative = @container.parental_2_name
237
+ ref_base = @parental_1_sequence[position]
238
+ else
239
+ raise BFRToolsException.new ("The reference for the line should be :first or :second, but was " + reference.to_s )
240
+ end
241
+
242
+ relative_position = self.start + position
243
+
244
+ bfr = bfrs[reference][base][position]
245
+ cov_1 = @coverages_1[position]
246
+ cov_2 = @coverages_2[position]
247
+ ratios_1 = @ratios_bulk_1[position][base]
248
+ ratios_2 = @ratios_bulk_2[position][base]
249
+ base_1_count = @bases_bulk_1[position][base.to_sym]
250
+ base_2_count = @bases_bulk_2[position][base.to_sym]
251
+ #puts "bases_1 #{@bases_bulk_1[position].to_s}"
252
+ #puts "bases_2 #{@bases_bulk_2[position].to_s}"
253
+ line = String.new
254
+ line << @container.parental_1_name << "\t" << @container.parental_2_name << "\t" << @container.bulk_1_name << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
255
+ line << ref_base << "\t" << relative_position.to_s
256
+ line << "\t" << base.to_s << "\t"
257
+ line << bfr.round(2).to_s << "\t"
258
+ line << cov_1.to_s << "\t" << cov_2.to_s << "\t"
259
+ line << informative
260
+ line << "\t" << ratios_1.round(2).to_s << "\t" << ratios_2.round(2).to_s
261
+ line << "\t" << base_1_count.to_s << "\t" << base_2_count.to_s
262
+ line
263
+ end
264
+
265
+ def to_multi_fasta
266
+ fasta_string = String.new
267
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_1_name << "\n" << @parental_1_sequence << "\n"
268
+ fasta_string << ">"<< self.to_s << ":" << @container.parental_2_name << "\n" << @parental_2_sequence << "\n"
269
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_1_name << "\n" << @bulk_1_sequence << "\n"
270
+ fasta_string << ">"<< self.to_s << ":" << @container.bulk_2_name << "\n" << @bulk_2_sequence << "\n"
271
+ fasta_string
272
+ end
273
+
274
+ def to_json (opts)
275
+ # puts JSON.dump self
276
+ # JSON.dump self
277
+ #{}"{\"firstName\": \"John\"}"
278
+ out = String.new
279
+ out << "{"
280
+ out << "\"Parental_1\" : \"" << @container.parental_1_name << "\"\n"
281
+ out << "\"Parental 2\" : \"" << @container.parental_2_name << "\"\n"
282
+ out << "\"Bulk 1\" : \"" << @container.bulk_1_name << "\"\n"
283
+ out << "\"Bulk 2\" : \"" << @container.bulk_2_name << "\"\n"
284
+ out << "\"Positions\" : " << (1..self.size).to_a.to_json << "\n" #TODO: Make this for any subsection, so we can subquery in case we are working on something bigger
285
+ out << "\"Parental_1_consensus\":" << @parental_1_sequence .split(//).to_json << "\n"
286
+ out << "\"Parental_2_consensus\":" << @parental_2_sequence .split(//).to_json << "\n"
287
+ out << "\"Bulk_1_consensus\":" << @bulk_1_sequence .split(//).to_json << "\n"
288
+ out << "\"Bulk_1_coverage\":" << @coverages_1.to_json << "\n"
289
+ # puts BASES
290
+
291
+ BASES.each do |base|
292
+ out << "\"Bases_Bulk_1" << base.to_s << "\":" << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
293
+ out << "\"Ratios_Bulk_1" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
294
+ end
295
+ out << "\"Bulk_2_consensus\":" << @bulk_2_sequence .split(//).join(",") << "\n"
296
+ out << "\"Bulk_2_coverage\":" << @coverages_2.join(",") << "\n"
297
+
298
+ BASES.each do |base|
299
+ out << "\"Bases_Bulk_2"<< base.to_s << "\":" << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
300
+ out << "\"Ratios_Bulk_2" << base.to_s << "\":" << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
301
+ end
302
+ BASES.each do |base|
303
+ out << "\"BFR" << base.to_s << "\":" << bfrs[:first][base].join(",") << "\n"
304
+ end
305
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
306
+ out << "}"
307
+ out
308
+
309
+ end
310
+
311
+ def to_csv
312
+ out = String.new
313
+ out << "Parental 1," << @container.parental_1_name << "\n"
314
+ out << "Parental 2," << @container.parental_2_name << "\n"
315
+ out << "Bulk 1, " << @container.bulk_1_name << "\n"
316
+ out << "Bulk 2," << @container.bulk_2_name << "\n"
317
+ out << "Positions," << (1..self.size).to_a.join(",") << "\n"
318
+ out << "Parental 1 consensus," << @parental_1_sequence .split(//).join(",") << "\n"
319
+ out << "Parental 2 consensus," << @parental_2_sequence .split(//).join(",") << "\n"
320
+ out << "Bulk 1 consensus," << @bulk_1_sequence .split(//).join(",") << "\n"
321
+ out << "Bulk 1 coverage," << @coverages_1.join(",") << "\n"
322
+ # puts BASES
323
+ BASES.each do |base|
324
+ out << "Bases Bulk 1"<< base.to_s << "," << base_count_for_base(base, @bases_bulk_1).join(",") << "\n"
325
+ out << "Ratios Bulk 1 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_1).join(",") << "\n"
326
+ end
327
+ out << "Bulk 2 consensus," << @bulk_2_sequence .split(//).join(",") << "\n"
328
+ out << "Bulk 2 coverage," << @coverages_2.join(",") << "\n"
329
+
330
+ BASES.each do |base|
331
+ out << "Bases Bulk 2 "<< base.to_s << "," << base_count_for_base(base, @bases_bulk_2).join(",") << "\n"
332
+ out << "Ratios Bulk 2 " << base.to_s << "," << base_ratios_for_base(base, @ratios_bulk_2).join(",") << "\n"
333
+ end
334
+ BASES.each do |base|
335
+ out << "BFRs" << base.to_s << "," << bfrs[:first][base].join(",") << "\n"
336
+ end
337
+ # << "\t" << @container.bulk_2_name << "\t" << self.entry << "\t"
338
+ out
339
+ end
340
+
341
+ def base_ratios_for_base(base, ratios_matrix)
342
+ ratios = Array.new
343
+ for i in (0..ratios_matrix.size-1)
344
+ ratios << ratios_matrix[i][base]
345
+ end
346
+ ratios
347
+ end
348
+
349
+ def base_count_for_base(base, base_matrix)
350
+ bases = Array.new
351
+ for i in (0..base_matrix.size-1)
352
+ bases << base_matrix[i][base]
353
+ end
354
+ bases
355
+ end
356
+
357
+ end
358
+
359
+
360
+ class BFRContainer < Container
361
+
362
+ def init_counters
363
+ @putative_snps = 0
364
+ @proccesed_regions = 0
365
+ @not_enogh_coverage = 0
366
+ @total_avg_coverage_bulk_1 = 0.0
367
+ @total_avg_coverage_bulk_2 = 0.0
368
+ @total_snp_1kbp = 0.0
369
+ @no_snps = 0
370
+ @too_many_snps = 0
371
+
372
+ end
373
+ def print_header(opts={})
374
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
375
+ output.print "#bulk_1\tbulk_2\tProcessed_regions\tputative_snps\tno_snps\ttoo_many_snps\tno_enough_coverage\tavg_cov_bulk_1\tavg_cov_bulk_2\tavg_snp_1kbp\n"
376
+ end
377
+
378
+ def print_stats(opts={})
379
+ output = opts[:output_file_stats] ? opts[:output_file_stats] : $stderr
380
+ output.print @bulk_1_name, "\t", @bulk_2_name, "\t"
381
+ output.print @proccesed_regions, "\t", @putative_snps, "\t", @no_snps, "\t", @too_many_snps,"\t", @not_enogh_coverage, "\t"
382
+ output.print @total_avg_coverage_bulk_1/@proccesed_regions, "\t",@total_avg_coverage_bulk_2/@proccesed_regions, "\t"
383
+ output.print @total_snp_1kbp / @proccesed_regions,"\n"
384
+ end
385
+
386
+ def get_region(opts={})
387
+ opts[:container] = self
388
+ region = BFRRegion.new(opts)
389
+ end
390
+
391
+ def process_region(opts={})
392
+ opts = { :min_cov=>20, :max_snp_1kbp => 10, :max_per=>0.20 }.merge!(opts)
393
+
394
+ @proccesed_regions += 1
395
+ output = opts[:output_file] ? opts[:output_file] : $stdout
396
+ print_output = opts[:output_file] ? true : false
397
+ opts[:container] = self
398
+
399
+ region = BFRRegion.new(opts)
400
+
401
+ #puts region.to_multi_fasta
402
+
403
+ @total_snp_1kbp += region.snp_1kbp
404
+ # puts "SNPS: #{region.snp_1kbp}"
405
+ if region.snp_count == 0
406
+ @no_snps += 1
407
+ print_output = false
408
+ end
409
+
410
+ if region.snp_1kbp > opts[:max_snp_1kbp]
411
+ @too_many_snps += 1
412
+ print_output = false
413
+ end
414
+
415
+
416
+
417
+ @total_avg_coverage_bulk_2 += region.avg_cov_bulk_2
418
+ @total_avg_coverage_bulk_1 += region.avg_cov_bulk_1
419
+
420
+ if region.avg_cov_bulk_2 < opts[:min_cov] or region.avg_cov_bulk_1 < opts[:min_cov]
421
+ @not_enogh_coverage += 1
422
+ print_output = false
423
+ end
424
+
425
+ info = Array.new
426
+
427
+ if print_output
428
+ for i in (0..region.size-1)
429
+ if region.coverages_1[i] > opts[:min_cov] and region.coverages_2[i] > opts[:min_cov]
430
+ BASES.each do |base|
431
+
432
+ info.clear
433
+ if Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s ) and
434
+ not Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s )
435
+ info << :first
436
+ end
437
+
438
+ if Bio::NucleicAcid.is_valid( region.parental_2_sequence[i], base.to_s ) and
439
+ not Bio::NucleicAcid.is_valid( region.parental_1_sequence[i], base.to_s )
440
+ info << :second
441
+ end
442
+
443
+
444
+ for informative in info
445
+ line = region.get_bfr_line(i, base, informative)
446
+ output.print line , "\n"
447
+ end
448
+ end
449
+ end
450
+ end
451
+ end
452
+
453
+
454
+ @parental_1_sam.mpileup_clear_cache region
455
+ @parental_2_sam.mpileup_clear_cache region
456
+ @bulk_2_sam.mpileup_clear_cache region
457
+ @bulk_1_sam.mpileup_clear_cache region
458
+ return region
459
+ end
460
+
461
+ end
462
+
463
+
464
+ end
465
+