bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,804 @@
1
+ require 'bio'
2
+ module Bio::PolyploidTools
3
+ class SNPException < RuntimeError
4
+ end
5
+
6
+ class SNP
7
+ #GENE,ORIGINAL,POS,SNP
8
+ attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
9
+ attr_accessor :contig
10
+ attr_accessor :exon_list
11
+ attr_accessor :container
12
+ attr_accessor :flanking_size, :ideal_min, :ideal_max
13
+ attr_accessor :template_sequence
14
+ attr_accessor :use_reference
15
+ attr_accessor :genomes_count
16
+ attr_accessor :primer_3_min_seq_length
17
+ attr_accessor :chromosome
18
+ attr_accessor :variation_free_region
19
+ attr_accessor :max_hits
20
+ attr_accessor :errors
21
+ attr_accessor :repetitive
22
+ attr_accessor :hit_count
23
+ attr_accessor :snp_type
24
+ attr_accessor :orientation
25
+
26
+ #Format:
27
+ #Gene_name,Original,SNP_Pos,pos,chromosome
28
+ #A_comp0_c0_seq1,C,519,A,2A
29
+ def self.parse(reg_str)
30
+ reg_str.chomp!
31
+ snp = SNP.new
32
+ snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
33
+ snp.position.strip!
34
+ snp.position = snp.position.to_i
35
+ snp.original.upcase!
36
+ snp.original.strip!
37
+ snp.snp.upcase!
38
+ snp.snp.strip!
39
+ snp.chromosome.strip!
40
+
41
+ snp.use_reference = false
42
+ snp
43
+ end
44
+
45
+ #Format:
46
+ #IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 .
47
+ def self.parseVCF(vcf_line, chr_arm_parser: Bio::PolyploidTools::ChromosomeArm.getArmSelection("first_two") )
48
+ snp = SNP.new
49
+ arr = vcf_line.split("\t")
50
+ snp.gene = arr[2]
51
+ snp.original = arr[3]
52
+ snp.position = arr[1]
53
+ snp.snp = arr[4]
54
+ snp.chromosome = chr_arm_parser.call(arr[0])
55
+ snp.contig = arr[0]
56
+ snp.position.strip!
57
+ snp.position = snp.position.to_i
58
+ snp.original.upcase!
59
+ snp.original.strip!
60
+ snp.snp.upcase!
61
+ snp.snp.strip!
62
+ snp.chromosome.strip!
63
+ snp.orientation = :forward
64
+
65
+ info = arr[7]
66
+ if info
67
+ details = info.scan(/(\w+)=([\w|.]+)/).collect { |id, value| { :id => id, :value => value }}
68
+ details.each do |e|
69
+ snp.orientation = :reverse if e[:id] == "OR" and e[:value] == "reverse"
70
+ end
71
+ end
72
+ return snp
73
+ end
74
+
75
+ def setTemplateFromFastaFile(fastaFile ,flanking_size: 100)
76
+ reg = Bio::DB::Fasta::Region.new
77
+ reg.entry = gene
78
+ reg.entry = @contig if @contig
79
+ reg.start = position - flanking_size
80
+ reg.end = position + flanking_size + 1
81
+ reg.orientation = :forward
82
+ entry = fastaFile.index.region_for_entry(reg.entry)
83
+ reg.start = 1 if reg.start < 1
84
+ reg.end = entry.length if reg.end > entry.length
85
+ amb = Bio::NucleicAcid.to_IUAPC("#{original}#{snp}")
86
+ @position = @position - reg.start + 1
87
+ @position = 1 if @position < 1
88
+ #puts "about to fetch"
89
+ self.template_sequence = fastaFile.fetch_sequence(reg)
90
+ #puts "done fetching"
91
+ template_sequence[position - 1] = amb
92
+ end
93
+
94
+ def initialize
95
+ @genomes_count = 3
96
+ @primer_3_min_seq_length = 50
97
+ @variation_free_region = 0
98
+ @contig = false
99
+ @max_hits = 8
100
+ @exon_list = Hash.new {|hsh, key| hsh[key] = [] }
101
+ @errors = Array.new
102
+ @repetitive = false
103
+ @hit_count = 0
104
+ end
105
+
106
+ def to_polymarker_coordinates(flanking_size, total:nil)
107
+ start = position - flanking_size + 1
108
+ start = 0 if start < 0
109
+ total = flanking_size * 2 unless total
110
+ total += 1
111
+ new_position = position - start + 2
112
+ [start , total, new_position ]
113
+ end
114
+
115
+ def to_polymarker_sequence(flanking_size, total:nil)
116
+ out = template_sequence.clone
117
+ snp_seq = "[#{original}/#{snp}]"
118
+ p = position-1
119
+ if orientation == :reverse
120
+ p = out.length - p - 1
121
+ s = Bio::Sequence::NA.new(out)
122
+ s1 = Bio::Sequence::NA.new(original)
123
+ s2 = Bio::Sequence::NA.new(snp)
124
+ out = s.reverse_complement
125
+ snp_seq = "[#{s1.reverse_complement}/#{s2.reverse_complement}]"
126
+
127
+ end
128
+
129
+ out[p] = snp_seq
130
+ start = position - flanking_size - 1
131
+ start = 0 if start < 0
132
+ total = flanking_size * 2 unless total
133
+ total += 5
134
+ out[start , total ].upcase
135
+ end
136
+
137
+ def snp_id_in_seq
138
+ "#{original}#{position}#{snp}"
139
+ end
140
+
141
+ #We Only want the chromosome, we drop the arm.
142
+ #We don't use this any more.
143
+ #def chromosome= (chr)
144
+ # @chromosome = chr
145
+ #end
146
+
147
+ def chromosome_group
148
+ chromosome[0]
149
+ end
150
+
151
+ def chromosome_genome
152
+ chromosome[1]
153
+ end
154
+
155
+ def chromosome_genome
156
+ return chromosome[3] if chromosome[3]
157
+ return nil
158
+ end
159
+
160
+ def to_fasta
161
+ return ">#{self.gene}\n#{self.template_sequence}\n"
162
+ end
163
+
164
+ def add_exon(exon, arm, filter_best: true)
165
+ exon_list[arm] = Array.new unless exon_list[arm]
166
+ if filter_best and exon_list[arm].size > 0
167
+ current = exon_list[arm].first
168
+ exon_list[arm] = [exon] if exon.record.score > current.record.score
169
+ else
170
+ exon_list[arm] << exon
171
+ end
172
+ end
173
+
174
+ def covered_region
175
+ return @covered_region if @covered_region
176
+ if self.use_reference
177
+ reg = Bio::DB::Fasta::Region.new()
178
+ reg.entry = gene
179
+ reg.orientation = :forward
180
+ reg.start = self.position - self.flanking_size
181
+ reg.end = self.position + self.flanking_size
182
+ reg.start = 1 if reg.start < 1
183
+ return reg
184
+ end
185
+
186
+ min = @position
187
+ max = @position
188
+ # puts "Calculating covered region for #{self.inspect}"
189
+ # puts "#{@exon_list.inspect}"
190
+ # raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
191
+ if @exon_list.size == 0
192
+ min = self.position - self.flanking_size
193
+ min = 1 if min < 1
194
+ max = self.position + self.flanking_size
195
+ end
196
+ @exon_list.each do | chromosome, exon_arr |
197
+ exon_arr.each do | exon |
198
+ reg = exon.query_region
199
+ min = reg.start if reg.start < min
200
+ max = reg.end if reg.end > max
201
+ end
202
+ end
203
+
204
+ reg = Bio::DB::Fasta::Region.new()
205
+ reg.entry = gene
206
+ reg.orientation = :forward
207
+ reg.start = min
208
+ reg.end = max
209
+
210
+ @covered_region = reg
211
+ @covered_region
212
+ end
213
+
214
+ def left_padding
215
+ flanking_size - self.local_position + 1
216
+ # primer_region.start - covered_region.start
217
+ # 0
218
+ end
219
+
220
+ def right_padding
221
+ ret = (2*flanking_size) - (left_padding + self.covered_region.size )
222
+ ret = 0 if ret < 0
223
+ ret
224
+ end
225
+
226
+ def local_position
227
+ # puts "local_position #{self.position} #{self.covered_region.start}"
228
+ self.position - self.covered_region.start
229
+ end
230
+
231
+ def padded_position(pos)
232
+ pos + left_padding
233
+ end
234
+
235
+ def primer_fasta_string
236
+ gene_region = self.covered_region
237
+ local_pos_in_gene = self.local_position
238
+ ret_str = ""
239
+
240
+ surrounding_parental_sequences.each do |name, seq|
241
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
242
+ ret_str << "#{seq}\n"
243
+ end
244
+
245
+ self.surrounding_exon_sequences.each do |chromosome, exon_seq|
246
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
247
+ end
248
+
249
+ mask = surrounding_masked_chromosomal_snps(chromosome)
250
+ ret_str << ">Mask\n#{mask}\n"
251
+
252
+ pr = primer_region(chromosome, snp_in )
253
+ ret_str << pr.to_fasta
254
+ ret_str
255
+ end
256
+
257
+ def primer_region(target_chromosome, parental )
258
+
259
+ parental = aligned_sequences[parental].downcase
260
+ names = aligned_sequences.keys
261
+ target_chromosome = get_target_sequence(names, target_chromosome)
262
+
263
+ chromosome_seq = aligned_sequences[target_chromosome]
264
+ chromosome_seq = "-" * parental.size unless chromosome_seq
265
+ chromosome_seq = chromosome_seq.downcase
266
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
267
+
268
+ pr = PrimerRegion.new
269
+ position_in_region = 0
270
+ (0..parental.size-1).each do |i|
271
+
272
+ if chromosome_seq[i] != '-' or parental[i] != '-'
273
+ case
274
+ when mask[i] == '&'
275
+ #This is the SNP we take the parental
276
+ pr.snp_pos = position_in_region
277
+ pr.homoeologous = false
278
+ when mask[i] == ':'
279
+ #This is the SNP we take the parental
280
+ pr.snp_pos = position_in_region
281
+ pr.homoeologous = true
282
+ when mask[i] == '-'
283
+ #When the mask doesnt detect a SNP, so we take the parental
284
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
285
+
286
+ when /[[:upper:]]/.match(mask[i])
287
+ #This is a good candidate for marking a SNP
288
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
289
+ if parental[i] == '-'
290
+ parental[i] = mask[i]
291
+ pr.crhomosome_specific_intron << position_in_region
292
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
293
+ parental[i] = mask[i]
294
+ pr.chromosome_specific << position_in_region
295
+ end
296
+ when /[[:lower:]]/.match(mask[i])
297
+ #this is not that good candidate, but sitll gives specificity
298
+
299
+ if parental[i] == '-'
300
+ parental[i] = mask[i]
301
+ pr.almost_crhomosome_specific_intron << position_in_region
302
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
303
+ parental[i] = mask[i].upcase
304
+ pr.almost_chromosome_specific << position_in_region
305
+ end
306
+ end #Case closes
307
+ position_in_region += 1
308
+ end #Closes region with bases
309
+ end
310
+
311
+ pr.sequence=parental.gsub('-','')
312
+ pr
313
+ end
314
+
315
+ def reverse_complement_string(sequenc_str)
316
+ complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
317
+ complement.reverse!
318
+ end
319
+
320
+ def return_primer_3_string(opts={})
321
+
322
+ left = opts[:left_pos]
323
+ right = opts[:right_pos]
324
+ sequence = opts[:sequence]
325
+ extra = opts[:extra]
326
+
327
+ orientation = "forward"
328
+ if opts[:right_pos]
329
+ orientation = "forward"
330
+ if left > right
331
+ left = sequence.size - left - 1
332
+ right = sequence.size - right - 1
333
+ sequence = reverse_complement_string(sequence)
334
+ orientation = "reverse"
335
+ end
336
+ if @variation_free_region > 0
337
+ check_str = sequence[right+1, @variation_free_region]
338
+ return nil if check_str != check_str.downcase
339
+ end
340
+
341
+ end
342
+
343
+ #puts "__"
344
+ #puts self.inspect
345
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation} \n"
346
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_f]
347
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
348
+ str << extra if extra
349
+ str << opts[:extra_f] if opts[:extra_f]
350
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
351
+
352
+
353
+ str << "=\n"
354
+
355
+
356
+ #In case that we don't have a right primer, we do both orientations
357
+ unless opts[:right_pos]
358
+ sequence = opts[:sequence]
359
+ left = sequence.size - left - 1
360
+ orientation = "reverse"
361
+ sequence = reverse_complement_string(sequence)
362
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
363
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_r]
364
+ str << opts[:extra_r] if opts[:extra_r]
365
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
366
+ str << extra if extra
367
+ str << "=\n"
368
+ end
369
+
370
+ str
371
+ end
372
+
373
+
374
+ def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20 )
375
+
376
+ pr = primer_region(target_chromosome, parental )
377
+ primer_3_propertes = Array.new
378
+
379
+ seq_original = String.new(pr.sequence)
380
+
381
+ if seq_original.size < primer_3_min_seq_length
382
+ errors << "The sequence (#{seq_original.size}) is shorter than #{primer_3_min_seq_length}"
383
+ return primer_3_propertes
384
+ end
385
+
386
+ if self.hit_count > self.max_hits
387
+ errors << "The marker maps to #{self.hit_count} positions (max_hits: #{self.max_hits}). "
388
+ repetitive = true
389
+ return primer_3_propertes
390
+ end
391
+ seq_original[pr.snp_pos] = self.original
392
+ seq_original_reverse = reverse_complement_string(seq_original)
393
+
394
+ seq_snp = String.new(pr.sequence)
395
+ seq_snp[pr.snp_pos] = self.snp
396
+ seq_snp_reverse = reverse_complement_string(seq_snp)
397
+
398
+ rev_pos = seq_snp.size - position
399
+
400
+ if pr.homoeologous
401
+ @snp_type = "homoeologous"
402
+ else
403
+ @snp_type = "non-homoeologous"
404
+ end
405
+
406
+ total_candidates = pr.chromosome_specific.size
407
+ total_candidates += pr.crhomosome_specific_intron.size
408
+ total_candidates += pr.almost_chromosome_specific.size
409
+ total_candidates += pr.almost_crhomosome_specific_intron.size
410
+
411
+ skip_specific = total_candidates > max_specific_primers
412
+ #puts "skip_specific: #{skip_specific}: #{total_candidates} > #{max_specific_primers}"
413
+ pr.chromosome_specific.each do |pos|
414
+ break if skip_specific
415
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
416
+ primer_3_propertes << return_primer_3_string(args)
417
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
418
+ args[:sequence] = seq_snp
419
+ primer_3_propertes << return_primer_3_string(args)
420
+ end
421
+
422
+ pr.crhomosome_specific_intron.each do |pos|
423
+ break if skip_specific
424
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
425
+ primer_3_propertes << return_primer_3_string(args)
426
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
427
+ args[:sequence] = seq_snp
428
+ primer_3_propertes << return_primer_3_string(args)
429
+ end
430
+
431
+ pr.almost_chromosome_specific.each do |pos|
432
+ break if skip_specific
433
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
434
+ primer_3_propertes << return_primer_3_string(args)
435
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
436
+ args[:sequence] = seq_snp
437
+ primer_3_propertes << return_primer_3_string(args)
438
+ end
439
+
440
+ pr.almost_crhomosome_specific_intron.each do |pos|
441
+ break if skip_specific
442
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
443
+ primer_3_propertes << return_primer_3_string(args)
444
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
445
+ args[:sequence] = seq_snp
446
+ primer_3_propertes << return_primer_3_string(args)
447
+ end
448
+
449
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
450
+ primer_3_propertes << return_primer_3_string(args)
451
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{@snp_type} #{chromosome}"
452
+ args[:sequence] = seq_snp
453
+ primer_3_propertes << return_primer_3_string(args)
454
+ primer_3_propertes
455
+ end
456
+
457
+ def to_s
458
+ "#{gene}:#{original}#{position}#{snp}#{chromosome}"
459
+ end
460
+
461
+ def short_s
462
+ "#{original}#{position}#{snp}".upcase
463
+ end
464
+
465
+ def primer_3_string(target_chromosome, parental, max_specific_primers: 20)
466
+ strings = primer_3_all_strings(target_chromosome, parental, max_specific_primers: max_specific_primers)
467
+ strings.join
468
+ end
469
+
470
+ def exon_for_chromosome (chromosome)
471
+ selected_exon=exon_list[chromosome]
472
+ puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
473
+ selected_exon
474
+ end
475
+
476
+ def parental_sequences
477
+ return @parental_sequences if @parental_sequences
478
+ gene_region = self.covered_region
479
+ local_pos_in_gene = self.local_position
480
+
481
+ @parental_sequences = Bio::Alignment::SequenceHash.new
482
+ container.parents.each do |name, bam|
483
+ seq = nil
484
+ if bam
485
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
486
+ else
487
+ seq = container.gene_model_sequence(gene_region)
488
+ unless name == self.snp_in
489
+ seq[local_pos_in_gene] = self.original
490
+ end
491
+ end
492
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
493
+
494
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
495
+ @parental_sequences [name] = seq
496
+ end
497
+ @parental_sequences
498
+ end
499
+
500
+
501
+
502
+
503
+ def surrounding_parental_sequences
504
+ return @surrounding_parental_sequences if @surrounding_parental_sequences
505
+ gene_region = self.covered_region
506
+ local_pos_in_gene = self.local_position
507
+
508
+ @surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
509
+ container.parents.each do |name, bam|
510
+ seq = nil
511
+ if bam
512
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
513
+ else
514
+ seq = container.gene_model_sequence(gene_region)
515
+ #puts "#{name} #{self.snp_in}"
516
+ #puts "Modifing original: #{name}\n#{seq}"
517
+ unless name == self.snp_in
518
+
519
+ seq[local_pos_in_gene] = self.original
520
+ else
521
+ seq[local_pos_in_gene] = self.snp
522
+ end
523
+ #puts "#{seq}"
524
+ end
525
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
526
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
527
+ @surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
528
+ end
529
+ @surrounding_parental_sequences
530
+ end
531
+
532
+ def cut_sequence_to_primer_region(sequence)
533
+ ideal_min = self.local_position - flanking_size
534
+ ideal_max = self.local_position + flanking_size
535
+ ideal_min = 0 if ideal_min < 0
536
+ ideal_max = sequence.size - 1 if ideal_max > sequence.size
537
+ # len = ideal_max - ideal_min
538
+ sequence[ideal_min..ideal_max]
539
+ end
540
+
541
+ def cut_and_pad_sequence_to_primer_region(sequence)
542
+ ideal_min = self.local_position - flanking_size
543
+ ideal_max = self.local_position + flanking_size
544
+ left_pad = 0
545
+ right_pad=0
546
+ if ideal_min < 0
547
+ left_pad = ideal_min * -1
548
+ ideal_min = 0
549
+ end
550
+ if ideal_max > sequence.size
551
+ right_pad = ideal_max - sequence.size
552
+ ideal_max = sequence.size - 1
553
+ end
554
+ ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
555
+ ret
556
+ end
557
+
558
+ def sequences_to_align
559
+ @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
560
+ @sequences_to_align
561
+ end
562
+
563
+ def aligned_sequences
564
+
565
+ return @aligned_sequences if @aligned_sequences
566
+ return Hash.new if sequences_to_align.size == 0
567
+
568
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
569
+ mafft = Bio::MAFFT.new( "mafft" , options)
570
+ #puts "Before MAFT:#{sequences_to_align.inspect}"
571
+
572
+ report = mafft.query_align(sequences_to_align)
573
+ @aligned_sequences = report.alignment
574
+ # puts "MAFFT: #{report.alignment.inspect}"
575
+ @aligned_sequences
576
+ end
577
+
578
+ def aligned_sequences_fasta
579
+ ret_str = ""
580
+ aligned_sequences.each_pair do |name, seq|
581
+ ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
582
+ end
583
+ ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
584
+
585
+ pr = primer_region(chromosome, snp_in )
586
+ ret_str << pr.to_fasta
587
+ ret_str
588
+ ret_str
589
+ end
590
+
591
+
592
+ def get_snp_position_after_trim
593
+ local_pos_in_gene = self.local_position
594
+ ideal_min = self.local_position - flanking_size
595
+ ideal_max = self.local_position + flanking_size
596
+ left_pad = 0
597
+ if ideal_min < 0
598
+ left_pad = ideal_min * -1
599
+ ideal_min = 0
600
+ end
601
+ local_pos_in_gene - ideal_min
602
+ end
603
+
604
+ def aligned_snp_position
605
+ return @aligned_snp_position if @aligned_snp_position
606
+ #puts self.inspect
607
+ pos = -1
608
+ parental_strings = Array.new
609
+ parental_sequences.keys.each do | par |
610
+ parental_strings << aligned_sequences[par]
611
+ end
612
+ $stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
613
+
614
+ local_pos_in_parental = get_snp_position_after_trim
615
+ i = 0
616
+ while i < parental_strings[0].size do
617
+ if local_pos_in_parental == 0 and parental_strings[0][i] != "-"
618
+ pos = i
619
+ if parental_strings[0][i] == parental_strings[1][i]
620
+ $stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
621
+ end
622
+ end
623
+
624
+ local_pos_in_parental -= 1 if parental_strings[0][i] != "-"
625
+ i += 1
626
+ end
627
+ @aligned_snp_position = pos
628
+ return pos
629
+ end
630
+
631
+ def get_target_sequence(names, chromosome)
632
+
633
+ best = chromosome
634
+ best_score = 0
635
+ names.each do |e|
636
+ arr = e.split("_")
637
+ if arr.length == 3
638
+ score = arr[2].to_f
639
+ if score >best_score
640
+ best_score = score
641
+ best = e
642
+ end
643
+ end
644
+ end
645
+ best
646
+ end
647
+
648
+
649
+
650
+ def mask_aligned_chromosomal_snp(chromosome)
651
+ names = aligned_sequences.keys
652
+ parentals = parental_sequences.keys
653
+
654
+ position_after_trim = get_snp_position_after_trim
655
+
656
+ names = names - parentals
657
+ local_pos_in_gene = aligned_snp_position
658
+
659
+ best_target = get_target_sequence(names, chromosome)
660
+ masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
661
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[best_target]
662
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
663
+ #puts "mask_aligned_chromosomal_snp(#{chromosome})"
664
+ #puts names
665
+ i = 0
666
+ for i in 0..masked_snps.size-1
667
+ #puts i
668
+ different = 0
669
+ cov = 0
670
+ from_group = 0
671
+ nCount = 0
672
+ seen = []
673
+ names.each do | chr |
674
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
675
+ #puts aligned_sequences[chr][i]
676
+ cov += 1
677
+ nCount += 1 if aligned_sequences[chr][i] == 'N' or aligned_sequences[chr][i] == 'n' # maybe fix this to use ambiguity codes instead.
678
+
679
+ if chr[0] == chromosome_group and not seen.include? chr[1]
680
+ seen << chr[1]
681
+ from_group += 1
682
+
683
+ end
684
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
685
+ if chr != best_target
686
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
687
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
688
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
689
+ end
690
+ end
691
+ end
692
+ masked_snps[i] = "-" if different == 0
693
+ masked_snps[i] = "-" if cov == 1
694
+ masked_snps[i] = "-" if nCount > 0
695
+ masked_snps[i] = "*" if cov == 0
696
+ expected_snps = names.size - 1
697
+
698
+ #puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
699
+
700
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
701
+ #puts "#{i}:#{masked_snps[i]}"
702
+
703
+ if i == local_pos_in_gene
704
+ masked_snps[i] = "&"
705
+ #puts "#{i}:#{masked_snps[i]}___"
706
+ bases = ""
707
+ names.each do | chr |
708
+ bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
709
+ end
710
+
711
+ code_reference = "n"
712
+ code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
713
+
714
+ if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
715
+ masked_snps[i] = ":"
716
+ end
717
+
718
+ end
719
+ #i += 1
720
+ end
721
+ masked_snps
722
+ end
723
+
724
+
725
+ def surrounding_masked_chromosomal_snps(chromosome)
726
+
727
+ chromosomes = surrounding_exon_sequences
728
+ names = chromosomes.keys
729
+ get_target_sequence(names)
730
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
731
+ masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
732
+ local_pos_in_gene = flanking_size
733
+ i = 0
734
+ while i < masked_snps.size do
735
+ different = 0
736
+ cov = 0
737
+ names.each do | chr |
738
+ if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
739
+ cov += 1
740
+ if chr != chromosome and masked_snps[i] != "+"
741
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
742
+ end
743
+ end
744
+ end
745
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
746
+ masked_snps[i] = "-" if cov < 2
747
+ masked_snps[i] = masked_snps[i].upcase if different > 1
748
+
749
+ if i == local_pos_in_gene
750
+ masked_snps[i] = "&"
751
+ end
752
+ i += 1
753
+ end
754
+ masked_snps
755
+ end
756
+
757
+ def surrounding_exon_sequences
758
+ return @surrounding_exon_sequences if @surrounding_exon_sequences
759
+ gene_region = self.covered_region
760
+ @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
761
+ self.exon_list.each do |chromosome, exon_arr|
762
+ exon_arr.each do |exon|
763
+ exon_start_offset = exon.query_region.start - gene_region.start
764
+ flanking_region = exon.target_flanking_region_from_position(position,flanking_size)
765
+ #TODO: Padd when the exon goes over the regions...
766
+ #puts flanking_region.inspect
767
+ #Ignoring when the exon is in a gap
768
+ unless exon.snp_in_gap
769
+ exon_seq = container.chromosome_sequence(flanking_region)
770
+ @surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
771
+ end
772
+ end
773
+ end
774
+ @surrounding_exon_sequences
775
+ end
776
+
777
+
778
+ def exon_sequences
779
+ return @exon_sequences if @exon_sequences
780
+ gene_region = self.covered_region
781
+ local_pos_in_gene = self.local_position
782
+ @exon_sequences = Bio::Alignment::SequenceHash.new
783
+ self.exon_list.each do |chromosome, exon_arr|
784
+ exon_arr.each do |exon|
785
+ exon_start_offset = exon.query_region.start - gene_region.start
786
+ exon_seq = "-" * exon_start_offset
787
+ exon_seq << container.chromosome_sequence(exon.target_region).to_s
788
+ #puts exon_seq
789
+ #l_pos = exon_start_offset + local_pos_in_gene
790
+ unless exon.snp_in_gap
791
+ #puts "local position: #{local_pos_in_gene}"
792
+ #puts "Exon_seq: #{exon_seq}"
793
+ exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
794
+ exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
795
+ #puts exon.inspect
796
+ @exon_sequences["#{chromosome}_#{exon.query_region.start}_#{exon.record.score}"] = exon_seq
797
+ end
798
+ end
799
+ end
800
+ @exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
801
+ @exon_sequences
802
+ end
803
+ end
804
+ end