bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,804 @@
1
+ require 'bio'
2
+ module Bio::PolyploidTools
3
+ class SNPException < RuntimeError
4
+ end
5
+
6
+ class SNP
7
+ #GENE,ORIGINAL,POS,SNP
8
+ attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
9
+ attr_accessor :contig
10
+ attr_accessor :exon_list
11
+ attr_accessor :container
12
+ attr_accessor :flanking_size, :ideal_min, :ideal_max
13
+ attr_accessor :template_sequence
14
+ attr_accessor :use_reference
15
+ attr_accessor :genomes_count
16
+ attr_accessor :primer_3_min_seq_length
17
+ attr_accessor :chromosome
18
+ attr_accessor :variation_free_region
19
+ attr_accessor :max_hits
20
+ attr_accessor :errors
21
+ attr_accessor :repetitive
22
+ attr_accessor :hit_count
23
+ attr_accessor :snp_type
24
+ attr_accessor :orientation
25
+
26
+ #Format:
27
+ #Gene_name,Original,SNP_Pos,pos,chromosome
28
+ #A_comp0_c0_seq1,C,519,A,2A
29
+ def self.parse(reg_str)
30
+ reg_str.chomp!
31
+ snp = SNP.new
32
+ snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
33
+ snp.position.strip!
34
+ snp.position = snp.position.to_i
35
+ snp.original.upcase!
36
+ snp.original.strip!
37
+ snp.snp.upcase!
38
+ snp.snp.strip!
39
+ snp.chromosome.strip!
40
+
41
+ snp.use_reference = false
42
+ snp
43
+ end
44
+
45
+ #Format:
46
+ #IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 .
47
+ def self.parseVCF(vcf_line, chr_arm_parser: Bio::PolyploidTools::ChromosomeArm.getArmSelection("first_two") )
48
+ snp = SNP.new
49
+ arr = vcf_line.split("\t")
50
+ snp.gene = arr[2]
51
+ snp.original = arr[3]
52
+ snp.position = arr[1]
53
+ snp.snp = arr[4]
54
+ snp.chromosome = chr_arm_parser.call(arr[0])
55
+ snp.contig = arr[0]
56
+ snp.position.strip!
57
+ snp.position = snp.position.to_i
58
+ snp.original.upcase!
59
+ snp.original.strip!
60
+ snp.snp.upcase!
61
+ snp.snp.strip!
62
+ snp.chromosome.strip!
63
+ snp.orientation = :forward
64
+
65
+ info = arr[7]
66
+ if info
67
+ details = info.scan(/(\w+)=([\w|.]+)/).collect { |id, value| { :id => id, :value => value }}
68
+ details.each do |e|
69
+ snp.orientation = :reverse if e[:id] == "OR" and e[:value] == "reverse"
70
+ end
71
+ end
72
+ return snp
73
+ end
74
+
75
+ def setTemplateFromFastaFile(fastaFile ,flanking_size: 100)
76
+ reg = Bio::DB::Fasta::Region.new
77
+ reg.entry = gene
78
+ reg.entry = @contig if @contig
79
+ reg.start = position - flanking_size
80
+ reg.end = position + flanking_size + 1
81
+ reg.orientation = :forward
82
+ entry = fastaFile.index.region_for_entry(reg.entry)
83
+ reg.start = 1 if reg.start < 1
84
+ reg.end = entry.length if reg.end > entry.length
85
+ amb = Bio::NucleicAcid.to_IUAPC("#{original}#{snp}")
86
+ @position = @position - reg.start + 1
87
+ @position = 1 if @position < 1
88
+ #puts "about to fetch"
89
+ self.template_sequence = fastaFile.fetch_sequence(reg)
90
+ #puts "done fetching"
91
+ template_sequence[position - 1] = amb
92
+ end
93
+
94
+ def initialize
95
+ @genomes_count = 3
96
+ @primer_3_min_seq_length = 50
97
+ @variation_free_region = 0
98
+ @contig = false
99
+ @max_hits = 8
100
+ @exon_list = Hash.new {|hsh, key| hsh[key] = [] }
101
+ @errors = Array.new
102
+ @repetitive = false
103
+ @hit_count = 0
104
+ end
105
+
106
+ def to_polymarker_coordinates(flanking_size, total:nil)
107
+ start = position - flanking_size + 1
108
+ start = 0 if start < 0
109
+ total = flanking_size * 2 unless total
110
+ total += 1
111
+ new_position = position - start + 2
112
+ [start , total, new_position ]
113
+ end
114
+
115
+ def to_polymarker_sequence(flanking_size, total:nil)
116
+ out = template_sequence.clone
117
+ snp_seq = "[#{original}/#{snp}]"
118
+ p = position-1
119
+ if orientation == :reverse
120
+ p = out.length - p - 1
121
+ s = Bio::Sequence::NA.new(out)
122
+ s1 = Bio::Sequence::NA.new(original)
123
+ s2 = Bio::Sequence::NA.new(snp)
124
+ out = s.reverse_complement
125
+ snp_seq = "[#{s1.reverse_complement}/#{s2.reverse_complement}]"
126
+
127
+ end
128
+
129
+ out[p] = snp_seq
130
+ start = position - flanking_size - 1
131
+ start = 0 if start < 0
132
+ total = flanking_size * 2 unless total
133
+ total += 5
134
+ out[start , total ].upcase
135
+ end
136
+
137
+ def snp_id_in_seq
138
+ "#{original}#{position}#{snp}"
139
+ end
140
+
141
+ #We Only want the chromosome, we drop the arm.
142
+ #We don't use this any more.
143
+ #def chromosome= (chr)
144
+ # @chromosome = chr
145
+ #end
146
+
147
+ def chromosome_group
148
+ chromosome[0]
149
+ end
150
+
151
+ def chromosome_genome
152
+ chromosome[1]
153
+ end
154
+
155
+ def chromosome_genome
156
+ return chromosome[3] if chromosome[3]
157
+ return nil
158
+ end
159
+
160
+ def to_fasta
161
+ return ">#{self.gene}\n#{self.template_sequence}\n"
162
+ end
163
+
164
+ def add_exon(exon, arm, filter_best: true)
165
+ exon_list[arm] = Array.new unless exon_list[arm]
166
+ if filter_best and exon_list[arm].size > 0
167
+ current = exon_list[arm].first
168
+ exon_list[arm] = [exon] if exon.record.score > current.record.score
169
+ else
170
+ exon_list[arm] << exon
171
+ end
172
+ end
173
+
174
+ def covered_region
175
+ return @covered_region if @covered_region
176
+ if self.use_reference
177
+ reg = Bio::DB::Fasta::Region.new()
178
+ reg.entry = gene
179
+ reg.orientation = :forward
180
+ reg.start = self.position - self.flanking_size
181
+ reg.end = self.position + self.flanking_size
182
+ reg.start = 1 if reg.start < 1
183
+ return reg
184
+ end
185
+
186
+ min = @position
187
+ max = @position
188
+ # puts "Calculating covered region for #{self.inspect}"
189
+ # puts "#{@exon_list.inspect}"
190
+ # raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
191
+ if @exon_list.size == 0
192
+ min = self.position - self.flanking_size
193
+ min = 1 if min < 1
194
+ max = self.position + self.flanking_size
195
+ end
196
+ @exon_list.each do | chromosome, exon_arr |
197
+ exon_arr.each do | exon |
198
+ reg = exon.query_region
199
+ min = reg.start if reg.start < min
200
+ max = reg.end if reg.end > max
201
+ end
202
+ end
203
+
204
+ reg = Bio::DB::Fasta::Region.new()
205
+ reg.entry = gene
206
+ reg.orientation = :forward
207
+ reg.start = min
208
+ reg.end = max
209
+
210
+ @covered_region = reg
211
+ @covered_region
212
+ end
213
+
214
+ def left_padding
215
+ flanking_size - self.local_position + 1
216
+ # primer_region.start - covered_region.start
217
+ # 0
218
+ end
219
+
220
+ def right_padding
221
+ ret = (2*flanking_size) - (left_padding + self.covered_region.size )
222
+ ret = 0 if ret < 0
223
+ ret
224
+ end
225
+
226
+ def local_position
227
+ # puts "local_position #{self.position} #{self.covered_region.start}"
228
+ self.position - self.covered_region.start
229
+ end
230
+
231
+ def padded_position(pos)
232
+ pos + left_padding
233
+ end
234
+
235
+ def primer_fasta_string
236
+ gene_region = self.covered_region
237
+ local_pos_in_gene = self.local_position
238
+ ret_str = ""
239
+
240
+ surrounding_parental_sequences.each do |name, seq|
241
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
242
+ ret_str << "#{seq}\n"
243
+ end
244
+
245
+ self.surrounding_exon_sequences.each do |chromosome, exon_seq|
246
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
247
+ end
248
+
249
+ mask = surrounding_masked_chromosomal_snps(chromosome)
250
+ ret_str << ">Mask\n#{mask}\n"
251
+
252
+ pr = primer_region(chromosome, snp_in )
253
+ ret_str << pr.to_fasta
254
+ ret_str
255
+ end
256
+
257
+ def primer_region(target_chromosome, parental )
258
+
259
+ parental = aligned_sequences[parental].downcase
260
+ names = aligned_sequences.keys
261
+ target_chromosome = get_target_sequence(names, target_chromosome)
262
+
263
+ chromosome_seq = aligned_sequences[target_chromosome]
264
+ chromosome_seq = "-" * parental.size unless chromosome_seq
265
+ chromosome_seq = chromosome_seq.downcase
266
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
267
+
268
+ pr = PrimerRegion.new
269
+ position_in_region = 0
270
+ (0..parental.size-1).each do |i|
271
+
272
+ if chromosome_seq[i] != '-' or parental[i] != '-'
273
+ case
274
+ when mask[i] == '&'
275
+ #This is the SNP we take the parental
276
+ pr.snp_pos = position_in_region
277
+ pr.homoeologous = false
278
+ when mask[i] == ':'
279
+ #This is the SNP we take the parental
280
+ pr.snp_pos = position_in_region
281
+ pr.homoeologous = true
282
+ when mask[i] == '-'
283
+ #When the mask doesnt detect a SNP, so we take the parental
284
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
285
+
286
+ when /[[:upper:]]/.match(mask[i])
287
+ #This is a good candidate for marking a SNP
288
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
289
+ if parental[i] == '-'
290
+ parental[i] = mask[i]
291
+ pr.crhomosome_specific_intron << position_in_region
292
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
293
+ parental[i] = mask[i]
294
+ pr.chromosome_specific << position_in_region
295
+ end
296
+ when /[[:lower:]]/.match(mask[i])
297
+ #this is not that good candidate, but sitll gives specificity
298
+
299
+ if parental[i] == '-'
300
+ parental[i] = mask[i]
301
+ pr.almost_crhomosome_specific_intron << position_in_region
302
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
303
+ parental[i] = mask[i].upcase
304
+ pr.almost_chromosome_specific << position_in_region
305
+ end
306
+ end #Case closes
307
+ position_in_region += 1
308
+ end #Closes region with bases
309
+ end
310
+
311
+ pr.sequence=parental.gsub('-','')
312
+ pr
313
+ end
314
+
315
+ def reverse_complement_string(sequenc_str)
316
+ complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
317
+ complement.reverse!
318
+ end
319
+
320
+ def return_primer_3_string(opts={})
321
+
322
+ left = opts[:left_pos]
323
+ right = opts[:right_pos]
324
+ sequence = opts[:sequence]
325
+ extra = opts[:extra]
326
+
327
+ orientation = "forward"
328
+ if opts[:right_pos]
329
+ orientation = "forward"
330
+ if left > right
331
+ left = sequence.size - left - 1
332
+ right = sequence.size - right - 1
333
+ sequence = reverse_complement_string(sequence)
334
+ orientation = "reverse"
335
+ end
336
+ if @variation_free_region > 0
337
+ check_str = sequence[right+1, @variation_free_region]
338
+ return nil if check_str != check_str.downcase
339
+ end
340
+
341
+ end
342
+
343
+ #puts "__"
344
+ #puts self.inspect
345
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation} \n"
346
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_f]
347
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
348
+ str << extra if extra
349
+ str << opts[:extra_f] if opts[:extra_f]
350
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
351
+
352
+
353
+ str << "=\n"
354
+
355
+
356
+ #In case that we don't have a right primer, we do both orientations
357
+ unless opts[:right_pos]
358
+ sequence = opts[:sequence]
359
+ left = sequence.size - left - 1
360
+ orientation = "reverse"
361
+ sequence = reverse_complement_string(sequence)
362
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
363
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_r]
364
+ str << opts[:extra_r] if opts[:extra_r]
365
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
366
+ str << extra if extra
367
+ str << "=\n"
368
+ end
369
+
370
+ str
371
+ end
372
+
373
+
374
+ def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20 )
375
+
376
+ pr = primer_region(target_chromosome, parental )
377
+ primer_3_propertes = Array.new
378
+
379
+ seq_original = String.new(pr.sequence)
380
+
381
+ if seq_original.size < primer_3_min_seq_length
382
+ errors << "The sequence (#{seq_original.size}) is shorter than #{primer_3_min_seq_length}"
383
+ return primer_3_propertes
384
+ end
385
+
386
+ if self.hit_count > self.max_hits
387
+ errors << "The marker maps to #{self.hit_count} positions (max_hits: #{self.max_hits}). "
388
+ repetitive = true
389
+ return primer_3_propertes
390
+ end
391
+ seq_original[pr.snp_pos] = self.original
392
+ seq_original_reverse = reverse_complement_string(seq_original)
393
+
394
+ seq_snp = String.new(pr.sequence)
395
+ seq_snp[pr.snp_pos] = self.snp
396
+ seq_snp_reverse = reverse_complement_string(seq_snp)
397
+
398
+ rev_pos = seq_snp.size - position
399
+
400
+ if pr.homoeologous
401
+ @snp_type = "homoeologous"
402
+ else
403
+ @snp_type = "non-homoeologous"
404
+ end
405
+
406
+ total_candidates = pr.chromosome_specific.size
407
+ total_candidates += pr.crhomosome_specific_intron.size
408
+ total_candidates += pr.almost_chromosome_specific.size
409
+ total_candidates += pr.almost_crhomosome_specific_intron.size
410
+
411
+ skip_specific = total_candidates > max_specific_primers
412
+ #puts "skip_specific: #{skip_specific}: #{total_candidates} > #{max_specific_primers}"
413
+ pr.chromosome_specific.each do |pos|
414
+ break if skip_specific
415
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
416
+ primer_3_propertes << return_primer_3_string(args)
417
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
418
+ args[:sequence] = seq_snp
419
+ primer_3_propertes << return_primer_3_string(args)
420
+ end
421
+
422
+ pr.crhomosome_specific_intron.each do |pos|
423
+ break if skip_specific
424
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
425
+ primer_3_propertes << return_primer_3_string(args)
426
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
427
+ args[:sequence] = seq_snp
428
+ primer_3_propertes << return_primer_3_string(args)
429
+ end
430
+
431
+ pr.almost_chromosome_specific.each do |pos|
432
+ break if skip_specific
433
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
434
+ primer_3_propertes << return_primer_3_string(args)
435
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
436
+ args[:sequence] = seq_snp
437
+ primer_3_propertes << return_primer_3_string(args)
438
+ end
439
+
440
+ pr.almost_crhomosome_specific_intron.each do |pos|
441
+ break if skip_specific
442
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
443
+ primer_3_propertes << return_primer_3_string(args)
444
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
445
+ args[:sequence] = seq_snp
446
+ primer_3_propertes << return_primer_3_string(args)
447
+ end
448
+
449
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
450
+ primer_3_propertes << return_primer_3_string(args)
451
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{@snp_type} #{chromosome}"
452
+ args[:sequence] = seq_snp
453
+ primer_3_propertes << return_primer_3_string(args)
454
+ primer_3_propertes
455
+ end
456
+
457
+ def to_s
458
+ "#{gene}:#{original}#{position}#{snp}#{chromosome}"
459
+ end
460
+
461
+ def short_s
462
+ "#{original}#{position}#{snp}".upcase
463
+ end
464
+
465
+ def primer_3_string(target_chromosome, parental, max_specific_primers: 20)
466
+ strings = primer_3_all_strings(target_chromosome, parental, max_specific_primers: max_specific_primers)
467
+ strings.join
468
+ end
469
+
470
+ def exon_for_chromosome (chromosome)
471
+ selected_exon=exon_list[chromosome]
472
+ puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
473
+ selected_exon
474
+ end
475
+
476
+ def parental_sequences
477
+ return @parental_sequences if @parental_sequences
478
+ gene_region = self.covered_region
479
+ local_pos_in_gene = self.local_position
480
+
481
+ @parental_sequences = Bio::Alignment::SequenceHash.new
482
+ container.parents.each do |name, bam|
483
+ seq = nil
484
+ if bam
485
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
486
+ else
487
+ seq = container.gene_model_sequence(gene_region)
488
+ unless name == self.snp_in
489
+ seq[local_pos_in_gene] = self.original
490
+ end
491
+ end
492
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
493
+
494
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
495
+ @parental_sequences [name] = seq
496
+ end
497
+ @parental_sequences
498
+ end
499
+
500
+
501
+
502
+
503
+ def surrounding_parental_sequences
504
+ return @surrounding_parental_sequences if @surrounding_parental_sequences
505
+ gene_region = self.covered_region
506
+ local_pos_in_gene = self.local_position
507
+
508
+ @surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
509
+ container.parents.each do |name, bam|
510
+ seq = nil
511
+ if bam
512
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
513
+ else
514
+ seq = container.gene_model_sequence(gene_region)
515
+ #puts "#{name} #{self.snp_in}"
516
+ #puts "Modifing original: #{name}\n#{seq}"
517
+ unless name == self.snp_in
518
+
519
+ seq[local_pos_in_gene] = self.original
520
+ else
521
+ seq[local_pos_in_gene] = self.snp
522
+ end
523
+ #puts "#{seq}"
524
+ end
525
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
526
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
527
+ @surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
528
+ end
529
+ @surrounding_parental_sequences
530
+ end
531
+
532
+ def cut_sequence_to_primer_region(sequence)
533
+ ideal_min = self.local_position - flanking_size
534
+ ideal_max = self.local_position + flanking_size
535
+ ideal_min = 0 if ideal_min < 0
536
+ ideal_max = sequence.size - 1 if ideal_max > sequence.size
537
+ # len = ideal_max - ideal_min
538
+ sequence[ideal_min..ideal_max]
539
+ end
540
+
541
+ def cut_and_pad_sequence_to_primer_region(sequence)
542
+ ideal_min = self.local_position - flanking_size
543
+ ideal_max = self.local_position + flanking_size
544
+ left_pad = 0
545
+ right_pad=0
546
+ if ideal_min < 0
547
+ left_pad = ideal_min * -1
548
+ ideal_min = 0
549
+ end
550
+ if ideal_max > sequence.size
551
+ right_pad = ideal_max - sequence.size
552
+ ideal_max = sequence.size - 1
553
+ end
554
+ ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
555
+ ret
556
+ end
557
+
558
+ def sequences_to_align
559
+ @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
560
+ @sequences_to_align
561
+ end
562
+
563
+ def aligned_sequences
564
+
565
+ return @aligned_sequences if @aligned_sequences
566
+ return Hash.new if sequences_to_align.size == 0
567
+
568
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
569
+ mafft = Bio::MAFFT.new( "mafft" , options)
570
+ #puts "Before MAFT:#{sequences_to_align.inspect}"
571
+
572
+ report = mafft.query_align(sequences_to_align)
573
+ @aligned_sequences = report.alignment
574
+ # puts "MAFFT: #{report.alignment.inspect}"
575
+ @aligned_sequences
576
+ end
577
+
578
+ def aligned_sequences_fasta
579
+ ret_str = ""
580
+ aligned_sequences.each_pair do |name, seq|
581
+ ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
582
+ end
583
+ ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
584
+
585
+ pr = primer_region(chromosome, snp_in )
586
+ ret_str << pr.to_fasta
587
+ ret_str
588
+ ret_str
589
+ end
590
+
591
+
592
+ def get_snp_position_after_trim
593
+ local_pos_in_gene = self.local_position
594
+ ideal_min = self.local_position - flanking_size
595
+ ideal_max = self.local_position + flanking_size
596
+ left_pad = 0
597
+ if ideal_min < 0
598
+ left_pad = ideal_min * -1
599
+ ideal_min = 0
600
+ end
601
+ local_pos_in_gene - ideal_min
602
+ end
603
+
604
+ def aligned_snp_position
605
+ return @aligned_snp_position if @aligned_snp_position
606
+ #puts self.inspect
607
+ pos = -1
608
+ parental_strings = Array.new
609
+ parental_sequences.keys.each do | par |
610
+ parental_strings << aligned_sequences[par]
611
+ end
612
+ $stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
613
+
614
+ local_pos_in_parental = get_snp_position_after_trim
615
+ i = 0
616
+ while i < parental_strings[0].size do
617
+ if local_pos_in_parental == 0 and parental_strings[0][i] != "-"
618
+ pos = i
619
+ if parental_strings[0][i] == parental_strings[1][i]
620
+ $stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
621
+ end
622
+ end
623
+
624
+ local_pos_in_parental -= 1 if parental_strings[0][i] != "-"
625
+ i += 1
626
+ end
627
+ @aligned_snp_position = pos
628
+ return pos
629
+ end
630
+
631
+ def get_target_sequence(names, chromosome)
632
+
633
+ best = chromosome
634
+ best_score = 0
635
+ names.each do |e|
636
+ arr = e.split("_")
637
+ if arr.length == 3
638
+ score = arr[2].to_f
639
+ if score >best_score
640
+ best_score = score
641
+ best = e
642
+ end
643
+ end
644
+ end
645
+ best
646
+ end
647
+
648
+
649
+
650
+ def mask_aligned_chromosomal_snp(chromosome)
651
+ names = aligned_sequences.keys
652
+ parentals = parental_sequences.keys
653
+
654
+ position_after_trim = get_snp_position_after_trim
655
+
656
+ names = names - parentals
657
+ local_pos_in_gene = aligned_snp_position
658
+
659
+ best_target = get_target_sequence(names, chromosome)
660
+ masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
661
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[best_target]
662
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
663
+ #puts "mask_aligned_chromosomal_snp(#{chromosome})"
664
+ #puts names
665
+ i = 0
666
+ for i in 0..masked_snps.size-1
667
+ #puts i
668
+ different = 0
669
+ cov = 0
670
+ from_group = 0
671
+ nCount = 0
672
+ seen = []
673
+ names.each do | chr |
674
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
675
+ #puts aligned_sequences[chr][i]
676
+ cov += 1
677
+ nCount += 1 if aligned_sequences[chr][i] == 'N' or aligned_sequences[chr][i] == 'n' # maybe fix this to use ambiguity codes instead.
678
+
679
+ if chr[0] == chromosome_group and not seen.include? chr[1]
680
+ seen << chr[1]
681
+ from_group += 1
682
+
683
+ end
684
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
685
+ if chr != best_target
686
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
687
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
688
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
689
+ end
690
+ end
691
+ end
692
+ masked_snps[i] = "-" if different == 0
693
+ masked_snps[i] = "-" if cov == 1
694
+ masked_snps[i] = "-" if nCount > 0
695
+ masked_snps[i] = "*" if cov == 0
696
+ expected_snps = names.size - 1
697
+
698
+ #puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
699
+
700
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
701
+ #puts "#{i}:#{masked_snps[i]}"
702
+
703
+ if i == local_pos_in_gene
704
+ masked_snps[i] = "&"
705
+ #puts "#{i}:#{masked_snps[i]}___"
706
+ bases = ""
707
+ names.each do | chr |
708
+ bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
709
+ end
710
+
711
+ code_reference = "n"
712
+ code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
713
+
714
+ if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
715
+ masked_snps[i] = ":"
716
+ end
717
+
718
+ end
719
+ #i += 1
720
+ end
721
+ masked_snps
722
+ end
723
+
724
+
725
+ def surrounding_masked_chromosomal_snps(chromosome)
726
+
727
+ chromosomes = surrounding_exon_sequences
728
+ names = chromosomes.keys
729
+ get_target_sequence(names)
730
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
731
+ masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
732
+ local_pos_in_gene = flanking_size
733
+ i = 0
734
+ while i < masked_snps.size do
735
+ different = 0
736
+ cov = 0
737
+ names.each do | chr |
738
+ if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
739
+ cov += 1
740
+ if chr != chromosome and masked_snps[i] != "+"
741
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
742
+ end
743
+ end
744
+ end
745
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
746
+ masked_snps[i] = "-" if cov < 2
747
+ masked_snps[i] = masked_snps[i].upcase if different > 1
748
+
749
+ if i == local_pos_in_gene
750
+ masked_snps[i] = "&"
751
+ end
752
+ i += 1
753
+ end
754
+ masked_snps
755
+ end
756
+
757
+ def surrounding_exon_sequences
758
+ return @surrounding_exon_sequences if @surrounding_exon_sequences
759
+ gene_region = self.covered_region
760
+ @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
761
+ self.exon_list.each do |chromosome, exon_arr|
762
+ exon_arr.each do |exon|
763
+ exon_start_offset = exon.query_region.start - gene_region.start
764
+ flanking_region = exon.target_flanking_region_from_position(position,flanking_size)
765
+ #TODO: Padd when the exon goes over the regions...
766
+ #puts flanking_region.inspect
767
+ #Ignoring when the exon is in a gap
768
+ unless exon.snp_in_gap
769
+ exon_seq = container.chromosome_sequence(flanking_region)
770
+ @surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
771
+ end
772
+ end
773
+ end
774
+ @surrounding_exon_sequences
775
+ end
776
+
777
+
778
+ def exon_sequences
779
+ return @exon_sequences if @exon_sequences
780
+ gene_region = self.covered_region
781
+ local_pos_in_gene = self.local_position
782
+ @exon_sequences = Bio::Alignment::SequenceHash.new
783
+ self.exon_list.each do |chromosome, exon_arr|
784
+ exon_arr.each do |exon|
785
+ exon_start_offset = exon.query_region.start - gene_region.start
786
+ exon_seq = "-" * exon_start_offset
787
+ exon_seq << container.chromosome_sequence(exon.target_region).to_s
788
+ #puts exon_seq
789
+ #l_pos = exon_start_offset + local_pos_in_gene
790
+ unless exon.snp_in_gap
791
+ #puts "local position: #{local_pos_in_gene}"
792
+ #puts "Exon_seq: #{exon_seq}"
793
+ exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
794
+ exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
795
+ #puts exon.inspect
796
+ @exon_sequences["#{chromosome}_#{exon.query_region.start}_#{exon.record.score}"] = exon_seq
797
+ end
798
+ end
799
+ end
800
+ @exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
801
+ @exon_sequences
802
+ end
803
+ end
804
+ end