bio-polyploid-tools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +16 -0
  3. data/Gemfile.lock +67 -0
  4. data/README +21 -0
  5. data/Rakefile +61 -0
  6. data/VERSION +1 -0
  7. data/bin/bfr.rb +133 -0
  8. data/bin/count_variations.rb +36 -0
  9. data/bin/filter_blat_by_target_coverage.rb +15 -0
  10. data/bin/find_best_blat_hit.rb +32 -0
  11. data/bin/hexaploid_primers.rb +168 -0
  12. data/bin/homokaryot_primers.rb +155 -0
  13. data/bin/map_markers_to_contigs.rb +66 -0
  14. data/bin/markers_in_region.rb +42 -0
  15. data/bin/polymarker.rb +219 -0
  16. data/bin/snps_between_bams.rb +106 -0
  17. data/bio-polyploid-tools.gemspec +139 -0
  18. data/conf/defaults.rb +1 -0
  19. data/conf/primer3_config/dangle.dh +128 -0
  20. data/conf/primer3_config/dangle.ds +128 -0
  21. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  22. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  23. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  24. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  25. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  26. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  27. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  28. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  29. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  30. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  31. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  32. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  33. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  34. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  35. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  36. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  37. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  38. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  39. data/conf/primer3_config/loops.dh +30 -0
  40. data/conf/primer3_config/loops.ds +30 -0
  41. data/conf/primer3_config/stack.dh +256 -0
  42. data/conf/primer3_config/stack.ds +256 -0
  43. data/conf/primer3_config/stackmm.dh +256 -0
  44. data/conf/primer3_config/stackmm.ds +256 -0
  45. data/conf/primer3_config/tetraloop.dh +77 -0
  46. data/conf/primer3_config/tetraloop.ds +77 -0
  47. data/conf/primer3_config/triloop.dh +16 -0
  48. data/conf/primer3_config/triloop.ds +16 -0
  49. data/conf/primer3_config/tstack.dh +256 -0
  50. data/conf/primer3_config/tstack2.dh +256 -0
  51. data/conf/primer3_config/tstack2.ds +256 -0
  52. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  53. data/lib/bio/BFRTools.rb +698 -0
  54. data/lib/bio/BIOExtensions.rb +186 -0
  55. data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
  56. data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
  57. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  58. data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
  59. data/lib/bio/PolyploidTools/SNP.rb +681 -0
  60. data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
  61. data/lib/bio/SAMToolsExtensions.rb +284 -0
  62. data/lib/bio/db/exonerate.rb +272 -0
  63. data/lib/bio/db/fastadb.rb +164 -0
  64. data/lib/bio/db/primer3.rb +673 -0
  65. data/lib/bioruby-polyploid-tools.rb +25 -0
  66. data/test/data/BS00068396_51.fa +2 -0
  67. data/test/data/BS00068396_51_contigs.aln +1412 -0
  68. data/test/data/BS00068396_51_contigs.dnd +7 -0
  69. data/test/data/BS00068396_51_contigs.fa +8 -0
  70. data/test/data/BS00068396_51_exonerate.tab +6 -0
  71. data/test/data/BS00068396_51_genes.txt +14 -0
  72. data/test/data/LIB1716.bam +0 -0
  73. data/test/data/LIB1716.bam.bai +0 -0
  74. data/test/data/LIB1719.bam +0 -0
  75. data/test/data/LIB1719.bam.bai +0 -0
  76. data/test/data/LIB1721.bam +0 -0
  77. data/test/data/LIB1721.bam.bai +0 -0
  78. data/test/data/LIB1722.bam +0 -0
  79. data/test/data/LIB1722.bam.bai +0 -0
  80. data/test/data/S22380157.fa +16 -0
  81. data/test/data/S22380157.fa.fai +1 -0
  82. data/test/data/Test3Aspecific.csv +1 -0
  83. data/test/data/Test3Aspecific_contigs.fa +6 -0
  84. data/test/data/patological_cases5D.csv +1 -0
  85. data/test/data/short_primer_design_test.csv +10 -0
  86. data/test/data/test_primer3_error.csv +4 -0
  87. data/test/data/test_primer3_error_contigs.fa +10 -0
  88. data/test/test_bfr.rb +51 -0
  89. data/test/test_exon_container.rb +17 -0
  90. data/test/test_exonearate.rb +53 -0
  91. data/test/test_snp_parsing.rb +40 -0
  92. metadata +201 -0
@@ -0,0 +1,22 @@
1
+ module Bio::PolyploidTools
2
+ class PrimerRegion
3
+ attr_accessor :snp_pos, :sequence, :chromosome_specific, :almost_chromosome_specific, :crhomosome_specific_intron , :almost_crhomosome_specific_intron, :homeologous
4
+
5
+ def initialize
6
+
7
+ @chromosome_specific = Array.new
8
+ @almost_chromosome_specific = Array.new
9
+ @crhomosome_specific_intron = Array.new
10
+ @almost_crhomosome_specific_intron = Array.new
11
+ end
12
+
13
+ def tail_candidates
14
+ @chromosome_specific.size + @almost_chromosome_specific.size
15
+ end
16
+
17
+ def to_fasta
18
+ ">Primer_#{snp_pos}_#{chromosome_specific.to_s}_#{almost_chromosome_specific.to_s}_#{crhomosome_specific_intron.to_s}_#{almost_crhomosome_specific_intron.to_s}\n#{sequence}\n"
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,681 @@
1
+ require 'bio'
2
+ module Bio::PolyploidTools
3
+ class SNPException < RuntimeError
4
+ end
5
+ class SNP
6
+
7
+ #GENE,ORIGINAL,POS,SNP
8
+ attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
9
+ attr_accessor :exon_list
10
+ attr_accessor :container
11
+ attr_accessor :flanking_size, :ideal_min, :ideal_max
12
+ attr_accessor :template_sequence
13
+ attr_accessor :use_reference
14
+ attr_accessor :genomes_count
15
+
16
+ attr_reader :chromosome
17
+
18
+ #Format:
19
+ #Gene_name,Original,SNP_Pos,pos,chromosome
20
+ #A_comp0_c0_seq1,C,519,A
21
+ def self.parse(reg_str)
22
+ reg_str.chomp!
23
+ snp = SNP.new
24
+ snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
25
+ snp.position = snp.position.strip!.to_i
26
+ snp.original.upcase!.strip!
27
+ snp.snp.upcase!.strip!
28
+ snp.chromosome.strip!
29
+ snp.exon_list = Hash.new()
30
+ snp.use_reference = false
31
+ snp
32
+ end
33
+
34
+ def initialize
35
+ @genomes_count = 3 #TODO: if we want to use this with other polyploids, me need to set this as a variable in the main script.
36
+ end
37
+
38
+
39
+ #We Only want the chromosome, we drop the arm.
40
+ def chromosome= (chr)
41
+ @chromosome = chr[0,2]
42
+ end
43
+
44
+ def chromosome_group
45
+ chromosome[0]
46
+ end
47
+
48
+ def chromosome_genome
49
+ chromosome[1]
50
+ end
51
+
52
+ def chromosome_genome
53
+ return chromosome[3] if chromosome[3]
54
+ return nil
55
+ end
56
+
57
+ def to_fasta
58
+ return ">#{self.gene}\n#{self.template_sequence}\n"
59
+ end
60
+
61
+ def add_exon(exon, arm)
62
+ @exon_list[arm] = exon unless @exon_list[arm]
63
+ @exon_list[arm] = exon if exon.record.score > @exon_list[arm].record.score
64
+ end
65
+
66
+ def covered_region
67
+ return @covered_region if @covered_region
68
+ if self.use_reference
69
+ reg = Bio::DB::Fasta::Region.new()
70
+ reg.entry = gene
71
+ reg.orientation = :forward
72
+ reg.start = self.position - self.flanking_size
73
+ reg.end = self.position + self.flanking_size
74
+
75
+ reg.start = 1 if reg.start < 1
76
+
77
+ return reg
78
+ end
79
+
80
+ min = @position
81
+ max = @position
82
+ # puts "Calculating covered region for #{self.inspect}"
83
+ # puts "#{@exon_list.inspect}"
84
+ raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
85
+
86
+ @exon_list.each do | chromosome, exon |
87
+ # puts exon.inspect
88
+ reg = exon.query_region
89
+ min = reg.start if reg.start < min
90
+ max = reg.end if reg.end > max
91
+ end
92
+ reg = Bio::DB::Fasta::Region.new()
93
+ reg.entry = gene
94
+ reg.orientation = :forward
95
+ reg.start = min
96
+ reg.end = max
97
+
98
+ @covered_region = reg
99
+ @covered_region
100
+ end
101
+
102
+ def left_padding
103
+ flanking_size - self.local_position + 1
104
+ # primer_region.start - covered_region.start
105
+ # 0
106
+ end
107
+
108
+ def right_padding
109
+ ret = (2*flanking_size) - (left_padding + self.covered_region.size )
110
+ ret = 0 if ret < 0
111
+ ret
112
+ end
113
+
114
+ def local_position
115
+ # puts "local_position #{self.position} #{self.covered_region.start}"
116
+ self.position - self.covered_region.start
117
+ end
118
+
119
+ def padded_position (pos)
120
+ pos + left_padding
121
+ end
122
+
123
+ def exon_fasta_string
124
+ gene_region = self.covered_region
125
+ local_pos_in_gene = self.local_position
126
+ ret_str = ""
127
+ container.parents.each do |name, bam|
128
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
129
+ to_print = parental_sequences[name]
130
+ ret_str << to_print << "\n"
131
+ end
132
+ self.exon_sequences.each do | chromosome, exon_seq |
133
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
134
+ end
135
+ mask = masked_chromosomal_snps("1BS", flanking_size)
136
+ ret_str << ">Mask\n#{mask}\n"
137
+ ret_str
138
+ end
139
+
140
+
141
+ def primer_fasta_string
142
+ gene_region = self.covered_region
143
+ local_pos_in_gene = self.local_position
144
+ ret_str = ""
145
+
146
+ surrounding_parental_sequences.each do |name, seq|
147
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
148
+ ret_str << "#{seq}\n"
149
+ end
150
+
151
+ #self.exon_sequences.each do | chromosome, exon_seq |
152
+ # ex_seq = cut_sequence_to_primer_region(exon_seq)
153
+ # ret_str << ">#{chromosome}\n#{ex_seq}\n"
154
+ #end
155
+ self.surrounding_exon_sequences.each do |chromosome, exon_seq|
156
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
157
+ end
158
+
159
+ mask = surrounding_masked_chromosomal_snps(chromosome)
160
+ ret_str << ">Mask\n#{mask}\n"
161
+
162
+ pr = primer_region(chromosome, snp_in )
163
+ ret_str << pr.to_fasta
164
+ ret_str
165
+ end
166
+
167
+ def primer_region(target_chromosome, parental )
168
+ parental = aligned_sequences[parental].downcase
169
+ chromosome_seq = aligned_sequences[target_chromosome]
170
+ chromosome_seq = "-" * parental.size unless chromosome_seq
171
+ chromosome_seq = chromosome_seq.downcase
172
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
173
+ #puts "'#{mask}'"
174
+
175
+ pr = PrimerRegion.new
176
+ position_in_region = 0
177
+ (0..parental.size-1).each do |i|
178
+
179
+ if chromosome_seq[i] != '-' or parental[i] != '-'
180
+ case
181
+ when mask[i] == '&'
182
+ #This is the SNP we take the parental
183
+ pr.snp_pos = position_in_region
184
+ pr.homeologous = false
185
+ when mask[i] == ':'
186
+ #This is the SNP we take the parental
187
+ pr.snp_pos = position_in_region
188
+ pr.homeologous = true
189
+ when mask[i] == '-'
190
+ #When the mask doesnt detect a SNP, so we take the parental
191
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
192
+
193
+ when /[[:upper:]]/.match(mask[i])
194
+ #This is a good candidate for marking a SNP
195
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
196
+ if parental[i] == '-'
197
+ parental[i] = mask[i]
198
+ pr.crhomosome_specific_intron << position_in_region
199
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
200
+ parental[i] = mask[i]
201
+ pr.chromosome_specific << position_in_region
202
+ end
203
+ when /[[:lower:]]/.match(mask[i])
204
+ #this is not that good candidate, but sitll gives specificity
205
+
206
+ if parental[i] == '-'
207
+ parental[i] = mask[i]
208
+ pr.almost_crhomosome_specific_intron << position_in_region
209
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
210
+ parental[i] = mask[i].upcase
211
+ pr.almost_chromosome_specific << position_in_region
212
+ end
213
+ end #Case closes
214
+ position_in_region += 1
215
+ end #Closes region with bases
216
+ end
217
+
218
+ pr.sequence=parental.gsub('-','')
219
+ pr
220
+ end
221
+
222
+ def reverse_complement_string(sequenc_str)
223
+ complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
224
+ complement.reverse!
225
+ end
226
+
227
+ def return_primer_3_string(opts={})
228
+
229
+ left = opts[:left_pos]
230
+ right = opts[:right_pos]
231
+ sequence = opts[:sequence]
232
+ orientation = "forward"
233
+ if opts[:right_pos]
234
+ orientation = "forward"
235
+ if left > right
236
+ left = sequence.size - left - 1
237
+ right = sequence.size - right - 1
238
+ sequence = reverse_complement_string(sequence)
239
+ orientation = "reverse"
240
+ end
241
+ end
242
+
243
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
244
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
245
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
246
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
247
+ str << "=\n"
248
+
249
+
250
+ #In case that we don't have a right primer, wi do both orientation
251
+ unless opts[:right_pos]
252
+ sequence = opts[:sequence]
253
+ left = sequence.size - left - 1
254
+ orientation = "reverse"
255
+ sequence = reverse_complement_string(sequence)
256
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
257
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
258
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
259
+ str << "=\n"
260
+ else
261
+
262
+ end
263
+
264
+ str
265
+ end
266
+
267
+
268
+ def primer_3_all_strings(target_chromosome, parental)
269
+ pr = primer_region(target_chromosome, parental )
270
+ primer_3_propertes = Array.new
271
+
272
+ seq_original = String.new(pr.sequence)
273
+ seq_original[pr.snp_pos] = self.original
274
+ seq_original_reverse = reverse_complement_string(seq_original)
275
+
276
+ seq_snp = String.new(pr.sequence)
277
+ seq_snp[pr.snp_pos] = self.snp
278
+ seq_snp_reverse = reverse_complement_string(seq_snp)
279
+
280
+ rev_pos = seq_snp.size - position
281
+
282
+ if pr.homeologous
283
+ snp_type = "homeologous"
284
+ else
285
+ snp_type = "non-homeologous"
286
+ end
287
+
288
+ pr.chromosome_specific.each do |pos|
289
+
290
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
291
+ primer_3_propertes << return_primer_3_string(args)
292
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
293
+ args[:sequence] = seq_snp
294
+ primer_3_propertes << return_primer_3_string(args)
295
+ end
296
+
297
+ pr.almost_chromosome_specific.each do |pos|
298
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
299
+ primer_3_propertes << return_primer_3_string(args)
300
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
301
+ args[:sequence] = seq_snp
302
+ primer_3_propertes << return_primer_3_string(args)
303
+
304
+ end
305
+
306
+ pr.crhomosome_specific_intron.each do |pos|
307
+
308
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
309
+ primer_3_propertes << return_primer_3_string(args)
310
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
311
+ args[:sequence] = seq_snp
312
+ primer_3_propertes << return_primer_3_string(args)
313
+ end
314
+
315
+ pr.almost_crhomosome_specific_intron.each do |pos|
316
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
317
+ primer_3_propertes << return_primer_3_string(args)
318
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
319
+ args[:sequence] = seq_snp
320
+ primer_3_propertes << return_primer_3_string(args)
321
+
322
+ end
323
+
324
+
325
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
326
+ primer_3_propertes << return_primer_3_string(args)
327
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{snp_type} #{chromosome}"
328
+ args[:sequence] = seq_snp
329
+ primer_3_propertes << return_primer_3_string(args)
330
+
331
+
332
+ primer_3_propertes
333
+ end
334
+
335
+ def to_s
336
+ "#{gene}:#{original}#{position}#{snp}#{chromosome}"
337
+ end
338
+
339
+ def short_s
340
+ "#{original}#{position}#{snp}".upcase
341
+ end
342
+
343
+ def primer_3_string(target_chromosome, parental)
344
+ strings = primer_3_all_strings(target_chromosome, parental)
345
+ strings.join
346
+ end
347
+
348
+ def exon_for_chromosome (chromosome)
349
+ selected_exon=exon_list[chromosome]
350
+ puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
351
+ selected_exon
352
+ end
353
+
354
+ def parental_sequences
355
+ return @parental_sequences if @parental_sequences
356
+ gene_region = self.covered_region
357
+ local_pos_in_gene = self.local_position
358
+
359
+ @parental_sequences = Bio::Alignment::SequenceHash.new
360
+ container.parents.each do |name, bam|
361
+ seq = nil
362
+ if bam
363
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
364
+ else
365
+ seq = container.gene_model_sequence(gene_region)
366
+ unless name == self.snp_in
367
+ seq[local_pos_in_gene] = self.original
368
+ end
369
+ end
370
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
371
+
372
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
373
+ @parental_sequences [name] = seq
374
+ #puts name
375
+ #puts seq
376
+ end
377
+ @parental_sequences
378
+ end
379
+
380
+ def surrounding_parental_sequences
381
+ return @surrounding_parental_sequences if @surrounding_parental_sequences
382
+ gene_region = self.covered_region
383
+ local_pos_in_gene = self.local_position
384
+
385
+ @surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
386
+ container.parents.each do |name, bam|
387
+ seq = nil
388
+ if bam
389
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
390
+ else
391
+ seq = container.gene_model_sequence(gene_region)
392
+ unless name == self.snp_in
393
+ # puts "Modiging original: #{name} #{self.original}"
394
+ seq[local_pos_in_gene] = self.original
395
+ end
396
+ end
397
+ #puts "local_pos_in_gene #{local_pos_in_gene}"
398
+ #puts "'#{name}' compared to '#{self.snp_in}'"
399
+ #puts seq
400
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
401
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
402
+ #puts seq
403
+ #puts "__"
404
+ @surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
405
+ end
406
+ # puts "&&&&\n#{surrounding_parental_sequences['A']}\n#{surrounding_parental_sequences['B']}\n&&&&"
407
+ @surrounding_parental_sequences
408
+ end
409
+
410
+ def cut_sequence_to_primer_region(sequence)
411
+ ideal_min = self.local_position - flanking_size
412
+ ideal_max = self.local_position + flanking_size
413
+ ideal_min = 0 if ideal_min < 0
414
+ ideal_max = sequence.size - 1 if ideal_max > sequence.size
415
+ # len = ideal_max - ideal_min
416
+ sequence[ideal_min..ideal_max]
417
+ end
418
+
419
+ def cut_and_pad_sequence_to_primer_region(sequence)
420
+ # p "cut_and_pad_sequence_to_primer_region #{local_position} #{flanking_size}"
421
+ ideal_min = self.local_position - flanking_size
422
+ ideal_max = self.local_position + flanking_size
423
+ left_pad = 0
424
+ right_pad=0
425
+ if ideal_min < 0
426
+ left_pad = ideal_min * -1
427
+ ideal_min = 0
428
+ end
429
+ if ideal_max > sequence.size
430
+ right_pad = ideal_max - sequence.size
431
+ ideal_max = sequence.size - 1
432
+ end
433
+ ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
434
+ ret
435
+ end
436
+
437
+ def sequences_to_align
438
+ @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
439
+ # p "sequences_to_align"
440
+ # p @sequences_to_align.inspect
441
+ @sequences_to_align
442
+ end
443
+
444
+ def aligned_sequences
445
+
446
+ return @aligned_sequences if @aligned_sequences
447
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
448
+ mafft = Bio::MAFFT.new( "mafft" , options)
449
+ #puts "Before MAFT:#{sequences_to_align.inspect}"
450
+ report = mafft.query_align(sequences_to_align)
451
+ @aligned_sequences = report.alignment
452
+ #puts "MAFFT: #{report.alignment.inspect}"
453
+ @aligned_sequences
454
+ end
455
+
456
+ def aligned_sequences_fasta
457
+ ret_str = ""
458
+ aligned_sequences.each_pair do |name, seq|
459
+ ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
460
+ end
461
+ ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
462
+
463
+ pr = primer_region(chromosome, snp_in )
464
+ ret_str << pr.to_fasta
465
+ ret_str
466
+ ret_str
467
+ end
468
+
469
+ def aligned_snp_position
470
+ return @aligned_snp_position if @aligned_snp_position
471
+ pos = -1
472
+ parental_strings = Array.new
473
+ parental_sequences.keys.each do | par |
474
+
475
+ parental_strings << aligned_sequences[par]
476
+ end
477
+ template_sequence = nil
478
+ aligned_sequences.keys.each do |temp |
479
+ template_sequence = aligned_sequences[ temp ] if aligned_sequences[ temp ][0] != "-"
480
+ end
481
+ $stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
482
+
483
+ i = 0
484
+ differences = 0
485
+ local_pos_in_gene = flanking_size
486
+ local_pos = 0
487
+ started = false
488
+ #TODO: Validate the cases when the alignment has padding on the left on all the chromosomes
489
+
490
+ while i < parental_strings[0].size do
491
+ if local_pos_in_gene == local_pos
492
+ pos = i
493
+ if parental_strings[0][i] == parental_strings[1][i]
494
+ $stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
495
+ end
496
+
497
+ end
498
+
499
+ started = true if template_sequence[i] != "-"
500
+ if started == false or template_sequence[i] != "-"
501
+ local_pos += 1
502
+ end
503
+ i += 1
504
+ end
505
+ @aligned_snp_position = pos
506
+ return pos
507
+ end
508
+
509
+ def mask_aligned_chromosomal_snp(chromosome)
510
+ names = exon_sequences.keys
511
+ parentals = parental_sequences.keys
512
+
513
+ local_pos_in_gene = aligned_snp_position
514
+ masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
515
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
516
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
517
+ i = 0
518
+ while i < masked_snps.size
519
+ different = 0
520
+ cov = 0
521
+ from_group = 0
522
+ names.each do | chr |
523
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
524
+ cov += 1
525
+
526
+ from_group += 1 if chr[0] == chromosome_group
527
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
528
+ if chr != chromosome
529
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
530
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
531
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
532
+ end
533
+ end
534
+ end
535
+ masked_snps[i] = "-" if different == 0
536
+ masked_snps[i] = "-" if cov == 1
537
+ masked_snps[i] = "*" if cov == 0
538
+ expected_snps = names.size - 1
539
+ # puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
540
+
541
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
542
+
543
+ if i == local_pos_in_gene
544
+ masked_snps[i] = "&"
545
+ bases = ""
546
+ names.each do | chr |
547
+ bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
548
+ end
549
+
550
+ code_reference = "n"
551
+ code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
552
+
553
+ if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
554
+ masked_snps[i] = ":"
555
+ end
556
+
557
+ end
558
+ i += 1
559
+ end
560
+ masked_snps
561
+ end
562
+
563
+ def masked_chromosomal_snps(chromosome)
564
+ chromosomes = exon_sequences
565
+ names = chromosomes.keys
566
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
567
+ masked_snps = "-" * covered_region.size unless chromosomes[chromosome]
568
+ local_pos_in_gene = self.local_position
569
+ ideal_min = local_pos_in_gene - flanking_size
570
+ ideal_max = local_pos_in_gene + flanking_size
571
+ i = 0
572
+ while i < masked_snps.size do
573
+ if i > ideal_min and i <= ideal_max
574
+
575
+ different = 0
576
+ cov = 0
577
+ names.each do | chr |
578
+ if chromosomes[chr][i] != "-"
579
+ cov += 1
580
+ if chr != chromosome and masked_snps[i] != "+"
581
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
582
+ end
583
+ end
584
+
585
+ end
586
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
587
+ masked_snps[i] = "-" if cov < 2
588
+ masked_snps[i] = masked_snps[i].upcase if different > 1
589
+
590
+ else
591
+ masked_snps[i] = "*"
592
+ end
593
+ if i == local_pos_in_gene
594
+ masked_snps[i] = "&"
595
+ end
596
+ i += 1
597
+ end
598
+ masked_snps
599
+ end
600
+
601
+ def surrounding_masked_chromosomal_snps(chromosome)
602
+
603
+ chromosomes = surrounding_exon_sequences
604
+ names = chromosomes.keys
605
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
606
+ masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
607
+ local_pos_in_gene = flanking_size
608
+ # ideal_min = local_pos_in_gene - flanking_size
609
+ #ideal_max = local_pos_in_gene + flanking_size
610
+ i = 0
611
+ while i < masked_snps.size do
612
+
613
+
614
+ different = 0
615
+ cov = 0
616
+ names.each do | chr |
617
+ if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
618
+ cov += 1
619
+ if chr != chromosome and masked_snps[i] != "+"
620
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
621
+ end
622
+ end
623
+
624
+ end
625
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
626
+ masked_snps[i] = "-" if cov < 2
627
+ masked_snps[i] = masked_snps[i].upcase if different > 1
628
+
629
+
630
+ if i == local_pos_in_gene
631
+ masked_snps[i] = "&"
632
+ end
633
+ i += 1
634
+ end
635
+ masked_snps
636
+ end
637
+
638
+ def surrounding_exon_sequences
639
+ return @surrounding_exon_sequences if @surrounding_exon_sequences
640
+ @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
641
+ self.exon_list.each do |chromosome, exon|
642
+ #puts "surrounding_exon_sequences #{flanking_size}"
643
+ #puts chromosome
644
+ #puts exon
645
+ flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
646
+ #TODO: Padd when the exon goes over the regions...
647
+
648
+ #Ignoring when the exon is in a gap
649
+ unless exon.snp_in_gap
650
+ exon_seq = container.chromosome_sequence(flanquing_region)
651
+ @surrounding_exon_sequences[chromosome] = exon_seq
652
+ end
653
+ end
654
+ @surrounding_exon_sequences
655
+ end
656
+
657
+
658
+ def exon_sequences
659
+ return @exon_sequences if @exon_sequences
660
+ gene_region = self.covered_region
661
+ local_pos_in_gene = self.local_position
662
+ @exon_sequences = Bio::Alignment::SequenceHash.new
663
+ self.exon_list.each do |chromosome, exon|
664
+ exon_start_offset = exon.query_region.start - gene_region.start
665
+ exon_seq = "-" * exon_start_offset
666
+ exon_seq << container.chromosome_sequence(exon.target_region).to_s
667
+ #puts exon_seq
668
+ # l_pos = exon_start_offset + local_pos_in_gene
669
+ unless exon.snp_in_gap
670
+ #puts "local position: #{local_pos_in_gene}"
671
+ #puts "Exon_seq: #{exon_seq}"
672
+ exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
673
+ exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
674
+ @exon_sequences[chromosome] = exon_seq
675
+ end
676
+ end
677
+ @exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
678
+ @exon_sequences
679
+ end
680
+ end
681
+ end