bio-polyploid-tools 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +16 -0
  3. data/Gemfile.lock +67 -0
  4. data/README +21 -0
  5. data/Rakefile +61 -0
  6. data/VERSION +1 -0
  7. data/bin/bfr.rb +133 -0
  8. data/bin/count_variations.rb +36 -0
  9. data/bin/filter_blat_by_target_coverage.rb +15 -0
  10. data/bin/find_best_blat_hit.rb +32 -0
  11. data/bin/hexaploid_primers.rb +168 -0
  12. data/bin/homokaryot_primers.rb +155 -0
  13. data/bin/map_markers_to_contigs.rb +66 -0
  14. data/bin/markers_in_region.rb +42 -0
  15. data/bin/polymarker.rb +219 -0
  16. data/bin/snps_between_bams.rb +106 -0
  17. data/bio-polyploid-tools.gemspec +139 -0
  18. data/conf/defaults.rb +1 -0
  19. data/conf/primer3_config/dangle.dh +128 -0
  20. data/conf/primer3_config/dangle.ds +128 -0
  21. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  22. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  23. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  24. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  25. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  26. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  27. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  28. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  29. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  30. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  31. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  32. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  33. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  34. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  35. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  36. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  37. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  38. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  39. data/conf/primer3_config/loops.dh +30 -0
  40. data/conf/primer3_config/loops.ds +30 -0
  41. data/conf/primer3_config/stack.dh +256 -0
  42. data/conf/primer3_config/stack.ds +256 -0
  43. data/conf/primer3_config/stackmm.dh +256 -0
  44. data/conf/primer3_config/stackmm.ds +256 -0
  45. data/conf/primer3_config/tetraloop.dh +77 -0
  46. data/conf/primer3_config/tetraloop.ds +77 -0
  47. data/conf/primer3_config/triloop.dh +16 -0
  48. data/conf/primer3_config/triloop.ds +16 -0
  49. data/conf/primer3_config/tstack.dh +256 -0
  50. data/conf/primer3_config/tstack2.dh +256 -0
  51. data/conf/primer3_config/tstack2.ds +256 -0
  52. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  53. data/lib/bio/BFRTools.rb +698 -0
  54. data/lib/bio/BIOExtensions.rb +186 -0
  55. data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
  56. data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
  57. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  58. data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
  59. data/lib/bio/PolyploidTools/SNP.rb +681 -0
  60. data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
  61. data/lib/bio/SAMToolsExtensions.rb +284 -0
  62. data/lib/bio/db/exonerate.rb +272 -0
  63. data/lib/bio/db/fastadb.rb +164 -0
  64. data/lib/bio/db/primer3.rb +673 -0
  65. data/lib/bioruby-polyploid-tools.rb +25 -0
  66. data/test/data/BS00068396_51.fa +2 -0
  67. data/test/data/BS00068396_51_contigs.aln +1412 -0
  68. data/test/data/BS00068396_51_contigs.dnd +7 -0
  69. data/test/data/BS00068396_51_contigs.fa +8 -0
  70. data/test/data/BS00068396_51_exonerate.tab +6 -0
  71. data/test/data/BS00068396_51_genes.txt +14 -0
  72. data/test/data/LIB1716.bam +0 -0
  73. data/test/data/LIB1716.bam.bai +0 -0
  74. data/test/data/LIB1719.bam +0 -0
  75. data/test/data/LIB1719.bam.bai +0 -0
  76. data/test/data/LIB1721.bam +0 -0
  77. data/test/data/LIB1721.bam.bai +0 -0
  78. data/test/data/LIB1722.bam +0 -0
  79. data/test/data/LIB1722.bam.bai +0 -0
  80. data/test/data/S22380157.fa +16 -0
  81. data/test/data/S22380157.fa.fai +1 -0
  82. data/test/data/Test3Aspecific.csv +1 -0
  83. data/test/data/Test3Aspecific_contigs.fa +6 -0
  84. data/test/data/patological_cases5D.csv +1 -0
  85. data/test/data/short_primer_design_test.csv +10 -0
  86. data/test/data/test_primer3_error.csv +4 -0
  87. data/test/data/test_primer3_error_contigs.fa +10 -0
  88. data/test/test_bfr.rb +51 -0
  89. data/test/test_exon_container.rb +17 -0
  90. data/test/test_exonearate.rb +53 -0
  91. data/test/test_snp_parsing.rb +40 -0
  92. metadata +201 -0
@@ -0,0 +1,22 @@
1
+ module Bio::PolyploidTools
2
+ class PrimerRegion
3
+ attr_accessor :snp_pos, :sequence, :chromosome_specific, :almost_chromosome_specific, :crhomosome_specific_intron , :almost_crhomosome_specific_intron, :homeologous
4
+
5
+ def initialize
6
+
7
+ @chromosome_specific = Array.new
8
+ @almost_chromosome_specific = Array.new
9
+ @crhomosome_specific_intron = Array.new
10
+ @almost_crhomosome_specific_intron = Array.new
11
+ end
12
+
13
+ def tail_candidates
14
+ @chromosome_specific.size + @almost_chromosome_specific.size
15
+ end
16
+
17
+ def to_fasta
18
+ ">Primer_#{snp_pos}_#{chromosome_specific.to_s}_#{almost_chromosome_specific.to_s}_#{crhomosome_specific_intron.to_s}_#{almost_crhomosome_specific_intron.to_s}\n#{sequence}\n"
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,681 @@
1
+ require 'bio'
2
+ module Bio::PolyploidTools
3
+ class SNPException < RuntimeError
4
+ end
5
+ class SNP
6
+
7
+ #GENE,ORIGINAL,POS,SNP
8
+ attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
9
+ attr_accessor :exon_list
10
+ attr_accessor :container
11
+ attr_accessor :flanking_size, :ideal_min, :ideal_max
12
+ attr_accessor :template_sequence
13
+ attr_accessor :use_reference
14
+ attr_accessor :genomes_count
15
+
16
+ attr_reader :chromosome
17
+
18
+ #Format:
19
+ #Gene_name,Original,SNP_Pos,pos,chromosome
20
+ #A_comp0_c0_seq1,C,519,A
21
+ def self.parse(reg_str)
22
+ reg_str.chomp!
23
+ snp = SNP.new
24
+ snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
25
+ snp.position = snp.position.strip!.to_i
26
+ snp.original.upcase!.strip!
27
+ snp.snp.upcase!.strip!
28
+ snp.chromosome.strip!
29
+ snp.exon_list = Hash.new()
30
+ snp.use_reference = false
31
+ snp
32
+ end
33
+
34
+ def initialize
35
+ @genomes_count = 3 #TODO: if we want to use this with other polyploids, me need to set this as a variable in the main script.
36
+ end
37
+
38
+
39
+ #We Only want the chromosome, we drop the arm.
40
+ def chromosome= (chr)
41
+ @chromosome = chr[0,2]
42
+ end
43
+
44
+ def chromosome_group
45
+ chromosome[0]
46
+ end
47
+
48
+ def chromosome_genome
49
+ chromosome[1]
50
+ end
51
+
52
+ def chromosome_genome
53
+ return chromosome[3] if chromosome[3]
54
+ return nil
55
+ end
56
+
57
+ def to_fasta
58
+ return ">#{self.gene}\n#{self.template_sequence}\n"
59
+ end
60
+
61
+ def add_exon(exon, arm)
62
+ @exon_list[arm] = exon unless @exon_list[arm]
63
+ @exon_list[arm] = exon if exon.record.score > @exon_list[arm].record.score
64
+ end
65
+
66
+ def covered_region
67
+ return @covered_region if @covered_region
68
+ if self.use_reference
69
+ reg = Bio::DB::Fasta::Region.new()
70
+ reg.entry = gene
71
+ reg.orientation = :forward
72
+ reg.start = self.position - self.flanking_size
73
+ reg.end = self.position + self.flanking_size
74
+
75
+ reg.start = 1 if reg.start < 1
76
+
77
+ return reg
78
+ end
79
+
80
+ min = @position
81
+ max = @position
82
+ # puts "Calculating covered region for #{self.inspect}"
83
+ # puts "#{@exon_list.inspect}"
84
+ raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
85
+
86
+ @exon_list.each do | chromosome, exon |
87
+ # puts exon.inspect
88
+ reg = exon.query_region
89
+ min = reg.start if reg.start < min
90
+ max = reg.end if reg.end > max
91
+ end
92
+ reg = Bio::DB::Fasta::Region.new()
93
+ reg.entry = gene
94
+ reg.orientation = :forward
95
+ reg.start = min
96
+ reg.end = max
97
+
98
+ @covered_region = reg
99
+ @covered_region
100
+ end
101
+
102
+ def left_padding
103
+ flanking_size - self.local_position + 1
104
+ # primer_region.start - covered_region.start
105
+ # 0
106
+ end
107
+
108
+ def right_padding
109
+ ret = (2*flanking_size) - (left_padding + self.covered_region.size )
110
+ ret = 0 if ret < 0
111
+ ret
112
+ end
113
+
114
+ def local_position
115
+ # puts "local_position #{self.position} #{self.covered_region.start}"
116
+ self.position - self.covered_region.start
117
+ end
118
+
119
+ def padded_position (pos)
120
+ pos + left_padding
121
+ end
122
+
123
+ def exon_fasta_string
124
+ gene_region = self.covered_region
125
+ local_pos_in_gene = self.local_position
126
+ ret_str = ""
127
+ container.parents.each do |name, bam|
128
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
129
+ to_print = parental_sequences[name]
130
+ ret_str << to_print << "\n"
131
+ end
132
+ self.exon_sequences.each do | chromosome, exon_seq |
133
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
134
+ end
135
+ mask = masked_chromosomal_snps("1BS", flanking_size)
136
+ ret_str << ">Mask\n#{mask}\n"
137
+ ret_str
138
+ end
139
+
140
+
141
+ def primer_fasta_string
142
+ gene_region = self.covered_region
143
+ local_pos_in_gene = self.local_position
144
+ ret_str = ""
145
+
146
+ surrounding_parental_sequences.each do |name, seq|
147
+ ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
148
+ ret_str << "#{seq}\n"
149
+ end
150
+
151
+ #self.exon_sequences.each do | chromosome, exon_seq |
152
+ # ex_seq = cut_sequence_to_primer_region(exon_seq)
153
+ # ret_str << ">#{chromosome}\n#{ex_seq}\n"
154
+ #end
155
+ self.surrounding_exon_sequences.each do |chromosome, exon_seq|
156
+ ret_str << ">#{chromosome}\n#{exon_seq}\n"
157
+ end
158
+
159
+ mask = surrounding_masked_chromosomal_snps(chromosome)
160
+ ret_str << ">Mask\n#{mask}\n"
161
+
162
+ pr = primer_region(chromosome, snp_in )
163
+ ret_str << pr.to_fasta
164
+ ret_str
165
+ end
166
+
167
+ def primer_region(target_chromosome, parental )
168
+ parental = aligned_sequences[parental].downcase
169
+ chromosome_seq = aligned_sequences[target_chromosome]
170
+ chromosome_seq = "-" * parental.size unless chromosome_seq
171
+ chromosome_seq = chromosome_seq.downcase
172
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
173
+ #puts "'#{mask}'"
174
+
175
+ pr = PrimerRegion.new
176
+ position_in_region = 0
177
+ (0..parental.size-1).each do |i|
178
+
179
+ if chromosome_seq[i] != '-' or parental[i] != '-'
180
+ case
181
+ when mask[i] == '&'
182
+ #This is the SNP we take the parental
183
+ pr.snp_pos = position_in_region
184
+ pr.homeologous = false
185
+ when mask[i] == ':'
186
+ #This is the SNP we take the parental
187
+ pr.snp_pos = position_in_region
188
+ pr.homeologous = true
189
+ when mask[i] == '-'
190
+ #When the mask doesnt detect a SNP, so we take the parental
191
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
192
+
193
+ when /[[:upper:]]/.match(mask[i])
194
+ #This is a good candidate for marking a SNP
195
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
196
+ if parental[i] == '-'
197
+ parental[i] = mask[i]
198
+ pr.crhomosome_specific_intron << position_in_region
199
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
200
+ parental[i] = mask[i]
201
+ pr.chromosome_specific << position_in_region
202
+ end
203
+ when /[[:lower:]]/.match(mask[i])
204
+ #this is not that good candidate, but sitll gives specificity
205
+
206
+ if parental[i] == '-'
207
+ parental[i] = mask[i]
208
+ pr.almost_crhomosome_specific_intron << position_in_region
209
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
210
+ parental[i] = mask[i].upcase
211
+ pr.almost_chromosome_specific << position_in_region
212
+ end
213
+ end #Case closes
214
+ position_in_region += 1
215
+ end #Closes region with bases
216
+ end
217
+
218
+ pr.sequence=parental.gsub('-','')
219
+ pr
220
+ end
221
+
222
+ def reverse_complement_string(sequenc_str)
223
+ complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
224
+ complement.reverse!
225
+ end
226
+
227
+ def return_primer_3_string(opts={})
228
+
229
+ left = opts[:left_pos]
230
+ right = opts[:right_pos]
231
+ sequence = opts[:sequence]
232
+ orientation = "forward"
233
+ if opts[:right_pos]
234
+ orientation = "forward"
235
+ if left > right
236
+ left = sequence.size - left - 1
237
+ right = sequence.size - right - 1
238
+ sequence = reverse_complement_string(sequence)
239
+ orientation = "reverse"
240
+ end
241
+ end
242
+
243
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
244
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
245
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
246
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
247
+ str << "=\n"
248
+
249
+
250
+ #In case that we don't have a right primer, wi do both orientation
251
+ unless opts[:right_pos]
252
+ sequence = opts[:sequence]
253
+ left = sequence.size - left - 1
254
+ orientation = "reverse"
255
+ sequence = reverse_complement_string(sequence)
256
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
257
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
258
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
259
+ str << "=\n"
260
+ else
261
+
262
+ end
263
+
264
+ str
265
+ end
266
+
267
+
268
+ def primer_3_all_strings(target_chromosome, parental)
269
+ pr = primer_region(target_chromosome, parental )
270
+ primer_3_propertes = Array.new
271
+
272
+ seq_original = String.new(pr.sequence)
273
+ seq_original[pr.snp_pos] = self.original
274
+ seq_original_reverse = reverse_complement_string(seq_original)
275
+
276
+ seq_snp = String.new(pr.sequence)
277
+ seq_snp[pr.snp_pos] = self.snp
278
+ seq_snp_reverse = reverse_complement_string(seq_snp)
279
+
280
+ rev_pos = seq_snp.size - position
281
+
282
+ if pr.homeologous
283
+ snp_type = "homeologous"
284
+ else
285
+ snp_type = "non-homeologous"
286
+ end
287
+
288
+ pr.chromosome_specific.each do |pos|
289
+
290
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
291
+ primer_3_propertes << return_primer_3_string(args)
292
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
293
+ args[:sequence] = seq_snp
294
+ primer_3_propertes << return_primer_3_string(args)
295
+ end
296
+
297
+ pr.almost_chromosome_specific.each do |pos|
298
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
299
+ primer_3_propertes << return_primer_3_string(args)
300
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
301
+ args[:sequence] = seq_snp
302
+ primer_3_propertes << return_primer_3_string(args)
303
+
304
+ end
305
+
306
+ pr.crhomosome_specific_intron.each do |pos|
307
+
308
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
309
+ primer_3_propertes << return_primer_3_string(args)
310
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
311
+ args[:sequence] = seq_snp
312
+ primer_3_propertes << return_primer_3_string(args)
313
+ end
314
+
315
+ pr.almost_crhomosome_specific_intron.each do |pos|
316
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
317
+ primer_3_propertes << return_primer_3_string(args)
318
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
319
+ args[:sequence] = seq_snp
320
+ primer_3_propertes << return_primer_3_string(args)
321
+
322
+ end
323
+
324
+
325
+ args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
326
+ primer_3_propertes << return_primer_3_string(args)
327
+ args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{snp_type} #{chromosome}"
328
+ args[:sequence] = seq_snp
329
+ primer_3_propertes << return_primer_3_string(args)
330
+
331
+
332
+ primer_3_propertes
333
+ end
334
+
335
+ def to_s
336
+ "#{gene}:#{original}#{position}#{snp}#{chromosome}"
337
+ end
338
+
339
+ def short_s
340
+ "#{original}#{position}#{snp}".upcase
341
+ end
342
+
343
+ def primer_3_string(target_chromosome, parental)
344
+ strings = primer_3_all_strings(target_chromosome, parental)
345
+ strings.join
346
+ end
347
+
348
+ def exon_for_chromosome (chromosome)
349
+ selected_exon=exon_list[chromosome]
350
+ puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
351
+ selected_exon
352
+ end
353
+
354
+ def parental_sequences
355
+ return @parental_sequences if @parental_sequences
356
+ gene_region = self.covered_region
357
+ local_pos_in_gene = self.local_position
358
+
359
+ @parental_sequences = Bio::Alignment::SequenceHash.new
360
+ container.parents.each do |name, bam|
361
+ seq = nil
362
+ if bam
363
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
364
+ else
365
+ seq = container.gene_model_sequence(gene_region)
366
+ unless name == self.snp_in
367
+ seq[local_pos_in_gene] = self.original
368
+ end
369
+ end
370
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
371
+
372
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
373
+ @parental_sequences [name] = seq
374
+ #puts name
375
+ #puts seq
376
+ end
377
+ @parental_sequences
378
+ end
379
+
380
+ def surrounding_parental_sequences
381
+ return @surrounding_parental_sequences if @surrounding_parental_sequences
382
+ gene_region = self.covered_region
383
+ local_pos_in_gene = self.local_position
384
+
385
+ @surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
386
+ container.parents.each do |name, bam|
387
+ seq = nil
388
+ if bam
389
+ seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
390
+ else
391
+ seq = container.gene_model_sequence(gene_region)
392
+ unless name == self.snp_in
393
+ # puts "Modiging original: #{name} #{self.original}"
394
+ seq[local_pos_in_gene] = self.original
395
+ end
396
+ end
397
+ #puts "local_pos_in_gene #{local_pos_in_gene}"
398
+ #puts "'#{name}' compared to '#{self.snp_in}'"
399
+ #puts seq
400
+ seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
401
+ seq[local_pos_in_gene] = self.snp if name == self.snp_in
402
+ #puts seq
403
+ #puts "__"
404
+ @surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
405
+ end
406
+ # puts "&&&&\n#{surrounding_parental_sequences['A']}\n#{surrounding_parental_sequences['B']}\n&&&&"
407
+ @surrounding_parental_sequences
408
+ end
409
+
410
+ def cut_sequence_to_primer_region(sequence)
411
+ ideal_min = self.local_position - flanking_size
412
+ ideal_max = self.local_position + flanking_size
413
+ ideal_min = 0 if ideal_min < 0
414
+ ideal_max = sequence.size - 1 if ideal_max > sequence.size
415
+ # len = ideal_max - ideal_min
416
+ sequence[ideal_min..ideal_max]
417
+ end
418
+
419
+ def cut_and_pad_sequence_to_primer_region(sequence)
420
+ # p "cut_and_pad_sequence_to_primer_region #{local_position} #{flanking_size}"
421
+ ideal_min = self.local_position - flanking_size
422
+ ideal_max = self.local_position + flanking_size
423
+ left_pad = 0
424
+ right_pad=0
425
+ if ideal_min < 0
426
+ left_pad = ideal_min * -1
427
+ ideal_min = 0
428
+ end
429
+ if ideal_max > sequence.size
430
+ right_pad = ideal_max - sequence.size
431
+ ideal_max = sequence.size - 1
432
+ end
433
+ ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
434
+ ret
435
+ end
436
+
437
+ def sequences_to_align
438
+ @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
439
+ # p "sequences_to_align"
440
+ # p @sequences_to_align.inspect
441
+ @sequences_to_align
442
+ end
443
+
444
+ def aligned_sequences
445
+
446
+ return @aligned_sequences if @aligned_sequences
447
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
448
+ mafft = Bio::MAFFT.new( "mafft" , options)
449
+ #puts "Before MAFT:#{sequences_to_align.inspect}"
450
+ report = mafft.query_align(sequences_to_align)
451
+ @aligned_sequences = report.alignment
452
+ #puts "MAFFT: #{report.alignment.inspect}"
453
+ @aligned_sequences
454
+ end
455
+
456
+ def aligned_sequences_fasta
457
+ ret_str = ""
458
+ aligned_sequences.each_pair do |name, seq|
459
+ ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
460
+ end
461
+ ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
462
+
463
+ pr = primer_region(chromosome, snp_in )
464
+ ret_str << pr.to_fasta
465
+ ret_str
466
+ ret_str
467
+ end
468
+
469
+ def aligned_snp_position
470
+ return @aligned_snp_position if @aligned_snp_position
471
+ pos = -1
472
+ parental_strings = Array.new
473
+ parental_sequences.keys.each do | par |
474
+
475
+ parental_strings << aligned_sequences[par]
476
+ end
477
+ template_sequence = nil
478
+ aligned_sequences.keys.each do |temp |
479
+ template_sequence = aligned_sequences[ temp ] if aligned_sequences[ temp ][0] != "-"
480
+ end
481
+ $stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
482
+
483
+ i = 0
484
+ differences = 0
485
+ local_pos_in_gene = flanking_size
486
+ local_pos = 0
487
+ started = false
488
+ #TODO: Validate the cases when the alignment has padding on the left on all the chromosomes
489
+
490
+ while i < parental_strings[0].size do
491
+ if local_pos_in_gene == local_pos
492
+ pos = i
493
+ if parental_strings[0][i] == parental_strings[1][i]
494
+ $stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
495
+ end
496
+
497
+ end
498
+
499
+ started = true if template_sequence[i] != "-"
500
+ if started == false or template_sequence[i] != "-"
501
+ local_pos += 1
502
+ end
503
+ i += 1
504
+ end
505
+ @aligned_snp_position = pos
506
+ return pos
507
+ end
508
+
509
+ def mask_aligned_chromosomal_snp(chromosome)
510
+ names = exon_sequences.keys
511
+ parentals = parental_sequences.keys
512
+
513
+ local_pos_in_gene = aligned_snp_position
514
+ masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
515
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
516
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
517
+ i = 0
518
+ while i < masked_snps.size
519
+ different = 0
520
+ cov = 0
521
+ from_group = 0
522
+ names.each do | chr |
523
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
524
+ cov += 1
525
+
526
+ from_group += 1 if chr[0] == chromosome_group
527
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
528
+ if chr != chromosome
529
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
530
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
531
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
532
+ end
533
+ end
534
+ end
535
+ masked_snps[i] = "-" if different == 0
536
+ masked_snps[i] = "-" if cov == 1
537
+ masked_snps[i] = "*" if cov == 0
538
+ expected_snps = names.size - 1
539
+ # puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
540
+
541
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
542
+
543
+ if i == local_pos_in_gene
544
+ masked_snps[i] = "&"
545
+ bases = ""
546
+ names.each do | chr |
547
+ bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
548
+ end
549
+
550
+ code_reference = "n"
551
+ code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
552
+
553
+ if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
554
+ masked_snps[i] = ":"
555
+ end
556
+
557
+ end
558
+ i += 1
559
+ end
560
+ masked_snps
561
+ end
562
+
563
+ def masked_chromosomal_snps(chromosome)
564
+ chromosomes = exon_sequences
565
+ names = chromosomes.keys
566
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
567
+ masked_snps = "-" * covered_region.size unless chromosomes[chromosome]
568
+ local_pos_in_gene = self.local_position
569
+ ideal_min = local_pos_in_gene - flanking_size
570
+ ideal_max = local_pos_in_gene + flanking_size
571
+ i = 0
572
+ while i < masked_snps.size do
573
+ if i > ideal_min and i <= ideal_max
574
+
575
+ different = 0
576
+ cov = 0
577
+ names.each do | chr |
578
+ if chromosomes[chr][i] != "-"
579
+ cov += 1
580
+ if chr != chromosome and masked_snps[i] != "+"
581
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
582
+ end
583
+ end
584
+
585
+ end
586
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
587
+ masked_snps[i] = "-" if cov < 2
588
+ masked_snps[i] = masked_snps[i].upcase if different > 1
589
+
590
+ else
591
+ masked_snps[i] = "*"
592
+ end
593
+ if i == local_pos_in_gene
594
+ masked_snps[i] = "&"
595
+ end
596
+ i += 1
597
+ end
598
+ masked_snps
599
+ end
600
+
601
+ def surrounding_masked_chromosomal_snps(chromosome)
602
+
603
+ chromosomes = surrounding_exon_sequences
604
+ names = chromosomes.keys
605
+ masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
606
+ masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
607
+ local_pos_in_gene = flanking_size
608
+ # ideal_min = local_pos_in_gene - flanking_size
609
+ #ideal_max = local_pos_in_gene + flanking_size
610
+ i = 0
611
+ while i < masked_snps.size do
612
+
613
+
614
+ different = 0
615
+ cov = 0
616
+ names.each do | chr |
617
+ if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
618
+ cov += 1
619
+ if chr != chromosome and masked_snps[i] != "+"
620
+ different += 1 if masked_snps[i] != chromosomes[chr][i]
621
+ end
622
+ end
623
+
624
+ end
625
+ masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
626
+ masked_snps[i] = "-" if cov < 2
627
+ masked_snps[i] = masked_snps[i].upcase if different > 1
628
+
629
+
630
+ if i == local_pos_in_gene
631
+ masked_snps[i] = "&"
632
+ end
633
+ i += 1
634
+ end
635
+ masked_snps
636
+ end
637
+
638
+ def surrounding_exon_sequences
639
+ return @surrounding_exon_sequences if @surrounding_exon_sequences
640
+ @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
641
+ self.exon_list.each do |chromosome, exon|
642
+ #puts "surrounding_exon_sequences #{flanking_size}"
643
+ #puts chromosome
644
+ #puts exon
645
+ flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
646
+ #TODO: Padd when the exon goes over the regions...
647
+
648
+ #Ignoring when the exon is in a gap
649
+ unless exon.snp_in_gap
650
+ exon_seq = container.chromosome_sequence(flanquing_region)
651
+ @surrounding_exon_sequences[chromosome] = exon_seq
652
+ end
653
+ end
654
+ @surrounding_exon_sequences
655
+ end
656
+
657
+
658
+ def exon_sequences
659
+ return @exon_sequences if @exon_sequences
660
+ gene_region = self.covered_region
661
+ local_pos_in_gene = self.local_position
662
+ @exon_sequences = Bio::Alignment::SequenceHash.new
663
+ self.exon_list.each do |chromosome, exon|
664
+ exon_start_offset = exon.query_region.start - gene_region.start
665
+ exon_seq = "-" * exon_start_offset
666
+ exon_seq << container.chromosome_sequence(exon.target_region).to_s
667
+ #puts exon_seq
668
+ # l_pos = exon_start_offset + local_pos_in_gene
669
+ unless exon.snp_in_gap
670
+ #puts "local position: #{local_pos_in_gene}"
671
+ #puts "Exon_seq: #{exon_seq}"
672
+ exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
673
+ exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
674
+ @exon_sequences[chromosome] = exon_seq
675
+ end
676
+ end
677
+ @exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
678
+ @exon_sequences
679
+ end
680
+ end
681
+ end