bio-polyploid-tools 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +67 -0
- data/README +21 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +133 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +15 -0
- data/bin/find_best_blat_hit.rb +32 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +155 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/polymarker.rb +219 -0
- data/bin/snps_between_bams.rb +106 -0
- data/bio-polyploid-tools.gemspec +139 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +698 -0
- data/lib/bio/BIOExtensions.rb +186 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
- data/lib/bio/PolyploidTools/SNP.rb +681 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
- data/lib/bio/SAMToolsExtensions.rb +284 -0
- data/lib/bio/db/exonerate.rb +272 -0
- data/lib/bio/db/fastadb.rb +164 -0
- data/lib/bio/db/primer3.rb +673 -0
- data/lib/bioruby-polyploid-tools.rb +25 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/Test3Aspecific.csv +1 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +51 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +53 -0
- data/test/test_snp_parsing.rb +40 -0
- metadata +201 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
module Bio::PolyploidTools
|
2
|
+
class PrimerRegion
|
3
|
+
attr_accessor :snp_pos, :sequence, :chromosome_specific, :almost_chromosome_specific, :crhomosome_specific_intron , :almost_crhomosome_specific_intron, :homeologous
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
|
7
|
+
@chromosome_specific = Array.new
|
8
|
+
@almost_chromosome_specific = Array.new
|
9
|
+
@crhomosome_specific_intron = Array.new
|
10
|
+
@almost_crhomosome_specific_intron = Array.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def tail_candidates
|
14
|
+
@chromosome_specific.size + @almost_chromosome_specific.size
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_fasta
|
18
|
+
">Primer_#{snp_pos}_#{chromosome_specific.to_s}_#{almost_chromosome_specific.to_s}_#{crhomosome_specific_intron.to_s}_#{almost_crhomosome_specific_intron.to_s}\n#{sequence}\n"
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,681 @@
|
|
1
|
+
require 'bio'
|
2
|
+
module Bio::PolyploidTools
|
3
|
+
class SNPException < RuntimeError
|
4
|
+
end
|
5
|
+
class SNP
|
6
|
+
|
7
|
+
#GENE,ORIGINAL,POS,SNP
|
8
|
+
attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
|
9
|
+
attr_accessor :exon_list
|
10
|
+
attr_accessor :container
|
11
|
+
attr_accessor :flanking_size, :ideal_min, :ideal_max
|
12
|
+
attr_accessor :template_sequence
|
13
|
+
attr_accessor :use_reference
|
14
|
+
attr_accessor :genomes_count
|
15
|
+
|
16
|
+
attr_reader :chromosome
|
17
|
+
|
18
|
+
#Format:
|
19
|
+
#Gene_name,Original,SNP_Pos,pos,chromosome
|
20
|
+
#A_comp0_c0_seq1,C,519,A
|
21
|
+
def self.parse(reg_str)
|
22
|
+
reg_str.chomp!
|
23
|
+
snp = SNP.new
|
24
|
+
snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
|
25
|
+
snp.position = snp.position.strip!.to_i
|
26
|
+
snp.original.upcase!.strip!
|
27
|
+
snp.snp.upcase!.strip!
|
28
|
+
snp.chromosome.strip!
|
29
|
+
snp.exon_list = Hash.new()
|
30
|
+
snp.use_reference = false
|
31
|
+
snp
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@genomes_count = 3 #TODO: if we want to use this with other polyploids, me need to set this as a variable in the main script.
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
#We Only want the chromosome, we drop the arm.
|
40
|
+
def chromosome= (chr)
|
41
|
+
@chromosome = chr[0,2]
|
42
|
+
end
|
43
|
+
|
44
|
+
def chromosome_group
|
45
|
+
chromosome[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def chromosome_genome
|
49
|
+
chromosome[1]
|
50
|
+
end
|
51
|
+
|
52
|
+
def chromosome_genome
|
53
|
+
return chromosome[3] if chromosome[3]
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_fasta
|
58
|
+
return ">#{self.gene}\n#{self.template_sequence}\n"
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_exon(exon, arm)
|
62
|
+
@exon_list[arm] = exon unless @exon_list[arm]
|
63
|
+
@exon_list[arm] = exon if exon.record.score > @exon_list[arm].record.score
|
64
|
+
end
|
65
|
+
|
66
|
+
def covered_region
|
67
|
+
return @covered_region if @covered_region
|
68
|
+
if self.use_reference
|
69
|
+
reg = Bio::DB::Fasta::Region.new()
|
70
|
+
reg.entry = gene
|
71
|
+
reg.orientation = :forward
|
72
|
+
reg.start = self.position - self.flanking_size
|
73
|
+
reg.end = self.position + self.flanking_size
|
74
|
+
|
75
|
+
reg.start = 1 if reg.start < 1
|
76
|
+
|
77
|
+
return reg
|
78
|
+
end
|
79
|
+
|
80
|
+
min = @position
|
81
|
+
max = @position
|
82
|
+
# puts "Calculating covered region for #{self.inspect}"
|
83
|
+
# puts "#{@exon_list.inspect}"
|
84
|
+
raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
|
85
|
+
|
86
|
+
@exon_list.each do | chromosome, exon |
|
87
|
+
# puts exon.inspect
|
88
|
+
reg = exon.query_region
|
89
|
+
min = reg.start if reg.start < min
|
90
|
+
max = reg.end if reg.end > max
|
91
|
+
end
|
92
|
+
reg = Bio::DB::Fasta::Region.new()
|
93
|
+
reg.entry = gene
|
94
|
+
reg.orientation = :forward
|
95
|
+
reg.start = min
|
96
|
+
reg.end = max
|
97
|
+
|
98
|
+
@covered_region = reg
|
99
|
+
@covered_region
|
100
|
+
end
|
101
|
+
|
102
|
+
def left_padding
|
103
|
+
flanking_size - self.local_position + 1
|
104
|
+
# primer_region.start - covered_region.start
|
105
|
+
# 0
|
106
|
+
end
|
107
|
+
|
108
|
+
def right_padding
|
109
|
+
ret = (2*flanking_size) - (left_padding + self.covered_region.size )
|
110
|
+
ret = 0 if ret < 0
|
111
|
+
ret
|
112
|
+
end
|
113
|
+
|
114
|
+
def local_position
|
115
|
+
# puts "local_position #{self.position} #{self.covered_region.start}"
|
116
|
+
self.position - self.covered_region.start
|
117
|
+
end
|
118
|
+
|
119
|
+
def padded_position (pos)
|
120
|
+
pos + left_padding
|
121
|
+
end
|
122
|
+
|
123
|
+
def exon_fasta_string
|
124
|
+
gene_region = self.covered_region
|
125
|
+
local_pos_in_gene = self.local_position
|
126
|
+
ret_str = ""
|
127
|
+
container.parents.each do |name, bam|
|
128
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
|
129
|
+
to_print = parental_sequences[name]
|
130
|
+
ret_str << to_print << "\n"
|
131
|
+
end
|
132
|
+
self.exon_sequences.each do | chromosome, exon_seq |
|
133
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
134
|
+
end
|
135
|
+
mask = masked_chromosomal_snps("1BS", flanking_size)
|
136
|
+
ret_str << ">Mask\n#{mask}\n"
|
137
|
+
ret_str
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
def primer_fasta_string
|
142
|
+
gene_region = self.covered_region
|
143
|
+
local_pos_in_gene = self.local_position
|
144
|
+
ret_str = ""
|
145
|
+
|
146
|
+
surrounding_parental_sequences.each do |name, seq|
|
147
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
|
148
|
+
ret_str << "#{seq}\n"
|
149
|
+
end
|
150
|
+
|
151
|
+
#self.exon_sequences.each do | chromosome, exon_seq |
|
152
|
+
# ex_seq = cut_sequence_to_primer_region(exon_seq)
|
153
|
+
# ret_str << ">#{chromosome}\n#{ex_seq}\n"
|
154
|
+
#end
|
155
|
+
self.surrounding_exon_sequences.each do |chromosome, exon_seq|
|
156
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
157
|
+
end
|
158
|
+
|
159
|
+
mask = surrounding_masked_chromosomal_snps(chromosome)
|
160
|
+
ret_str << ">Mask\n#{mask}\n"
|
161
|
+
|
162
|
+
pr = primer_region(chromosome, snp_in )
|
163
|
+
ret_str << pr.to_fasta
|
164
|
+
ret_str
|
165
|
+
end
|
166
|
+
|
167
|
+
def primer_region(target_chromosome, parental )
|
168
|
+
parental = aligned_sequences[parental].downcase
|
169
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
170
|
+
chromosome_seq = "-" * parental.size unless chromosome_seq
|
171
|
+
chromosome_seq = chromosome_seq.downcase
|
172
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
173
|
+
#puts "'#{mask}'"
|
174
|
+
|
175
|
+
pr = PrimerRegion.new
|
176
|
+
position_in_region = 0
|
177
|
+
(0..parental.size-1).each do |i|
|
178
|
+
|
179
|
+
if chromosome_seq[i] != '-' or parental[i] != '-'
|
180
|
+
case
|
181
|
+
when mask[i] == '&'
|
182
|
+
#This is the SNP we take the parental
|
183
|
+
pr.snp_pos = position_in_region
|
184
|
+
pr.homeologous = false
|
185
|
+
when mask[i] == ':'
|
186
|
+
#This is the SNP we take the parental
|
187
|
+
pr.snp_pos = position_in_region
|
188
|
+
pr.homeologous = true
|
189
|
+
when mask[i] == '-'
|
190
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
191
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
192
|
+
|
193
|
+
when /[[:upper:]]/.match(mask[i])
|
194
|
+
#This is a good candidate for marking a SNP
|
195
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
196
|
+
if parental[i] == '-'
|
197
|
+
parental[i] = mask[i]
|
198
|
+
pr.crhomosome_specific_intron << position_in_region
|
199
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
200
|
+
parental[i] = mask[i]
|
201
|
+
pr.chromosome_specific << position_in_region
|
202
|
+
end
|
203
|
+
when /[[:lower:]]/.match(mask[i])
|
204
|
+
#this is not that good candidate, but sitll gives specificity
|
205
|
+
|
206
|
+
if parental[i] == '-'
|
207
|
+
parental[i] = mask[i]
|
208
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
209
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
210
|
+
parental[i] = mask[i].upcase
|
211
|
+
pr.almost_chromosome_specific << position_in_region
|
212
|
+
end
|
213
|
+
end #Case closes
|
214
|
+
position_in_region += 1
|
215
|
+
end #Closes region with bases
|
216
|
+
end
|
217
|
+
|
218
|
+
pr.sequence=parental.gsub('-','')
|
219
|
+
pr
|
220
|
+
end
|
221
|
+
|
222
|
+
def reverse_complement_string(sequenc_str)
|
223
|
+
complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
|
224
|
+
complement.reverse!
|
225
|
+
end
|
226
|
+
|
227
|
+
def return_primer_3_string(opts={})
|
228
|
+
|
229
|
+
left = opts[:left_pos]
|
230
|
+
right = opts[:right_pos]
|
231
|
+
sequence = opts[:sequence]
|
232
|
+
orientation = "forward"
|
233
|
+
if opts[:right_pos]
|
234
|
+
orientation = "forward"
|
235
|
+
if left > right
|
236
|
+
left = sequence.size - left - 1
|
237
|
+
right = sequence.size - right - 1
|
238
|
+
sequence = reverse_complement_string(sequence)
|
239
|
+
orientation = "reverse"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
244
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
245
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
246
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
247
|
+
str << "=\n"
|
248
|
+
|
249
|
+
|
250
|
+
#In case that we don't have a right primer, wi do both orientation
|
251
|
+
unless opts[:right_pos]
|
252
|
+
sequence = opts[:sequence]
|
253
|
+
left = sequence.size - left - 1
|
254
|
+
orientation = "reverse"
|
255
|
+
sequence = reverse_complement_string(sequence)
|
256
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
257
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
258
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
259
|
+
str << "=\n"
|
260
|
+
else
|
261
|
+
|
262
|
+
end
|
263
|
+
|
264
|
+
str
|
265
|
+
end
|
266
|
+
|
267
|
+
|
268
|
+
def primer_3_all_strings(target_chromosome, parental)
|
269
|
+
pr = primer_region(target_chromosome, parental )
|
270
|
+
primer_3_propertes = Array.new
|
271
|
+
|
272
|
+
seq_original = String.new(pr.sequence)
|
273
|
+
seq_original[pr.snp_pos] = self.original
|
274
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
275
|
+
|
276
|
+
seq_snp = String.new(pr.sequence)
|
277
|
+
seq_snp[pr.snp_pos] = self.snp
|
278
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
279
|
+
|
280
|
+
rev_pos = seq_snp.size - position
|
281
|
+
|
282
|
+
if pr.homeologous
|
283
|
+
snp_type = "homeologous"
|
284
|
+
else
|
285
|
+
snp_type = "non-homeologous"
|
286
|
+
end
|
287
|
+
|
288
|
+
pr.chromosome_specific.each do |pos|
|
289
|
+
|
290
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
291
|
+
primer_3_propertes << return_primer_3_string(args)
|
292
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
|
293
|
+
args[:sequence] = seq_snp
|
294
|
+
primer_3_propertes << return_primer_3_string(args)
|
295
|
+
end
|
296
|
+
|
297
|
+
pr.almost_chromosome_specific.each do |pos|
|
298
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
299
|
+
primer_3_propertes << return_primer_3_string(args)
|
300
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
|
301
|
+
args[:sequence] = seq_snp
|
302
|
+
primer_3_propertes << return_primer_3_string(args)
|
303
|
+
|
304
|
+
end
|
305
|
+
|
306
|
+
pr.crhomosome_specific_intron.each do |pos|
|
307
|
+
|
308
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
309
|
+
primer_3_propertes << return_primer_3_string(args)
|
310
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
|
311
|
+
args[:sequence] = seq_snp
|
312
|
+
primer_3_propertes << return_primer_3_string(args)
|
313
|
+
end
|
314
|
+
|
315
|
+
pr.almost_crhomosome_specific_intron.each do |pos|
|
316
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
317
|
+
primer_3_propertes << return_primer_3_string(args)
|
318
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
|
319
|
+
args[:sequence] = seq_snp
|
320
|
+
primer_3_propertes << return_primer_3_string(args)
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
|
325
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
|
326
|
+
primer_3_propertes << return_primer_3_string(args)
|
327
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{snp_type} #{chromosome}"
|
328
|
+
args[:sequence] = seq_snp
|
329
|
+
primer_3_propertes << return_primer_3_string(args)
|
330
|
+
|
331
|
+
|
332
|
+
primer_3_propertes
|
333
|
+
end
|
334
|
+
|
335
|
+
def to_s
|
336
|
+
"#{gene}:#{original}#{position}#{snp}#{chromosome}"
|
337
|
+
end
|
338
|
+
|
339
|
+
def short_s
|
340
|
+
"#{original}#{position}#{snp}".upcase
|
341
|
+
end
|
342
|
+
|
343
|
+
def primer_3_string(target_chromosome, parental)
|
344
|
+
strings = primer_3_all_strings(target_chromosome, parental)
|
345
|
+
strings.join
|
346
|
+
end
|
347
|
+
|
348
|
+
def exon_for_chromosome (chromosome)
|
349
|
+
selected_exon=exon_list[chromosome]
|
350
|
+
puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
|
351
|
+
selected_exon
|
352
|
+
end
|
353
|
+
|
354
|
+
def parental_sequences
|
355
|
+
return @parental_sequences if @parental_sequences
|
356
|
+
gene_region = self.covered_region
|
357
|
+
local_pos_in_gene = self.local_position
|
358
|
+
|
359
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
360
|
+
container.parents.each do |name, bam|
|
361
|
+
seq = nil
|
362
|
+
if bam
|
363
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
364
|
+
else
|
365
|
+
seq = container.gene_model_sequence(gene_region)
|
366
|
+
unless name == self.snp_in
|
367
|
+
seq[local_pos_in_gene] = self.original
|
368
|
+
end
|
369
|
+
end
|
370
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
371
|
+
|
372
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
373
|
+
@parental_sequences [name] = seq
|
374
|
+
#puts name
|
375
|
+
#puts seq
|
376
|
+
end
|
377
|
+
@parental_sequences
|
378
|
+
end
|
379
|
+
|
380
|
+
def surrounding_parental_sequences
|
381
|
+
return @surrounding_parental_sequences if @surrounding_parental_sequences
|
382
|
+
gene_region = self.covered_region
|
383
|
+
local_pos_in_gene = self.local_position
|
384
|
+
|
385
|
+
@surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
|
386
|
+
container.parents.each do |name, bam|
|
387
|
+
seq = nil
|
388
|
+
if bam
|
389
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
390
|
+
else
|
391
|
+
seq = container.gene_model_sequence(gene_region)
|
392
|
+
unless name == self.snp_in
|
393
|
+
# puts "Modiging original: #{name} #{self.original}"
|
394
|
+
seq[local_pos_in_gene] = self.original
|
395
|
+
end
|
396
|
+
end
|
397
|
+
#puts "local_pos_in_gene #{local_pos_in_gene}"
|
398
|
+
#puts "'#{name}' compared to '#{self.snp_in}'"
|
399
|
+
#puts seq
|
400
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
401
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
402
|
+
#puts seq
|
403
|
+
#puts "__"
|
404
|
+
@surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
|
405
|
+
end
|
406
|
+
# puts "&&&&\n#{surrounding_parental_sequences['A']}\n#{surrounding_parental_sequences['B']}\n&&&&"
|
407
|
+
@surrounding_parental_sequences
|
408
|
+
end
|
409
|
+
|
410
|
+
def cut_sequence_to_primer_region(sequence)
|
411
|
+
ideal_min = self.local_position - flanking_size
|
412
|
+
ideal_max = self.local_position + flanking_size
|
413
|
+
ideal_min = 0 if ideal_min < 0
|
414
|
+
ideal_max = sequence.size - 1 if ideal_max > sequence.size
|
415
|
+
# len = ideal_max - ideal_min
|
416
|
+
sequence[ideal_min..ideal_max]
|
417
|
+
end
|
418
|
+
|
419
|
+
def cut_and_pad_sequence_to_primer_region(sequence)
|
420
|
+
# p "cut_and_pad_sequence_to_primer_region #{local_position} #{flanking_size}"
|
421
|
+
ideal_min = self.local_position - flanking_size
|
422
|
+
ideal_max = self.local_position + flanking_size
|
423
|
+
left_pad = 0
|
424
|
+
right_pad=0
|
425
|
+
if ideal_min < 0
|
426
|
+
left_pad = ideal_min * -1
|
427
|
+
ideal_min = 0
|
428
|
+
end
|
429
|
+
if ideal_max > sequence.size
|
430
|
+
right_pad = ideal_max - sequence.size
|
431
|
+
ideal_max = sequence.size - 1
|
432
|
+
end
|
433
|
+
ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
|
434
|
+
ret
|
435
|
+
end
|
436
|
+
|
437
|
+
def sequences_to_align
|
438
|
+
@sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
|
439
|
+
# p "sequences_to_align"
|
440
|
+
# p @sequences_to_align.inspect
|
441
|
+
@sequences_to_align
|
442
|
+
end
|
443
|
+
|
444
|
+
def aligned_sequences
|
445
|
+
|
446
|
+
return @aligned_sequences if @aligned_sequences
|
447
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
448
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
449
|
+
#puts "Before MAFT:#{sequences_to_align.inspect}"
|
450
|
+
report = mafft.query_align(sequences_to_align)
|
451
|
+
@aligned_sequences = report.alignment
|
452
|
+
#puts "MAFFT: #{report.alignment.inspect}"
|
453
|
+
@aligned_sequences
|
454
|
+
end
|
455
|
+
|
456
|
+
def aligned_sequences_fasta
|
457
|
+
ret_str = ""
|
458
|
+
aligned_sequences.each_pair do |name, seq|
|
459
|
+
ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
|
460
|
+
end
|
461
|
+
ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
|
462
|
+
|
463
|
+
pr = primer_region(chromosome, snp_in )
|
464
|
+
ret_str << pr.to_fasta
|
465
|
+
ret_str
|
466
|
+
ret_str
|
467
|
+
end
|
468
|
+
|
469
|
+
def aligned_snp_position
|
470
|
+
return @aligned_snp_position if @aligned_snp_position
|
471
|
+
pos = -1
|
472
|
+
parental_strings = Array.new
|
473
|
+
parental_sequences.keys.each do | par |
|
474
|
+
|
475
|
+
parental_strings << aligned_sequences[par]
|
476
|
+
end
|
477
|
+
template_sequence = nil
|
478
|
+
aligned_sequences.keys.each do |temp |
|
479
|
+
template_sequence = aligned_sequences[ temp ] if aligned_sequences[ temp ][0] != "-"
|
480
|
+
end
|
481
|
+
$stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
|
482
|
+
|
483
|
+
i = 0
|
484
|
+
differences = 0
|
485
|
+
local_pos_in_gene = flanking_size
|
486
|
+
local_pos = 0
|
487
|
+
started = false
|
488
|
+
#TODO: Validate the cases when the alignment has padding on the left on all the chromosomes
|
489
|
+
|
490
|
+
while i < parental_strings[0].size do
|
491
|
+
if local_pos_in_gene == local_pos
|
492
|
+
pos = i
|
493
|
+
if parental_strings[0][i] == parental_strings[1][i]
|
494
|
+
$stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
|
495
|
+
end
|
496
|
+
|
497
|
+
end
|
498
|
+
|
499
|
+
started = true if template_sequence[i] != "-"
|
500
|
+
if started == false or template_sequence[i] != "-"
|
501
|
+
local_pos += 1
|
502
|
+
end
|
503
|
+
i += 1
|
504
|
+
end
|
505
|
+
@aligned_snp_position = pos
|
506
|
+
return pos
|
507
|
+
end
|
508
|
+
|
509
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
510
|
+
names = exon_sequences.keys
|
511
|
+
parentals = parental_sequences.keys
|
512
|
+
|
513
|
+
local_pos_in_gene = aligned_snp_position
|
514
|
+
masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
|
515
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
|
516
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
517
|
+
i = 0
|
518
|
+
while i < masked_snps.size
|
519
|
+
different = 0
|
520
|
+
cov = 0
|
521
|
+
from_group = 0
|
522
|
+
names.each do | chr |
|
523
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
524
|
+
cov += 1
|
525
|
+
|
526
|
+
from_group += 1 if chr[0] == chromosome_group
|
527
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
528
|
+
if chr != chromosome
|
529
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
530
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
531
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
532
|
+
end
|
533
|
+
end
|
534
|
+
end
|
535
|
+
masked_snps[i] = "-" if different == 0
|
536
|
+
masked_snps[i] = "-" if cov == 1
|
537
|
+
masked_snps[i] = "*" if cov == 0
|
538
|
+
expected_snps = names.size - 1
|
539
|
+
# puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
540
|
+
|
541
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
542
|
+
|
543
|
+
if i == local_pos_in_gene
|
544
|
+
masked_snps[i] = "&"
|
545
|
+
bases = ""
|
546
|
+
names.each do | chr |
|
547
|
+
bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
548
|
+
end
|
549
|
+
|
550
|
+
code_reference = "n"
|
551
|
+
code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
|
552
|
+
|
553
|
+
if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
|
554
|
+
masked_snps[i] = ":"
|
555
|
+
end
|
556
|
+
|
557
|
+
end
|
558
|
+
i += 1
|
559
|
+
end
|
560
|
+
masked_snps
|
561
|
+
end
|
562
|
+
|
563
|
+
def masked_chromosomal_snps(chromosome)
|
564
|
+
chromosomes = exon_sequences
|
565
|
+
names = chromosomes.keys
|
566
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
567
|
+
masked_snps = "-" * covered_region.size unless chromosomes[chromosome]
|
568
|
+
local_pos_in_gene = self.local_position
|
569
|
+
ideal_min = local_pos_in_gene - flanking_size
|
570
|
+
ideal_max = local_pos_in_gene + flanking_size
|
571
|
+
i = 0
|
572
|
+
while i < masked_snps.size do
|
573
|
+
if i > ideal_min and i <= ideal_max
|
574
|
+
|
575
|
+
different = 0
|
576
|
+
cov = 0
|
577
|
+
names.each do | chr |
|
578
|
+
if chromosomes[chr][i] != "-"
|
579
|
+
cov += 1
|
580
|
+
if chr != chromosome and masked_snps[i] != "+"
|
581
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
end
|
586
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
587
|
+
masked_snps[i] = "-" if cov < 2
|
588
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
589
|
+
|
590
|
+
else
|
591
|
+
masked_snps[i] = "*"
|
592
|
+
end
|
593
|
+
if i == local_pos_in_gene
|
594
|
+
masked_snps[i] = "&"
|
595
|
+
end
|
596
|
+
i += 1
|
597
|
+
end
|
598
|
+
masked_snps
|
599
|
+
end
|
600
|
+
|
601
|
+
def surrounding_masked_chromosomal_snps(chromosome)
|
602
|
+
|
603
|
+
chromosomes = surrounding_exon_sequences
|
604
|
+
names = chromosomes.keys
|
605
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
606
|
+
masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
|
607
|
+
local_pos_in_gene = flanking_size
|
608
|
+
# ideal_min = local_pos_in_gene - flanking_size
|
609
|
+
#ideal_max = local_pos_in_gene + flanking_size
|
610
|
+
i = 0
|
611
|
+
while i < masked_snps.size do
|
612
|
+
|
613
|
+
|
614
|
+
different = 0
|
615
|
+
cov = 0
|
616
|
+
names.each do | chr |
|
617
|
+
if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
|
618
|
+
cov += 1
|
619
|
+
if chr != chromosome and masked_snps[i] != "+"
|
620
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
end
|
625
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
626
|
+
masked_snps[i] = "-" if cov < 2
|
627
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
628
|
+
|
629
|
+
|
630
|
+
if i == local_pos_in_gene
|
631
|
+
masked_snps[i] = "&"
|
632
|
+
end
|
633
|
+
i += 1
|
634
|
+
end
|
635
|
+
masked_snps
|
636
|
+
end
|
637
|
+
|
638
|
+
def surrounding_exon_sequences
|
639
|
+
return @surrounding_exon_sequences if @surrounding_exon_sequences
|
640
|
+
@surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
|
641
|
+
self.exon_list.each do |chromosome, exon|
|
642
|
+
#puts "surrounding_exon_sequences #{flanking_size}"
|
643
|
+
#puts chromosome
|
644
|
+
#puts exon
|
645
|
+
flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
|
646
|
+
#TODO: Padd when the exon goes over the regions...
|
647
|
+
|
648
|
+
#Ignoring when the exon is in a gap
|
649
|
+
unless exon.snp_in_gap
|
650
|
+
exon_seq = container.chromosome_sequence(flanquing_region)
|
651
|
+
@surrounding_exon_sequences[chromosome] = exon_seq
|
652
|
+
end
|
653
|
+
end
|
654
|
+
@surrounding_exon_sequences
|
655
|
+
end
|
656
|
+
|
657
|
+
|
658
|
+
def exon_sequences
|
659
|
+
return @exon_sequences if @exon_sequences
|
660
|
+
gene_region = self.covered_region
|
661
|
+
local_pos_in_gene = self.local_position
|
662
|
+
@exon_sequences = Bio::Alignment::SequenceHash.new
|
663
|
+
self.exon_list.each do |chromosome, exon|
|
664
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
665
|
+
exon_seq = "-" * exon_start_offset
|
666
|
+
exon_seq << container.chromosome_sequence(exon.target_region).to_s
|
667
|
+
#puts exon_seq
|
668
|
+
# l_pos = exon_start_offset + local_pos_in_gene
|
669
|
+
unless exon.snp_in_gap
|
670
|
+
#puts "local position: #{local_pos_in_gene}"
|
671
|
+
#puts "Exon_seq: #{exon_seq}"
|
672
|
+
exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
|
673
|
+
exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
|
674
|
+
@exon_sequences[chromosome] = exon_seq
|
675
|
+
end
|
676
|
+
end
|
677
|
+
@exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
|
678
|
+
@exon_sequences
|
679
|
+
end
|
680
|
+
end
|
681
|
+
end
|