bio-polyploid-tools 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +67 -0
- data/README +21 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +133 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +15 -0
- data/bin/find_best_blat_hit.rb +32 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +155 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/polymarker.rb +219 -0
- data/bin/snps_between_bams.rb +106 -0
- data/bio-polyploid-tools.gemspec +139 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +698 -0
- data/lib/bio/BIOExtensions.rb +186 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
- data/lib/bio/PolyploidTools/SNP.rb +681 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
- data/lib/bio/SAMToolsExtensions.rb +284 -0
- data/lib/bio/db/exonerate.rb +272 -0
- data/lib/bio/db/fastadb.rb +164 -0
- data/lib/bio/db/primer3.rb +673 -0
- data/lib/bioruby-polyploid-tools.rb +25 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/Test3Aspecific.csv +1 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +51 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +53 -0
- data/test/test_snp_parsing.rb +40 -0
- metadata +201 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module Bio::PolyploidTools
|
|
2
|
+
class PrimerRegion
|
|
3
|
+
attr_accessor :snp_pos, :sequence, :chromosome_specific, :almost_chromosome_specific, :crhomosome_specific_intron , :almost_crhomosome_specific_intron, :homeologous
|
|
4
|
+
|
|
5
|
+
def initialize
|
|
6
|
+
|
|
7
|
+
@chromosome_specific = Array.new
|
|
8
|
+
@almost_chromosome_specific = Array.new
|
|
9
|
+
@crhomosome_specific_intron = Array.new
|
|
10
|
+
@almost_crhomosome_specific_intron = Array.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def tail_candidates
|
|
14
|
+
@chromosome_specific.size + @almost_chromosome_specific.size
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_fasta
|
|
18
|
+
">Primer_#{snp_pos}_#{chromosome_specific.to_s}_#{almost_chromosome_specific.to_s}_#{crhomosome_specific_intron.to_s}_#{almost_crhomosome_specific_intron.to_s}\n#{sequence}\n"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
require 'bio'
|
|
2
|
+
module Bio::PolyploidTools
|
|
3
|
+
class SNPException < RuntimeError
|
|
4
|
+
end
|
|
5
|
+
class SNP
|
|
6
|
+
|
|
7
|
+
#GENE,ORIGINAL,POS,SNP
|
|
8
|
+
attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
|
|
9
|
+
attr_accessor :exon_list
|
|
10
|
+
attr_accessor :container
|
|
11
|
+
attr_accessor :flanking_size, :ideal_min, :ideal_max
|
|
12
|
+
attr_accessor :template_sequence
|
|
13
|
+
attr_accessor :use_reference
|
|
14
|
+
attr_accessor :genomes_count
|
|
15
|
+
|
|
16
|
+
attr_reader :chromosome
|
|
17
|
+
|
|
18
|
+
#Format:
|
|
19
|
+
#Gene_name,Original,SNP_Pos,pos,chromosome
|
|
20
|
+
#A_comp0_c0_seq1,C,519,A
|
|
21
|
+
def self.parse(reg_str)
|
|
22
|
+
reg_str.chomp!
|
|
23
|
+
snp = SNP.new
|
|
24
|
+
snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
|
|
25
|
+
snp.position = snp.position.strip!.to_i
|
|
26
|
+
snp.original.upcase!.strip!
|
|
27
|
+
snp.snp.upcase!.strip!
|
|
28
|
+
snp.chromosome.strip!
|
|
29
|
+
snp.exon_list = Hash.new()
|
|
30
|
+
snp.use_reference = false
|
|
31
|
+
snp
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def initialize
|
|
35
|
+
@genomes_count = 3 #TODO: if we want to use this with other polyploids, me need to set this as a variable in the main script.
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
#We Only want the chromosome, we drop the arm.
|
|
40
|
+
def chromosome= (chr)
|
|
41
|
+
@chromosome = chr[0,2]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def chromosome_group
|
|
45
|
+
chromosome[0]
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def chromosome_genome
|
|
49
|
+
chromosome[1]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def chromosome_genome
|
|
53
|
+
return chromosome[3] if chromosome[3]
|
|
54
|
+
return nil
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def to_fasta
|
|
58
|
+
return ">#{self.gene}\n#{self.template_sequence}\n"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def add_exon(exon, arm)
|
|
62
|
+
@exon_list[arm] = exon unless @exon_list[arm]
|
|
63
|
+
@exon_list[arm] = exon if exon.record.score > @exon_list[arm].record.score
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def covered_region
|
|
67
|
+
return @covered_region if @covered_region
|
|
68
|
+
if self.use_reference
|
|
69
|
+
reg = Bio::DB::Fasta::Region.new()
|
|
70
|
+
reg.entry = gene
|
|
71
|
+
reg.orientation = :forward
|
|
72
|
+
reg.start = self.position - self.flanking_size
|
|
73
|
+
reg.end = self.position + self.flanking_size
|
|
74
|
+
|
|
75
|
+
reg.start = 1 if reg.start < 1
|
|
76
|
+
|
|
77
|
+
return reg
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
min = @position
|
|
81
|
+
max = @position
|
|
82
|
+
# puts "Calculating covered region for #{self.inspect}"
|
|
83
|
+
# puts "#{@exon_list.inspect}"
|
|
84
|
+
raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
|
|
85
|
+
|
|
86
|
+
@exon_list.each do | chromosome, exon |
|
|
87
|
+
# puts exon.inspect
|
|
88
|
+
reg = exon.query_region
|
|
89
|
+
min = reg.start if reg.start < min
|
|
90
|
+
max = reg.end if reg.end > max
|
|
91
|
+
end
|
|
92
|
+
reg = Bio::DB::Fasta::Region.new()
|
|
93
|
+
reg.entry = gene
|
|
94
|
+
reg.orientation = :forward
|
|
95
|
+
reg.start = min
|
|
96
|
+
reg.end = max
|
|
97
|
+
|
|
98
|
+
@covered_region = reg
|
|
99
|
+
@covered_region
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def left_padding
|
|
103
|
+
flanking_size - self.local_position + 1
|
|
104
|
+
# primer_region.start - covered_region.start
|
|
105
|
+
# 0
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def right_padding
|
|
109
|
+
ret = (2*flanking_size) - (left_padding + self.covered_region.size )
|
|
110
|
+
ret = 0 if ret < 0
|
|
111
|
+
ret
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def local_position
|
|
115
|
+
# puts "local_position #{self.position} #{self.covered_region.start}"
|
|
116
|
+
self.position - self.covered_region.start
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def padded_position (pos)
|
|
120
|
+
pos + left_padding
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def exon_fasta_string
|
|
124
|
+
gene_region = self.covered_region
|
|
125
|
+
local_pos_in_gene = self.local_position
|
|
126
|
+
ret_str = ""
|
|
127
|
+
container.parents.each do |name, bam|
|
|
128
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
|
|
129
|
+
to_print = parental_sequences[name]
|
|
130
|
+
ret_str << to_print << "\n"
|
|
131
|
+
end
|
|
132
|
+
self.exon_sequences.each do | chromosome, exon_seq |
|
|
133
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
|
134
|
+
end
|
|
135
|
+
mask = masked_chromosomal_snps("1BS", flanking_size)
|
|
136
|
+
ret_str << ">Mask\n#{mask}\n"
|
|
137
|
+
ret_str
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def primer_fasta_string
|
|
142
|
+
gene_region = self.covered_region
|
|
143
|
+
local_pos_in_gene = self.local_position
|
|
144
|
+
ret_str = ""
|
|
145
|
+
|
|
146
|
+
surrounding_parental_sequences.each do |name, seq|
|
|
147
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
|
|
148
|
+
ret_str << "#{seq}\n"
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
#self.exon_sequences.each do | chromosome, exon_seq |
|
|
152
|
+
# ex_seq = cut_sequence_to_primer_region(exon_seq)
|
|
153
|
+
# ret_str << ">#{chromosome}\n#{ex_seq}\n"
|
|
154
|
+
#end
|
|
155
|
+
self.surrounding_exon_sequences.each do |chromosome, exon_seq|
|
|
156
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
mask = surrounding_masked_chromosomal_snps(chromosome)
|
|
160
|
+
ret_str << ">Mask\n#{mask}\n"
|
|
161
|
+
|
|
162
|
+
pr = primer_region(chromosome, snp_in )
|
|
163
|
+
ret_str << pr.to_fasta
|
|
164
|
+
ret_str
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def primer_region(target_chromosome, parental )
|
|
168
|
+
parental = aligned_sequences[parental].downcase
|
|
169
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
|
170
|
+
chromosome_seq = "-" * parental.size unless chromosome_seq
|
|
171
|
+
chromosome_seq = chromosome_seq.downcase
|
|
172
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
|
173
|
+
#puts "'#{mask}'"
|
|
174
|
+
|
|
175
|
+
pr = PrimerRegion.new
|
|
176
|
+
position_in_region = 0
|
|
177
|
+
(0..parental.size-1).each do |i|
|
|
178
|
+
|
|
179
|
+
if chromosome_seq[i] != '-' or parental[i] != '-'
|
|
180
|
+
case
|
|
181
|
+
when mask[i] == '&'
|
|
182
|
+
#This is the SNP we take the parental
|
|
183
|
+
pr.snp_pos = position_in_region
|
|
184
|
+
pr.homeologous = false
|
|
185
|
+
when mask[i] == ':'
|
|
186
|
+
#This is the SNP we take the parental
|
|
187
|
+
pr.snp_pos = position_in_region
|
|
188
|
+
pr.homeologous = true
|
|
189
|
+
when mask[i] == '-'
|
|
190
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
|
191
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
|
192
|
+
|
|
193
|
+
when /[[:upper:]]/.match(mask[i])
|
|
194
|
+
#This is a good candidate for marking a SNP
|
|
195
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
|
196
|
+
if parental[i] == '-'
|
|
197
|
+
parental[i] = mask[i]
|
|
198
|
+
pr.crhomosome_specific_intron << position_in_region
|
|
199
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
|
200
|
+
parental[i] = mask[i]
|
|
201
|
+
pr.chromosome_specific << position_in_region
|
|
202
|
+
end
|
|
203
|
+
when /[[:lower:]]/.match(mask[i])
|
|
204
|
+
#this is not that good candidate, but sitll gives specificity
|
|
205
|
+
|
|
206
|
+
if parental[i] == '-'
|
|
207
|
+
parental[i] = mask[i]
|
|
208
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
|
209
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
|
210
|
+
parental[i] = mask[i].upcase
|
|
211
|
+
pr.almost_chromosome_specific << position_in_region
|
|
212
|
+
end
|
|
213
|
+
end #Case closes
|
|
214
|
+
position_in_region += 1
|
|
215
|
+
end #Closes region with bases
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
pr.sequence=parental.gsub('-','')
|
|
219
|
+
pr
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def reverse_complement_string(sequenc_str)
|
|
223
|
+
complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
|
|
224
|
+
complement.reverse!
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def return_primer_3_string(opts={})
|
|
228
|
+
|
|
229
|
+
left = opts[:left_pos]
|
|
230
|
+
right = opts[:right_pos]
|
|
231
|
+
sequence = opts[:sequence]
|
|
232
|
+
orientation = "forward"
|
|
233
|
+
if opts[:right_pos]
|
|
234
|
+
orientation = "forward"
|
|
235
|
+
if left > right
|
|
236
|
+
left = sequence.size - left - 1
|
|
237
|
+
right = sequence.size - right - 1
|
|
238
|
+
sequence = reverse_complement_string(sequence)
|
|
239
|
+
orientation = "reverse"
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
|
244
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
|
245
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
|
246
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
|
247
|
+
str << "=\n"
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
#In case that we don't have a right primer, wi do both orientation
|
|
251
|
+
unless opts[:right_pos]
|
|
252
|
+
sequence = opts[:sequence]
|
|
253
|
+
left = sequence.size - left - 1
|
|
254
|
+
orientation = "reverse"
|
|
255
|
+
sequence = reverse_complement_string(sequence)
|
|
256
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
|
257
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
|
258
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
|
259
|
+
str << "=\n"
|
|
260
|
+
else
|
|
261
|
+
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
str
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def primer_3_all_strings(target_chromosome, parental)
|
|
269
|
+
pr = primer_region(target_chromosome, parental )
|
|
270
|
+
primer_3_propertes = Array.new
|
|
271
|
+
|
|
272
|
+
seq_original = String.new(pr.sequence)
|
|
273
|
+
seq_original[pr.snp_pos] = self.original
|
|
274
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
|
275
|
+
|
|
276
|
+
seq_snp = String.new(pr.sequence)
|
|
277
|
+
seq_snp[pr.snp_pos] = self.snp
|
|
278
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
|
279
|
+
|
|
280
|
+
rev_pos = seq_snp.size - position
|
|
281
|
+
|
|
282
|
+
if pr.homeologous
|
|
283
|
+
snp_type = "homeologous"
|
|
284
|
+
else
|
|
285
|
+
snp_type = "non-homeologous"
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
pr.chromosome_specific.each do |pos|
|
|
289
|
+
|
|
290
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
|
291
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
292
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
|
|
293
|
+
args[:sequence] = seq_snp
|
|
294
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
pr.almost_chromosome_specific.each do |pos|
|
|
298
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
|
299
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
300
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
|
|
301
|
+
args[:sequence] = seq_snp
|
|
302
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
303
|
+
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
pr.crhomosome_specific_intron.each do |pos|
|
|
307
|
+
|
|
308
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
|
309
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
310
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{snp_type} #{chromosome}"
|
|
311
|
+
args[:sequence] = seq_snp
|
|
312
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
pr.almost_crhomosome_specific_intron.each do |pos|
|
|
316
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
|
317
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
318
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{snp_type} #{chromosome}"
|
|
319
|
+
args[:sequence] = seq_snp
|
|
320
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
321
|
+
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
|
|
326
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
327
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{snp_type} #{chromosome}"
|
|
328
|
+
args[:sequence] = seq_snp
|
|
329
|
+
primer_3_propertes << return_primer_3_string(args)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
primer_3_propertes
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def to_s
|
|
336
|
+
"#{gene}:#{original}#{position}#{snp}#{chromosome}"
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def short_s
|
|
340
|
+
"#{original}#{position}#{snp}".upcase
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def primer_3_string(target_chromosome, parental)
|
|
344
|
+
strings = primer_3_all_strings(target_chromosome, parental)
|
|
345
|
+
strings.join
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def exon_for_chromosome (chromosome)
|
|
349
|
+
selected_exon=exon_list[chromosome]
|
|
350
|
+
puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
|
|
351
|
+
selected_exon
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
def parental_sequences
|
|
355
|
+
return @parental_sequences if @parental_sequences
|
|
356
|
+
gene_region = self.covered_region
|
|
357
|
+
local_pos_in_gene = self.local_position
|
|
358
|
+
|
|
359
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
|
360
|
+
container.parents.each do |name, bam|
|
|
361
|
+
seq = nil
|
|
362
|
+
if bam
|
|
363
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
|
364
|
+
else
|
|
365
|
+
seq = container.gene_model_sequence(gene_region)
|
|
366
|
+
unless name == self.snp_in
|
|
367
|
+
seq[local_pos_in_gene] = self.original
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
|
371
|
+
|
|
372
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
|
373
|
+
@parental_sequences [name] = seq
|
|
374
|
+
#puts name
|
|
375
|
+
#puts seq
|
|
376
|
+
end
|
|
377
|
+
@parental_sequences
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def surrounding_parental_sequences
|
|
381
|
+
return @surrounding_parental_sequences if @surrounding_parental_sequences
|
|
382
|
+
gene_region = self.covered_region
|
|
383
|
+
local_pos_in_gene = self.local_position
|
|
384
|
+
|
|
385
|
+
@surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
|
|
386
|
+
container.parents.each do |name, bam|
|
|
387
|
+
seq = nil
|
|
388
|
+
if bam
|
|
389
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
|
390
|
+
else
|
|
391
|
+
seq = container.gene_model_sequence(gene_region)
|
|
392
|
+
unless name == self.snp_in
|
|
393
|
+
# puts "Modiging original: #{name} #{self.original}"
|
|
394
|
+
seq[local_pos_in_gene] = self.original
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
#puts "local_pos_in_gene #{local_pos_in_gene}"
|
|
398
|
+
#puts "'#{name}' compared to '#{self.snp_in}'"
|
|
399
|
+
#puts seq
|
|
400
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
|
401
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
|
402
|
+
#puts seq
|
|
403
|
+
#puts "__"
|
|
404
|
+
@surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
|
|
405
|
+
end
|
|
406
|
+
# puts "&&&&\n#{surrounding_parental_sequences['A']}\n#{surrounding_parental_sequences['B']}\n&&&&"
|
|
407
|
+
@surrounding_parental_sequences
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def cut_sequence_to_primer_region(sequence)
|
|
411
|
+
ideal_min = self.local_position - flanking_size
|
|
412
|
+
ideal_max = self.local_position + flanking_size
|
|
413
|
+
ideal_min = 0 if ideal_min < 0
|
|
414
|
+
ideal_max = sequence.size - 1 if ideal_max > sequence.size
|
|
415
|
+
# len = ideal_max - ideal_min
|
|
416
|
+
sequence[ideal_min..ideal_max]
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def cut_and_pad_sequence_to_primer_region(sequence)
|
|
420
|
+
# p "cut_and_pad_sequence_to_primer_region #{local_position} #{flanking_size}"
|
|
421
|
+
ideal_min = self.local_position - flanking_size
|
|
422
|
+
ideal_max = self.local_position + flanking_size
|
|
423
|
+
left_pad = 0
|
|
424
|
+
right_pad=0
|
|
425
|
+
if ideal_min < 0
|
|
426
|
+
left_pad = ideal_min * -1
|
|
427
|
+
ideal_min = 0
|
|
428
|
+
end
|
|
429
|
+
if ideal_max > sequence.size
|
|
430
|
+
right_pad = ideal_max - sequence.size
|
|
431
|
+
ideal_max = sequence.size - 1
|
|
432
|
+
end
|
|
433
|
+
ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
|
|
434
|
+
ret
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def sequences_to_align
|
|
438
|
+
@sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
|
|
439
|
+
# p "sequences_to_align"
|
|
440
|
+
# p @sequences_to_align.inspect
|
|
441
|
+
@sequences_to_align
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def aligned_sequences
|
|
445
|
+
|
|
446
|
+
return @aligned_sequences if @aligned_sequences
|
|
447
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
|
448
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
|
449
|
+
#puts "Before MAFT:#{sequences_to_align.inspect}"
|
|
450
|
+
report = mafft.query_align(sequences_to_align)
|
|
451
|
+
@aligned_sequences = report.alignment
|
|
452
|
+
#puts "MAFFT: #{report.alignment.inspect}"
|
|
453
|
+
@aligned_sequences
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def aligned_sequences_fasta
|
|
457
|
+
ret_str = ""
|
|
458
|
+
aligned_sequences.each_pair do |name, seq|
|
|
459
|
+
ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
|
|
460
|
+
end
|
|
461
|
+
ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
|
|
462
|
+
|
|
463
|
+
pr = primer_region(chromosome, snp_in )
|
|
464
|
+
ret_str << pr.to_fasta
|
|
465
|
+
ret_str
|
|
466
|
+
ret_str
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
def aligned_snp_position
|
|
470
|
+
return @aligned_snp_position if @aligned_snp_position
|
|
471
|
+
pos = -1
|
|
472
|
+
parental_strings = Array.new
|
|
473
|
+
parental_sequences.keys.each do | par |
|
|
474
|
+
|
|
475
|
+
parental_strings << aligned_sequences[par]
|
|
476
|
+
end
|
|
477
|
+
template_sequence = nil
|
|
478
|
+
aligned_sequences.keys.each do |temp |
|
|
479
|
+
template_sequence = aligned_sequences[ temp ] if aligned_sequences[ temp ][0] != "-"
|
|
480
|
+
end
|
|
481
|
+
$stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
|
|
482
|
+
|
|
483
|
+
i = 0
|
|
484
|
+
differences = 0
|
|
485
|
+
local_pos_in_gene = flanking_size
|
|
486
|
+
local_pos = 0
|
|
487
|
+
started = false
|
|
488
|
+
#TODO: Validate the cases when the alignment has padding on the left on all the chromosomes
|
|
489
|
+
|
|
490
|
+
while i < parental_strings[0].size do
|
|
491
|
+
if local_pos_in_gene == local_pos
|
|
492
|
+
pos = i
|
|
493
|
+
if parental_strings[0][i] == parental_strings[1][i]
|
|
494
|
+
$stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
started = true if template_sequence[i] != "-"
|
|
500
|
+
if started == false or template_sequence[i] != "-"
|
|
501
|
+
local_pos += 1
|
|
502
|
+
end
|
|
503
|
+
i += 1
|
|
504
|
+
end
|
|
505
|
+
@aligned_snp_position = pos
|
|
506
|
+
return pos
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
|
510
|
+
names = exon_sequences.keys
|
|
511
|
+
parentals = parental_sequences.keys
|
|
512
|
+
|
|
513
|
+
local_pos_in_gene = aligned_snp_position
|
|
514
|
+
masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
|
|
515
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
|
|
516
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
|
517
|
+
i = 0
|
|
518
|
+
while i < masked_snps.size
|
|
519
|
+
different = 0
|
|
520
|
+
cov = 0
|
|
521
|
+
from_group = 0
|
|
522
|
+
names.each do | chr |
|
|
523
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
|
524
|
+
cov += 1
|
|
525
|
+
|
|
526
|
+
from_group += 1 if chr[0] == chromosome_group
|
|
527
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
|
528
|
+
if chr != chromosome
|
|
529
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
|
530
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
|
531
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
|
532
|
+
end
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
masked_snps[i] = "-" if different == 0
|
|
536
|
+
masked_snps[i] = "-" if cov == 1
|
|
537
|
+
masked_snps[i] = "*" if cov == 0
|
|
538
|
+
expected_snps = names.size - 1
|
|
539
|
+
# puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
|
540
|
+
|
|
541
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
|
542
|
+
|
|
543
|
+
if i == local_pos_in_gene
|
|
544
|
+
masked_snps[i] = "&"
|
|
545
|
+
bases = ""
|
|
546
|
+
names.each do | chr |
|
|
547
|
+
bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
code_reference = "n"
|
|
551
|
+
code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
|
|
552
|
+
|
|
553
|
+
if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
|
|
554
|
+
masked_snps[i] = ":"
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
end
|
|
558
|
+
i += 1
|
|
559
|
+
end
|
|
560
|
+
masked_snps
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
def masked_chromosomal_snps(chromosome)
|
|
564
|
+
chromosomes = exon_sequences
|
|
565
|
+
names = chromosomes.keys
|
|
566
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
|
567
|
+
masked_snps = "-" * covered_region.size unless chromosomes[chromosome]
|
|
568
|
+
local_pos_in_gene = self.local_position
|
|
569
|
+
ideal_min = local_pos_in_gene - flanking_size
|
|
570
|
+
ideal_max = local_pos_in_gene + flanking_size
|
|
571
|
+
i = 0
|
|
572
|
+
while i < masked_snps.size do
|
|
573
|
+
if i > ideal_min and i <= ideal_max
|
|
574
|
+
|
|
575
|
+
different = 0
|
|
576
|
+
cov = 0
|
|
577
|
+
names.each do | chr |
|
|
578
|
+
if chromosomes[chr][i] != "-"
|
|
579
|
+
cov += 1
|
|
580
|
+
if chr != chromosome and masked_snps[i] != "+"
|
|
581
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
|
582
|
+
end
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
end
|
|
586
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
|
587
|
+
masked_snps[i] = "-" if cov < 2
|
|
588
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
|
589
|
+
|
|
590
|
+
else
|
|
591
|
+
masked_snps[i] = "*"
|
|
592
|
+
end
|
|
593
|
+
if i == local_pos_in_gene
|
|
594
|
+
masked_snps[i] = "&"
|
|
595
|
+
end
|
|
596
|
+
i += 1
|
|
597
|
+
end
|
|
598
|
+
masked_snps
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
def surrounding_masked_chromosomal_snps(chromosome)
|
|
602
|
+
|
|
603
|
+
chromosomes = surrounding_exon_sequences
|
|
604
|
+
names = chromosomes.keys
|
|
605
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
|
606
|
+
masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
|
|
607
|
+
local_pos_in_gene = flanking_size
|
|
608
|
+
# ideal_min = local_pos_in_gene - flanking_size
|
|
609
|
+
#ideal_max = local_pos_in_gene + flanking_size
|
|
610
|
+
i = 0
|
|
611
|
+
while i < masked_snps.size do
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
different = 0
|
|
615
|
+
cov = 0
|
|
616
|
+
names.each do | chr |
|
|
617
|
+
if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
|
|
618
|
+
cov += 1
|
|
619
|
+
if chr != chromosome and masked_snps[i] != "+"
|
|
620
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
|
621
|
+
end
|
|
622
|
+
end
|
|
623
|
+
|
|
624
|
+
end
|
|
625
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
|
626
|
+
masked_snps[i] = "-" if cov < 2
|
|
627
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
if i == local_pos_in_gene
|
|
631
|
+
masked_snps[i] = "&"
|
|
632
|
+
end
|
|
633
|
+
i += 1
|
|
634
|
+
end
|
|
635
|
+
masked_snps
|
|
636
|
+
end
|
|
637
|
+
|
|
638
|
+
def surrounding_exon_sequences
|
|
639
|
+
return @surrounding_exon_sequences if @surrounding_exon_sequences
|
|
640
|
+
@surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
|
|
641
|
+
self.exon_list.each do |chromosome, exon|
|
|
642
|
+
#puts "surrounding_exon_sequences #{flanking_size}"
|
|
643
|
+
#puts chromosome
|
|
644
|
+
#puts exon
|
|
645
|
+
flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
|
|
646
|
+
#TODO: Padd when the exon goes over the regions...
|
|
647
|
+
|
|
648
|
+
#Ignoring when the exon is in a gap
|
|
649
|
+
unless exon.snp_in_gap
|
|
650
|
+
exon_seq = container.chromosome_sequence(flanquing_region)
|
|
651
|
+
@surrounding_exon_sequences[chromosome] = exon_seq
|
|
652
|
+
end
|
|
653
|
+
end
|
|
654
|
+
@surrounding_exon_sequences
|
|
655
|
+
end
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def exon_sequences
|
|
659
|
+
return @exon_sequences if @exon_sequences
|
|
660
|
+
gene_region = self.covered_region
|
|
661
|
+
local_pos_in_gene = self.local_position
|
|
662
|
+
@exon_sequences = Bio::Alignment::SequenceHash.new
|
|
663
|
+
self.exon_list.each do |chromosome, exon|
|
|
664
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
|
665
|
+
exon_seq = "-" * exon_start_offset
|
|
666
|
+
exon_seq << container.chromosome_sequence(exon.target_region).to_s
|
|
667
|
+
#puts exon_seq
|
|
668
|
+
# l_pos = exon_start_offset + local_pos_in_gene
|
|
669
|
+
unless exon.snp_in_gap
|
|
670
|
+
#puts "local position: #{local_pos_in_gene}"
|
|
671
|
+
#puts "Exon_seq: #{exon_seq}"
|
|
672
|
+
exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
|
|
673
|
+
exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
|
|
674
|
+
@exon_sequences[chromosome] = exon_seq
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
@exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
|
|
678
|
+
@exon_sequences
|
|
679
|
+
end
|
|
680
|
+
end
|
|
681
|
+
end
|