bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,804 @@
|
|
1
|
+
require 'bio'
|
2
|
+
module Bio::PolyploidTools
|
3
|
+
class SNPException < RuntimeError
|
4
|
+
end
|
5
|
+
|
6
|
+
class SNP
|
7
|
+
#GENE,ORIGINAL,POS,SNP
|
8
|
+
attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
|
9
|
+
attr_accessor :contig
|
10
|
+
attr_accessor :exon_list
|
11
|
+
attr_accessor :container
|
12
|
+
attr_accessor :flanking_size, :ideal_min, :ideal_max
|
13
|
+
attr_accessor :template_sequence
|
14
|
+
attr_accessor :use_reference
|
15
|
+
attr_accessor :genomes_count
|
16
|
+
attr_accessor :primer_3_min_seq_length
|
17
|
+
attr_accessor :chromosome
|
18
|
+
attr_accessor :variation_free_region
|
19
|
+
attr_accessor :max_hits
|
20
|
+
attr_accessor :errors
|
21
|
+
attr_accessor :repetitive
|
22
|
+
attr_accessor :hit_count
|
23
|
+
attr_accessor :snp_type
|
24
|
+
attr_accessor :orientation
|
25
|
+
|
26
|
+
#Format:
|
27
|
+
#Gene_name,Original,SNP_Pos,pos,chromosome
|
28
|
+
#A_comp0_c0_seq1,C,519,A,2A
|
29
|
+
def self.parse(reg_str)
|
30
|
+
reg_str.chomp!
|
31
|
+
snp = SNP.new
|
32
|
+
snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
|
33
|
+
snp.position.strip!
|
34
|
+
snp.position = snp.position.to_i
|
35
|
+
snp.original.upcase!
|
36
|
+
snp.original.strip!
|
37
|
+
snp.snp.upcase!
|
38
|
+
snp.snp.strip!
|
39
|
+
snp.chromosome.strip!
|
40
|
+
|
41
|
+
snp.use_reference = false
|
42
|
+
snp
|
43
|
+
end
|
44
|
+
|
45
|
+
#Format:
|
46
|
+
#IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 .
|
47
|
+
def self.parseVCF(vcf_line, chr_arm_parser: Bio::PolyploidTools::ChromosomeArm.getArmSelection("first_two") )
|
48
|
+
snp = SNP.new
|
49
|
+
arr = vcf_line.split("\t")
|
50
|
+
snp.gene = arr[2]
|
51
|
+
snp.original = arr[3]
|
52
|
+
snp.position = arr[1]
|
53
|
+
snp.snp = arr[4]
|
54
|
+
snp.chromosome = chr_arm_parser.call(arr[0])
|
55
|
+
snp.contig = arr[0]
|
56
|
+
snp.position.strip!
|
57
|
+
snp.position = snp.position.to_i
|
58
|
+
snp.original.upcase!
|
59
|
+
snp.original.strip!
|
60
|
+
snp.snp.upcase!
|
61
|
+
snp.snp.strip!
|
62
|
+
snp.chromosome.strip!
|
63
|
+
snp.orientation = :forward
|
64
|
+
|
65
|
+
info = arr[7]
|
66
|
+
if info
|
67
|
+
details = info.scan(/(\w+)=([\w|.]+)/).collect { |id, value| { :id => id, :value => value }}
|
68
|
+
details.each do |e|
|
69
|
+
snp.orientation = :reverse if e[:id] == "OR" and e[:value] == "reverse"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return snp
|
73
|
+
end
|
74
|
+
|
75
|
+
def setTemplateFromFastaFile(fastaFile ,flanking_size: 100)
|
76
|
+
reg = Bio::DB::Fasta::Region.new
|
77
|
+
reg.entry = gene
|
78
|
+
reg.entry = @contig if @contig
|
79
|
+
reg.start = position - flanking_size
|
80
|
+
reg.end = position + flanking_size + 1
|
81
|
+
reg.orientation = :forward
|
82
|
+
entry = fastaFile.index.region_for_entry(reg.entry)
|
83
|
+
reg.start = 1 if reg.start < 1
|
84
|
+
reg.end = entry.length if reg.end > entry.length
|
85
|
+
amb = Bio::NucleicAcid.to_IUAPC("#{original}#{snp}")
|
86
|
+
@position = @position - reg.start + 1
|
87
|
+
@position = 1 if @position < 1
|
88
|
+
#puts "about to fetch"
|
89
|
+
self.template_sequence = fastaFile.fetch_sequence(reg)
|
90
|
+
#puts "done fetching"
|
91
|
+
template_sequence[position - 1] = amb
|
92
|
+
end
|
93
|
+
|
94
|
+
def initialize
|
95
|
+
@genomes_count = 3
|
96
|
+
@primer_3_min_seq_length = 50
|
97
|
+
@variation_free_region = 0
|
98
|
+
@contig = false
|
99
|
+
@max_hits = 8
|
100
|
+
@exon_list = Hash.new {|hsh, key| hsh[key] = [] }
|
101
|
+
@errors = Array.new
|
102
|
+
@repetitive = false
|
103
|
+
@hit_count = 0
|
104
|
+
end
|
105
|
+
|
106
|
+
def to_polymarker_coordinates(flanking_size, total:nil)
|
107
|
+
start = position - flanking_size + 1
|
108
|
+
start = 0 if start < 0
|
109
|
+
total = flanking_size * 2 unless total
|
110
|
+
total += 1
|
111
|
+
new_position = position - start + 2
|
112
|
+
[start , total, new_position ]
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_polymarker_sequence(flanking_size, total:nil)
|
116
|
+
out = template_sequence.clone
|
117
|
+
snp_seq = "[#{original}/#{snp}]"
|
118
|
+
p = position-1
|
119
|
+
if orientation == :reverse
|
120
|
+
p = out.length - p - 1
|
121
|
+
s = Bio::Sequence::NA.new(out)
|
122
|
+
s1 = Bio::Sequence::NA.new(original)
|
123
|
+
s2 = Bio::Sequence::NA.new(snp)
|
124
|
+
out = s.reverse_complement
|
125
|
+
snp_seq = "[#{s1.reverse_complement}/#{s2.reverse_complement}]"
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
out[p] = snp_seq
|
130
|
+
start = position - flanking_size - 1
|
131
|
+
start = 0 if start < 0
|
132
|
+
total = flanking_size * 2 unless total
|
133
|
+
total += 5
|
134
|
+
out[start , total ].upcase
|
135
|
+
end
|
136
|
+
|
137
|
+
def snp_id_in_seq
|
138
|
+
"#{original}#{position}#{snp}"
|
139
|
+
end
|
140
|
+
|
141
|
+
#We Only want the chromosome, we drop the arm.
|
142
|
+
#We don't use this any more.
|
143
|
+
#def chromosome= (chr)
|
144
|
+
# @chromosome = chr
|
145
|
+
#end
|
146
|
+
|
147
|
+
def chromosome_group
|
148
|
+
chromosome[0]
|
149
|
+
end
|
150
|
+
|
151
|
+
def chromosome_genome
|
152
|
+
chromosome[1]
|
153
|
+
end
|
154
|
+
|
155
|
+
def chromosome_genome
|
156
|
+
return chromosome[3] if chromosome[3]
|
157
|
+
return nil
|
158
|
+
end
|
159
|
+
|
160
|
+
def to_fasta
|
161
|
+
return ">#{self.gene}\n#{self.template_sequence}\n"
|
162
|
+
end
|
163
|
+
|
164
|
+
def add_exon(exon, arm, filter_best: true)
|
165
|
+
exon_list[arm] = Array.new unless exon_list[arm]
|
166
|
+
if filter_best and exon_list[arm].size > 0
|
167
|
+
current = exon_list[arm].first
|
168
|
+
exon_list[arm] = [exon] if exon.record.score > current.record.score
|
169
|
+
else
|
170
|
+
exon_list[arm] << exon
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def covered_region
|
175
|
+
return @covered_region if @covered_region
|
176
|
+
if self.use_reference
|
177
|
+
reg = Bio::DB::Fasta::Region.new()
|
178
|
+
reg.entry = gene
|
179
|
+
reg.orientation = :forward
|
180
|
+
reg.start = self.position - self.flanking_size
|
181
|
+
reg.end = self.position + self.flanking_size
|
182
|
+
reg.start = 1 if reg.start < 1
|
183
|
+
return reg
|
184
|
+
end
|
185
|
+
|
186
|
+
min = @position
|
187
|
+
max = @position
|
188
|
+
# puts "Calculating covered region for #{self.inspect}"
|
189
|
+
# puts "#{@exon_list.inspect}"
|
190
|
+
# raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
|
191
|
+
if @exon_list.size == 0
|
192
|
+
min = self.position - self.flanking_size
|
193
|
+
min = 1 if min < 1
|
194
|
+
max = self.position + self.flanking_size
|
195
|
+
end
|
196
|
+
@exon_list.each do | chromosome, exon_arr |
|
197
|
+
exon_arr.each do | exon |
|
198
|
+
reg = exon.query_region
|
199
|
+
min = reg.start if reg.start < min
|
200
|
+
max = reg.end if reg.end > max
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
reg = Bio::DB::Fasta::Region.new()
|
205
|
+
reg.entry = gene
|
206
|
+
reg.orientation = :forward
|
207
|
+
reg.start = min
|
208
|
+
reg.end = max
|
209
|
+
|
210
|
+
@covered_region = reg
|
211
|
+
@covered_region
|
212
|
+
end
|
213
|
+
|
214
|
+
def left_padding
|
215
|
+
flanking_size - self.local_position + 1
|
216
|
+
# primer_region.start - covered_region.start
|
217
|
+
# 0
|
218
|
+
end
|
219
|
+
|
220
|
+
def right_padding
|
221
|
+
ret = (2*flanking_size) - (left_padding + self.covered_region.size )
|
222
|
+
ret = 0 if ret < 0
|
223
|
+
ret
|
224
|
+
end
|
225
|
+
|
226
|
+
def local_position
|
227
|
+
# puts "local_position #{self.position} #{self.covered_region.start}"
|
228
|
+
self.position - self.covered_region.start
|
229
|
+
end
|
230
|
+
|
231
|
+
def padded_position(pos)
|
232
|
+
pos + left_padding
|
233
|
+
end
|
234
|
+
|
235
|
+
def primer_fasta_string
|
236
|
+
gene_region = self.covered_region
|
237
|
+
local_pos_in_gene = self.local_position
|
238
|
+
ret_str = ""
|
239
|
+
|
240
|
+
surrounding_parental_sequences.each do |name, seq|
|
241
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
|
242
|
+
ret_str << "#{seq}\n"
|
243
|
+
end
|
244
|
+
|
245
|
+
self.surrounding_exon_sequences.each do |chromosome, exon_seq|
|
246
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
247
|
+
end
|
248
|
+
|
249
|
+
mask = surrounding_masked_chromosomal_snps(chromosome)
|
250
|
+
ret_str << ">Mask\n#{mask}\n"
|
251
|
+
|
252
|
+
pr = primer_region(chromosome, snp_in )
|
253
|
+
ret_str << pr.to_fasta
|
254
|
+
ret_str
|
255
|
+
end
|
256
|
+
|
257
|
+
def primer_region(target_chromosome, parental )
|
258
|
+
|
259
|
+
parental = aligned_sequences[parental].downcase
|
260
|
+
names = aligned_sequences.keys
|
261
|
+
target_chromosome = get_target_sequence(names, target_chromosome)
|
262
|
+
|
263
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
264
|
+
chromosome_seq = "-" * parental.size unless chromosome_seq
|
265
|
+
chromosome_seq = chromosome_seq.downcase
|
266
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
267
|
+
|
268
|
+
pr = PrimerRegion.new
|
269
|
+
position_in_region = 0
|
270
|
+
(0..parental.size-1).each do |i|
|
271
|
+
|
272
|
+
if chromosome_seq[i] != '-' or parental[i] != '-'
|
273
|
+
case
|
274
|
+
when mask[i] == '&'
|
275
|
+
#This is the SNP we take the parental
|
276
|
+
pr.snp_pos = position_in_region
|
277
|
+
pr.homoeologous = false
|
278
|
+
when mask[i] == ':'
|
279
|
+
#This is the SNP we take the parental
|
280
|
+
pr.snp_pos = position_in_region
|
281
|
+
pr.homoeologous = true
|
282
|
+
when mask[i] == '-'
|
283
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
284
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
285
|
+
|
286
|
+
when /[[:upper:]]/.match(mask[i])
|
287
|
+
#This is a good candidate for marking a SNP
|
288
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
289
|
+
if parental[i] == '-'
|
290
|
+
parental[i] = mask[i]
|
291
|
+
pr.crhomosome_specific_intron << position_in_region
|
292
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
293
|
+
parental[i] = mask[i]
|
294
|
+
pr.chromosome_specific << position_in_region
|
295
|
+
end
|
296
|
+
when /[[:lower:]]/.match(mask[i])
|
297
|
+
#this is not that good candidate, but sitll gives specificity
|
298
|
+
|
299
|
+
if parental[i] == '-'
|
300
|
+
parental[i] = mask[i]
|
301
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
302
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
303
|
+
parental[i] = mask[i].upcase
|
304
|
+
pr.almost_chromosome_specific << position_in_region
|
305
|
+
end
|
306
|
+
end #Case closes
|
307
|
+
position_in_region += 1
|
308
|
+
end #Closes region with bases
|
309
|
+
end
|
310
|
+
|
311
|
+
pr.sequence=parental.gsub('-','')
|
312
|
+
pr
|
313
|
+
end
|
314
|
+
|
315
|
+
def reverse_complement_string(sequenc_str)
|
316
|
+
complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
|
317
|
+
complement.reverse!
|
318
|
+
end
|
319
|
+
|
320
|
+
def return_primer_3_string(opts={})
|
321
|
+
|
322
|
+
left = opts[:left_pos]
|
323
|
+
right = opts[:right_pos]
|
324
|
+
sequence = opts[:sequence]
|
325
|
+
extra = opts[:extra]
|
326
|
+
|
327
|
+
orientation = "forward"
|
328
|
+
if opts[:right_pos]
|
329
|
+
orientation = "forward"
|
330
|
+
if left > right
|
331
|
+
left = sequence.size - left - 1
|
332
|
+
right = sequence.size - right - 1
|
333
|
+
sequence = reverse_complement_string(sequence)
|
334
|
+
orientation = "reverse"
|
335
|
+
end
|
336
|
+
if @variation_free_region > 0
|
337
|
+
check_str = sequence[right+1, @variation_free_region]
|
338
|
+
return nil if check_str != check_str.downcase
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
#puts "__"
|
344
|
+
#puts self.inspect
|
345
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation} \n"
|
346
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_f]
|
347
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
348
|
+
str << extra if extra
|
349
|
+
str << opts[:extra_f] if opts[:extra_f]
|
350
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
351
|
+
|
352
|
+
|
353
|
+
str << "=\n"
|
354
|
+
|
355
|
+
|
356
|
+
#In case that we don't have a right primer, we do both orientations
|
357
|
+
unless opts[:right_pos]
|
358
|
+
sequence = opts[:sequence]
|
359
|
+
left = sequence.size - left - 1
|
360
|
+
orientation = "reverse"
|
361
|
+
sequence = reverse_complement_string(sequence)
|
362
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
363
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_r]
|
364
|
+
str << opts[:extra_r] if opts[:extra_r]
|
365
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
366
|
+
str << extra if extra
|
367
|
+
str << "=\n"
|
368
|
+
end
|
369
|
+
|
370
|
+
str
|
371
|
+
end
|
372
|
+
|
373
|
+
|
374
|
+
def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20 )
|
375
|
+
|
376
|
+
pr = primer_region(target_chromosome, parental )
|
377
|
+
primer_3_propertes = Array.new
|
378
|
+
|
379
|
+
seq_original = String.new(pr.sequence)
|
380
|
+
|
381
|
+
if seq_original.size < primer_3_min_seq_length
|
382
|
+
errors << "The sequence (#{seq_original.size}) is shorter than #{primer_3_min_seq_length}"
|
383
|
+
return primer_3_propertes
|
384
|
+
end
|
385
|
+
|
386
|
+
if self.hit_count > self.max_hits
|
387
|
+
errors << "The marker maps to #{self.hit_count} positions (max_hits: #{self.max_hits}). "
|
388
|
+
repetitive = true
|
389
|
+
return primer_3_propertes
|
390
|
+
end
|
391
|
+
seq_original[pr.snp_pos] = self.original
|
392
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
393
|
+
|
394
|
+
seq_snp = String.new(pr.sequence)
|
395
|
+
seq_snp[pr.snp_pos] = self.snp
|
396
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
397
|
+
|
398
|
+
rev_pos = seq_snp.size - position
|
399
|
+
|
400
|
+
if pr.homoeologous
|
401
|
+
@snp_type = "homoeologous"
|
402
|
+
else
|
403
|
+
@snp_type = "non-homoeologous"
|
404
|
+
end
|
405
|
+
|
406
|
+
total_candidates = pr.chromosome_specific.size
|
407
|
+
total_candidates += pr.crhomosome_specific_intron.size
|
408
|
+
total_candidates += pr.almost_chromosome_specific.size
|
409
|
+
total_candidates += pr.almost_crhomosome_specific_intron.size
|
410
|
+
|
411
|
+
skip_specific = total_candidates > max_specific_primers
|
412
|
+
#puts "skip_specific: #{skip_specific}: #{total_candidates} > #{max_specific_primers}"
|
413
|
+
pr.chromosome_specific.each do |pos|
|
414
|
+
break if skip_specific
|
415
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
416
|
+
primer_3_propertes << return_primer_3_string(args)
|
417
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
|
418
|
+
args[:sequence] = seq_snp
|
419
|
+
primer_3_propertes << return_primer_3_string(args)
|
420
|
+
end
|
421
|
+
|
422
|
+
pr.crhomosome_specific_intron.each do |pos|
|
423
|
+
break if skip_specific
|
424
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
425
|
+
primer_3_propertes << return_primer_3_string(args)
|
426
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
|
427
|
+
args[:sequence] = seq_snp
|
428
|
+
primer_3_propertes << return_primer_3_string(args)
|
429
|
+
end
|
430
|
+
|
431
|
+
pr.almost_chromosome_specific.each do |pos|
|
432
|
+
break if skip_specific
|
433
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
434
|
+
primer_3_propertes << return_primer_3_string(args)
|
435
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
|
436
|
+
args[:sequence] = seq_snp
|
437
|
+
primer_3_propertes << return_primer_3_string(args)
|
438
|
+
end
|
439
|
+
|
440
|
+
pr.almost_crhomosome_specific_intron.each do |pos|
|
441
|
+
break if skip_specific
|
442
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
443
|
+
primer_3_propertes << return_primer_3_string(args)
|
444
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
|
445
|
+
args[:sequence] = seq_snp
|
446
|
+
primer_3_propertes << return_primer_3_string(args)
|
447
|
+
end
|
448
|
+
|
449
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
|
450
|
+
primer_3_propertes << return_primer_3_string(args)
|
451
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{@snp_type} #{chromosome}"
|
452
|
+
args[:sequence] = seq_snp
|
453
|
+
primer_3_propertes << return_primer_3_string(args)
|
454
|
+
primer_3_propertes
|
455
|
+
end
|
456
|
+
|
457
|
+
def to_s
|
458
|
+
"#{gene}:#{original}#{position}#{snp}#{chromosome}"
|
459
|
+
end
|
460
|
+
|
461
|
+
def short_s
|
462
|
+
"#{original}#{position}#{snp}".upcase
|
463
|
+
end
|
464
|
+
|
465
|
+
def primer_3_string(target_chromosome, parental, max_specific_primers: 20)
|
466
|
+
strings = primer_3_all_strings(target_chromosome, parental, max_specific_primers: max_specific_primers)
|
467
|
+
strings.join
|
468
|
+
end
|
469
|
+
|
470
|
+
def exon_for_chromosome (chromosome)
|
471
|
+
selected_exon=exon_list[chromosome]
|
472
|
+
puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
|
473
|
+
selected_exon
|
474
|
+
end
|
475
|
+
|
476
|
+
def parental_sequences
|
477
|
+
return @parental_sequences if @parental_sequences
|
478
|
+
gene_region = self.covered_region
|
479
|
+
local_pos_in_gene = self.local_position
|
480
|
+
|
481
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
482
|
+
container.parents.each do |name, bam|
|
483
|
+
seq = nil
|
484
|
+
if bam
|
485
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
486
|
+
else
|
487
|
+
seq = container.gene_model_sequence(gene_region)
|
488
|
+
unless name == self.snp_in
|
489
|
+
seq[local_pos_in_gene] = self.original
|
490
|
+
end
|
491
|
+
end
|
492
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
493
|
+
|
494
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
495
|
+
@parental_sequences [name] = seq
|
496
|
+
end
|
497
|
+
@parental_sequences
|
498
|
+
end
|
499
|
+
|
500
|
+
|
501
|
+
|
502
|
+
|
503
|
+
def surrounding_parental_sequences
|
504
|
+
return @surrounding_parental_sequences if @surrounding_parental_sequences
|
505
|
+
gene_region = self.covered_region
|
506
|
+
local_pos_in_gene = self.local_position
|
507
|
+
|
508
|
+
@surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
|
509
|
+
container.parents.each do |name, bam|
|
510
|
+
seq = nil
|
511
|
+
if bam
|
512
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
513
|
+
else
|
514
|
+
seq = container.gene_model_sequence(gene_region)
|
515
|
+
#puts "#{name} #{self.snp_in}"
|
516
|
+
#puts "Modifing original: #{name}\n#{seq}"
|
517
|
+
unless name == self.snp_in
|
518
|
+
|
519
|
+
seq[local_pos_in_gene] = self.original
|
520
|
+
else
|
521
|
+
seq[local_pos_in_gene] = self.snp
|
522
|
+
end
|
523
|
+
#puts "#{seq}"
|
524
|
+
end
|
525
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
526
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
527
|
+
@surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
|
528
|
+
end
|
529
|
+
@surrounding_parental_sequences
|
530
|
+
end
|
531
|
+
|
532
|
+
def cut_sequence_to_primer_region(sequence)
|
533
|
+
ideal_min = self.local_position - flanking_size
|
534
|
+
ideal_max = self.local_position + flanking_size
|
535
|
+
ideal_min = 0 if ideal_min < 0
|
536
|
+
ideal_max = sequence.size - 1 if ideal_max > sequence.size
|
537
|
+
# len = ideal_max - ideal_min
|
538
|
+
sequence[ideal_min..ideal_max]
|
539
|
+
end
|
540
|
+
|
541
|
+
def cut_and_pad_sequence_to_primer_region(sequence)
|
542
|
+
ideal_min = self.local_position - flanking_size
|
543
|
+
ideal_max = self.local_position + flanking_size
|
544
|
+
left_pad = 0
|
545
|
+
right_pad=0
|
546
|
+
if ideal_min < 0
|
547
|
+
left_pad = ideal_min * -1
|
548
|
+
ideal_min = 0
|
549
|
+
end
|
550
|
+
if ideal_max > sequence.size
|
551
|
+
right_pad = ideal_max - sequence.size
|
552
|
+
ideal_max = sequence.size - 1
|
553
|
+
end
|
554
|
+
ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
|
555
|
+
ret
|
556
|
+
end
|
557
|
+
|
558
|
+
def sequences_to_align
|
559
|
+
@sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
|
560
|
+
@sequences_to_align
|
561
|
+
end
|
562
|
+
|
563
|
+
def aligned_sequences
|
564
|
+
|
565
|
+
return @aligned_sequences if @aligned_sequences
|
566
|
+
return Hash.new if sequences_to_align.size == 0
|
567
|
+
|
568
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
569
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
570
|
+
#puts "Before MAFT:#{sequences_to_align.inspect}"
|
571
|
+
|
572
|
+
report = mafft.query_align(sequences_to_align)
|
573
|
+
@aligned_sequences = report.alignment
|
574
|
+
# puts "MAFFT: #{report.alignment.inspect}"
|
575
|
+
@aligned_sequences
|
576
|
+
end
|
577
|
+
|
578
|
+
def aligned_sequences_fasta
|
579
|
+
ret_str = ""
|
580
|
+
aligned_sequences.each_pair do |name, seq|
|
581
|
+
ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
|
582
|
+
end
|
583
|
+
ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
|
584
|
+
|
585
|
+
pr = primer_region(chromosome, snp_in )
|
586
|
+
ret_str << pr.to_fasta
|
587
|
+
ret_str
|
588
|
+
ret_str
|
589
|
+
end
|
590
|
+
|
591
|
+
|
592
|
+
def get_snp_position_after_trim
|
593
|
+
local_pos_in_gene = self.local_position
|
594
|
+
ideal_min = self.local_position - flanking_size
|
595
|
+
ideal_max = self.local_position + flanking_size
|
596
|
+
left_pad = 0
|
597
|
+
if ideal_min < 0
|
598
|
+
left_pad = ideal_min * -1
|
599
|
+
ideal_min = 0
|
600
|
+
end
|
601
|
+
local_pos_in_gene - ideal_min
|
602
|
+
end
|
603
|
+
|
604
|
+
def aligned_snp_position
|
605
|
+
return @aligned_snp_position if @aligned_snp_position
|
606
|
+
#puts self.inspect
|
607
|
+
pos = -1
|
608
|
+
parental_strings = Array.new
|
609
|
+
parental_sequences.keys.each do | par |
|
610
|
+
parental_strings << aligned_sequences[par]
|
611
|
+
end
|
612
|
+
$stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
|
613
|
+
|
614
|
+
local_pos_in_parental = get_snp_position_after_trim
|
615
|
+
i = 0
|
616
|
+
while i < parental_strings[0].size do
|
617
|
+
if local_pos_in_parental == 0 and parental_strings[0][i] != "-"
|
618
|
+
pos = i
|
619
|
+
if parental_strings[0][i] == parental_strings[1][i]
|
620
|
+
$stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
local_pos_in_parental -= 1 if parental_strings[0][i] != "-"
|
625
|
+
i += 1
|
626
|
+
end
|
627
|
+
@aligned_snp_position = pos
|
628
|
+
return pos
|
629
|
+
end
|
630
|
+
|
631
|
+
def get_target_sequence(names, chromosome)
|
632
|
+
|
633
|
+
best = chromosome
|
634
|
+
best_score = 0
|
635
|
+
names.each do |e|
|
636
|
+
arr = e.split("_")
|
637
|
+
if arr.length == 3
|
638
|
+
score = arr[2].to_f
|
639
|
+
if score >best_score
|
640
|
+
best_score = score
|
641
|
+
best = e
|
642
|
+
end
|
643
|
+
end
|
644
|
+
end
|
645
|
+
best
|
646
|
+
end
|
647
|
+
|
648
|
+
|
649
|
+
|
650
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
651
|
+
names = aligned_sequences.keys
|
652
|
+
parentals = parental_sequences.keys
|
653
|
+
|
654
|
+
position_after_trim = get_snp_position_after_trim
|
655
|
+
|
656
|
+
names = names - parentals
|
657
|
+
local_pos_in_gene = aligned_snp_position
|
658
|
+
|
659
|
+
best_target = get_target_sequence(names, chromosome)
|
660
|
+
masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
|
661
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[best_target]
|
662
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
663
|
+
#puts "mask_aligned_chromosomal_snp(#{chromosome})"
|
664
|
+
#puts names
|
665
|
+
i = 0
|
666
|
+
for i in 0..masked_snps.size-1
|
667
|
+
#puts i
|
668
|
+
different = 0
|
669
|
+
cov = 0
|
670
|
+
from_group = 0
|
671
|
+
nCount = 0
|
672
|
+
seen = []
|
673
|
+
names.each do | chr |
|
674
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
675
|
+
#puts aligned_sequences[chr][i]
|
676
|
+
cov += 1
|
677
|
+
nCount += 1 if aligned_sequences[chr][i] == 'N' or aligned_sequences[chr][i] == 'n' # maybe fix this to use ambiguity codes instead.
|
678
|
+
|
679
|
+
if chr[0] == chromosome_group and not seen.include? chr[1]
|
680
|
+
seen << chr[1]
|
681
|
+
from_group += 1
|
682
|
+
|
683
|
+
end
|
684
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
685
|
+
if chr != best_target
|
686
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
687
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
688
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
689
|
+
end
|
690
|
+
end
|
691
|
+
end
|
692
|
+
masked_snps[i] = "-" if different == 0
|
693
|
+
masked_snps[i] = "-" if cov == 1
|
694
|
+
masked_snps[i] = "-" if nCount > 0
|
695
|
+
masked_snps[i] = "*" if cov == 0
|
696
|
+
expected_snps = names.size - 1
|
697
|
+
|
698
|
+
#puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
699
|
+
|
700
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
701
|
+
#puts "#{i}:#{masked_snps[i]}"
|
702
|
+
|
703
|
+
if i == local_pos_in_gene
|
704
|
+
masked_snps[i] = "&"
|
705
|
+
#puts "#{i}:#{masked_snps[i]}___"
|
706
|
+
bases = ""
|
707
|
+
names.each do | chr |
|
708
|
+
bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
709
|
+
end
|
710
|
+
|
711
|
+
code_reference = "n"
|
712
|
+
code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
|
713
|
+
|
714
|
+
if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
|
715
|
+
masked_snps[i] = ":"
|
716
|
+
end
|
717
|
+
|
718
|
+
end
|
719
|
+
#i += 1
|
720
|
+
end
|
721
|
+
masked_snps
|
722
|
+
end
|
723
|
+
|
724
|
+
|
725
|
+
def surrounding_masked_chromosomal_snps(chromosome)
|
726
|
+
|
727
|
+
chromosomes = surrounding_exon_sequences
|
728
|
+
names = chromosomes.keys
|
729
|
+
get_target_sequence(names)
|
730
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
731
|
+
masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
|
732
|
+
local_pos_in_gene = flanking_size
|
733
|
+
i = 0
|
734
|
+
while i < masked_snps.size do
|
735
|
+
different = 0
|
736
|
+
cov = 0
|
737
|
+
names.each do | chr |
|
738
|
+
if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
|
739
|
+
cov += 1
|
740
|
+
if chr != chromosome and masked_snps[i] != "+"
|
741
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
742
|
+
end
|
743
|
+
end
|
744
|
+
end
|
745
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
746
|
+
masked_snps[i] = "-" if cov < 2
|
747
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
748
|
+
|
749
|
+
if i == local_pos_in_gene
|
750
|
+
masked_snps[i] = "&"
|
751
|
+
end
|
752
|
+
i += 1
|
753
|
+
end
|
754
|
+
masked_snps
|
755
|
+
end
|
756
|
+
|
757
|
+
def surrounding_exon_sequences
|
758
|
+
return @surrounding_exon_sequences if @surrounding_exon_sequences
|
759
|
+
gene_region = self.covered_region
|
760
|
+
@surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
|
761
|
+
self.exon_list.each do |chromosome, exon_arr|
|
762
|
+
exon_arr.each do |exon|
|
763
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
764
|
+
flanking_region = exon.target_flanking_region_from_position(position,flanking_size)
|
765
|
+
#TODO: Padd when the exon goes over the regions...
|
766
|
+
#puts flanking_region.inspect
|
767
|
+
#Ignoring when the exon is in a gap
|
768
|
+
unless exon.snp_in_gap
|
769
|
+
exon_seq = container.chromosome_sequence(flanking_region)
|
770
|
+
@surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
|
771
|
+
end
|
772
|
+
end
|
773
|
+
end
|
774
|
+
@surrounding_exon_sequences
|
775
|
+
end
|
776
|
+
|
777
|
+
|
778
|
+
def exon_sequences
|
779
|
+
return @exon_sequences if @exon_sequences
|
780
|
+
gene_region = self.covered_region
|
781
|
+
local_pos_in_gene = self.local_position
|
782
|
+
@exon_sequences = Bio::Alignment::SequenceHash.new
|
783
|
+
self.exon_list.each do |chromosome, exon_arr|
|
784
|
+
exon_arr.each do |exon|
|
785
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
786
|
+
exon_seq = "-" * exon_start_offset
|
787
|
+
exon_seq << container.chromosome_sequence(exon.target_region).to_s
|
788
|
+
#puts exon_seq
|
789
|
+
#l_pos = exon_start_offset + local_pos_in_gene
|
790
|
+
unless exon.snp_in_gap
|
791
|
+
#puts "local position: #{local_pos_in_gene}"
|
792
|
+
#puts "Exon_seq: #{exon_seq}"
|
793
|
+
exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
|
794
|
+
exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
|
795
|
+
#puts exon.inspect
|
796
|
+
@exon_sequences["#{chromosome}_#{exon.query_region.start}_#{exon.record.score}"] = exon_seq
|
797
|
+
end
|
798
|
+
end
|
799
|
+
end
|
800
|
+
@exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
|
801
|
+
@exon_sequences
|
802
|
+
end
|
803
|
+
end
|
804
|
+
end
|