bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,804 @@
|
|
1
|
+
require 'bio'
|
2
|
+
module Bio::PolyploidTools
|
3
|
+
class SNPException < RuntimeError
|
4
|
+
end
|
5
|
+
|
6
|
+
class SNP
|
7
|
+
#GENE,ORIGINAL,POS,SNP
|
8
|
+
attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
|
9
|
+
attr_accessor :contig
|
10
|
+
attr_accessor :exon_list
|
11
|
+
attr_accessor :container
|
12
|
+
attr_accessor :flanking_size, :ideal_min, :ideal_max
|
13
|
+
attr_accessor :template_sequence
|
14
|
+
attr_accessor :use_reference
|
15
|
+
attr_accessor :genomes_count
|
16
|
+
attr_accessor :primer_3_min_seq_length
|
17
|
+
attr_accessor :chromosome
|
18
|
+
attr_accessor :variation_free_region
|
19
|
+
attr_accessor :max_hits
|
20
|
+
attr_accessor :errors
|
21
|
+
attr_accessor :repetitive
|
22
|
+
attr_accessor :hit_count
|
23
|
+
attr_accessor :snp_type
|
24
|
+
attr_accessor :orientation
|
25
|
+
|
26
|
+
#Format:
|
27
|
+
#Gene_name,Original,SNP_Pos,pos,chromosome
|
28
|
+
#A_comp0_c0_seq1,C,519,A,2A
|
29
|
+
def self.parse(reg_str)
|
30
|
+
reg_str.chomp!
|
31
|
+
snp = SNP.new
|
32
|
+
snp.gene, snp.original, snp.position, snp.snp, snp.chromosome = reg_str.split(",")
|
33
|
+
snp.position.strip!
|
34
|
+
snp.position = snp.position.to_i
|
35
|
+
snp.original.upcase!
|
36
|
+
snp.original.strip!
|
37
|
+
snp.snp.upcase!
|
38
|
+
snp.snp.strip!
|
39
|
+
snp.chromosome.strip!
|
40
|
+
|
41
|
+
snp.use_reference = false
|
42
|
+
snp
|
43
|
+
end
|
44
|
+
|
45
|
+
#Format:
|
46
|
+
#IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 .
|
47
|
+
def self.parseVCF(vcf_line, chr_arm_parser: Bio::PolyploidTools::ChromosomeArm.getArmSelection("first_two") )
|
48
|
+
snp = SNP.new
|
49
|
+
arr = vcf_line.split("\t")
|
50
|
+
snp.gene = arr[2]
|
51
|
+
snp.original = arr[3]
|
52
|
+
snp.position = arr[1]
|
53
|
+
snp.snp = arr[4]
|
54
|
+
snp.chromosome = chr_arm_parser.call(arr[0])
|
55
|
+
snp.contig = arr[0]
|
56
|
+
snp.position.strip!
|
57
|
+
snp.position = snp.position.to_i
|
58
|
+
snp.original.upcase!
|
59
|
+
snp.original.strip!
|
60
|
+
snp.snp.upcase!
|
61
|
+
snp.snp.strip!
|
62
|
+
snp.chromosome.strip!
|
63
|
+
snp.orientation = :forward
|
64
|
+
|
65
|
+
info = arr[7]
|
66
|
+
if info
|
67
|
+
details = info.scan(/(\w+)=([\w|.]+)/).collect { |id, value| { :id => id, :value => value }}
|
68
|
+
details.each do |e|
|
69
|
+
snp.orientation = :reverse if e[:id] == "OR" and e[:value] == "reverse"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return snp
|
73
|
+
end
|
74
|
+
|
75
|
+
def setTemplateFromFastaFile(fastaFile ,flanking_size: 100)
|
76
|
+
reg = Bio::DB::Fasta::Region.new
|
77
|
+
reg.entry = gene
|
78
|
+
reg.entry = @contig if @contig
|
79
|
+
reg.start = position - flanking_size
|
80
|
+
reg.end = position + flanking_size + 1
|
81
|
+
reg.orientation = :forward
|
82
|
+
entry = fastaFile.index.region_for_entry(reg.entry)
|
83
|
+
reg.start = 1 if reg.start < 1
|
84
|
+
reg.end = entry.length if reg.end > entry.length
|
85
|
+
amb = Bio::NucleicAcid.to_IUAPC("#{original}#{snp}")
|
86
|
+
@position = @position - reg.start + 1
|
87
|
+
@position = 1 if @position < 1
|
88
|
+
#puts "about to fetch"
|
89
|
+
self.template_sequence = fastaFile.fetch_sequence(reg)
|
90
|
+
#puts "done fetching"
|
91
|
+
template_sequence[position - 1] = amb
|
92
|
+
end
|
93
|
+
|
94
|
+
def initialize
|
95
|
+
@genomes_count = 3
|
96
|
+
@primer_3_min_seq_length = 50
|
97
|
+
@variation_free_region = 0
|
98
|
+
@contig = false
|
99
|
+
@max_hits = 8
|
100
|
+
@exon_list = Hash.new {|hsh, key| hsh[key] = [] }
|
101
|
+
@errors = Array.new
|
102
|
+
@repetitive = false
|
103
|
+
@hit_count = 0
|
104
|
+
end
|
105
|
+
|
106
|
+
def to_polymarker_coordinates(flanking_size, total:nil)
|
107
|
+
start = position - flanking_size + 1
|
108
|
+
start = 0 if start < 0
|
109
|
+
total = flanking_size * 2 unless total
|
110
|
+
total += 1
|
111
|
+
new_position = position - start + 2
|
112
|
+
[start , total, new_position ]
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_polymarker_sequence(flanking_size, total:nil)
|
116
|
+
out = template_sequence.clone
|
117
|
+
snp_seq = "[#{original}/#{snp}]"
|
118
|
+
p = position-1
|
119
|
+
if orientation == :reverse
|
120
|
+
p = out.length - p - 1
|
121
|
+
s = Bio::Sequence::NA.new(out)
|
122
|
+
s1 = Bio::Sequence::NA.new(original)
|
123
|
+
s2 = Bio::Sequence::NA.new(snp)
|
124
|
+
out = s.reverse_complement
|
125
|
+
snp_seq = "[#{s1.reverse_complement}/#{s2.reverse_complement}]"
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
out[p] = snp_seq
|
130
|
+
start = position - flanking_size - 1
|
131
|
+
start = 0 if start < 0
|
132
|
+
total = flanking_size * 2 unless total
|
133
|
+
total += 5
|
134
|
+
out[start , total ].upcase
|
135
|
+
end
|
136
|
+
|
137
|
+
def snp_id_in_seq
|
138
|
+
"#{original}#{position}#{snp}"
|
139
|
+
end
|
140
|
+
|
141
|
+
#We Only want the chromosome, we drop the arm.
|
142
|
+
#We don't use this any more.
|
143
|
+
#def chromosome= (chr)
|
144
|
+
# @chromosome = chr
|
145
|
+
#end
|
146
|
+
|
147
|
+
def chromosome_group
|
148
|
+
chromosome[0]
|
149
|
+
end
|
150
|
+
|
151
|
+
def chromosome_genome
|
152
|
+
chromosome[1]
|
153
|
+
end
|
154
|
+
|
155
|
+
def chromosome_genome
|
156
|
+
return chromosome[3] if chromosome[3]
|
157
|
+
return nil
|
158
|
+
end
|
159
|
+
|
160
|
+
def to_fasta
|
161
|
+
return ">#{self.gene}\n#{self.template_sequence}\n"
|
162
|
+
end
|
163
|
+
|
164
|
+
def add_exon(exon, arm, filter_best: true)
|
165
|
+
exon_list[arm] = Array.new unless exon_list[arm]
|
166
|
+
if filter_best and exon_list[arm].size > 0
|
167
|
+
current = exon_list[arm].first
|
168
|
+
exon_list[arm] = [exon] if exon.record.score > current.record.score
|
169
|
+
else
|
170
|
+
exon_list[arm] << exon
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def covered_region
|
175
|
+
return @covered_region if @covered_region
|
176
|
+
if self.use_reference
|
177
|
+
reg = Bio::DB::Fasta::Region.new()
|
178
|
+
reg.entry = gene
|
179
|
+
reg.orientation = :forward
|
180
|
+
reg.start = self.position - self.flanking_size
|
181
|
+
reg.end = self.position + self.flanking_size
|
182
|
+
reg.start = 1 if reg.start < 1
|
183
|
+
return reg
|
184
|
+
end
|
185
|
+
|
186
|
+
min = @position
|
187
|
+
max = @position
|
188
|
+
# puts "Calculating covered region for #{self.inspect}"
|
189
|
+
# puts "#{@exon_list.inspect}"
|
190
|
+
# raise SNPException.new "Exons haven't been loaded for #{self.to_s}" if @exon_list.size == 0
|
191
|
+
if @exon_list.size == 0
|
192
|
+
min = self.position - self.flanking_size
|
193
|
+
min = 1 if min < 1
|
194
|
+
max = self.position + self.flanking_size
|
195
|
+
end
|
196
|
+
@exon_list.each do | chromosome, exon_arr |
|
197
|
+
exon_arr.each do | exon |
|
198
|
+
reg = exon.query_region
|
199
|
+
min = reg.start if reg.start < min
|
200
|
+
max = reg.end if reg.end > max
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
reg = Bio::DB::Fasta::Region.new()
|
205
|
+
reg.entry = gene
|
206
|
+
reg.orientation = :forward
|
207
|
+
reg.start = min
|
208
|
+
reg.end = max
|
209
|
+
|
210
|
+
@covered_region = reg
|
211
|
+
@covered_region
|
212
|
+
end
|
213
|
+
|
214
|
+
def left_padding
|
215
|
+
flanking_size - self.local_position + 1
|
216
|
+
# primer_region.start - covered_region.start
|
217
|
+
# 0
|
218
|
+
end
|
219
|
+
|
220
|
+
def right_padding
|
221
|
+
ret = (2*flanking_size) - (left_padding + self.covered_region.size )
|
222
|
+
ret = 0 if ret < 0
|
223
|
+
ret
|
224
|
+
end
|
225
|
+
|
226
|
+
def local_position
|
227
|
+
# puts "local_position #{self.position} #{self.covered_region.start}"
|
228
|
+
self.position - self.covered_region.start
|
229
|
+
end
|
230
|
+
|
231
|
+
def padded_position(pos)
|
232
|
+
pos + left_padding
|
233
|
+
end
|
234
|
+
|
235
|
+
def primer_fasta_string
|
236
|
+
gene_region = self.covered_region
|
237
|
+
local_pos_in_gene = self.local_position
|
238
|
+
ret_str = ""
|
239
|
+
|
240
|
+
surrounding_parental_sequences.each do |name, seq|
|
241
|
+
ret_str << ">#{gene_region.entry}-#{self.position}_#{name}\n"
|
242
|
+
ret_str << "#{seq}\n"
|
243
|
+
end
|
244
|
+
|
245
|
+
self.surrounding_exon_sequences.each do |chromosome, exon_seq|
|
246
|
+
ret_str << ">#{chromosome}\n#{exon_seq}\n"
|
247
|
+
end
|
248
|
+
|
249
|
+
mask = surrounding_masked_chromosomal_snps(chromosome)
|
250
|
+
ret_str << ">Mask\n#{mask}\n"
|
251
|
+
|
252
|
+
pr = primer_region(chromosome, snp_in )
|
253
|
+
ret_str << pr.to_fasta
|
254
|
+
ret_str
|
255
|
+
end
|
256
|
+
|
257
|
+
def primer_region(target_chromosome, parental )
|
258
|
+
|
259
|
+
parental = aligned_sequences[parental].downcase
|
260
|
+
names = aligned_sequences.keys
|
261
|
+
target_chromosome = get_target_sequence(names, target_chromosome)
|
262
|
+
|
263
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
264
|
+
chromosome_seq = "-" * parental.size unless chromosome_seq
|
265
|
+
chromosome_seq = chromosome_seq.downcase
|
266
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
267
|
+
|
268
|
+
pr = PrimerRegion.new
|
269
|
+
position_in_region = 0
|
270
|
+
(0..parental.size-1).each do |i|
|
271
|
+
|
272
|
+
if chromosome_seq[i] != '-' or parental[i] != '-'
|
273
|
+
case
|
274
|
+
when mask[i] == '&'
|
275
|
+
#This is the SNP we take the parental
|
276
|
+
pr.snp_pos = position_in_region
|
277
|
+
pr.homoeologous = false
|
278
|
+
when mask[i] == ':'
|
279
|
+
#This is the SNP we take the parental
|
280
|
+
pr.snp_pos = position_in_region
|
281
|
+
pr.homoeologous = true
|
282
|
+
when mask[i] == '-'
|
283
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
284
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
285
|
+
|
286
|
+
when /[[:upper:]]/.match(mask[i])
|
287
|
+
#This is a good candidate for marking a SNP
|
288
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
289
|
+
if parental[i] == '-'
|
290
|
+
parental[i] = mask[i]
|
291
|
+
pr.crhomosome_specific_intron << position_in_region
|
292
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
293
|
+
parental[i] = mask[i]
|
294
|
+
pr.chromosome_specific << position_in_region
|
295
|
+
end
|
296
|
+
when /[[:lower:]]/.match(mask[i])
|
297
|
+
#this is not that good candidate, but sitll gives specificity
|
298
|
+
|
299
|
+
if parental[i] == '-'
|
300
|
+
parental[i] = mask[i]
|
301
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
302
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
303
|
+
parental[i] = mask[i].upcase
|
304
|
+
pr.almost_chromosome_specific << position_in_region
|
305
|
+
end
|
306
|
+
end #Case closes
|
307
|
+
position_in_region += 1
|
308
|
+
end #Closes region with bases
|
309
|
+
end
|
310
|
+
|
311
|
+
pr.sequence=parental.gsub('-','')
|
312
|
+
pr
|
313
|
+
end
|
314
|
+
|
315
|
+
def reverse_complement_string(sequenc_str)
|
316
|
+
complement = sequenc_str.tr('atgcrymkdhvbswnATGCRYMKDHVBSWN', 'tacgyrkmhdbvswnTACGYRKMHDBVSWN')
|
317
|
+
complement.reverse!
|
318
|
+
end
|
319
|
+
|
320
|
+
def return_primer_3_string(opts={})
|
321
|
+
|
322
|
+
left = opts[:left_pos]
|
323
|
+
right = opts[:right_pos]
|
324
|
+
sequence = opts[:sequence]
|
325
|
+
extra = opts[:extra]
|
326
|
+
|
327
|
+
orientation = "forward"
|
328
|
+
if opts[:right_pos]
|
329
|
+
orientation = "forward"
|
330
|
+
if left > right
|
331
|
+
left = sequence.size - left - 1
|
332
|
+
right = sequence.size - right - 1
|
333
|
+
sequence = reverse_complement_string(sequence)
|
334
|
+
orientation = "reverse"
|
335
|
+
end
|
336
|
+
if @variation_free_region > 0
|
337
|
+
check_str = sequence[right+1, @variation_free_region]
|
338
|
+
return nil if check_str != check_str.downcase
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
#puts "__"
|
344
|
+
#puts self.inspect
|
345
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation} \n"
|
346
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_f]
|
347
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
348
|
+
str << extra if extra
|
349
|
+
str << opts[:extra_f] if opts[:extra_f]
|
350
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
351
|
+
|
352
|
+
|
353
|
+
str << "=\n"
|
354
|
+
|
355
|
+
|
356
|
+
#In case that we don't have a right primer, we do both orientations
|
357
|
+
unless opts[:right_pos]
|
358
|
+
sequence = opts[:sequence]
|
359
|
+
left = sequence.size - left - 1
|
360
|
+
orientation = "reverse"
|
361
|
+
sequence = reverse_complement_string(sequence)
|
362
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
363
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n" unless opts[:extra_r]
|
364
|
+
str << opts[:extra_r] if opts[:extra_r]
|
365
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
366
|
+
str << extra if extra
|
367
|
+
str << "=\n"
|
368
|
+
end
|
369
|
+
|
370
|
+
str
|
371
|
+
end
|
372
|
+
|
373
|
+
|
374
|
+
def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20 )
|
375
|
+
|
376
|
+
pr = primer_region(target_chromosome, parental )
|
377
|
+
primer_3_propertes = Array.new
|
378
|
+
|
379
|
+
seq_original = String.new(pr.sequence)
|
380
|
+
|
381
|
+
if seq_original.size < primer_3_min_seq_length
|
382
|
+
errors << "The sequence (#{seq_original.size}) is shorter than #{primer_3_min_seq_length}"
|
383
|
+
return primer_3_propertes
|
384
|
+
end
|
385
|
+
|
386
|
+
if self.hit_count > self.max_hits
|
387
|
+
errors << "The marker maps to #{self.hit_count} positions (max_hits: #{self.max_hits}). "
|
388
|
+
repetitive = true
|
389
|
+
return primer_3_propertes
|
390
|
+
end
|
391
|
+
seq_original[pr.snp_pos] = self.original
|
392
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
393
|
+
|
394
|
+
seq_snp = String.new(pr.sequence)
|
395
|
+
seq_snp[pr.snp_pos] = self.snp
|
396
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
397
|
+
|
398
|
+
rev_pos = seq_snp.size - position
|
399
|
+
|
400
|
+
if pr.homoeologous
|
401
|
+
@snp_type = "homoeologous"
|
402
|
+
else
|
403
|
+
@snp_type = "non-homoeologous"
|
404
|
+
end
|
405
|
+
|
406
|
+
total_candidates = pr.chromosome_specific.size
|
407
|
+
total_candidates += pr.crhomosome_specific_intron.size
|
408
|
+
total_candidates += pr.almost_chromosome_specific.size
|
409
|
+
total_candidates += pr.almost_crhomosome_specific_intron.size
|
410
|
+
|
411
|
+
skip_specific = total_candidates > max_specific_primers
|
412
|
+
#puts "skip_specific: #{skip_specific}: #{total_candidates} > #{max_specific_primers}"
|
413
|
+
pr.chromosome_specific.each do |pos|
|
414
|
+
break if skip_specific
|
415
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
416
|
+
primer_3_propertes << return_primer_3_string(args)
|
417
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
|
418
|
+
args[:sequence] = seq_snp
|
419
|
+
primer_3_propertes << return_primer_3_string(args)
|
420
|
+
end
|
421
|
+
|
422
|
+
pr.crhomosome_specific_intron.each do |pos|
|
423
|
+
break if skip_specific
|
424
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_specific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
425
|
+
primer_3_propertes << return_primer_3_string(args)
|
426
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_specific exon #{@snp_type} #{chromosome}"
|
427
|
+
args[:sequence] = seq_snp
|
428
|
+
primer_3_propertes << return_primer_3_string(args)
|
429
|
+
end
|
430
|
+
|
431
|
+
pr.almost_chromosome_specific.each do |pos|
|
432
|
+
break if skip_specific
|
433
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific exon #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
434
|
+
primer_3_propertes << return_primer_3_string(args)
|
435
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
|
436
|
+
args[:sequence] = seq_snp
|
437
|
+
primer_3_propertes << return_primer_3_string(args)
|
438
|
+
end
|
439
|
+
|
440
|
+
pr.almost_crhomosome_specific_intron.each do |pos|
|
441
|
+
break if skip_specific
|
442
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_semispecific intron #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :right_pos => pos, :sequence=>seq_original}
|
443
|
+
primer_3_propertes << return_primer_3_string(args)
|
444
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_semispecific exon #{@snp_type} #{chromosome}"
|
445
|
+
args[:sequence] = seq_snp
|
446
|
+
primer_3_propertes << return_primer_3_string(args)
|
447
|
+
end
|
448
|
+
|
449
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific all #{@snp_type} #{chromosome}", :left_pos => pr.snp_pos, :sequence=>seq_original}
|
450
|
+
primer_3_propertes << return_primer_3_string(args)
|
451
|
+
args[:name] = "#{gene}:#{original}#{position}#{snp} #{snp_in} chromosome_nonspecific all #{@snp_type} #{chromosome}"
|
452
|
+
args[:sequence] = seq_snp
|
453
|
+
primer_3_propertes << return_primer_3_string(args)
|
454
|
+
primer_3_propertes
|
455
|
+
end
|
456
|
+
|
457
|
+
def to_s
|
458
|
+
"#{gene}:#{original}#{position}#{snp}#{chromosome}"
|
459
|
+
end
|
460
|
+
|
461
|
+
def short_s
|
462
|
+
"#{original}#{position}#{snp}".upcase
|
463
|
+
end
|
464
|
+
|
465
|
+
def primer_3_string(target_chromosome, parental, max_specific_primers: 20)
|
466
|
+
strings = primer_3_all_strings(target_chromosome, parental, max_specific_primers: max_specific_primers)
|
467
|
+
strings.join
|
468
|
+
end
|
469
|
+
|
470
|
+
def exon_for_chromosome (chromosome)
|
471
|
+
selected_exon=exon_list[chromosome]
|
472
|
+
puts "No exon with chromosome #{chromosome} for #{gene}" unless selected_exon
|
473
|
+
selected_exon
|
474
|
+
end
|
475
|
+
|
476
|
+
def parental_sequences
|
477
|
+
return @parental_sequences if @parental_sequences
|
478
|
+
gene_region = self.covered_region
|
479
|
+
local_pos_in_gene = self.local_position
|
480
|
+
|
481
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
482
|
+
container.parents.each do |name, bam|
|
483
|
+
seq = nil
|
484
|
+
if bam
|
485
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
486
|
+
else
|
487
|
+
seq = container.gene_model_sequence(gene_region)
|
488
|
+
unless name == self.snp_in
|
489
|
+
seq[local_pos_in_gene] = self.original
|
490
|
+
end
|
491
|
+
end
|
492
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
493
|
+
|
494
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
495
|
+
@parental_sequences [name] = seq
|
496
|
+
end
|
497
|
+
@parental_sequences
|
498
|
+
end
|
499
|
+
|
500
|
+
|
501
|
+
|
502
|
+
|
503
|
+
def surrounding_parental_sequences
|
504
|
+
return @surrounding_parental_sequences if @surrounding_parental_sequences
|
505
|
+
gene_region = self.covered_region
|
506
|
+
local_pos_in_gene = self.local_position
|
507
|
+
|
508
|
+
@surrounding_parental_sequences = Bio::Alignment::SequenceHash.new
|
509
|
+
container.parents.each do |name, bam|
|
510
|
+
seq = nil
|
511
|
+
if bam
|
512
|
+
seq = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
513
|
+
else
|
514
|
+
seq = container.gene_model_sequence(gene_region)
|
515
|
+
#puts "#{name} #{self.snp_in}"
|
516
|
+
#puts "Modifing original: #{name}\n#{seq}"
|
517
|
+
unless name == self.snp_in
|
518
|
+
|
519
|
+
seq[local_pos_in_gene] = self.original
|
520
|
+
else
|
521
|
+
seq[local_pos_in_gene] = self.snp
|
522
|
+
end
|
523
|
+
#puts "#{seq}"
|
524
|
+
end
|
525
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
526
|
+
seq[local_pos_in_gene] = self.snp if name == self.snp_in
|
527
|
+
@surrounding_parental_sequences [name] = cut_and_pad_sequence_to_primer_region(seq)
|
528
|
+
end
|
529
|
+
@surrounding_parental_sequences
|
530
|
+
end
|
531
|
+
|
532
|
+
def cut_sequence_to_primer_region(sequence)
|
533
|
+
ideal_min = self.local_position - flanking_size
|
534
|
+
ideal_max = self.local_position + flanking_size
|
535
|
+
ideal_min = 0 if ideal_min < 0
|
536
|
+
ideal_max = sequence.size - 1 if ideal_max > sequence.size
|
537
|
+
# len = ideal_max - ideal_min
|
538
|
+
sequence[ideal_min..ideal_max]
|
539
|
+
end
|
540
|
+
|
541
|
+
def cut_and_pad_sequence_to_primer_region(sequence)
|
542
|
+
ideal_min = self.local_position - flanking_size
|
543
|
+
ideal_max = self.local_position + flanking_size
|
544
|
+
left_pad = 0
|
545
|
+
right_pad=0
|
546
|
+
if ideal_min < 0
|
547
|
+
left_pad = ideal_min * -1
|
548
|
+
ideal_min = 0
|
549
|
+
end
|
550
|
+
if ideal_max > sequence.size
|
551
|
+
right_pad = ideal_max - sequence.size
|
552
|
+
ideal_max = sequence.size - 1
|
553
|
+
end
|
554
|
+
ret = "-" * left_pad << sequence[ideal_min..ideal_max] << "-" * right_pad
|
555
|
+
ret
|
556
|
+
end
|
557
|
+
|
558
|
+
def sequences_to_align
|
559
|
+
@sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
|
560
|
+
@sequences_to_align
|
561
|
+
end
|
562
|
+
|
563
|
+
def aligned_sequences
|
564
|
+
|
565
|
+
return @aligned_sequences if @aligned_sequences
|
566
|
+
return Hash.new if sequences_to_align.size == 0
|
567
|
+
|
568
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
569
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
570
|
+
#puts "Before MAFT:#{sequences_to_align.inspect}"
|
571
|
+
|
572
|
+
report = mafft.query_align(sequences_to_align)
|
573
|
+
@aligned_sequences = report.alignment
|
574
|
+
# puts "MAFFT: #{report.alignment.inspect}"
|
575
|
+
@aligned_sequences
|
576
|
+
end
|
577
|
+
|
578
|
+
def aligned_sequences_fasta
|
579
|
+
ret_str = ""
|
580
|
+
aligned_sequences.each_pair do |name, seq|
|
581
|
+
ret_str << ">#{self.to_s}-#{name}\n#{seq}\n"
|
582
|
+
end
|
583
|
+
ret_str << ">MASK #{chromosome}\n#{mask_aligned_chromosomal_snp(chromosome)}\n"
|
584
|
+
|
585
|
+
pr = primer_region(chromosome, snp_in )
|
586
|
+
ret_str << pr.to_fasta
|
587
|
+
ret_str
|
588
|
+
ret_str
|
589
|
+
end
|
590
|
+
|
591
|
+
|
592
|
+
def get_snp_position_after_trim
|
593
|
+
local_pos_in_gene = self.local_position
|
594
|
+
ideal_min = self.local_position - flanking_size
|
595
|
+
ideal_max = self.local_position + flanking_size
|
596
|
+
left_pad = 0
|
597
|
+
if ideal_min < 0
|
598
|
+
left_pad = ideal_min * -1
|
599
|
+
ideal_min = 0
|
600
|
+
end
|
601
|
+
local_pos_in_gene - ideal_min
|
602
|
+
end
|
603
|
+
|
604
|
+
def aligned_snp_position
|
605
|
+
return @aligned_snp_position if @aligned_snp_position
|
606
|
+
#puts self.inspect
|
607
|
+
pos = -1
|
608
|
+
parental_strings = Array.new
|
609
|
+
parental_sequences.keys.each do | par |
|
610
|
+
parental_strings << aligned_sequences[par]
|
611
|
+
end
|
612
|
+
$stderr.puts "WARN: #{self.to_s} #{parental_sequences.keys} is not of size 2 (#{parental_strings.size})" if parental_strings.size != 2
|
613
|
+
|
614
|
+
local_pos_in_parental = get_snp_position_after_trim
|
615
|
+
i = 0
|
616
|
+
while i < parental_strings[0].size do
|
617
|
+
if local_pos_in_parental == 0 and parental_strings[0][i] != "-"
|
618
|
+
pos = i
|
619
|
+
if parental_strings[0][i] == parental_strings[1][i]
|
620
|
+
$stderr.puts "WARN: #{self.to_s} doesn't have a SNP in the marked place (#{i})! \n#{parental_strings[0]}\n#{parental_strings[1]}"
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
local_pos_in_parental -= 1 if parental_strings[0][i] != "-"
|
625
|
+
i += 1
|
626
|
+
end
|
627
|
+
@aligned_snp_position = pos
|
628
|
+
return pos
|
629
|
+
end
|
630
|
+
|
631
|
+
def get_target_sequence(names, chromosome)
|
632
|
+
|
633
|
+
best = chromosome
|
634
|
+
best_score = 0
|
635
|
+
names.each do |e|
|
636
|
+
arr = e.split("_")
|
637
|
+
if arr.length == 3
|
638
|
+
score = arr[2].to_f
|
639
|
+
if score >best_score
|
640
|
+
best_score = score
|
641
|
+
best = e
|
642
|
+
end
|
643
|
+
end
|
644
|
+
end
|
645
|
+
best
|
646
|
+
end
|
647
|
+
|
648
|
+
|
649
|
+
|
650
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
651
|
+
names = aligned_sequences.keys
|
652
|
+
parentals = parental_sequences.keys
|
653
|
+
|
654
|
+
position_after_trim = get_snp_position_after_trim
|
655
|
+
|
656
|
+
names = names - parentals
|
657
|
+
local_pos_in_gene = aligned_snp_position
|
658
|
+
|
659
|
+
best_target = get_target_sequence(names, chromosome)
|
660
|
+
masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
|
661
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[best_target]
|
662
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
663
|
+
#puts "mask_aligned_chromosomal_snp(#{chromosome})"
|
664
|
+
#puts names
|
665
|
+
i = 0
|
666
|
+
for i in 0..masked_snps.size-1
|
667
|
+
#puts i
|
668
|
+
different = 0
|
669
|
+
cov = 0
|
670
|
+
from_group = 0
|
671
|
+
nCount = 0
|
672
|
+
seen = []
|
673
|
+
names.each do | chr |
|
674
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
675
|
+
#puts aligned_sequences[chr][i]
|
676
|
+
cov += 1
|
677
|
+
nCount += 1 if aligned_sequences[chr][i] == 'N' or aligned_sequences[chr][i] == 'n' # maybe fix this to use ambiguity codes instead.
|
678
|
+
|
679
|
+
if chr[0] == chromosome_group and not seen.include? chr[1]
|
680
|
+
seen << chr[1]
|
681
|
+
from_group += 1
|
682
|
+
|
683
|
+
end
|
684
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
685
|
+
if chr != best_target
|
686
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
687
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
688
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
689
|
+
end
|
690
|
+
end
|
691
|
+
end
|
692
|
+
masked_snps[i] = "-" if different == 0
|
693
|
+
masked_snps[i] = "-" if cov == 1
|
694
|
+
masked_snps[i] = "-" if nCount > 0
|
695
|
+
masked_snps[i] = "*" if cov == 0
|
696
|
+
expected_snps = names.size - 1
|
697
|
+
|
698
|
+
#puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
699
|
+
|
700
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
701
|
+
#puts "#{i}:#{masked_snps[i]}"
|
702
|
+
|
703
|
+
if i == local_pos_in_gene
|
704
|
+
masked_snps[i] = "&"
|
705
|
+
#puts "#{i}:#{masked_snps[i]}___"
|
706
|
+
bases = ""
|
707
|
+
names.each do | chr |
|
708
|
+
bases << aligned_sequences[chr][i] if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
709
|
+
end
|
710
|
+
|
711
|
+
code_reference = "n"
|
712
|
+
code_reference = Bio::NucleicAcid.to_IUAPC(bases) unless bases == ""
|
713
|
+
|
714
|
+
if Bio::NucleicAcid.is_valid(code_reference, original) and Bio::NucleicAcid.is_valid(code_reference, snp)
|
715
|
+
masked_snps[i] = ":"
|
716
|
+
end
|
717
|
+
|
718
|
+
end
|
719
|
+
#i += 1
|
720
|
+
end
|
721
|
+
masked_snps
|
722
|
+
end
|
723
|
+
|
724
|
+
|
725
|
+
def surrounding_masked_chromosomal_snps(chromosome)
|
726
|
+
|
727
|
+
chromosomes = surrounding_exon_sequences
|
728
|
+
names = chromosomes.keys
|
729
|
+
get_target_sequence(names)
|
730
|
+
masked_snps = chromosomes[chromosome].tr("-","+") if chromosomes[chromosome]
|
731
|
+
masked_snps = "-" * (flanking_size * 2 ) unless chromosomes[chromosome]
|
732
|
+
local_pos_in_gene = flanking_size
|
733
|
+
i = 0
|
734
|
+
while i < masked_snps.size do
|
735
|
+
different = 0
|
736
|
+
cov = 0
|
737
|
+
names.each do | chr |
|
738
|
+
if chromosomes[chr][i] != "-" and chromosomes[chr][i]. != 'N' and chromosomes[chr][i]. != 'n'
|
739
|
+
cov += 1
|
740
|
+
if chr != chromosome and masked_snps[i] != "+"
|
741
|
+
different += 1 if masked_snps[i] != chromosomes[chr][i]
|
742
|
+
end
|
743
|
+
end
|
744
|
+
end
|
745
|
+
masked_snps[i] = "-" if different == 0 and masked_snps[i] != "+"
|
746
|
+
masked_snps[i] = "-" if cov < 2
|
747
|
+
masked_snps[i] = masked_snps[i].upcase if different > 1
|
748
|
+
|
749
|
+
if i == local_pos_in_gene
|
750
|
+
masked_snps[i] = "&"
|
751
|
+
end
|
752
|
+
i += 1
|
753
|
+
end
|
754
|
+
masked_snps
|
755
|
+
end
|
756
|
+
|
757
|
+
def surrounding_exon_sequences
|
758
|
+
return @surrounding_exon_sequences if @surrounding_exon_sequences
|
759
|
+
gene_region = self.covered_region
|
760
|
+
@surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
|
761
|
+
self.exon_list.each do |chromosome, exon_arr|
|
762
|
+
exon_arr.each do |exon|
|
763
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
764
|
+
flanking_region = exon.target_flanking_region_from_position(position,flanking_size)
|
765
|
+
#TODO: Padd when the exon goes over the regions...
|
766
|
+
#puts flanking_region.inspect
|
767
|
+
#Ignoring when the exon is in a gap
|
768
|
+
unless exon.snp_in_gap
|
769
|
+
exon_seq = container.chromosome_sequence(flanking_region)
|
770
|
+
@surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
|
771
|
+
end
|
772
|
+
end
|
773
|
+
end
|
774
|
+
@surrounding_exon_sequences
|
775
|
+
end
|
776
|
+
|
777
|
+
|
778
|
+
def exon_sequences
|
779
|
+
return @exon_sequences if @exon_sequences
|
780
|
+
gene_region = self.covered_region
|
781
|
+
local_pos_in_gene = self.local_position
|
782
|
+
@exon_sequences = Bio::Alignment::SequenceHash.new
|
783
|
+
self.exon_list.each do |chromosome, exon_arr|
|
784
|
+
exon_arr.each do |exon|
|
785
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
786
|
+
exon_seq = "-" * exon_start_offset
|
787
|
+
exon_seq << container.chromosome_sequence(exon.target_region).to_s
|
788
|
+
#puts exon_seq
|
789
|
+
#l_pos = exon_start_offset + local_pos_in_gene
|
790
|
+
unless exon.snp_in_gap
|
791
|
+
#puts "local position: #{local_pos_in_gene}"
|
792
|
+
#puts "Exon_seq: #{exon_seq}"
|
793
|
+
exon_seq[local_pos_in_gene] = exon_seq[local_pos_in_gene].upcase
|
794
|
+
exon_seq << "-" * (gene_region.size - exon_seq.size + 1)
|
795
|
+
#puts exon.inspect
|
796
|
+
@exon_sequences["#{chromosome}_#{exon.query_region.start}_#{exon.record.score}"] = exon_seq
|
797
|
+
end
|
798
|
+
end
|
799
|
+
end
|
800
|
+
@exon_sequences[@chromosome] = "-" * gene_region.size unless @exon_sequences[@chromosome]
|
801
|
+
@exon_sequences
|
802
|
+
end
|
803
|
+
end
|
804
|
+
end
|