full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,78 +1,116 @@
|
|
1
1
|
|
2
2
|
module CommonFunctions
|
3
3
|
|
4
|
-
def contenidos_en_prot(
|
5
|
-
|
6
|
-
|
7
|
-
q_index_start =
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
fr_index_start = full_prot.index(simliar_fragment)
|
27
|
-
|
28
|
-
if (q_index_start == 9999)
|
29
|
-
q_index_start = fr_index_start
|
4
|
+
def contenidos_en_prot(key_seq, full_prot)
|
5
|
+
full_prot = full_prot.gsub(/[\-Xx]/,'-')
|
6
|
+
compare_prot = key_seq.gsub(/[\-Xx]/,'-')
|
7
|
+
q_index_start = full_prot.index(compare_prot) #Full match between hit.q_seq and full_prot (unigene)
|
8
|
+
if q_index_start.nil? #There is gaps that unables the full match
|
9
|
+
q_index_start = match_with_ungapped_reference(full_prot, compare_prot)
|
10
|
+
if q_index_start.nil? && full_prot.include?('-')
|
11
|
+
diff = full_prot.length - compare_prot.length
|
12
|
+
if scan_sequences(full_prot.split(''), compare_prot.split('')) == compare_prot.length
|
13
|
+
q_index_start = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
if diff >0 && scan_sequences(full_prot.split(''), compare_prot.split(''), diff) == compare_prot.length
|
17
|
+
q_index_start = diff
|
18
|
+
end
|
19
|
+
|
20
|
+
if q_index_start.nil?
|
21
|
+
q_index_start = match_with_gapped_reference(full_prot, compare_prot)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
if q_index_start.nil?
|
25
|
+
q_index_start = 0
|
30
26
|
end
|
31
|
-
full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
|
32
|
-
end
|
33
|
-
|
34
|
-
simliar_fragment = full_prot.lcs(compare_prot)
|
35
|
-
|
36
|
-
# if ($verbose)
|
37
|
-
# puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
|
38
|
-
# puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
|
39
|
-
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
|
40
|
-
# end
|
41
|
-
|
42
|
-
if (suma_fragments + masked_x >= compare_prot.length * 0.7)
|
43
|
-
is_ok = true
|
44
|
-
# puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
|
45
|
-
else
|
46
|
-
is_ok = false
|
47
|
-
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
|
48
|
-
# puts "Warning!: no match comparing proteins"
|
49
|
-
end
|
50
27
|
|
51
|
-
min_index_start = [min_index_start, q_index_start].min
|
52
|
-
|
53
|
-
if (min_index_start == 9999)
|
54
|
-
min_index_start = 0
|
55
28
|
end
|
56
|
-
|
57
|
-
return [is_ok, min_index_start]
|
29
|
+
return q_index_start
|
58
30
|
end
|
59
31
|
|
60
32
|
|
33
|
+
def match_with_gapped_reference(full_prot, compare_prot)
|
34
|
+
q_index_start = nil
|
35
|
+
fragments_array = full_prot.split(/\-+/)
|
36
|
+
fragments_array.each_with_index do |seq, i|
|
37
|
+
if seq.length > 4
|
38
|
+
compare_prot_index = compare_prot.index(seq)
|
39
|
+
if compare_prot_index.nil? # In cases that no match by gaps
|
40
|
+
seq =seq[0..4]
|
41
|
+
compare_prot_index = compare_prot.index(seq)
|
42
|
+
end
|
43
|
+
if !compare_prot_index.nil?
|
44
|
+
q_index_start = full_prot.index(seq)
|
45
|
+
if i > 0
|
46
|
+
q_index_start, compare_prot_index = extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
|
47
|
+
end
|
48
|
+
break
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
return q_index_start
|
53
|
+
end
|
61
54
|
|
55
|
+
def extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
|
56
|
+
full_prot_substring = full_prot[0..q_index_start-1].reverse.split('')
|
57
|
+
compare_prot_substring = compare_prot[0..compare_prot_index-1].reverse.split('')
|
58
|
+
extend_match = scan_sequences(full_prot_substring, compare_prot_substring)
|
59
|
+
q_index_start -= extend_match
|
60
|
+
compare_prot_index -= extend_match
|
61
|
+
return q_index_start, compare_prot_index
|
62
|
+
end
|
62
63
|
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
64
|
+
def scan_sequences(ref_seq, compare_seq, diff = 0)
|
65
|
+
extend_match = 0
|
66
|
+
ref_seq.each_with_index do |char,i|
|
67
|
+
if i >= diff
|
68
|
+
compare_char = compare_seq[extend_match]
|
69
|
+
if compare_char.nil? || char != compare_char && char != '-' && compare_char != '-'
|
70
|
+
break
|
71
|
+
end
|
72
|
+
extend_match += 1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return extend_match
|
76
|
+
end
|
69
77
|
|
70
|
-
|
78
|
+
def match_with_ungapped_reference(full_prot, compare_prot)
|
79
|
+
q_index_start = nil
|
80
|
+
fragments_array = compare_prot.split(/\-+/)
|
81
|
+
fragments_array.each_with_index do |seq, i|
|
82
|
+
if q_index_start.nil? && seq.length > 4
|
83
|
+
q_index_start = full_prot.index(seq)
|
84
|
+
if i > 0 && !q_index_start.nil?
|
85
|
+
q_index_start = refine_match(seq, compare_prot, q_index_start) # Correction if first seq isn't enough large
|
86
|
+
end
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
return q_index_start
|
91
|
+
end
|
71
92
|
|
72
|
-
|
73
|
-
|
93
|
+
def refine_match(subseq, seq, q_index_start)
|
94
|
+
location_seq = seq.index(subseq)
|
95
|
+
gaps_on_location = seq[0..location_seq].count('-')
|
96
|
+
q_index_start -= location_seq - gaps_on_location # Correction if first seq isn't enough large
|
97
|
+
return q_index_start
|
74
98
|
end
|
75
99
|
|
100
|
+
def reverse_seq(query_fasta, hit)
|
101
|
+
hit.q_frame = -hit.q_frame
|
102
|
+
hit.q_end = query_fasta.length - 1 - hit.q_end
|
103
|
+
hit.q_beg = query_fasta.length - 1 - hit.q_beg
|
104
|
+
hit.reversed = TRUE
|
105
|
+
query_fasta = query_fasta.complementary_dna # ESTO REALMENTE HACE LA REVERSO COMPLEMENTARIA.
|
106
|
+
if hit.class.to_s == 'ExoBlastHit'
|
107
|
+
hit.q_frameshift.map!{|position, num_nts|
|
108
|
+
reversed_position = query_fasta.length - 1 - position
|
109
|
+
[reversed_position, num_nts]
|
110
|
+
}
|
111
|
+
end
|
112
|
+
return query_fasta
|
113
|
+
end
|
76
114
|
|
77
115
|
|
78
116
|
def corrige_frame(ref_frame,ref_start,ref_end)
|
@@ -89,6 +127,10 @@ module CommonFunctions
|
|
89
127
|
|
90
128
|
end
|
91
129
|
|
92
|
-
|
93
|
-
|
94
|
-
|
130
|
+
def check_frame_shift(hit)
|
131
|
+
fs = 0
|
132
|
+
prot_length_in_nts = hit.q_end-hit.q_beg+1
|
133
|
+
fs = prot_length_in_nts%3
|
134
|
+
return fs
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,258 @@
|
|
1
|
+
# Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# 'Software'), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
require 'blast_query'
|
23
|
+
require 'blast_hit'
|
24
|
+
require 'fl_string_utils.rb'
|
25
|
+
|
26
|
+
OPERATION = 0
|
27
|
+
QUERY = 1
|
28
|
+
TARGET = 2
|
29
|
+
|
30
|
+
class ExoBlastHit < BlastHit
|
31
|
+
attr_accessor :q_frameshift, :s_frameshift
|
32
|
+
def initialize(start_target, ends_target, start_query, ends_query)
|
33
|
+
super(start_target, ends_target, start_query, ends_query)
|
34
|
+
@s_frameshift=[]
|
35
|
+
@q_frameshift=[]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
40
|
+
class ExonerateResult
|
41
|
+
|
42
|
+
# Parser initialization
|
43
|
+
def initialize(input, seqs= nil, query_seqs = nil, all = TRUE)
|
44
|
+
@querys = []
|
45
|
+
@seqs = seqs #unigenes
|
46
|
+
@prot_seqs = query_seqs#prot
|
47
|
+
|
48
|
+
if input.is_a?(Array)
|
49
|
+
input.each do |file|
|
50
|
+
parse_file(File.open(file).readlines, all)
|
51
|
+
end
|
52
|
+
else
|
53
|
+
parse_file(File.open(input).readlines, all)
|
54
|
+
end
|
55
|
+
query_name=''
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_file(lines, all)
|
59
|
+
lines_parsed=[]
|
60
|
+
lines_parsed={} if !all
|
61
|
+
lines.each do |line|
|
62
|
+
if line=~ /^vulgar:/
|
63
|
+
line.chomp!
|
64
|
+
fields=line.split(' ', 11)
|
65
|
+
features={'query_id'=> fields[1], 'query_start_align'=> fields[2].to_i, 'query_end_align'=> fields[3].to_i, 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6].to_i, 'target_end_align'=> fields[7].to_i, 'target_strand'=> fields[8], 'score'=> fields[9].to_i, 'align_data'=> fields[10]}
|
66
|
+
if all
|
67
|
+
lines_parsed << features
|
68
|
+
else
|
69
|
+
if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
|
70
|
+
lines_parsed[features['target_id']]=features
|
71
|
+
else
|
72
|
+
if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
|
73
|
+
lines_parsed[features['target_id']]=features
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
convert_parsed_lines(lines_parsed)
|
80
|
+
end
|
81
|
+
|
82
|
+
def convert_parsed_lines(lines_parsed)
|
83
|
+
last_query = nil
|
84
|
+
query = nil
|
85
|
+
lines_parsed.each_with_index do |line|
|
86
|
+
begin
|
87
|
+
if lines_parsed.class.to_s=='Array'
|
88
|
+
align_data=line['align_data']
|
89
|
+
features=line
|
90
|
+
else #hash
|
91
|
+
align_data=line[1]['align_data']
|
92
|
+
features=line[1]
|
93
|
+
end
|
94
|
+
tags = align_data.scan(/([MFG53S]) ([0-9]+) ([0-9]+)/)
|
95
|
+
tags.map!{|tag| [tag[0], tag[1].to_i, tag[2].to_i]}
|
96
|
+
if features['target_id'] != last_query
|
97
|
+
last_query = features['target_id']
|
98
|
+
query = BlastQuery.new(features['target_id'])
|
99
|
+
@querys << query
|
100
|
+
end
|
101
|
+
hiting(features,tags, query)
|
102
|
+
rescue
|
103
|
+
puts "Result: #{features['target_id']} => #{features['query_id']} hasn't been parsed\n#{line}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
#this method only works fine with --model protein2dna parameter of exonerate
|
109
|
+
def hiting(features, tags, query) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
|
110
|
+
do_align = FALSE
|
111
|
+
do_align = TRUE if !@prot_seqs.nil? && !@seqs.nil?
|
112
|
+
start_target = features['target_start_align']#Unigen
|
113
|
+
start_query = features['query_start_align'] #proteina
|
114
|
+
ends_target = features['target_end_align']
|
115
|
+
ends_query = features['query_end_align']-1 # -> Exonerate don't set to 0 position the ends of target and query
|
116
|
+
if features['target_strand'] == '-' #-> Exonerate don't set to 0 position the ends of target and query
|
117
|
+
start_target -= 1 # Start target is end target when mathc is in reversed complementary strand
|
118
|
+
else
|
119
|
+
ends_target -= 1
|
120
|
+
end
|
121
|
+
hit = ExoBlastHit.new(start_target+1, ends_target+1, start_query+1, ends_query+1)
|
122
|
+
define_hit_parameters(hit, features, tags)
|
123
|
+
query.add_hit(hit)
|
124
|
+
|
125
|
+
#Define alignment and blast like parameters
|
126
|
+
target_alignment = ''
|
127
|
+
query_alignment = ''
|
128
|
+
counter_target = start_target
|
129
|
+
counter_query = start_query
|
130
|
+
if do_align #get seqs
|
131
|
+
query_seq = @prot_seqs[features['query_id']]
|
132
|
+
target_seq = @seqs[features['target_id']]
|
133
|
+
end
|
134
|
+
counter_target, target_seq = do_reverso_complementary(counter_target, target_seq) if features['target_strand'] == '-'
|
135
|
+
query_frameshift = []
|
136
|
+
target_frameshift = []
|
137
|
+
gap_shift = 0
|
138
|
+
#puts features['query_id']+ ' ' +features['target_strand'], '-----------------------'
|
139
|
+
tags.each_with_index do |tag, n_operation|
|
140
|
+
#puts tag.inspect
|
141
|
+
if do_align
|
142
|
+
gap_shift = 0 if tag[OPERATION] != 'G'
|
143
|
+
query_alignment << query_seq[counter_query, tag[QUERY]]
|
144
|
+
target_alignment << target_seq[counter_target, tag[TARGET]].translate
|
145
|
+
end
|
146
|
+
if tag[OPERATION] == 'F'
|
147
|
+
if tag[TARGET] > 0 && tag[TARGET] < 3 #TRUE FRAMESHIFT
|
148
|
+
gap_shift += 1
|
149
|
+
if tags[n_operation+1][OPERATION] != 'G' #there are frameshift that not insert a gap, we do it
|
150
|
+
query_alignment << '-' if do_align
|
151
|
+
end
|
152
|
+
else
|
153
|
+
query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
|
154
|
+
end
|
155
|
+
query_frameshift << counter_query
|
156
|
+
fs_counter_target = counter_target
|
157
|
+
fs_counter_target = target_seq.length - counter_target if features['target_strand'] == '-' # ESto es un apaño, habria que plantear el parseo de las reversas como reduccion en el contador del formato del exonerate, en vez de como adiccion
|
158
|
+
if tag[TARGET] > 3
|
159
|
+
real_fs = tag[TARGET]%3
|
160
|
+
real_gap = tag[TARGET] - real_fs
|
161
|
+
fs = [fs_counter_target + real_gap, real_fs]
|
162
|
+
else
|
163
|
+
fs = [fs_counter_target, tag[TARGET]]
|
164
|
+
end
|
165
|
+
target_frameshift << fs
|
166
|
+
elsif tag[OPERATION] == 'G'
|
167
|
+
query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
|
168
|
+
diff = tag[QUERY] - gap_shift
|
169
|
+
target_alignment << '-' * diff if do_align && diff > 0
|
170
|
+
gap_shift = 0
|
171
|
+
end
|
172
|
+
counter_query += tag[QUERY]
|
173
|
+
counter_target += tag[TARGET]
|
174
|
+
end
|
175
|
+
hit.s_frameshift = query_frameshift
|
176
|
+
hit.q_frameshift = target_frameshift
|
177
|
+
|
178
|
+
#puts "\e[33m#{target_alignment}\e[0m", "\e[36m#{query_alignment}\e[0m"
|
179
|
+
if do_align
|
180
|
+
hit.q_seq = target_alignment
|
181
|
+
hit.s_seq = query_alignment
|
182
|
+
hit.align_len = query_alignment.length
|
183
|
+
hit.ident = set_ident(target_alignment,query_alignment)
|
184
|
+
end
|
185
|
+
end #def
|
186
|
+
|
187
|
+
def do_reverso_complementary(counter_target, target_seq)
|
188
|
+
counter_target = target_seq.length - 1 - counter_target
|
189
|
+
target_seq = target_seq.complementary_dna
|
190
|
+
return counter_target, target_seq
|
191
|
+
end
|
192
|
+
|
193
|
+
def set_ident(target_alignment, query_alignment)
|
194
|
+
matchs = 0
|
195
|
+
position = 0
|
196
|
+
target_alignment.each_char do |char|
|
197
|
+
matchs +=1 if char == query_alignment[position]
|
198
|
+
position +=1
|
199
|
+
end
|
200
|
+
perc_ident = ('%.2f' % (matchs*100.0/target_alignment.length)).to_f
|
201
|
+
return perc_ident
|
202
|
+
end
|
203
|
+
|
204
|
+
def define_hit_parameters(hit, features, tags)
|
205
|
+
hit.gaps = 0
|
206
|
+
tags.map{|aln| hit.gaps += 1 if aln[0] == 'G'}
|
207
|
+
hit.reversed = FALSE
|
208
|
+
hit.align_len =(features['query_end_align'] - features['query_start_align']).abs+1
|
209
|
+
hit.mismatches=0
|
210
|
+
hit.e_val=0
|
211
|
+
hit.bit_score=0
|
212
|
+
hit.score = features['score']
|
213
|
+
hit.s_frame = nil
|
214
|
+
strand = 1
|
215
|
+
strand = -1 if features['target_strand'] == '-'
|
216
|
+
hit.q_frame = (((features['target_start_align']) % 3) +1) *strand
|
217
|
+
hit.subject_id = features['query_id']
|
218
|
+
hit.full_subject_length=0
|
219
|
+
hit.definition=''
|
220
|
+
hit.acc=features['query_id']
|
221
|
+
hit.q_seq=''
|
222
|
+
hit.s_seq=''
|
223
|
+
end
|
224
|
+
|
225
|
+
# inspect results
|
226
|
+
def inspect
|
227
|
+
res = "Exonerate results:\n"
|
228
|
+
res+= '-'*20
|
229
|
+
res+= "\nQuerys: #{@querys.count}\n"
|
230
|
+
@querys.each{|q| res+=q.inspect+"\n"}
|
231
|
+
return res
|
232
|
+
end
|
233
|
+
|
234
|
+
# find query by name
|
235
|
+
def find_query(querys,name_q)
|
236
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
237
|
+
new_q=nil
|
238
|
+
|
239
|
+
if !querys.empty?
|
240
|
+
new_q=querys.find{|q| (q.query_id==name_q)}
|
241
|
+
end
|
242
|
+
|
243
|
+
return new_q
|
244
|
+
end
|
245
|
+
|
246
|
+
# check if there are querys
|
247
|
+
def empty?
|
248
|
+
|
249
|
+
return @querys.empty?
|
250
|
+
end
|
251
|
+
|
252
|
+
# get query count
|
253
|
+
def size
|
254
|
+
@querys.size
|
255
|
+
end
|
256
|
+
|
257
|
+
attr_accessor :querys
|
258
|
+
end
|
@@ -1,688 +1,297 @@
|
|
1
|
-
|
1
|
+
require 'types'
|
2
2
|
require 'une_los_hit'
|
3
3
|
|
4
4
|
module FlAnalysis
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
#
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
|
16
|
-
|
6
|
+
$global_warnings = []
|
7
|
+
|
8
|
+
def analiza_orf_y_fl(seq, hit, options, db_name)
|
9
|
+
query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
|
10
|
+
if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
|
11
|
+
seq_unida = UneLosHit.new(hit, query_fasta)
|
12
|
+
full_prot = seq_unida.full_prot
|
13
|
+
query_fasta = seq_unida.output_seq # repaired fasta
|
14
|
+
final_hit = seq_unida.final_hit # single hit
|
15
|
+
$global_warnings += seq_unida.msgs # warning messages
|
16
|
+
else
|
17
|
+
query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
|
18
|
+
final_hit = hit.first # single hit
|
17
19
|
end
|
18
|
-
|
19
|
-
q=blast_query
|
20
|
-
msgs = ''
|
21
|
-
atg_status = ''
|
22
|
-
end_status = ''
|
23
|
-
final_status = ''
|
24
|
-
|
25
|
-
# the fasta sequence is saved
|
26
|
-
query_fasta = seq.seq_fasta
|
20
|
+
query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]
|
27
21
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
if (seq.get_annotations(:tmp_annotation).empty?)
|
34
|
-
if (seq.sec_desc.empty?)
|
35
|
-
seq.annotate(:apply_tcode,'')
|
36
|
-
else
|
37
|
-
seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
|
38
|
-
end
|
39
|
-
else
|
40
|
-
save_last_db_annotations(seq)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
return
|
45
|
-
end
|
46
|
-
#----------------------------------------------------------------------------------------------------------
|
47
|
-
warnings = ''
|
48
|
-
errors = ''
|
49
|
-
wrong_seq = false
|
22
|
+
full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
|
23
|
+
original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
|
24
|
+
seq.show_alignment(final_hit, query_fasta, show_nts) if $verbose > 2 ## VERBOSE
|
25
|
+
atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
|
26
|
+
end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)
|
50
27
|
|
51
|
-
|
52
|
-
if
|
53
|
-
|
54
|
-
seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
|
55
|
-
|
56
|
-
wrong_seq = seq_unida.wrong_seq
|
57
|
-
is_ok = seq_unida.is_ok
|
58
|
-
q_index_start = seq_unida.q_index_start
|
59
|
-
full_prot = seq_unida.full_prot
|
60
|
-
|
61
|
-
query_fasta = seq_unida.output_seq # repaired fasta
|
62
|
-
|
63
|
-
final_hit = seq_unida.final_hit # single hit
|
64
|
-
msgs = seq_unida.msgs # warning messages
|
65
|
-
x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
|
66
|
-
|
67
|
-
else # if there is only one hit
|
28
|
+
puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
|
29
|
+
seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if $verbose > 1 ## VERBOSE
|
30
|
+
puts "ATG: #{atg_status} STOP: #{end_status}" if $verbose > 2 ## VERBOSE
|
68
31
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
final_hit = q.hits[0] # single hit
|
75
|
-
x_number = 0 # number of nucleotides used to fix frame errors
|
76
|
-
|
77
|
-
full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
|
78
|
-
(is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
|
79
|
-
end
|
80
|
-
# test_final_hit(final_hit, query_fasta)
|
81
|
-
#----------------------------------------------------------------------------------------------------------
|
82
|
-
if wrong_seq
|
83
|
-
warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
|
84
|
-
# puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
|
85
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
|
86
|
-
error_log(q, seq, warnings, db_name)
|
87
|
-
return
|
88
|
-
end
|
89
|
-
#----------------------------------------------------------------------------------------------------------
|
90
|
-
warnings += msgs
|
91
|
-
msgs = ''
|
92
|
-
#----------------------------------------------------------------------------------------------------------
|
93
|
-
if (x_number < 0)
|
94
|
-
warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
|
95
|
-
# puts "ERROR#2, unexpected negative index in x_number"
|
96
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
|
97
|
-
error_log(q, seq, warnings, db_name)
|
98
|
-
return
|
99
|
-
end
|
100
|
-
#----------------------------------------------------------------------------------------------------------
|
101
|
-
if (!is_ok)
|
102
|
-
warnings = "ERROR#3, very serious frame error, " + warnings
|
103
|
-
# puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
|
104
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
|
105
|
-
# error_log(q, seq, warnings, db_name)
|
106
|
-
# return
|
107
|
-
end
|
108
|
-
#----------------------------------------------------------------------------------------------------------
|
109
|
-
fiable = false
|
110
|
-
if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
|
111
|
-
fiable = true
|
32
|
+
# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
|
33
|
+
type, status = determine_status(atg_status, end_status)
|
34
|
+
status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
|
35
|
+
if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
|
36
|
+
save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
|
112
37
|
end
|
113
|
-
|
114
|
-
if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
|
115
|
-
substring = full_prot[0, q_index_start + 10]
|
116
|
-
resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
|
38
|
+
end
|
117
39
|
|
118
|
-
# to look for the beginning of the protein
|
119
|
-
(m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
|
120
40
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
41
|
+
def set_start_codon(final_hit, distance, full_prot, query_fasta)
|
42
|
+
q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot)
|
43
|
+
atg_status = nil
|
44
|
+
_5prima = q_index_start + distance
|
45
|
+
|
46
|
+
if final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
|
47
|
+
atg_status = 'complete'
|
48
|
+
tmp_prot = full_prot[q_index_start..full_prot.length]
|
49
|
+
elsif _5prima >= final_hit.s_beg
|
50
|
+
amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
|
51
|
+
carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
|
52
|
+
length_before_cut = amine_seq.length
|
53
|
+
amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
|
54
|
+
tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
|
55
|
+
new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
|
56
|
+
modify_5p_align(new_q_beg, final_hit, query_fasta) if $verbose > 1 ## VERBOSE, Modify query align
|
57
|
+
final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
|
125
58
|
else
|
126
|
-
|
127
|
-
# puts "beginning too short!"
|
128
|
-
# end
|
129
|
-
|
59
|
+
$global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
|
130
60
|
atg_status = 'incomplete'
|
131
|
-
substring = full_prot[0, q_index_start]
|
132
|
-
distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
|
133
|
-
|
134
|
-
if (substring.rindex('*'))
|
135
|
-
warnings += "Unexpected stop codon in the beginning of your sequence, "
|
136
|
-
# if (@verbose)
|
137
|
-
# puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
|
138
|
-
# end
|
139
|
-
end
|
140
|
-
|
141
|
-
final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
|
142
61
|
tmp_prot = full_prot
|
143
62
|
end
|
144
|
-
#----------------------------------------------------------------------------------------------------------
|
145
|
-
# look for the end of the protein
|
146
|
-
(resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
|
147
|
-
#----------------------------------------------------------------------------------------------------------
|
148
|
-
final_prot = "#{resto_substring}#{end_substring}"
|
149
|
-
|
150
|
-
warnings += msgs
|
151
|
-
|
152
|
-
# to get the value of the end_ORF index
|
153
|
-
if (atg_status == 'complete')
|
154
|
-
final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
|
155
|
-
else
|
156
|
-
if (putative_end)
|
157
|
-
final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
#--------------------------------------------------------------------------------------------------------------
|
162
|
-
# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
|
163
|
-
final_status = determine_status(atg_status,end_status)
|
164
|
-
#----------------------------------------------------------------------------------------------------------
|
165
|
-
if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
|
166
|
-
warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
|
167
63
|
|
168
|
-
|
169
|
-
warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
|
170
|
-
if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
|
171
|
-
|
172
|
-
if (final_status == 'Complete')
|
173
|
-
final_status = 'Putative Complete'
|
174
|
-
warnings += ". Was predicted as Complete, but is very much shorter than de subject"
|
175
|
-
# if (@verbose)
|
176
|
-
# puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
|
177
|
-
# end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
# test_final_hit(final_hit, query_fasta)
|
183
|
-
print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
|
184
|
-
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
def test_blast_hits(q)
|
189
|
-
|
190
|
-
puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
|
191
|
-
|
192
|
-
q.hits.each do |h|
|
193
|
-
puts "\t subject_id: #{h.acc}"
|
194
|
-
puts "\t acc: #{h.acc}"
|
195
|
-
puts "\t full_subject_length: #{h.full_subject_length}"
|
196
|
-
puts "\t q_beg: #{h.q_beg + 1}"
|
197
|
-
puts "\t q_end: #{h.q_end + 1}"
|
198
|
-
puts "\t q_frame: #{h.q_frame}"
|
199
|
-
puts "\t s_beg: #{h.s_beg + 1}"
|
200
|
-
puts "\t s_end: #{h.s_end + 1}"
|
201
|
-
puts "\t s_frame: #{h.s_frame}"
|
202
|
-
puts "\t align_len: #{h.align_len}"
|
203
|
-
puts "\t gaps: #{h.gaps}"
|
204
|
-
puts "\t mismatches: #{h.mismatches}"
|
205
|
-
puts "\t reversed: #{h.reversed}"
|
206
|
-
puts "\t score: #{h.score}"
|
207
|
-
puts "\t bit_score: #{h.bit_score}"
|
208
|
-
puts "\t ident: #{h.ident}"
|
209
|
-
puts "\t e_val: #{h.e_val}"
|
210
|
-
puts "\t definition: #{h.definition}"
|
211
|
-
puts "\t q_seq: #{h.q_seq}"
|
212
|
-
puts "\t s_seq: #{h.s_seq}"
|
213
|
-
|
214
|
-
end
|
215
|
-
|
216
|
-
end
|
217
|
-
|
218
|
-
|
219
|
-
def test_final_hit(final_hit, query_fasta)
|
220
|
-
|
221
|
-
puts "\t acc: #{final_hit.acc}"
|
222
|
-
puts "\t full_subject_length: #{final_hit.full_subject_length}"
|
223
|
-
|
224
|
-
puts "\n\t q_frame: #{final_hit.q_frame}"
|
225
|
-
puts "\t reversed: #{final_hit.reversed}"
|
226
|
-
|
227
|
-
puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
|
228
|
-
puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
|
229
|
-
|
230
|
-
puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
|
231
|
-
|
232
|
-
puts "\n\t definition: #{final_hit.definition}"
|
233
|
-
puts "\t q_seq: #{final_hit.q_seq}"
|
234
|
-
puts "\t s_seq: #{final_hit.s_seq}"
|
235
|
-
|
236
|
-
puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
|
237
|
-
puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
|
238
|
-
|
64
|
+
return atg_status, tmp_prot
|
239
65
|
end
|
240
66
|
|
241
67
|
|
242
|
-
def
|
243
|
-
#
|
244
|
-
|
245
|
-
if
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
else
|
253
|
-
seq.annotate(:apply_tcode,'')
|
254
|
-
end
|
255
|
-
else
|
256
|
-
warnings = "Coding sequence with some errors, #{warnings}"
|
257
|
-
tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
|
258
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
259
|
-
end
|
260
|
-
else
|
261
|
-
save_last_db_annotations(seq)
|
262
|
-
end
|
263
|
-
else
|
264
|
-
if (seq.sec_desc.empty?)
|
265
|
-
if (!q.hits[0].definition.nil?)
|
266
|
-
warnings = "Coding sequence with some errors, #{warnings}"
|
267
|
-
seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
|
268
|
-
end
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
end
|
273
|
-
|
274
|
-
|
275
|
-
def save_last_db_annotations(seq)
|
276
|
-
|
277
|
-
# puts "sequence not complete! recovering annotations from previous database! sldba!!"
|
278
|
-
(q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
|
279
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
280
|
-
|
281
|
-
(name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
|
282
|
-
if (final_hit.reversed)
|
283
|
-
(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
|
284
|
-
end
|
285
|
-
|
286
|
-
seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
|
287
|
-
seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
|
288
|
-
tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
|
289
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
290
|
-
|
291
|
-
end
|
292
|
-
|
293
|
-
|
294
|
-
def find_start(subject_start, substring, fiable, aas_n_end)
|
295
|
-
|
296
|
-
tmp_prot = ''
|
297
|
-
msgs = ''
|
298
|
-
atg_status = 'incomplete' # complete, incomplete or putative
|
299
|
-
|
300
|
-
# puts "\nsubstring (#{substring.length} aas):\n#{substring}"
|
301
|
-
stop_codon = substring.rindex('*')
|
302
|
-
|
303
|
-
# marcamos la distancia al s_beg desde el principio del substring
|
304
|
-
# s_beg_distance = (substring.length) - subject_start
|
305
|
-
s_beg_distance = (substring.length - 10) - subject_start
|
306
|
-
# marcamos la distancia al s_beg desde el final del substring
|
307
|
-
atg_distance = (subject_start + 1) - (substring.length - 10)
|
308
|
-
if (atg_distance <= 0)
|
309
|
-
atg_distance = 0
|
310
|
-
else
|
311
|
-
# puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
|
312
|
-
msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
|
313
|
-
end
|
314
|
-
|
315
|
-
# puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
|
316
|
-
#----------------------------------------------------------------------------------------------------------
|
317
|
-
# tenemos un codon de parada en el substring 5 prima
|
318
|
-
if (stop_codon)
|
319
|
-
stop_codon += 1
|
320
|
-
# ahora vamos a ver si el stop esta antes o despues del s_beg
|
321
|
-
if (stop_codon <= s_beg_distance) # esta antes
|
322
|
-
substring = substring[stop_codon, substring.length - stop_codon]
|
323
|
-
# puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
|
324
|
-
|
325
|
-
first_m = substring.index('M')
|
326
|
-
|
327
|
-
if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
|
328
|
-
substring = substring[first_m, substring.length - first_m]
|
329
|
-
|
68
|
+
def find_start(subject_start, amine_seq, distance)
|
69
|
+
atg_status = 'putative' # complete, incomplete or putative
|
70
|
+
stop_codon = amine_seq.rindex('*')
|
71
|
+
if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
|
72
|
+
_5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
|
73
|
+
amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
|
74
|
+
first_m = amine_seq.index('M')
|
75
|
+
if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
|
76
|
+
if first_m # tenemos M
|
77
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1]
|
330
78
|
atg_status = 'complete'
|
331
|
-
else # con STOP pero sin M
|
332
|
-
|
333
|
-
# puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
|
334
|
-
msgs += "W1: There is no M at the beginning, "
|
79
|
+
else # con STOP pero sin M
|
80
|
+
$global_warnings << 'noM1'
|
335
81
|
end
|
336
|
-
#----------------------------------------------------------------------------------------------------------
|
337
82
|
else # esta despues, un cambio de fase impide analizar el principio
|
338
|
-
|
339
|
-
|
340
|
-
if (first_m) # tenemos M y unexpected stop # comentar?
|
341
|
-
substring = substring[first_m, substring.length - first_m] # comentar?
|
342
|
-
end # comentar?
|
343
|
-
# TODO esto se puede cambiar!
|
344
|
-
atg_status = 'putative'
|
345
|
-
msgs += " Unexpected STOP codon in 5 prime region, "
|
346
|
-
# puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
|
83
|
+
$global_warnings << 'UnexpSTOP5p'
|
84
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
|
347
85
|
end
|
348
|
-
#---------------------------------------------------------------------------------------------------------------
|
349
86
|
else # no hay stop codon
|
350
|
-
first_m =
|
351
|
-
if
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
|
357
|
-
# puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
|
358
|
-
msgs += "No stop codon before M and M found is too far from subject M, "
|
87
|
+
first_m = amine_seq.index('M')
|
88
|
+
if first_m # tenemos M
|
89
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1]
|
90
|
+
m_distance = (subject_start - amine_seq.length).abs - 10
|
91
|
+
if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
|
92
|
+
$global_warnings << 'NoStopMfar'
|
359
93
|
atg_status = 'incomplete'
|
360
|
-
else
|
361
|
-
|
362
|
-
# msgs += "No stop codon before M but high homology subject, "
|
363
|
-
atg_status = 'complete'
|
364
|
-
else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
|
365
|
-
# puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
|
366
|
-
msgs += "No stop codon before M and low homology subject, "
|
367
|
-
atg_status = 'putative'
|
368
|
-
end
|
94
|
+
else # Tenemos M
|
95
|
+
atg_status = 'complete'
|
369
96
|
end
|
370
|
-
else # sin M
|
371
|
-
|
372
|
-
# puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
|
373
|
-
msgs += "W2: There is no M at the beginning, "
|
97
|
+
else # sin M
|
98
|
+
$global_warnings << 'noM2'
|
374
99
|
end
|
375
100
|
end
|
376
|
-
|
377
|
-
return [substring, atg_status, msgs]
|
378
|
-
|
101
|
+
return amine_seq, atg_status
|
379
102
|
end
|
380
103
|
|
381
104
|
|
382
|
-
def find_end(final_hit,
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
# si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
|
395
|
-
if (sq_end_distance + aas_n_end < 0)
|
105
|
+
def find_end(final_hit, max_distance, tmp_prot, query_fasta)
|
106
|
+
frame_shift = check_frame_shift(final_hit)
|
107
|
+
beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
|
108
|
+
atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
|
109
|
+
end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
|
110
|
+
#puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3 - max_distance: #{max_distance}\e[0m"
|
111
|
+
#puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
|
112
|
+
#puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
|
113
|
+
if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
|
114
|
+
atg_substring = tmp_prot
|
115
|
+
end_substring = ''
|
396
116
|
end_status = 'incomplete'
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
# if (@verbose)
|
406
|
-
# puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
|
407
|
-
# end
|
408
|
-
end
|
117
|
+
else
|
118
|
+
end_status = 'putative'
|
119
|
+
putative_end = end_substring.index('*')
|
120
|
+
end_substring = end_substring[0 .. putative_end] if putative_end
|
121
|
+
|
122
|
+
s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
|
123
|
+
q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo
|
124
|
+
sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
|
409
125
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
#
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
126
|
+
if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment
|
127
|
+
end_status = 'complete'
|
128
|
+
elsif sq_end_distance < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
|
129
|
+
end_status = 'incomplete'
|
130
|
+
if putative_end
|
131
|
+
$global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
|
132
|
+
else
|
133
|
+
$global_warnings << ['DistSubj', sq_end_distance.abs]
|
134
|
+
end
|
135
|
+
else # tenemos suficiente secuencia
|
136
|
+
if putative_end # tenemos un stop
|
137
|
+
#beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
|
138
|
+
stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
|
139
|
+
if stop_q_s.abs <= max_distance #Stop codon is in search region
|
140
|
+
end_status = 'complete'
|
141
|
+
elsif stop_q_s < 0
|
142
|
+
$global_warnings << 'UnexpSTOP3p'
|
143
|
+
elsif stop_q_s > 0
|
144
|
+
end_status = 'complete'
|
145
|
+
$global_warnings << 'QueryTooLong'
|
146
|
+
end
|
147
|
+
else # no tenemos codon de parada pero tenemos suficiente secuencia
|
148
|
+
end_status = 'incomplete'
|
149
|
+
$global_warnings << 'ProtFusion'
|
431
150
|
end
|
432
|
-
end_substring = end_substring[0, putative_end+1]
|
433
|
-
|
434
|
-
else # no tenemos codon de parada pero tenemos suficiente secuencia
|
435
|
-
end_status = 'putative'
|
436
|
-
warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
|
437
|
-
# if (@verbose)
|
438
|
-
# puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
|
439
|
-
# end
|
440
151
|
end
|
441
|
-
|
442
152
|
end
|
443
|
-
|
444
|
-
|
153
|
+
final_prot = atg_substring + end_substring
|
154
|
+
end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
|
155
|
+
new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
|
156
|
+
modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if $verbose > 1
|
157
|
+
final_hit.q_end = new_q_end
|
158
|
+
return end_status, final_prot
|
445
159
|
end
|
446
160
|
|
447
161
|
|
448
|
-
def determine_status(atg_status,end_status)
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
final_status = 'N-terminus'
|
458
|
-
elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
|
459
|
-
final_status = 'Putative N-terminus'
|
460
|
-
elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
|
461
|
-
final_status = 'C-terminus'
|
462
|
-
elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
|
463
|
-
final_status = 'Putative C-terminus'
|
162
|
+
def determine_status(atg_status, end_status)
|
163
|
+
if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
|
164
|
+
type = COMPLETE
|
165
|
+
elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
|
166
|
+
type = INTERNAL
|
167
|
+
elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
|
168
|
+
type = N_TERMINAL
|
169
|
+
elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
|
170
|
+
type = C_TERMINAL
|
464
171
|
end
|
465
172
|
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
|
471
|
-
name_diff = q.query_def.length - final_hit.acc.length
|
472
|
-
if (name_diff > 0)
|
473
|
-
spnum = ' '*name_diff.to_i
|
173
|
+
if atg_status == 'putative' || end_status == 'putative'
|
174
|
+
status = FALSE # Putative
|
474
175
|
else
|
475
|
-
|
176
|
+
status = TRUE # Sure
|
476
177
|
end
|
477
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
478
|
-
# if the sequence is Complete will be printed --------------------------------------------------------------------
|
479
|
-
if (final_status == 'Complete')
|
480
|
-
seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
|
481
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
482
|
-
|
483
|
-
if (final_hit.reversed)
|
484
|
-
(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
|
485
|
-
end
|
486
|
-
seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
|
487
|
-
seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
|
488
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
489
|
-
else # la proteina no esta completa -------------------------------------------------------------------------
|
490
|
-
if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
|
491
|
-
if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
|
492
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
|
493
|
-
(kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
|
494
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
495
178
|
|
496
|
-
|
497
|
-
|
498
|
-
(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
|
499
|
-
end
|
500
|
-
|
501
|
-
my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
|
502
|
-
seq.annotate(:protein,my_prot)
|
503
|
-
my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
|
504
|
-
seq.annotate(:alignment,my_align)
|
505
|
-
|
506
|
-
tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
|
507
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
508
|
-
#-----------------------------------------------------------------------------------------------------------------------------
|
509
|
-
# elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
|
510
|
-
|
511
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
|
512
|
-
end
|
513
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
514
|
-
elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
|
515
|
-
if (db_name =~ /^tr_/) # ---> estamos usando el trembl
|
516
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
|
517
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
179
|
+
return type, status
|
180
|
+
end
|
518
181
|
|
519
|
-
if (final_hit.reversed)
|
520
|
-
(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
|
521
|
-
end
|
522
182
|
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
|
533
|
-
seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
|
534
|
-
|
535
|
-
# puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
|
536
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
|
183
|
+
def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
|
184
|
+
if final_prot.length - 2 * distance > final_hit.s_len
|
185
|
+
$global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
|
186
|
+
elsif final_prot.length + 2 * distance < final_hit.s_len
|
187
|
+
$global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
|
188
|
+
if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len
|
189
|
+
if type == COMPLETE
|
190
|
+
status = FALSE
|
191
|
+
$global_warnings << 'VeryShorter'
|
537
192
|
end
|
538
193
|
end
|
539
194
|
end
|
195
|
+
return status
|
540
196
|
end
|
541
197
|
|
542
198
|
|
543
|
-
def
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
# puts "empieza en el borde de la seq"
|
199
|
+
def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
|
200
|
+
# if the sequence is Complete or it hasn't previous info will be saved
|
201
|
+
if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
|
202
|
+
seq.type = type
|
203
|
+
seq.status = status
|
204
|
+
seq.db_name = db_name
|
205
|
+
seq.seq_fasta = query_fasta
|
206
|
+
seq.seq_aa = final_prot
|
207
|
+
seq.hit = final_hit
|
208
|
+
seq.warnings($global_warnings)
|
209
|
+
$global_warnings = [] # Clean all warnings for current sequence
|
210
|
+
seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
|
211
|
+
if type == COMPLETE
|
212
|
+
seq.ignore = TRUE
|
558
213
|
end
|
559
|
-
|
560
|
-
atg_found = my_seq_n.index(/ATG/i)
|
561
|
-
atg_found_rv = my_seq_n.rindex(/ATG/i)
|
562
|
-
my_atg_index = nil
|
563
214
|
end
|
564
|
-
|
565
|
-
|
566
|
-
if (beg5)
|
567
|
-
|
568
|
-
my_seq_n.sub!(/ATG/i,'_-_ATG')
|
569
|
-
my_atg_index = atg_found
|
570
|
-
my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
|
571
|
-
|
572
|
-
elsif (atg_found == atg_found_rv)
|
573
|
-
|
574
|
-
my_seq_n.sub!(/ATG/i,'_-_ATG')
|
575
|
-
my_atg_index = final_hit.q_beg - 5 + atg_found
|
576
|
-
|
577
|
-
my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
|
578
|
-
|
579
|
-
# puts "my_seq despues de encontrar el atg: #{my_seq}"
|
580
|
-
elsif (atg_found == 5) || (atg_found_rv == 5)
|
581
|
-
|
582
|
-
my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
|
583
|
-
my_atg_index = final_hit.q_beg - 5 + atg_found
|
584
|
-
my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
|
585
|
-
|
586
|
-
else
|
587
|
-
|
588
|
-
# puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
|
589
|
-
bad_atg = true
|
590
|
-
my_seq = query_fasta
|
591
|
-
end
|
592
|
-
|
593
|
-
else
|
594
|
-
|
595
|
-
bad_atg = true
|
596
|
-
# puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
|
597
|
-
my_seq = query_fasta
|
598
|
-
|
215
|
+
if $verbose > 2
|
216
|
+
puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
|
599
217
|
end
|
600
|
-
|
601
|
-
stop_c = nil
|
602
|
-
if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
|
218
|
+
end
|
603
219
|
|
604
|
-
if (bad_atg == true)
|
605
|
-
stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
|
606
|
-
stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
|
607
|
-
else
|
608
|
-
stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
|
609
|
-
stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
|
610
|
-
end
|
611
220
|
|
221
|
+
def mark_nt_seqs(final_hit, query_fasta)
|
222
|
+
atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
|
223
|
+
mark_atg = nil
|
224
|
+
if atg == 'ATG'
|
225
|
+
mark_atg = '_-_'
|
612
226
|
end
|
227
|
+
stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
|
228
|
+
mark_stop = nil
|
229
|
+
if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
|
230
|
+
mark_stop = '___'
|
231
|
+
end
|
232
|
+
seq5p = query_fasta[0..final_hit.q_beg-1]
|
233
|
+
orf = query_fasta[final_hit.q_beg..final_hit.q_end]
|
234
|
+
seq3p = query_fasta[final_hit.q_end..query_fasta.length]
|
235
|
+
nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
|
236
|
+
return nt_seq
|
237
|
+
end
|
613
238
|
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
if
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
my_prot = my_prot.sub(/___\w+/,'')
|
627
|
-
my_prot = my_prot.translate
|
628
|
-
my_prot = my_prot.sub(/x$/,'')
|
629
|
-
|
630
|
-
simliar_fragment = final_prot.lcs(my_prot)
|
631
|
-
|
632
|
-
if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
|
633
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
|
634
|
-
else
|
635
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
636
|
-
# puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
|
239
|
+
def exonerate_fix_frame_shift(query_fasta, hit)
|
240
|
+
frame_shifts = []
|
241
|
+
added_nts = 0
|
242
|
+
hit.each_with_index do |hsp, num|
|
243
|
+
if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
|
244
|
+
if !hsp.q_frameshift.empty? #There is frameshift
|
245
|
+
hsp.q_frameshift.each do |position, num_nts|
|
246
|
+
local_add = 3 - num_nts
|
247
|
+
fs_final_position = position + num_nts
|
248
|
+
$global_warnings << ['ExFrameS', fs_final_position]
|
249
|
+
frame_shifts << [fs_final_position, local_add]
|
250
|
+
added_nts += local_add
|
637
251
|
end
|
638
|
-
|
639
252
|
end
|
253
|
+
end
|
254
|
+
hsp.q_beg += added_nts if num > 0
|
255
|
+
hsp.q_end += added_nts
|
256
|
+
end
|
257
|
+
add = 0
|
258
|
+
frame_shifts.each do |position, num_nts|
|
259
|
+
query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
|
260
|
+
add += num_nts
|
261
|
+
end
|
262
|
+
return query_fasta
|
263
|
+
end
|
640
264
|
|
641
|
-
else
|
642
|
-
if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
|
643
265
|
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
end
|
651
|
-
|
652
|
-
if (!stop_c.nil?)
|
653
|
-
if (stop_c.translate == '*')
|
654
|
-
final_hit.q_end = final_hit.q_end + 3
|
655
|
-
if (bad_atg == true)
|
656
|
-
my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
|
657
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
658
|
-
else
|
659
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
660
|
-
end
|
661
|
-
else
|
662
|
-
if (bad_atg == true)
|
663
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
664
|
-
# puts "find nt end: NO ATG, NO exact STOP"
|
665
|
-
else
|
666
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
667
|
-
# puts "find nt end: GOOD ATG, NO exact STOP"
|
668
|
-
end
|
669
|
-
end
|
670
|
-
end
|
671
|
-
end
|
672
|
-
|
266
|
+
## VERBOSE METHODS
|
267
|
+
def show_nts
|
268
|
+
show = FALSE
|
269
|
+
show = TRUE if $verbose && $verbose > 3
|
270
|
+
return show
|
271
|
+
end
|
673
272
|
|
674
|
-
end
|
675
273
|
|
676
|
-
|
274
|
+
def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
|
275
|
+
if new_q_end > final_hit.q_end #There is an align extension
|
276
|
+
extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
|
277
|
+
final_hit.q_seq = final_hit.q_seq + extend_align
|
278
|
+
elsif new_q_end < final_hit.q_end #The align is cutted
|
279
|
+
upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
|
280
|
+
final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
|
281
|
+
end
|
282
|
+
end
|
677
283
|
|
678
|
-
if (bad_atg == true)
|
679
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
|
680
|
-
else
|
681
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
|
682
|
-
end
|
683
284
|
|
285
|
+
def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
|
286
|
+
if new_q_beg < final_hit.q_beg #There is an align extension
|
287
|
+
extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
|
288
|
+
final_hit.q_seq = extend_align + final_hit.q_seq
|
289
|
+
elsif new_q_beg > final_hit.q_beg #The align is cut
|
290
|
+
seq_cut = (new_q_beg - final_hit.q_beg)/3
|
291
|
+
gaps = final_hit.q_seq[0..seq_cut].count('-')
|
292
|
+
seq_cut += gaps
|
293
|
+
final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
|
684
294
|
end
|
685
|
-
|
686
295
|
end
|
687
296
|
|
688
297
|
end
|