full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,78 +1,116 @@
|
|
1
1
|
|
2
2
|
module CommonFunctions
|
3
3
|
|
4
|
-
def contenidos_en_prot(
|
5
|
-
|
6
|
-
|
7
|
-
q_index_start =
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
fr_index_start = full_prot.index(simliar_fragment)
|
27
|
-
|
28
|
-
if (q_index_start == 9999)
|
29
|
-
q_index_start = fr_index_start
|
4
|
+
def contenidos_en_prot(key_seq, full_prot)
|
5
|
+
full_prot = full_prot.gsub(/[\-Xx]/,'-')
|
6
|
+
compare_prot = key_seq.gsub(/[\-Xx]/,'-')
|
7
|
+
q_index_start = full_prot.index(compare_prot) #Full match between hit.q_seq and full_prot (unigene)
|
8
|
+
if q_index_start.nil? #There is gaps that unables the full match
|
9
|
+
q_index_start = match_with_ungapped_reference(full_prot, compare_prot)
|
10
|
+
if q_index_start.nil? && full_prot.include?('-')
|
11
|
+
diff = full_prot.length - compare_prot.length
|
12
|
+
if scan_sequences(full_prot.split(''), compare_prot.split('')) == compare_prot.length
|
13
|
+
q_index_start = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
if diff >0 && scan_sequences(full_prot.split(''), compare_prot.split(''), diff) == compare_prot.length
|
17
|
+
q_index_start = diff
|
18
|
+
end
|
19
|
+
|
20
|
+
if q_index_start.nil?
|
21
|
+
q_index_start = match_with_gapped_reference(full_prot, compare_prot)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
if q_index_start.nil?
|
25
|
+
q_index_start = 0
|
30
26
|
end
|
31
|
-
full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
|
32
|
-
end
|
33
|
-
|
34
|
-
simliar_fragment = full_prot.lcs(compare_prot)
|
35
|
-
|
36
|
-
# if ($verbose)
|
37
|
-
# puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
|
38
|
-
# puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
|
39
|
-
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
|
40
|
-
# end
|
41
|
-
|
42
|
-
if (suma_fragments + masked_x >= compare_prot.length * 0.7)
|
43
|
-
is_ok = true
|
44
|
-
# puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
|
45
|
-
else
|
46
|
-
is_ok = false
|
47
|
-
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
|
48
|
-
# puts "Warning!: no match comparing proteins"
|
49
|
-
end
|
50
27
|
|
51
|
-
min_index_start = [min_index_start, q_index_start].min
|
52
|
-
|
53
|
-
if (min_index_start == 9999)
|
54
|
-
min_index_start = 0
|
55
28
|
end
|
56
|
-
|
57
|
-
return [is_ok, min_index_start]
|
29
|
+
return q_index_start
|
58
30
|
end
|
59
31
|
|
60
32
|
|
33
|
+
def match_with_gapped_reference(full_prot, compare_prot)
|
34
|
+
q_index_start = nil
|
35
|
+
fragments_array = full_prot.split(/\-+/)
|
36
|
+
fragments_array.each_with_index do |seq, i|
|
37
|
+
if seq.length > 4
|
38
|
+
compare_prot_index = compare_prot.index(seq)
|
39
|
+
if compare_prot_index.nil? # In cases that no match by gaps
|
40
|
+
seq =seq[0..4]
|
41
|
+
compare_prot_index = compare_prot.index(seq)
|
42
|
+
end
|
43
|
+
if !compare_prot_index.nil?
|
44
|
+
q_index_start = full_prot.index(seq)
|
45
|
+
if i > 0
|
46
|
+
q_index_start, compare_prot_index = extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
|
47
|
+
end
|
48
|
+
break
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
return q_index_start
|
53
|
+
end
|
61
54
|
|
55
|
+
def extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
|
56
|
+
full_prot_substring = full_prot[0..q_index_start-1].reverse.split('')
|
57
|
+
compare_prot_substring = compare_prot[0..compare_prot_index-1].reverse.split('')
|
58
|
+
extend_match = scan_sequences(full_prot_substring, compare_prot_substring)
|
59
|
+
q_index_start -= extend_match
|
60
|
+
compare_prot_index -= extend_match
|
61
|
+
return q_index_start, compare_prot_index
|
62
|
+
end
|
62
63
|
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
64
|
+
def scan_sequences(ref_seq, compare_seq, diff = 0)
|
65
|
+
extend_match = 0
|
66
|
+
ref_seq.each_with_index do |char,i|
|
67
|
+
if i >= diff
|
68
|
+
compare_char = compare_seq[extend_match]
|
69
|
+
if compare_char.nil? || char != compare_char && char != '-' && compare_char != '-'
|
70
|
+
break
|
71
|
+
end
|
72
|
+
extend_match += 1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return extend_match
|
76
|
+
end
|
69
77
|
|
70
|
-
|
78
|
+
def match_with_ungapped_reference(full_prot, compare_prot)
|
79
|
+
q_index_start = nil
|
80
|
+
fragments_array = compare_prot.split(/\-+/)
|
81
|
+
fragments_array.each_with_index do |seq, i|
|
82
|
+
if q_index_start.nil? && seq.length > 4
|
83
|
+
q_index_start = full_prot.index(seq)
|
84
|
+
if i > 0 && !q_index_start.nil?
|
85
|
+
q_index_start = refine_match(seq, compare_prot, q_index_start) # Correction if first seq isn't enough large
|
86
|
+
end
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
return q_index_start
|
91
|
+
end
|
71
92
|
|
72
|
-
|
73
|
-
|
93
|
+
def refine_match(subseq, seq, q_index_start)
|
94
|
+
location_seq = seq.index(subseq)
|
95
|
+
gaps_on_location = seq[0..location_seq].count('-')
|
96
|
+
q_index_start -= location_seq - gaps_on_location # Correction if first seq isn't enough large
|
97
|
+
return q_index_start
|
74
98
|
end
|
75
99
|
|
100
|
+
def reverse_seq(query_fasta, hit)
|
101
|
+
hit.q_frame = -hit.q_frame
|
102
|
+
hit.q_end = query_fasta.length - 1 - hit.q_end
|
103
|
+
hit.q_beg = query_fasta.length - 1 - hit.q_beg
|
104
|
+
hit.reversed = TRUE
|
105
|
+
query_fasta = query_fasta.complementary_dna # ESTO REALMENTE HACE LA REVERSO COMPLEMENTARIA.
|
106
|
+
if hit.class.to_s == 'ExoBlastHit'
|
107
|
+
hit.q_frameshift.map!{|position, num_nts|
|
108
|
+
reversed_position = query_fasta.length - 1 - position
|
109
|
+
[reversed_position, num_nts]
|
110
|
+
}
|
111
|
+
end
|
112
|
+
return query_fasta
|
113
|
+
end
|
76
114
|
|
77
115
|
|
78
116
|
def corrige_frame(ref_frame,ref_start,ref_end)
|
@@ -89,6 +127,10 @@ module CommonFunctions
|
|
89
127
|
|
90
128
|
end
|
91
129
|
|
92
|
-
|
93
|
-
|
94
|
-
|
130
|
+
def check_frame_shift(hit)
|
131
|
+
fs = 0
|
132
|
+
prot_length_in_nts = hit.q_end-hit.q_beg+1
|
133
|
+
fs = prot_length_in_nts%3
|
134
|
+
return fs
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,258 @@
|
|
1
|
+
# Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# 'Software'), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
require 'blast_query'
|
23
|
+
require 'blast_hit'
|
24
|
+
require 'fl_string_utils.rb'
|
25
|
+
|
26
|
+
OPERATION = 0
|
27
|
+
QUERY = 1
|
28
|
+
TARGET = 2
|
29
|
+
|
30
|
+
class ExoBlastHit < BlastHit
|
31
|
+
attr_accessor :q_frameshift, :s_frameshift
|
32
|
+
def initialize(start_target, ends_target, start_query, ends_query)
|
33
|
+
super(start_target, ends_target, start_query, ends_query)
|
34
|
+
@s_frameshift=[]
|
35
|
+
@q_frameshift=[]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
40
|
+
class ExonerateResult
|
41
|
+
|
42
|
+
# Parser initialization
|
43
|
+
def initialize(input, seqs= nil, query_seqs = nil, all = TRUE)
|
44
|
+
@querys = []
|
45
|
+
@seqs = seqs #unigenes
|
46
|
+
@prot_seqs = query_seqs#prot
|
47
|
+
|
48
|
+
if input.is_a?(Array)
|
49
|
+
input.each do |file|
|
50
|
+
parse_file(File.open(file).readlines, all)
|
51
|
+
end
|
52
|
+
else
|
53
|
+
parse_file(File.open(input).readlines, all)
|
54
|
+
end
|
55
|
+
query_name=''
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_file(lines, all)
|
59
|
+
lines_parsed=[]
|
60
|
+
lines_parsed={} if !all
|
61
|
+
lines.each do |line|
|
62
|
+
if line=~ /^vulgar:/
|
63
|
+
line.chomp!
|
64
|
+
fields=line.split(' ', 11)
|
65
|
+
features={'query_id'=> fields[1], 'query_start_align'=> fields[2].to_i, 'query_end_align'=> fields[3].to_i, 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6].to_i, 'target_end_align'=> fields[7].to_i, 'target_strand'=> fields[8], 'score'=> fields[9].to_i, 'align_data'=> fields[10]}
|
66
|
+
if all
|
67
|
+
lines_parsed << features
|
68
|
+
else
|
69
|
+
if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
|
70
|
+
lines_parsed[features['target_id']]=features
|
71
|
+
else
|
72
|
+
if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
|
73
|
+
lines_parsed[features['target_id']]=features
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
convert_parsed_lines(lines_parsed)
|
80
|
+
end
|
81
|
+
|
82
|
+
def convert_parsed_lines(lines_parsed)
|
83
|
+
last_query = nil
|
84
|
+
query = nil
|
85
|
+
lines_parsed.each_with_index do |line|
|
86
|
+
begin
|
87
|
+
if lines_parsed.class.to_s=='Array'
|
88
|
+
align_data=line['align_data']
|
89
|
+
features=line
|
90
|
+
else #hash
|
91
|
+
align_data=line[1]['align_data']
|
92
|
+
features=line[1]
|
93
|
+
end
|
94
|
+
tags = align_data.scan(/([MFG53S]) ([0-9]+) ([0-9]+)/)
|
95
|
+
tags.map!{|tag| [tag[0], tag[1].to_i, tag[2].to_i]}
|
96
|
+
if features['target_id'] != last_query
|
97
|
+
last_query = features['target_id']
|
98
|
+
query = BlastQuery.new(features['target_id'])
|
99
|
+
@querys << query
|
100
|
+
end
|
101
|
+
hiting(features,tags, query)
|
102
|
+
rescue
|
103
|
+
puts "Result: #{features['target_id']} => #{features['query_id']} hasn't been parsed\n#{line}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
#this method only works fine with --model protein2dna parameter of exonerate
|
109
|
+
def hiting(features, tags, query) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
|
110
|
+
do_align = FALSE
|
111
|
+
do_align = TRUE if !@prot_seqs.nil? && !@seqs.nil?
|
112
|
+
start_target = features['target_start_align']#Unigen
|
113
|
+
start_query = features['query_start_align'] #proteina
|
114
|
+
ends_target = features['target_end_align']
|
115
|
+
ends_query = features['query_end_align']-1 # -> Exonerate don't set to 0 position the ends of target and query
|
116
|
+
if features['target_strand'] == '-' #-> Exonerate don't set to 0 position the ends of target and query
|
117
|
+
start_target -= 1 # Start target is end target when mathc is in reversed complementary strand
|
118
|
+
else
|
119
|
+
ends_target -= 1
|
120
|
+
end
|
121
|
+
hit = ExoBlastHit.new(start_target+1, ends_target+1, start_query+1, ends_query+1)
|
122
|
+
define_hit_parameters(hit, features, tags)
|
123
|
+
query.add_hit(hit)
|
124
|
+
|
125
|
+
#Define alignment and blast like parameters
|
126
|
+
target_alignment = ''
|
127
|
+
query_alignment = ''
|
128
|
+
counter_target = start_target
|
129
|
+
counter_query = start_query
|
130
|
+
if do_align #get seqs
|
131
|
+
query_seq = @prot_seqs[features['query_id']]
|
132
|
+
target_seq = @seqs[features['target_id']]
|
133
|
+
end
|
134
|
+
counter_target, target_seq = do_reverso_complementary(counter_target, target_seq) if features['target_strand'] == '-'
|
135
|
+
query_frameshift = []
|
136
|
+
target_frameshift = []
|
137
|
+
gap_shift = 0
|
138
|
+
#puts features['query_id']+ ' ' +features['target_strand'], '-----------------------'
|
139
|
+
tags.each_with_index do |tag, n_operation|
|
140
|
+
#puts tag.inspect
|
141
|
+
if do_align
|
142
|
+
gap_shift = 0 if tag[OPERATION] != 'G'
|
143
|
+
query_alignment << query_seq[counter_query, tag[QUERY]]
|
144
|
+
target_alignment << target_seq[counter_target, tag[TARGET]].translate
|
145
|
+
end
|
146
|
+
if tag[OPERATION] == 'F'
|
147
|
+
if tag[TARGET] > 0 && tag[TARGET] < 3 #TRUE FRAMESHIFT
|
148
|
+
gap_shift += 1
|
149
|
+
if tags[n_operation+1][OPERATION] != 'G' #there are frameshift that not insert a gap, we do it
|
150
|
+
query_alignment << '-' if do_align
|
151
|
+
end
|
152
|
+
else
|
153
|
+
query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
|
154
|
+
end
|
155
|
+
query_frameshift << counter_query
|
156
|
+
fs_counter_target = counter_target
|
157
|
+
fs_counter_target = target_seq.length - counter_target if features['target_strand'] == '-' # ESto es un apaño, habria que plantear el parseo de las reversas como reduccion en el contador del formato del exonerate, en vez de como adiccion
|
158
|
+
if tag[TARGET] > 3
|
159
|
+
real_fs = tag[TARGET]%3
|
160
|
+
real_gap = tag[TARGET] - real_fs
|
161
|
+
fs = [fs_counter_target + real_gap, real_fs]
|
162
|
+
else
|
163
|
+
fs = [fs_counter_target, tag[TARGET]]
|
164
|
+
end
|
165
|
+
target_frameshift << fs
|
166
|
+
elsif tag[OPERATION] == 'G'
|
167
|
+
query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
|
168
|
+
diff = tag[QUERY] - gap_shift
|
169
|
+
target_alignment << '-' * diff if do_align && diff > 0
|
170
|
+
gap_shift = 0
|
171
|
+
end
|
172
|
+
counter_query += tag[QUERY]
|
173
|
+
counter_target += tag[TARGET]
|
174
|
+
end
|
175
|
+
hit.s_frameshift = query_frameshift
|
176
|
+
hit.q_frameshift = target_frameshift
|
177
|
+
|
178
|
+
#puts "\e[33m#{target_alignment}\e[0m", "\e[36m#{query_alignment}\e[0m"
|
179
|
+
if do_align
|
180
|
+
hit.q_seq = target_alignment
|
181
|
+
hit.s_seq = query_alignment
|
182
|
+
hit.align_len = query_alignment.length
|
183
|
+
hit.ident = set_ident(target_alignment,query_alignment)
|
184
|
+
end
|
185
|
+
end #def
|
186
|
+
|
187
|
+
def do_reverso_complementary(counter_target, target_seq)
|
188
|
+
counter_target = target_seq.length - 1 - counter_target
|
189
|
+
target_seq = target_seq.complementary_dna
|
190
|
+
return counter_target, target_seq
|
191
|
+
end
|
192
|
+
|
193
|
+
def set_ident(target_alignment, query_alignment)
|
194
|
+
matchs = 0
|
195
|
+
position = 0
|
196
|
+
target_alignment.each_char do |char|
|
197
|
+
matchs +=1 if char == query_alignment[position]
|
198
|
+
position +=1
|
199
|
+
end
|
200
|
+
perc_ident = ('%.2f' % (matchs*100.0/target_alignment.length)).to_f
|
201
|
+
return perc_ident
|
202
|
+
end
|
203
|
+
|
204
|
+
def define_hit_parameters(hit, features, tags)
|
205
|
+
hit.gaps = 0
|
206
|
+
tags.map{|aln| hit.gaps += 1 if aln[0] == 'G'}
|
207
|
+
hit.reversed = FALSE
|
208
|
+
hit.align_len =(features['query_end_align'] - features['query_start_align']).abs+1
|
209
|
+
hit.mismatches=0
|
210
|
+
hit.e_val=0
|
211
|
+
hit.bit_score=0
|
212
|
+
hit.score = features['score']
|
213
|
+
hit.s_frame = nil
|
214
|
+
strand = 1
|
215
|
+
strand = -1 if features['target_strand'] == '-'
|
216
|
+
hit.q_frame = (((features['target_start_align']) % 3) +1) *strand
|
217
|
+
hit.subject_id = features['query_id']
|
218
|
+
hit.full_subject_length=0
|
219
|
+
hit.definition=''
|
220
|
+
hit.acc=features['query_id']
|
221
|
+
hit.q_seq=''
|
222
|
+
hit.s_seq=''
|
223
|
+
end
|
224
|
+
|
225
|
+
# inspect results
|
226
|
+
def inspect
|
227
|
+
res = "Exonerate results:\n"
|
228
|
+
res+= '-'*20
|
229
|
+
res+= "\nQuerys: #{@querys.count}\n"
|
230
|
+
@querys.each{|q| res+=q.inspect+"\n"}
|
231
|
+
return res
|
232
|
+
end
|
233
|
+
|
234
|
+
# find query by name
|
235
|
+
def find_query(querys,name_q)
|
236
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
237
|
+
new_q=nil
|
238
|
+
|
239
|
+
if !querys.empty?
|
240
|
+
new_q=querys.find{|q| (q.query_id==name_q)}
|
241
|
+
end
|
242
|
+
|
243
|
+
return new_q
|
244
|
+
end
|
245
|
+
|
246
|
+
# check if there are querys
|
247
|
+
def empty?
|
248
|
+
|
249
|
+
return @querys.empty?
|
250
|
+
end
|
251
|
+
|
252
|
+
# get query count
|
253
|
+
def size
|
254
|
+
@querys.size
|
255
|
+
end
|
256
|
+
|
257
|
+
attr_accessor :querys
|
258
|
+
end
|
@@ -1,688 +1,297 @@
|
|
1
|
-
|
1
|
+
require 'types'
|
2
2
|
require 'une_los_hit'
|
3
3
|
|
4
4
|
module FlAnalysis
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
#
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
|
16
|
-
|
6
|
+
$global_warnings = []
|
7
|
+
|
8
|
+
def analiza_orf_y_fl(seq, hit, options, db_name)
|
9
|
+
query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
|
10
|
+
if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
|
11
|
+
seq_unida = UneLosHit.new(hit, query_fasta)
|
12
|
+
full_prot = seq_unida.full_prot
|
13
|
+
query_fasta = seq_unida.output_seq # repaired fasta
|
14
|
+
final_hit = seq_unida.final_hit # single hit
|
15
|
+
$global_warnings += seq_unida.msgs # warning messages
|
16
|
+
else
|
17
|
+
query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
|
18
|
+
final_hit = hit.first # single hit
|
17
19
|
end
|
18
|
-
|
19
|
-
q=blast_query
|
20
|
-
msgs = ''
|
21
|
-
atg_status = ''
|
22
|
-
end_status = ''
|
23
|
-
final_status = ''
|
24
|
-
|
25
|
-
# the fasta sequence is saved
|
26
|
-
query_fasta = seq.seq_fasta
|
20
|
+
query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]
|
27
21
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
if (seq.get_annotations(:tmp_annotation).empty?)
|
34
|
-
if (seq.sec_desc.empty?)
|
35
|
-
seq.annotate(:apply_tcode,'')
|
36
|
-
else
|
37
|
-
seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
|
38
|
-
end
|
39
|
-
else
|
40
|
-
save_last_db_annotations(seq)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
return
|
45
|
-
end
|
46
|
-
#----------------------------------------------------------------------------------------------------------
|
47
|
-
warnings = ''
|
48
|
-
errors = ''
|
49
|
-
wrong_seq = false
|
22
|
+
full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
|
23
|
+
original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
|
24
|
+
seq.show_alignment(final_hit, query_fasta, show_nts) if $verbose > 2 ## VERBOSE
|
25
|
+
atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
|
26
|
+
end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)
|
50
27
|
|
51
|
-
|
52
|
-
if
|
53
|
-
|
54
|
-
seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
|
55
|
-
|
56
|
-
wrong_seq = seq_unida.wrong_seq
|
57
|
-
is_ok = seq_unida.is_ok
|
58
|
-
q_index_start = seq_unida.q_index_start
|
59
|
-
full_prot = seq_unida.full_prot
|
60
|
-
|
61
|
-
query_fasta = seq_unida.output_seq # repaired fasta
|
62
|
-
|
63
|
-
final_hit = seq_unida.final_hit # single hit
|
64
|
-
msgs = seq_unida.msgs # warning messages
|
65
|
-
x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
|
66
|
-
|
67
|
-
else # if there is only one hit
|
28
|
+
puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
|
29
|
+
seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if $verbose > 1 ## VERBOSE
|
30
|
+
puts "ATG: #{atg_status} STOP: #{end_status}" if $verbose > 2 ## VERBOSE
|
68
31
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
final_hit = q.hits[0] # single hit
|
75
|
-
x_number = 0 # number of nucleotides used to fix frame errors
|
76
|
-
|
77
|
-
full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
|
78
|
-
(is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
|
79
|
-
end
|
80
|
-
# test_final_hit(final_hit, query_fasta)
|
81
|
-
#----------------------------------------------------------------------------------------------------------
|
82
|
-
if wrong_seq
|
83
|
-
warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
|
84
|
-
# puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
|
85
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
|
86
|
-
error_log(q, seq, warnings, db_name)
|
87
|
-
return
|
88
|
-
end
|
89
|
-
#----------------------------------------------------------------------------------------------------------
|
90
|
-
warnings += msgs
|
91
|
-
msgs = ''
|
92
|
-
#----------------------------------------------------------------------------------------------------------
|
93
|
-
if (x_number < 0)
|
94
|
-
warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
|
95
|
-
# puts "ERROR#2, unexpected negative index in x_number"
|
96
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
|
97
|
-
error_log(q, seq, warnings, db_name)
|
98
|
-
return
|
99
|
-
end
|
100
|
-
#----------------------------------------------------------------------------------------------------------
|
101
|
-
if (!is_ok)
|
102
|
-
warnings = "ERROR#3, very serious frame error, " + warnings
|
103
|
-
# puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
|
104
|
-
errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
|
105
|
-
# error_log(q, seq, warnings, db_name)
|
106
|
-
# return
|
107
|
-
end
|
108
|
-
#----------------------------------------------------------------------------------------------------------
|
109
|
-
fiable = false
|
110
|
-
if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
|
111
|
-
fiable = true
|
32
|
+
# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
|
33
|
+
type, status = determine_status(atg_status, end_status)
|
34
|
+
status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
|
35
|
+
if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
|
36
|
+
save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
|
112
37
|
end
|
113
|
-
|
114
|
-
if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
|
115
|
-
substring = full_prot[0, q_index_start + 10]
|
116
|
-
resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
|
38
|
+
end
|
117
39
|
|
118
|
-
# to look for the beginning of the protein
|
119
|
-
(m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
|
120
40
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
41
|
+
def set_start_codon(final_hit, distance, full_prot, query_fasta)
|
42
|
+
q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot)
|
43
|
+
atg_status = nil
|
44
|
+
_5prima = q_index_start + distance
|
45
|
+
|
46
|
+
if final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
|
47
|
+
atg_status = 'complete'
|
48
|
+
tmp_prot = full_prot[q_index_start..full_prot.length]
|
49
|
+
elsif _5prima >= final_hit.s_beg
|
50
|
+
amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
|
51
|
+
carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
|
52
|
+
length_before_cut = amine_seq.length
|
53
|
+
amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
|
54
|
+
tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
|
55
|
+
new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
|
56
|
+
modify_5p_align(new_q_beg, final_hit, query_fasta) if $verbose > 1 ## VERBOSE, Modify query align
|
57
|
+
final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
|
125
58
|
else
|
126
|
-
|
127
|
-
# puts "beginning too short!"
|
128
|
-
# end
|
129
|
-
|
59
|
+
$global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
|
130
60
|
atg_status = 'incomplete'
|
131
|
-
substring = full_prot[0, q_index_start]
|
132
|
-
distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
|
133
|
-
|
134
|
-
if (substring.rindex('*'))
|
135
|
-
warnings += "Unexpected stop codon in the beginning of your sequence, "
|
136
|
-
# if (@verbose)
|
137
|
-
# puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
|
138
|
-
# end
|
139
|
-
end
|
140
|
-
|
141
|
-
final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
|
142
61
|
tmp_prot = full_prot
|
143
62
|
end
|
144
|
-
#----------------------------------------------------------------------------------------------------------
|
145
|
-
# look for the end of the protein
|
146
|
-
(resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
|
147
|
-
#----------------------------------------------------------------------------------------------------------
|
148
|
-
final_prot = "#{resto_substring}#{end_substring}"
|
149
|
-
|
150
|
-
warnings += msgs
|
151
|
-
|
152
|
-
# to get the value of the end_ORF index
|
153
|
-
if (atg_status == 'complete')
|
154
|
-
final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
|
155
|
-
else
|
156
|
-
if (putative_end)
|
157
|
-
final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
#--------------------------------------------------------------------------------------------------------------
|
162
|
-
# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
|
163
|
-
final_status = determine_status(atg_status,end_status)
|
164
|
-
#----------------------------------------------------------------------------------------------------------
|
165
|
-
if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
|
166
|
-
warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
|
167
63
|
|
168
|
-
|
169
|
-
warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
|
170
|
-
if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
|
171
|
-
|
172
|
-
if (final_status == 'Complete')
|
173
|
-
final_status = 'Putative Complete'
|
174
|
-
warnings += ". Was predicted as Complete, but is very much shorter than de subject"
|
175
|
-
# if (@verbose)
|
176
|
-
# puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
|
177
|
-
# end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
# test_final_hit(final_hit, query_fasta)
|
183
|
-
print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
|
184
|
-
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
def test_blast_hits(q)
|
189
|
-
|
190
|
-
puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
|
191
|
-
|
192
|
-
q.hits.each do |h|
|
193
|
-
puts "\t subject_id: #{h.acc}"
|
194
|
-
puts "\t acc: #{h.acc}"
|
195
|
-
puts "\t full_subject_length: #{h.full_subject_length}"
|
196
|
-
puts "\t q_beg: #{h.q_beg + 1}"
|
197
|
-
puts "\t q_end: #{h.q_end + 1}"
|
198
|
-
puts "\t q_frame: #{h.q_frame}"
|
199
|
-
puts "\t s_beg: #{h.s_beg + 1}"
|
200
|
-
puts "\t s_end: #{h.s_end + 1}"
|
201
|
-
puts "\t s_frame: #{h.s_frame}"
|
202
|
-
puts "\t align_len: #{h.align_len}"
|
203
|
-
puts "\t gaps: #{h.gaps}"
|
204
|
-
puts "\t mismatches: #{h.mismatches}"
|
205
|
-
puts "\t reversed: #{h.reversed}"
|
206
|
-
puts "\t score: #{h.score}"
|
207
|
-
puts "\t bit_score: #{h.bit_score}"
|
208
|
-
puts "\t ident: #{h.ident}"
|
209
|
-
puts "\t e_val: #{h.e_val}"
|
210
|
-
puts "\t definition: #{h.definition}"
|
211
|
-
puts "\t q_seq: #{h.q_seq}"
|
212
|
-
puts "\t s_seq: #{h.s_seq}"
|
213
|
-
|
214
|
-
end
|
215
|
-
|
216
|
-
end
|
217
|
-
|
218
|
-
|
219
|
-
def test_final_hit(final_hit, query_fasta)
|
220
|
-
|
221
|
-
puts "\t acc: #{final_hit.acc}"
|
222
|
-
puts "\t full_subject_length: #{final_hit.full_subject_length}"
|
223
|
-
|
224
|
-
puts "\n\t q_frame: #{final_hit.q_frame}"
|
225
|
-
puts "\t reversed: #{final_hit.reversed}"
|
226
|
-
|
227
|
-
puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
|
228
|
-
puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
|
229
|
-
|
230
|
-
puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
|
231
|
-
|
232
|
-
puts "\n\t definition: #{final_hit.definition}"
|
233
|
-
puts "\t q_seq: #{final_hit.q_seq}"
|
234
|
-
puts "\t s_seq: #{final_hit.s_seq}"
|
235
|
-
|
236
|
-
puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
|
237
|
-
puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
|
238
|
-
|
64
|
+
return atg_status, tmp_prot
|
239
65
|
end
|
240
66
|
|
241
67
|
|
242
|
-
def
|
243
|
-
#
|
244
|
-
|
245
|
-
if
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
else
|
253
|
-
seq.annotate(:apply_tcode,'')
|
254
|
-
end
|
255
|
-
else
|
256
|
-
warnings = "Coding sequence with some errors, #{warnings}"
|
257
|
-
tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
|
258
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
259
|
-
end
|
260
|
-
else
|
261
|
-
save_last_db_annotations(seq)
|
262
|
-
end
|
263
|
-
else
|
264
|
-
if (seq.sec_desc.empty?)
|
265
|
-
if (!q.hits[0].definition.nil?)
|
266
|
-
warnings = "Coding sequence with some errors, #{warnings}"
|
267
|
-
seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
|
268
|
-
end
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
end
|
273
|
-
|
274
|
-
|
275
|
-
def save_last_db_annotations(seq)
|
276
|
-
|
277
|
-
# puts "sequence not complete! recovering annotations from previous database! sldba!!"
|
278
|
-
(q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
|
279
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
280
|
-
|
281
|
-
(name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
|
282
|
-
if (final_hit.reversed)
|
283
|
-
(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
|
284
|
-
end
|
285
|
-
|
286
|
-
seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
|
287
|
-
seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
|
288
|
-
tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
|
289
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
290
|
-
|
291
|
-
end
|
292
|
-
|
293
|
-
|
294
|
-
def find_start(subject_start, substring, fiable, aas_n_end)
|
295
|
-
|
296
|
-
tmp_prot = ''
|
297
|
-
msgs = ''
|
298
|
-
atg_status = 'incomplete' # complete, incomplete or putative
|
299
|
-
|
300
|
-
# puts "\nsubstring (#{substring.length} aas):\n#{substring}"
|
301
|
-
stop_codon = substring.rindex('*')
|
302
|
-
|
303
|
-
# marcamos la distancia al s_beg desde el principio del substring
|
304
|
-
# s_beg_distance = (substring.length) - subject_start
|
305
|
-
s_beg_distance = (substring.length - 10) - subject_start
|
306
|
-
# marcamos la distancia al s_beg desde el final del substring
|
307
|
-
atg_distance = (subject_start + 1) - (substring.length - 10)
|
308
|
-
if (atg_distance <= 0)
|
309
|
-
atg_distance = 0
|
310
|
-
else
|
311
|
-
# puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
|
312
|
-
msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
|
313
|
-
end
|
314
|
-
|
315
|
-
# puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
|
316
|
-
#----------------------------------------------------------------------------------------------------------
|
317
|
-
# tenemos un codon de parada en el substring 5 prima
|
318
|
-
if (stop_codon)
|
319
|
-
stop_codon += 1
|
320
|
-
# ahora vamos a ver si el stop esta antes o despues del s_beg
|
321
|
-
if (stop_codon <= s_beg_distance) # esta antes
|
322
|
-
substring = substring[stop_codon, substring.length - stop_codon]
|
323
|
-
# puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
|
324
|
-
|
325
|
-
first_m = substring.index('M')
|
326
|
-
|
327
|
-
if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
|
328
|
-
substring = substring[first_m, substring.length - first_m]
|
329
|
-
|
68
|
+
def find_start(subject_start, amine_seq, distance)
|
69
|
+
atg_status = 'putative' # complete, incomplete or putative
|
70
|
+
stop_codon = amine_seq.rindex('*')
|
71
|
+
if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
|
72
|
+
_5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
|
73
|
+
amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
|
74
|
+
first_m = amine_seq.index('M')
|
75
|
+
if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
|
76
|
+
if first_m # tenemos M
|
77
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1]
|
330
78
|
atg_status = 'complete'
|
331
|
-
else # con STOP pero sin M
|
332
|
-
|
333
|
-
# puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
|
334
|
-
msgs += "W1: There is no M at the beginning, "
|
79
|
+
else # con STOP pero sin M
|
80
|
+
$global_warnings << 'noM1'
|
335
81
|
end
|
336
|
-
#----------------------------------------------------------------------------------------------------------
|
337
82
|
else # esta despues, un cambio de fase impide analizar el principio
|
338
|
-
|
339
|
-
|
340
|
-
if (first_m) # tenemos M y unexpected stop # comentar?
|
341
|
-
substring = substring[first_m, substring.length - first_m] # comentar?
|
342
|
-
end # comentar?
|
343
|
-
# TODO esto se puede cambiar!
|
344
|
-
atg_status = 'putative'
|
345
|
-
msgs += " Unexpected STOP codon in 5 prime region, "
|
346
|
-
# puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
|
83
|
+
$global_warnings << 'UnexpSTOP5p'
|
84
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
|
347
85
|
end
|
348
|
-
#---------------------------------------------------------------------------------------------------------------
|
349
86
|
else # no hay stop codon
|
350
|
-
first_m =
|
351
|
-
if
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
|
357
|
-
# puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
|
358
|
-
msgs += "No stop codon before M and M found is too far from subject M, "
|
87
|
+
first_m = amine_seq.index('M')
|
88
|
+
if first_m # tenemos M
|
89
|
+
amine_seq = amine_seq[first_m .. amine_seq.length - 1]
|
90
|
+
m_distance = (subject_start - amine_seq.length).abs - 10
|
91
|
+
if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
|
92
|
+
$global_warnings << 'NoStopMfar'
|
359
93
|
atg_status = 'incomplete'
|
360
|
-
else
|
361
|
-
|
362
|
-
# msgs += "No stop codon before M but high homology subject, "
|
363
|
-
atg_status = 'complete'
|
364
|
-
else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
|
365
|
-
# puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
|
366
|
-
msgs += "No stop codon before M and low homology subject, "
|
367
|
-
atg_status = 'putative'
|
368
|
-
end
|
94
|
+
else # Tenemos M
|
95
|
+
atg_status = 'complete'
|
369
96
|
end
|
370
|
-
else # sin M
|
371
|
-
|
372
|
-
# puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
|
373
|
-
msgs += "W2: There is no M at the beginning, "
|
97
|
+
else # sin M
|
98
|
+
$global_warnings << 'noM2'
|
374
99
|
end
|
375
100
|
end
|
376
|
-
|
377
|
-
return [substring, atg_status, msgs]
|
378
|
-
|
101
|
+
return amine_seq, atg_status
|
379
102
|
end
|
380
103
|
|
381
104
|
|
382
|
-
def find_end(final_hit,
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
# si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
|
395
|
-
if (sq_end_distance + aas_n_end < 0)
|
105
|
+
def find_end(final_hit, max_distance, tmp_prot, query_fasta)
|
106
|
+
frame_shift = check_frame_shift(final_hit)
|
107
|
+
beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
|
108
|
+
atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
|
109
|
+
end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
|
110
|
+
#puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3 - max_distance: #{max_distance}\e[0m"
|
111
|
+
#puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
|
112
|
+
#puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
|
113
|
+
if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
|
114
|
+
atg_substring = tmp_prot
|
115
|
+
end_substring = ''
|
396
116
|
end_status = 'incomplete'
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
# if (@verbose)
|
406
|
-
# puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
|
407
|
-
# end
|
408
|
-
end
|
117
|
+
else
|
118
|
+
end_status = 'putative'
|
119
|
+
putative_end = end_substring.index('*')
|
120
|
+
end_substring = end_substring[0 .. putative_end] if putative_end
|
121
|
+
|
122
|
+
s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
|
123
|
+
q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo
|
124
|
+
sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
|
409
125
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
#
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
126
|
+
if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment
|
127
|
+
end_status = 'complete'
|
128
|
+
elsif sq_end_distance < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
|
129
|
+
end_status = 'incomplete'
|
130
|
+
if putative_end
|
131
|
+
$global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
|
132
|
+
else
|
133
|
+
$global_warnings << ['DistSubj', sq_end_distance.abs]
|
134
|
+
end
|
135
|
+
else # tenemos suficiente secuencia
|
136
|
+
if putative_end # tenemos un stop
|
137
|
+
#beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
|
138
|
+
stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
|
139
|
+
if stop_q_s.abs <= max_distance #Stop codon is in search region
|
140
|
+
end_status = 'complete'
|
141
|
+
elsif stop_q_s < 0
|
142
|
+
$global_warnings << 'UnexpSTOP3p'
|
143
|
+
elsif stop_q_s > 0
|
144
|
+
end_status = 'complete'
|
145
|
+
$global_warnings << 'QueryTooLong'
|
146
|
+
end
|
147
|
+
else # no tenemos codon de parada pero tenemos suficiente secuencia
|
148
|
+
end_status = 'incomplete'
|
149
|
+
$global_warnings << 'ProtFusion'
|
431
150
|
end
|
432
|
-
end_substring = end_substring[0, putative_end+1]
|
433
|
-
|
434
|
-
else # no tenemos codon de parada pero tenemos suficiente secuencia
|
435
|
-
end_status = 'putative'
|
436
|
-
warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
|
437
|
-
# if (@verbose)
|
438
|
-
# puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
|
439
|
-
# end
|
440
151
|
end
|
441
|
-
|
442
152
|
end
|
443
|
-
|
444
|
-
|
153
|
+
final_prot = atg_substring + end_substring
|
154
|
+
end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
|
155
|
+
new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
|
156
|
+
modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if $verbose > 1
|
157
|
+
final_hit.q_end = new_q_end
|
158
|
+
return end_status, final_prot
|
445
159
|
end
|
446
160
|
|
447
161
|
|
448
|
-
def determine_status(atg_status,end_status)
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
final_status = 'N-terminus'
|
458
|
-
elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
|
459
|
-
final_status = 'Putative N-terminus'
|
460
|
-
elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
|
461
|
-
final_status = 'C-terminus'
|
462
|
-
elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
|
463
|
-
final_status = 'Putative C-terminus'
|
162
|
+
def determine_status(atg_status, end_status)
|
163
|
+
if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
|
164
|
+
type = COMPLETE
|
165
|
+
elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
|
166
|
+
type = INTERNAL
|
167
|
+
elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
|
168
|
+
type = N_TERMINAL
|
169
|
+
elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
|
170
|
+
type = C_TERMINAL
|
464
171
|
end
|
465
172
|
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
|
471
|
-
name_diff = q.query_def.length - final_hit.acc.length
|
472
|
-
if (name_diff > 0)
|
473
|
-
spnum = ' '*name_diff.to_i
|
173
|
+
if atg_status == 'putative' || end_status == 'putative'
|
174
|
+
status = FALSE # Putative
|
474
175
|
else
|
475
|
-
|
176
|
+
status = TRUE # Sure
|
476
177
|
end
|
477
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
478
|
-
# if the sequence is Complete will be printed --------------------------------------------------------------------
|
479
|
-
if (final_status == 'Complete')
|
480
|
-
seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
|
481
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
482
|
-
|
483
|
-
if (final_hit.reversed)
|
484
|
-
(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
|
485
|
-
end
|
486
|
-
seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
|
487
|
-
seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
|
488
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
489
|
-
else # la proteina no esta completa -------------------------------------------------------------------------
|
490
|
-
if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
|
491
|
-
if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
|
492
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
|
493
|
-
(kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
|
494
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
495
178
|
|
496
|
-
|
497
|
-
|
498
|
-
(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
|
499
|
-
end
|
500
|
-
|
501
|
-
my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
|
502
|
-
seq.annotate(:protein,my_prot)
|
503
|
-
my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
|
504
|
-
seq.annotate(:alignment,my_align)
|
505
|
-
|
506
|
-
tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
|
507
|
-
seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
|
508
|
-
#-----------------------------------------------------------------------------------------------------------------------------
|
509
|
-
# elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
|
510
|
-
|
511
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
|
512
|
-
end
|
513
|
-
#-------------------------------------------------------------------------------------------------------------------------------------
|
514
|
-
elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
|
515
|
-
if (db_name =~ /^tr_/) # ---> estamos usando el trembl
|
516
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
|
517
|
-
print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
|
179
|
+
return type, status
|
180
|
+
end
|
518
181
|
|
519
|
-
if (final_hit.reversed)
|
520
|
-
(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
|
521
|
-
end
|
522
182
|
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
|
533
|
-
seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
|
534
|
-
|
535
|
-
# puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
|
536
|
-
# puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
|
183
|
+
def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
|
184
|
+
if final_prot.length - 2 * distance > final_hit.s_len
|
185
|
+
$global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
|
186
|
+
elsif final_prot.length + 2 * distance < final_hit.s_len
|
187
|
+
$global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
|
188
|
+
if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len
|
189
|
+
if type == COMPLETE
|
190
|
+
status = FALSE
|
191
|
+
$global_warnings << 'VeryShorter'
|
537
192
|
end
|
538
193
|
end
|
539
194
|
end
|
195
|
+
return status
|
540
196
|
end
|
541
197
|
|
542
198
|
|
543
|
-
def
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
# puts "empieza en el borde de la seq"
|
199
|
+
def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
|
200
|
+
# if the sequence is Complete or it hasn't previous info will be saved
|
201
|
+
if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
|
202
|
+
seq.type = type
|
203
|
+
seq.status = status
|
204
|
+
seq.db_name = db_name
|
205
|
+
seq.seq_fasta = query_fasta
|
206
|
+
seq.seq_aa = final_prot
|
207
|
+
seq.hit = final_hit
|
208
|
+
seq.warnings($global_warnings)
|
209
|
+
$global_warnings = [] # Clean all warnings for current sequence
|
210
|
+
seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
|
211
|
+
if type == COMPLETE
|
212
|
+
seq.ignore = TRUE
|
558
213
|
end
|
559
|
-
|
560
|
-
atg_found = my_seq_n.index(/ATG/i)
|
561
|
-
atg_found_rv = my_seq_n.rindex(/ATG/i)
|
562
|
-
my_atg_index = nil
|
563
214
|
end
|
564
|
-
|
565
|
-
|
566
|
-
if (beg5)
|
567
|
-
|
568
|
-
my_seq_n.sub!(/ATG/i,'_-_ATG')
|
569
|
-
my_atg_index = atg_found
|
570
|
-
my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
|
571
|
-
|
572
|
-
elsif (atg_found == atg_found_rv)
|
573
|
-
|
574
|
-
my_seq_n.sub!(/ATG/i,'_-_ATG')
|
575
|
-
my_atg_index = final_hit.q_beg - 5 + atg_found
|
576
|
-
|
577
|
-
my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
|
578
|
-
|
579
|
-
# puts "my_seq despues de encontrar el atg: #{my_seq}"
|
580
|
-
elsif (atg_found == 5) || (atg_found_rv == 5)
|
581
|
-
|
582
|
-
my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
|
583
|
-
my_atg_index = final_hit.q_beg - 5 + atg_found
|
584
|
-
my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
|
585
|
-
|
586
|
-
else
|
587
|
-
|
588
|
-
# puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
|
589
|
-
bad_atg = true
|
590
|
-
my_seq = query_fasta
|
591
|
-
end
|
592
|
-
|
593
|
-
else
|
594
|
-
|
595
|
-
bad_atg = true
|
596
|
-
# puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
|
597
|
-
my_seq = query_fasta
|
598
|
-
|
215
|
+
if $verbose > 2
|
216
|
+
puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
|
599
217
|
end
|
600
|
-
|
601
|
-
stop_c = nil
|
602
|
-
if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
|
218
|
+
end
|
603
219
|
|
604
|
-
if (bad_atg == true)
|
605
|
-
stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
|
606
|
-
stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
|
607
|
-
else
|
608
|
-
stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
|
609
|
-
stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
|
610
|
-
end
|
611
220
|
|
221
|
+
def mark_nt_seqs(final_hit, query_fasta)
|
222
|
+
atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
|
223
|
+
mark_atg = nil
|
224
|
+
if atg == 'ATG'
|
225
|
+
mark_atg = '_-_'
|
612
226
|
end
|
227
|
+
stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
|
228
|
+
mark_stop = nil
|
229
|
+
if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
|
230
|
+
mark_stop = '___'
|
231
|
+
end
|
232
|
+
seq5p = query_fasta[0..final_hit.q_beg-1]
|
233
|
+
orf = query_fasta[final_hit.q_beg..final_hit.q_end]
|
234
|
+
seq3p = query_fasta[final_hit.q_end..query_fasta.length]
|
235
|
+
nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
|
236
|
+
return nt_seq
|
237
|
+
end
|
613
238
|
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
if
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
my_prot = my_prot.sub(/___\w+/,'')
|
627
|
-
my_prot = my_prot.translate
|
628
|
-
my_prot = my_prot.sub(/x$/,'')
|
629
|
-
|
630
|
-
simliar_fragment = final_prot.lcs(my_prot)
|
631
|
-
|
632
|
-
if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
|
633
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
|
634
|
-
else
|
635
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
636
|
-
# puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
|
239
|
+
def exonerate_fix_frame_shift(query_fasta, hit)
|
240
|
+
frame_shifts = []
|
241
|
+
added_nts = 0
|
242
|
+
hit.each_with_index do |hsp, num|
|
243
|
+
if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
|
244
|
+
if !hsp.q_frameshift.empty? #There is frameshift
|
245
|
+
hsp.q_frameshift.each do |position, num_nts|
|
246
|
+
local_add = 3 - num_nts
|
247
|
+
fs_final_position = position + num_nts
|
248
|
+
$global_warnings << ['ExFrameS', fs_final_position]
|
249
|
+
frame_shifts << [fs_final_position, local_add]
|
250
|
+
added_nts += local_add
|
637
251
|
end
|
638
|
-
|
639
252
|
end
|
253
|
+
end
|
254
|
+
hsp.q_beg += added_nts if num > 0
|
255
|
+
hsp.q_end += added_nts
|
256
|
+
end
|
257
|
+
add = 0
|
258
|
+
frame_shifts.each do |position, num_nts|
|
259
|
+
query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
|
260
|
+
add += num_nts
|
261
|
+
end
|
262
|
+
return query_fasta
|
263
|
+
end
|
640
264
|
|
641
|
-
else
|
642
|
-
if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
|
643
265
|
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
end
|
651
|
-
|
652
|
-
if (!stop_c.nil?)
|
653
|
-
if (stop_c.translate == '*')
|
654
|
-
final_hit.q_end = final_hit.q_end + 3
|
655
|
-
if (bad_atg == true)
|
656
|
-
my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
|
657
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
658
|
-
else
|
659
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
660
|
-
end
|
661
|
-
else
|
662
|
-
if (bad_atg == true)
|
663
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
664
|
-
# puts "find nt end: NO ATG, NO exact STOP"
|
665
|
-
else
|
666
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
|
667
|
-
# puts "find nt end: GOOD ATG, NO exact STOP"
|
668
|
-
end
|
669
|
-
end
|
670
|
-
end
|
671
|
-
end
|
672
|
-
|
266
|
+
## VERBOSE METHODS
|
267
|
+
def show_nts
|
268
|
+
show = FALSE
|
269
|
+
show = TRUE if $verbose && $verbose > 3
|
270
|
+
return show
|
271
|
+
end
|
673
272
|
|
674
|
-
end
|
675
273
|
|
676
|
-
|
274
|
+
def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
|
275
|
+
if new_q_end > final_hit.q_end #There is an align extension
|
276
|
+
extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
|
277
|
+
final_hit.q_seq = final_hit.q_seq + extend_align
|
278
|
+
elsif new_q_end < final_hit.q_end #The align is cutted
|
279
|
+
upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
|
280
|
+
final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
|
281
|
+
end
|
282
|
+
end
|
677
283
|
|
678
|
-
if (bad_atg == true)
|
679
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
|
680
|
-
else
|
681
|
-
seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
|
682
|
-
end
|
683
284
|
|
285
|
+
def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
|
286
|
+
if new_q_beg < final_hit.q_beg #There is an align extension
|
287
|
+
extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
|
288
|
+
final_hit.q_seq = extend_align + final_hit.q_seq
|
289
|
+
elsif new_q_beg > final_hit.q_beg #The align is cut
|
290
|
+
seq_cut = (new_q_beg - final_hit.q_beg)/3
|
291
|
+
gaps = final_hit.q_seq[0..seq_cut].count('-')
|
292
|
+
seq_cut += gaps
|
293
|
+
final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
|
684
294
|
end
|
685
|
-
|
686
295
|
end
|
687
296
|
|
688
297
|
end
|