full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,78 +1,116 @@
1
1
 
2
2
  module CommonFunctions
3
3
 
4
- def contenidos_en_prot(hit, full_prot, q)
5
-
6
- is_ok = false
7
- q_index_start = 9999
8
- fr_index_start = 0
9
- min_index_start = 9999
10
- aas_parecidos = 0
11
- masked_x = 0
12
- suma_fragments = 0
13
-
14
- masked_x = hit.q_seq.count('X')
15
- masked_x = masked_x + hit.q_seq.count('-')
16
-
17
- full_prot = full_prot.gsub(/[\-Xx]+/,'')
18
- compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
19
- fragments_array = compare_prot.split(/\-+/)
20
-
21
- fragments_array.each do |seq|
22
- # puts "seq: #{seq}\nfull_prot: #{full_prot}"
23
- simliar_fragment = full_prot.lcs(seq)
24
- suma_fragments += simliar_fragment.length
25
-
26
- fr_index_start = full_prot.index(simliar_fragment)
27
-
28
- if (q_index_start == 9999)
29
- q_index_start = fr_index_start
4
+ def contenidos_en_prot(key_seq, full_prot)
5
+ full_prot = full_prot.gsub(/[\-Xx]/,'-')
6
+ compare_prot = key_seq.gsub(/[\-Xx]/,'-')
7
+ q_index_start = full_prot.index(compare_prot) #Full match between hit.q_seq and full_prot (unigene)
8
+ if q_index_start.nil? #There is gaps that unables the full match
9
+ q_index_start = match_with_ungapped_reference(full_prot, compare_prot)
10
+ if q_index_start.nil? && full_prot.include?('-')
11
+ diff = full_prot.length - compare_prot.length
12
+ if scan_sequences(full_prot.split(''), compare_prot.split('')) == compare_prot.length
13
+ q_index_start = 0
14
+ end
15
+
16
+ if diff >0 && scan_sequences(full_prot.split(''), compare_prot.split(''), diff) == compare_prot.length
17
+ q_index_start = diff
18
+ end
19
+
20
+ if q_index_start.nil?
21
+ q_index_start = match_with_gapped_reference(full_prot, compare_prot)
22
+ end
23
+ end
24
+ if q_index_start.nil?
25
+ q_index_start = 0
30
26
  end
31
- full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
32
- end
33
-
34
- simliar_fragment = full_prot.lcs(compare_prot)
35
-
36
- # if ($verbose)
37
- # puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
38
- # puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
39
- # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
40
- # end
41
-
42
- if (suma_fragments + masked_x >= compare_prot.length * 0.7)
43
- is_ok = true
44
- # puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
45
- else
46
- is_ok = false
47
- # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
48
- # puts "Warning!: no match comparing proteins"
49
- end
50
27
 
51
- min_index_start = [min_index_start, q_index_start].min
52
-
53
- if (min_index_start == 9999)
54
- min_index_start = 0
55
28
  end
56
-
57
- return [is_ok, min_index_start]
29
+ return q_index_start
58
30
  end
59
31
 
60
32
 
33
+ def match_with_gapped_reference(full_prot, compare_prot)
34
+ q_index_start = nil
35
+ fragments_array = full_prot.split(/\-+/)
36
+ fragments_array.each_with_index do |seq, i|
37
+ if seq.length > 4
38
+ compare_prot_index = compare_prot.index(seq)
39
+ if compare_prot_index.nil? # In cases that no match by gaps
40
+ seq =seq[0..4]
41
+ compare_prot_index = compare_prot.index(seq)
42
+ end
43
+ if !compare_prot_index.nil?
44
+ q_index_start = full_prot.index(seq)
45
+ if i > 0
46
+ q_index_start, compare_prot_index = extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
47
+ end
48
+ break
49
+ end
50
+ end
51
+ end
52
+ return q_index_start
53
+ end
61
54
 
55
+ def extend_match(full_prot, compare_prot, q_index_start, compare_prot_index)
56
+ full_prot_substring = full_prot[0..q_index_start-1].reverse.split('')
57
+ compare_prot_substring = compare_prot[0..compare_prot_index-1].reverse.split('')
58
+ extend_match = scan_sequences(full_prot_substring, compare_prot_substring)
59
+ q_index_start -= extend_match
60
+ compare_prot_index -= extend_match
61
+ return q_index_start, compare_prot_index
62
+ end
62
63
 
63
- def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
64
-
65
- q_frame = -h_qframe.to_i
66
-
67
- q_beg = query_fasta.length - h_qend - 1
68
- q_end = query_fasta.length - h_qstart - 1
64
+ def scan_sequences(ref_seq, compare_seq, diff = 0)
65
+ extend_match = 0
66
+ ref_seq.each_with_index do |char,i|
67
+ if i >= diff
68
+ compare_char = compare_seq[extend_match]
69
+ if compare_char.nil? || char != compare_char && char != '-' && compare_char != '-'
70
+ break
71
+ end
72
+ extend_match += 1
73
+ end
74
+ end
75
+ return extend_match
76
+ end
69
77
 
70
- query_fasta = query_fasta.complementary_dna
78
+ def match_with_ungapped_reference(full_prot, compare_prot)
79
+ q_index_start = nil
80
+ fragments_array = compare_prot.split(/\-+/)
81
+ fragments_array.each_with_index do |seq, i|
82
+ if q_index_start.nil? && seq.length > 4
83
+ q_index_start = full_prot.index(seq)
84
+ if i > 0 && !q_index_start.nil?
85
+ q_index_start = refine_match(seq, compare_prot, q_index_start) # Correction if first seq isn't enough large
86
+ end
87
+ break
88
+ end
89
+ end
90
+ return q_index_start
91
+ end
71
92
 
72
- # el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
73
- return [query_fasta, q_frame, q_beg, q_end]
93
+ def refine_match(subseq, seq, q_index_start)
94
+ location_seq = seq.index(subseq)
95
+ gaps_on_location = seq[0..location_seq].count('-')
96
+ q_index_start -= location_seq - gaps_on_location # Correction if first seq isn't enough large
97
+ return q_index_start
74
98
  end
75
99
 
100
+ def reverse_seq(query_fasta, hit)
101
+ hit.q_frame = -hit.q_frame
102
+ hit.q_end = query_fasta.length - 1 - hit.q_end
103
+ hit.q_beg = query_fasta.length - 1 - hit.q_beg
104
+ hit.reversed = TRUE
105
+ query_fasta = query_fasta.complementary_dna # ESTO REALMENTE HACE LA REVERSO COMPLEMENTARIA.
106
+ if hit.class.to_s == 'ExoBlastHit'
107
+ hit.q_frameshift.map!{|position, num_nts|
108
+ reversed_position = query_fasta.length - 1 - position
109
+ [reversed_position, num_nts]
110
+ }
111
+ end
112
+ return query_fasta
113
+ end
76
114
 
77
115
 
78
116
  def corrige_frame(ref_frame,ref_start,ref_end)
@@ -89,6 +127,10 @@ module CommonFunctions
89
127
 
90
128
  end
91
129
 
92
-
93
-
94
- end
130
+ def check_frame_shift(hit)
131
+ fs = 0
132
+ prot_length_in_nts = hit.q_end-hit.q_beg+1
133
+ fs = prot_length_in_nts%3
134
+ return fs
135
+ end
136
+ end
@@ -0,0 +1,258 @@
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+ require 'blast_query'
23
+ require 'blast_hit'
24
+ require 'fl_string_utils.rb'
25
+
26
+ OPERATION = 0
27
+ QUERY = 1
28
+ TARGET = 2
29
+
30
+ class ExoBlastHit < BlastHit
31
+ attr_accessor :q_frameshift, :s_frameshift
32
+ def initialize(start_target, ends_target, start_query, ends_query)
33
+ super(start_target, ends_target, start_query, ends_query)
34
+ @s_frameshift=[]
35
+ @q_frameshift=[]
36
+ end
37
+ end
38
+
39
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
40
+ class ExonerateResult
41
+
42
+ # Parser initialization
43
+ def initialize(input, seqs= nil, query_seqs = nil, all = TRUE)
44
+ @querys = []
45
+ @seqs = seqs #unigenes
46
+ @prot_seqs = query_seqs#prot
47
+
48
+ if input.is_a?(Array)
49
+ input.each do |file|
50
+ parse_file(File.open(file).readlines, all)
51
+ end
52
+ else
53
+ parse_file(File.open(input).readlines, all)
54
+ end
55
+ query_name=''
56
+ end
57
+
58
+ def parse_file(lines, all)
59
+ lines_parsed=[]
60
+ lines_parsed={} if !all
61
+ lines.each do |line|
62
+ if line=~ /^vulgar:/
63
+ line.chomp!
64
+ fields=line.split(' ', 11)
65
+ features={'query_id'=> fields[1], 'query_start_align'=> fields[2].to_i, 'query_end_align'=> fields[3].to_i, 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6].to_i, 'target_end_align'=> fields[7].to_i, 'target_strand'=> fields[8], 'score'=> fields[9].to_i, 'align_data'=> fields[10]}
66
+ if all
67
+ lines_parsed << features
68
+ else
69
+ if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
70
+ lines_parsed[features['target_id']]=features
71
+ else
72
+ if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
73
+ lines_parsed[features['target_id']]=features
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ convert_parsed_lines(lines_parsed)
80
+ end
81
+
82
+ def convert_parsed_lines(lines_parsed)
83
+ last_query = nil
84
+ query = nil
85
+ lines_parsed.each_with_index do |line|
86
+ begin
87
+ if lines_parsed.class.to_s=='Array'
88
+ align_data=line['align_data']
89
+ features=line
90
+ else #hash
91
+ align_data=line[1]['align_data']
92
+ features=line[1]
93
+ end
94
+ tags = align_data.scan(/([MFG53S]) ([0-9]+) ([0-9]+)/)
95
+ tags.map!{|tag| [tag[0], tag[1].to_i, tag[2].to_i]}
96
+ if features['target_id'] != last_query
97
+ last_query = features['target_id']
98
+ query = BlastQuery.new(features['target_id'])
99
+ @querys << query
100
+ end
101
+ hiting(features,tags, query)
102
+ rescue
103
+ puts "Result: #{features['target_id']} => #{features['query_id']} hasn't been parsed\n#{line}"
104
+ end
105
+ end
106
+ end
107
+
108
+ #this method only works fine with --model protein2dna parameter of exonerate
109
+ def hiting(features, tags, query) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
110
+ do_align = FALSE
111
+ do_align = TRUE if !@prot_seqs.nil? && !@seqs.nil?
112
+ start_target = features['target_start_align']#Unigen
113
+ start_query = features['query_start_align'] #proteina
114
+ ends_target = features['target_end_align']
115
+ ends_query = features['query_end_align']-1 # -> Exonerate don't set to 0 position the ends of target and query
116
+ if features['target_strand'] == '-' #-> Exonerate don't set to 0 position the ends of target and query
117
+ start_target -= 1 # Start target is end target when mathc is in reversed complementary strand
118
+ else
119
+ ends_target -= 1
120
+ end
121
+ hit = ExoBlastHit.new(start_target+1, ends_target+1, start_query+1, ends_query+1)
122
+ define_hit_parameters(hit, features, tags)
123
+ query.add_hit(hit)
124
+
125
+ #Define alignment and blast like parameters
126
+ target_alignment = ''
127
+ query_alignment = ''
128
+ counter_target = start_target
129
+ counter_query = start_query
130
+ if do_align #get seqs
131
+ query_seq = @prot_seqs[features['query_id']]
132
+ target_seq = @seqs[features['target_id']]
133
+ end
134
+ counter_target, target_seq = do_reverso_complementary(counter_target, target_seq) if features['target_strand'] == '-'
135
+ query_frameshift = []
136
+ target_frameshift = []
137
+ gap_shift = 0
138
+ #puts features['query_id']+ ' ' +features['target_strand'], '-----------------------'
139
+ tags.each_with_index do |tag, n_operation|
140
+ #puts tag.inspect
141
+ if do_align
142
+ gap_shift = 0 if tag[OPERATION] != 'G'
143
+ query_alignment << query_seq[counter_query, tag[QUERY]]
144
+ target_alignment << target_seq[counter_target, tag[TARGET]].translate
145
+ end
146
+ if tag[OPERATION] == 'F'
147
+ if tag[TARGET] > 0 && tag[TARGET] < 3 #TRUE FRAMESHIFT
148
+ gap_shift += 1
149
+ if tags[n_operation+1][OPERATION] != 'G' #there are frameshift that not insert a gap, we do it
150
+ query_alignment << '-' if do_align
151
+ end
152
+ else
153
+ query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
154
+ end
155
+ query_frameshift << counter_query
156
+ fs_counter_target = counter_target
157
+ fs_counter_target = target_seq.length - counter_target if features['target_strand'] == '-' # ESto es un apaño, habria que plantear el parseo de las reversas como reduccion en el contador del formato del exonerate, en vez de como adiccion
158
+ if tag[TARGET] > 3
159
+ real_fs = tag[TARGET]%3
160
+ real_gap = tag[TARGET] - real_fs
161
+ fs = [fs_counter_target + real_gap, real_fs]
162
+ else
163
+ fs = [fs_counter_target, tag[TARGET]]
164
+ end
165
+ target_frameshift << fs
166
+ elsif tag[OPERATION] == 'G'
167
+ query_alignment << '-' * (tag[TARGET]/3.0).ceil if do_align
168
+ diff = tag[QUERY] - gap_shift
169
+ target_alignment << '-' * diff if do_align && diff > 0
170
+ gap_shift = 0
171
+ end
172
+ counter_query += tag[QUERY]
173
+ counter_target += tag[TARGET]
174
+ end
175
+ hit.s_frameshift = query_frameshift
176
+ hit.q_frameshift = target_frameshift
177
+
178
+ #puts "\e[33m#{target_alignment}\e[0m", "\e[36m#{query_alignment}\e[0m"
179
+ if do_align
180
+ hit.q_seq = target_alignment
181
+ hit.s_seq = query_alignment
182
+ hit.align_len = query_alignment.length
183
+ hit.ident = set_ident(target_alignment,query_alignment)
184
+ end
185
+ end #def
186
+
187
+ def do_reverso_complementary(counter_target, target_seq)
188
+ counter_target = target_seq.length - 1 - counter_target
189
+ target_seq = target_seq.complementary_dna
190
+ return counter_target, target_seq
191
+ end
192
+
193
+ def set_ident(target_alignment, query_alignment)
194
+ matchs = 0
195
+ position = 0
196
+ target_alignment.each_char do |char|
197
+ matchs +=1 if char == query_alignment[position]
198
+ position +=1
199
+ end
200
+ perc_ident = ('%.2f' % (matchs*100.0/target_alignment.length)).to_f
201
+ return perc_ident
202
+ end
203
+
204
+ def define_hit_parameters(hit, features, tags)
205
+ hit.gaps = 0
206
+ tags.map{|aln| hit.gaps += 1 if aln[0] == 'G'}
207
+ hit.reversed = FALSE
208
+ hit.align_len =(features['query_end_align'] - features['query_start_align']).abs+1
209
+ hit.mismatches=0
210
+ hit.e_val=0
211
+ hit.bit_score=0
212
+ hit.score = features['score']
213
+ hit.s_frame = nil
214
+ strand = 1
215
+ strand = -1 if features['target_strand'] == '-'
216
+ hit.q_frame = (((features['target_start_align']) % 3) +1) *strand
217
+ hit.subject_id = features['query_id']
218
+ hit.full_subject_length=0
219
+ hit.definition=''
220
+ hit.acc=features['query_id']
221
+ hit.q_seq=''
222
+ hit.s_seq=''
223
+ end
224
+
225
+ # inspect results
226
+ def inspect
227
+ res = "Exonerate results:\n"
228
+ res+= '-'*20
229
+ res+= "\nQuerys: #{@querys.count}\n"
230
+ @querys.each{|q| res+=q.inspect+"\n"}
231
+ return res
232
+ end
233
+
234
+ # find query by name
235
+ def find_query(querys,name_q)
236
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
237
+ new_q=nil
238
+
239
+ if !querys.empty?
240
+ new_q=querys.find{|q| (q.query_id==name_q)}
241
+ end
242
+
243
+ return new_q
244
+ end
245
+
246
+ # check if there are querys
247
+ def empty?
248
+
249
+ return @querys.empty?
250
+ end
251
+
252
+ # get query count
253
+ def size
254
+ @querys.size
255
+ end
256
+
257
+ attr_accessor :querys
258
+ end
@@ -1,688 +1,297 @@
1
-
1
+ require 'types'
2
2
  require 'une_los_hit'
3
3
 
4
4
  module FlAnalysis
5
5
 
6
- def analiza_orf_y_fl(seq, blast_query, options, db_name)
7
- aas_n_end = options[:distance]
8
- pident_threshold = options[:ident]
9
- evalue_threshold = options[:evalue]
10
- # @verbose = options[:verbose]
11
-
12
- # test_blast_hits(blast_query)
13
-
14
- # used to detect if the sequence and the blast are from different query
15
- if seq.seq_name != blast_query.query_def
16
- raise "BLAST query name and sequence are different"
6
+ $global_warnings = []
7
+
8
+ def analiza_orf_y_fl(seq, hit, options, db_name)
9
+ query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
10
+ if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
11
+ seq_unida = UneLosHit.new(hit, query_fasta)
12
+ full_prot = seq_unida.full_prot
13
+ query_fasta = seq_unida.output_seq # repaired fasta
14
+ final_hit = seq_unida.final_hit # single hit
15
+ $global_warnings += seq_unida.msgs # warning messages
16
+ else
17
+ query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
18
+ final_hit = hit.first # single hit
17
19
  end
18
-
19
- q=blast_query
20
- msgs = ''
21
- atg_status = ''
22
- end_status = ''
23
- final_status = ''
24
-
25
- # the fasta sequence is saved
26
- query_fasta = seq.seq_fasta
20
+ query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]
27
21
 
28
- if q.hits[0].nil? # There is no match in blast, the seq go to the next DB
29
- # puts "#{db_name} -- #{q.query_def} --> NO BLASTX match"
30
-
31
- # If the DB is trembl and the seq has annotations from other DB the annotations must be printed
32
- if (db_name =~ /^tr_/)
33
- if (seq.get_annotations(:tmp_annotation).empty?)
34
- if (seq.sec_desc.empty?)
35
- seq.annotate(:apply_tcode,'')
36
- else
37
- seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
38
- end
39
- else
40
- save_last_db_annotations(seq)
41
- end
42
- end
43
-
44
- return
45
- end
46
- #----------------------------------------------------------------------------------------------------------
47
- warnings = ''
48
- errors = ''
49
- wrong_seq = false
22
+ full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
23
+ original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
24
+ seq.show_alignment(final_hit, query_fasta, show_nts) if $verbose > 2 ## VERBOSE
25
+ atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
26
+ end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)
50
27
 
51
- # if the sequence has more than one hit, the frames are checked and fixed to get an single hit
52
- if (q.hits.count > 1)
53
-
54
- seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
55
-
56
- wrong_seq = seq_unida.wrong_seq
57
- is_ok = seq_unida.is_ok
58
- q_index_start = seq_unida.q_index_start
59
- full_prot = seq_unida.full_prot
60
-
61
- query_fasta = seq_unida.output_seq # repaired fasta
62
-
63
- final_hit = seq_unida.final_hit # single hit
64
- msgs = seq_unida.msgs # warning messages
65
- x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
66
-
67
- else # if there is only one hit
28
+ puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
29
+ seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if $verbose > 1 ## VERBOSE
30
+ puts "ATG: #{atg_status} STOP: #{end_status}" if $verbose > 2 ## VERBOSE
68
31
 
69
- if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta
70
- (query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end)
71
- q.hits[0].reversed = true
72
- end
73
-
74
- final_hit = q.hits[0] # single hit
75
- x_number = 0 # number of nucleotides used to fix frame errors
76
-
77
- full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
78
- (is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
79
- end
80
- # test_final_hit(final_hit, query_fasta)
81
- #----------------------------------------------------------------------------------------------------------
82
- if wrong_seq
83
- warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
84
- # puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
85
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
86
- error_log(q, seq, warnings, db_name)
87
- return
88
- end
89
- #----------------------------------------------------------------------------------------------------------
90
- warnings += msgs
91
- msgs = ''
92
- #----------------------------------------------------------------------------------------------------------
93
- if (x_number < 0)
94
- warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
95
- # puts "ERROR#2, unexpected negative index in x_number"
96
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
97
- error_log(q, seq, warnings, db_name)
98
- return
99
- end
100
- #----------------------------------------------------------------------------------------------------------
101
- if (!is_ok)
102
- warnings = "ERROR#3, very serious frame error, " + warnings
103
- # puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
104
- errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
105
- # error_log(q, seq, warnings, db_name)
106
- # return
107
- end
108
- #----------------------------------------------------------------------------------------------------------
109
- fiable = false
110
- if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
111
- fiable = true
32
+ # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
33
+ type, status = determine_status(atg_status, end_status)
34
+ status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
35
+ if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
36
+ save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
112
37
  end
113
- # if the query protein is large enough at the start of the sequence should have the start codon
114
- if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i)
115
- substring = full_prot[0, q_index_start + 10]
116
- resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]
38
+ end
117
39
 
118
- # to look for the beginning of the protein
119
- (m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)
120
40
 
121
- # pasting the substring sequence with the rest of the sequence
122
- tmp_prot = "#{m_substring}#{resto_substring}"
123
- # to get the value of the start_ORF index
124
- final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3)
41
+ def set_start_codon(final_hit, distance, full_prot, query_fasta)
42
+ q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot)
43
+ atg_status = nil
44
+ _5prima = q_index_start + distance
45
+
46
+ if final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
47
+ atg_status = 'complete'
48
+ tmp_prot = full_prot[q_index_start..full_prot.length]
49
+ elsif _5prima >= final_hit.s_beg
50
+ amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
51
+ carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
52
+ length_before_cut = amine_seq.length
53
+ amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
54
+ tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
55
+ new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
56
+ modify_5p_align(new_q_beg, final_hit, query_fasta) if $verbose > 1 ## VERBOSE, Modify query align
57
+ final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
125
58
  else
126
- # if (@verbose)
127
- # puts "beginning too short!"
128
- # end
129
-
59
+ $global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
130
60
  atg_status = 'incomplete'
131
- substring = full_prot[0, q_index_start]
132
- distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1
133
-
134
- if (substring.rindex('*'))
135
- warnings += "Unexpected stop codon in the beginning of your sequence, "
136
- # if (@verbose)
137
- # puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
138
- # end
139
- end
140
-
141
- final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
142
61
  tmp_prot = full_prot
143
62
  end
144
- #----------------------------------------------------------------------------------------------------------
145
- # look for the end of the protein
146
- (resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
147
- #----------------------------------------------------------------------------------------------------------
148
- final_prot = "#{resto_substring}#{end_substring}"
149
-
150
- warnings += msgs
151
-
152
- # to get the value of the end_ORF index
153
- if (atg_status == 'complete')
154
- final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
155
- else
156
- if (putative_end)
157
- final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
158
- end
159
- end
160
-
161
- #--------------------------------------------------------------------------------------------------------------
162
- # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
163
- final_status = determine_status(atg_status,end_status)
164
- #----------------------------------------------------------------------------------------------------------
165
- if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
166
- warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
167
63
 
168
- elsif (final_prot.length + aas_n_end < final_hit.full_subject_length)
169
- warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
170
- if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
171
-
172
- if (final_status == 'Complete')
173
- final_status = 'Putative Complete'
174
- warnings += ". Was predicted as Complete, but is very much shorter than de subject"
175
- # if (@verbose)
176
- # puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
177
- # end
178
- end
179
- end
180
- end
181
-
182
- # test_final_hit(final_hit, query_fasta)
183
- print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
184
-
185
- end
186
-
187
-
188
- def test_blast_hits(q)
189
-
190
- puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
191
-
192
- q.hits.each do |h|
193
- puts "\t subject_id: #{h.acc}"
194
- puts "\t acc: #{h.acc}"
195
- puts "\t full_subject_length: #{h.full_subject_length}"
196
- puts "\t q_beg: #{h.q_beg + 1}"
197
- puts "\t q_end: #{h.q_end + 1}"
198
- puts "\t q_frame: #{h.q_frame}"
199
- puts "\t s_beg: #{h.s_beg + 1}"
200
- puts "\t s_end: #{h.s_end + 1}"
201
- puts "\t s_frame: #{h.s_frame}"
202
- puts "\t align_len: #{h.align_len}"
203
- puts "\t gaps: #{h.gaps}"
204
- puts "\t mismatches: #{h.mismatches}"
205
- puts "\t reversed: #{h.reversed}"
206
- puts "\t score: #{h.score}"
207
- puts "\t bit_score: #{h.bit_score}"
208
- puts "\t ident: #{h.ident}"
209
- puts "\t e_val: #{h.e_val}"
210
- puts "\t definition: #{h.definition}"
211
- puts "\t q_seq: #{h.q_seq}"
212
- puts "\t s_seq: #{h.s_seq}"
213
-
214
- end
215
-
216
- end
217
-
218
-
219
- def test_final_hit(final_hit, query_fasta)
220
-
221
- puts "\t acc: #{final_hit.acc}"
222
- puts "\t full_subject_length: #{final_hit.full_subject_length}"
223
-
224
- puts "\n\t q_frame: #{final_hit.q_frame}"
225
- puts "\t reversed: #{final_hit.reversed}"
226
-
227
- puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
228
- puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"
229
-
230
- puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"
231
-
232
- puts "\n\t definition: #{final_hit.definition}"
233
- puts "\t q_seq: #{final_hit.q_seq}"
234
- puts "\t s_seq: #{final_hit.s_seq}"
235
-
236
- puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
237
- puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
238
-
64
+ return atg_status, tmp_prot
239
65
  end
240
66
 
241
67
 
242
- def error_log(q, seq, warnings, db_name)
243
- # seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}")
244
-
245
- if (db_name =~ /^tr_/)
246
- if (seq.get_annotations(:tmp_annotation).empty?)
247
- if (seq.sec_desc.empty?)
248
- if (!q.hits[0].definition.nil?)
249
- warnings = "Coding sequence with some errors, #{warnings}"
250
- seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
251
- seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
252
- else
253
- seq.annotate(:apply_tcode,'')
254
- end
255
- else
256
- warnings = "Coding sequence with some errors, #{warnings}"
257
- tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
258
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
259
- end
260
- else
261
- save_last_db_annotations(seq)
262
- end
263
- else
264
- if (seq.sec_desc.empty?)
265
- if (!q.hits[0].definition.nil?)
266
- warnings = "Coding sequence with some errors, #{warnings}"
267
- seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
268
- end
269
- end
270
- end
271
-
272
- end
273
-
274
-
275
- def save_last_db_annotations(seq)
276
-
277
- # puts "sequence not complete! recovering annotations from previous database! sldba!!"
278
- (q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
279
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
280
-
281
- (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
282
- if (final_hit.reversed)
283
- (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
284
- end
285
-
286
- seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
287
- seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
288
- tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
289
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
290
-
291
- end
292
-
293
-
294
- def find_start(subject_start, substring, fiable, aas_n_end)
295
-
296
- tmp_prot = ''
297
- msgs = ''
298
- atg_status = 'incomplete' # complete, incomplete or putative
299
-
300
- # puts "\nsubstring (#{substring.length} aas):\n#{substring}"
301
- stop_codon = substring.rindex('*')
302
-
303
- # marcamos la distancia al s_beg desde el principio del substring
304
- # s_beg_distance = (substring.length) - subject_start
305
- s_beg_distance = (substring.length - 10) - subject_start
306
- # marcamos la distancia al s_beg desde el final del substring
307
- atg_distance = (subject_start + 1) - (substring.length - 10)
308
- if (atg_distance <= 0)
309
- atg_distance = 0
310
- else
311
- # puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
312
- msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
313
- end
314
-
315
- # puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
316
- #----------------------------------------------------------------------------------------------------------
317
- # tenemos un codon de parada en el substring 5 prima
318
- if (stop_codon)
319
- stop_codon += 1
320
- # ahora vamos a ver si el stop esta antes o despues del s_beg
321
- if (stop_codon <= s_beg_distance) # esta antes
322
- substring = substring[stop_codon, substring.length - stop_codon]
323
- # puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
324
-
325
- first_m = substring.index('M')
326
-
327
- if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
328
- substring = substring[first_m, substring.length - first_m]
329
-
68
+ def find_start(subject_start, amine_seq, distance)
69
+ atg_status = 'putative' # complete, incomplete or putative
70
+ stop_codon = amine_seq.rindex('*')
71
+ if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
72
+ _5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
73
+ amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
74
+ first_m = amine_seq.index('M')
75
+ if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
76
+ if first_m # tenemos M
77
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1]
330
78
  atg_status = 'complete'
331
- else # con STOP pero sin M --------------------------------------------------------------------------------
332
- atg_status = 'putative'
333
- # puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
334
- msgs += "W1: There is no M at the beginning, "
79
+ else # con STOP pero sin M
80
+ $global_warnings << 'noM1'
335
81
  end
336
- #----------------------------------------------------------------------------------------------------------
337
82
  else # esta despues, un cambio de fase impide analizar el principio
338
- substring = substring[stop_codon, substring.length - stop_codon] # comentar?
339
- first_m = substring.index('M') # comentar?
340
- if (first_m) # tenemos M y unexpected stop # comentar?
341
- substring = substring[first_m, substring.length - first_m] # comentar?
342
- end # comentar?
343
- # TODO esto se puede cambiar!
344
- atg_status = 'putative'
345
- msgs += " Unexpected STOP codon in 5 prime region, "
346
- # puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
83
+ $global_warnings << 'UnexpSTOP5p'
84
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
347
85
  end
348
- #---------------------------------------------------------------------------------------------------------------
349
86
  else # no hay stop codon
350
- first_m = substring.index('M')
351
- if (first_m) # tenemos M, sin stop
352
- m_distance = subject_start - (substring.length - 10 - first_m)
353
- substring = substring[first_m, substring.length - first_m]
354
- # m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min
355
-
356
- if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
357
- # puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
358
- msgs += "No stop codon before M and M found is too far from subject M, "
87
+ first_m = amine_seq.index('M')
88
+ if first_m # tenemos M
89
+ amine_seq = amine_seq[first_m .. amine_seq.length - 1]
90
+ m_distance = (subject_start - amine_seq.length).abs - 10
91
+ if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
92
+ $global_warnings << 'NoStopMfar'
359
93
  atg_status = 'incomplete'
360
- else
361
- if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ----------------------------------
362
- # msgs += "No stop codon before M but high homology subject, "
363
- atg_status = 'complete'
364
- else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
365
- # puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
366
- msgs += "No stop codon before M and low homology subject, "
367
- atg_status = 'putative'
368
- end
94
+ else # Tenemos M
95
+ atg_status = 'complete'
369
96
  end
370
- else # sin M ni STOP -------------------------------------------------------------------------------------------
371
- atg_status = 'putative'
372
- # puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
373
- msgs += "W2: There is no M at the beginning, "
97
+ else # sin M
98
+ $global_warnings << 'noM2'
374
99
  end
375
100
  end
376
-
377
- return [substring, atg_status, msgs]
378
-
101
+ return amine_seq, atg_status
379
102
  end
380
103
 
381
104
 
382
- def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
383
- # aqui vemos lo que queda sin similitud hasta el final
384
- s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir
385
- q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo
386
- sq_end_distance = q_end_resto - s_end_resto
387
-
388
- cut_in_5p = full_prot.length - tmp_prot.length
389
-
390
- resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16]
391
- end_substring = tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length]
392
- putative_end = end_substring.index('*')
393
-
394
- # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
395
- if (sq_end_distance + aas_n_end < 0)
105
+ def find_end(final_hit, max_distance, tmp_prot, query_fasta)
106
+ frame_shift = check_frame_shift(final_hit)
107
+ beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
108
+ atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
109
+ end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
110
+ #puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3 - max_distance: #{max_distance}\e[0m"
111
+ #puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
112
+ #puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
113
+ if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
114
+ atg_substring = tmp_prot
115
+ end_substring = ''
396
116
  end_status = 'incomplete'
397
- if (putative_end)
398
- warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, "
399
- end_substring = end_substring[0, putative_end+1] # comentar?
400
- # if (@verbose)
401
- # puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas"
402
- # end
403
- else
404
- warnings += "Distance to subject end: #{sq_end_distance.abs} aas, "
405
- # if (@verbose)
406
- # puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
407
- # end
408
- end
117
+ else
118
+ end_status = 'putative'
119
+ putative_end = end_substring.index('*')
120
+ end_substring = end_substring[0 .. putative_end] if putative_end
121
+
122
+ s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
123
+ q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo
124
+ sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
409
125
 
410
- else # tenemos suficiente secuencia
411
- if (putative_end) # tenemos un stop
412
- q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end
413
- qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s
414
-
415
- # puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}"
416
-
417
- if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete
418
- end_status = 'complete'
419
- elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq
420
- end_status = 'putative'
421
- warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, "
422
- # if (@verbose)
423
- # puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence"
424
- # end
425
- elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative
426
- end_status = 'putative'
427
- warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, "
428
- # if (@verbose)
429
- # puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas"
430
- # end
126
+ if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment
127
+ end_status = 'complete'
128
+ elsif sq_end_distance < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
129
+ end_status = 'incomplete'
130
+ if putative_end
131
+ $global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
132
+ else
133
+ $global_warnings << ['DistSubj', sq_end_distance.abs]
134
+ end
135
+ else # tenemos suficiente secuencia
136
+ if putative_end # tenemos un stop
137
+ #beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
138
+ stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
139
+ if stop_q_s.abs <= max_distance #Stop codon is in search region
140
+ end_status = 'complete'
141
+ elsif stop_q_s < 0
142
+ $global_warnings << 'UnexpSTOP3p'
143
+ elsif stop_q_s > 0
144
+ end_status = 'complete'
145
+ $global_warnings << 'QueryTooLong'
146
+ end
147
+ else # no tenemos codon de parada pero tenemos suficiente secuencia
148
+ end_status = 'incomplete'
149
+ $global_warnings << 'ProtFusion'
431
150
  end
432
- end_substring = end_substring[0, putative_end+1]
433
-
434
- else # no tenemos codon de parada pero tenemos suficiente secuencia
435
- end_status = 'putative'
436
- warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
437
- # if (@verbose)
438
- # puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
439
- # end
440
151
  end
441
-
442
152
  end
443
-
444
- return [resto_substring, end_substring, end_status, warnings, putative_end]
153
+ final_prot = atg_substring + end_substring
154
+ end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
155
+ new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
156
+ modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if $verbose > 1
157
+ final_hit.q_end = new_q_end
158
+ return end_status, final_prot
445
159
  end
446
160
 
447
161
 
448
- def determine_status(atg_status,end_status)
449
-
450
- if (atg_status == 'complete') && (end_status == 'complete') # proteina completa
451
- final_status = 'Complete'
452
- elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative
453
- final_status = 'Putative Complete'
454
- elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia
455
- final_status = 'Internal'
456
- elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina
457
- final_status = 'N-terminus'
458
- elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
459
- final_status = 'Putative N-terminus'
460
- elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
461
- final_status = 'C-terminus'
462
- elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
463
- final_status = 'Putative C-terminus'
162
+ def determine_status(atg_status, end_status)
163
+ if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
164
+ type = COMPLETE
165
+ elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
166
+ type = INTERNAL
167
+ elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
168
+ type = N_TERMINAL
169
+ elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
170
+ type = C_TERMINAL
464
171
  end
465
172
 
466
- return final_status
467
- end
468
-
469
-
470
- def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
471
- name_diff = q.query_def.length - final_hit.acc.length
472
- if (name_diff > 0)
473
- spnum = ' '*name_diff.to_i
173
+ if atg_status == 'putative' || end_status == 'putative'
174
+ status = FALSE # Putative
474
175
  else
475
- spnum = ''
176
+ status = TRUE # Sure
476
177
  end
477
- #-------------------------------------------------------------------------------------------------------------------------------------
478
- # if the sequence is Complete will be printed --------------------------------------------------------------------
479
- if (final_status == 'Complete')
480
- seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
481
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
482
-
483
- if (final_hit.reversed)
484
- (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
485
- end
486
- seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
487
- seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
488
- #-------------------------------------------------------------------------------------------------------------------------------------
489
- else # la proteina no esta completa -------------------------------------------------------------------------
490
- if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
491
- if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae
492
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
493
- (kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
494
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
495
178
 
496
- (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
497
- if (final_hit.reversed)
498
- (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
499
- end
500
-
501
- my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
502
- seq.annotate(:protein,my_prot)
503
- my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
504
- seq.annotate(:alignment,my_align)
505
-
506
- tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
507
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
508
- #-----------------------------------------------------------------------------------------------------------------------------
509
- # elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae
510
-
511
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
512
- end
513
- #-------------------------------------------------------------------------------------------------------------------------------------
514
- elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior
515
- if (db_name =~ /^tr_/) # ---> estamos usando el trembl
516
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
517
- print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)
179
+ return type, status
180
+ end
518
181
 
519
- if (final_hit.reversed)
520
- (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
521
- end
522
182
 
523
- seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
524
- seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
525
- tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
526
- seq.annotate(:tmp_annotation,[tmp_annot, '','',''])
527
- #-------------------------------------------------------------------------------------------------------------------------------------
528
- else # cargamos anotaciones para la siguiente BD
529
- tmp_prot = ">#{q.query_def}\n#{final_prot}"
530
- tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
531
- tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
532
- seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
533
- seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
534
-
535
- # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
536
- # puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
183
+ def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
184
+ if final_prot.length - 2 * distance > final_hit.s_len
185
+ $global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
186
+ elsif final_prot.length + 2 * distance < final_hit.s_len
187
+ $global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
188
+ if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len
189
+ if type == COMPLETE
190
+ status = FALSE
191
+ $global_warnings << 'VeryShorter'
537
192
  end
538
193
  end
539
194
  end
195
+ return status
540
196
  end
541
197
 
542
198
 
543
- def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status)
544
-
545
- bad_atg = false
546
- #------------------------------------------------------------------------------------------------------------- ATG
547
-
548
- if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus')
549
- # puts "entra aqui, final_status: #{final_status}"
550
- my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5]
551
-
552
- beg5 = false
553
- # ------------------------------------- si my_seq_n = nil puede ser porque q_beg sea < 5
554
- if (final_hit.q_beg < 6)
555
- my_seq_n = query_fasta[0..10]
556
- beg5 = true
557
- # puts "empieza en el borde de la seq"
199
+ def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
200
+ # if the sequence is Complete or it hasn't previous info will be saved
201
+ if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
202
+ seq.type = type
203
+ seq.status = status
204
+ seq.db_name = db_name
205
+ seq.seq_fasta = query_fasta
206
+ seq.seq_aa = final_prot
207
+ seq.hit = final_hit
208
+ seq.warnings($global_warnings)
209
+ $global_warnings = [] # Clean all warnings for current sequence
210
+ seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
211
+ if type == COMPLETE
212
+ seq.ignore = TRUE
558
213
  end
559
-
560
- atg_found = my_seq_n.index(/ATG/i)
561
- atg_found_rv = my_seq_n.rindex(/ATG/i)
562
- my_atg_index = nil
563
214
  end
564
-
565
- if (!atg_found.nil?)
566
- if (beg5)
567
-
568
- my_seq_n.sub!(/ATG/i,'_-_ATG')
569
- my_atg_index = atg_found
570
- my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]
571
-
572
- elsif (atg_found == atg_found_rv)
573
-
574
- my_seq_n.sub!(/ATG/i,'_-_ATG')
575
- my_atg_index = final_hit.q_beg - 5 + atg_found
576
-
577
- my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
578
-
579
- # puts "my_seq despues de encontrar el atg: #{my_seq}"
580
- elsif (atg_found == 5) || (atg_found_rv == 5)
581
-
582
- my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
583
- my_atg_index = final_hit.q_beg - 5 + atg_found
584
- my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
585
-
586
- else
587
-
588
- # puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}"
589
- bad_atg = true
590
- my_seq = query_fasta
591
- end
592
-
593
- else
594
-
595
- bad_atg = true
596
- # puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}"
597
- my_seq = query_fasta
598
-
215
+ if $verbose > 2
216
+ puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
599
217
  end
600
- #------------------------------------------------------------------------------------------------------------- STOP
601
- stop_c = nil
602
- if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
218
+ end
603
219
 
604
- if (bad_atg == true)
605
- stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
606
- stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
607
- else
608
- stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
609
- stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
610
- end
611
220
 
221
+ def mark_nt_seqs(final_hit, query_fasta)
222
+ atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
223
+ mark_atg = nil
224
+ if atg == 'ATG'
225
+ mark_atg = '_-_'
612
226
  end
227
+ stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
228
+ mark_stop = nil
229
+ if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
230
+ mark_stop = '___'
231
+ end
232
+ seq5p = query_fasta[0..final_hit.q_beg-1]
233
+ orf = query_fasta[final_hit.q_beg..final_hit.q_end]
234
+ seq3p = query_fasta[final_hit.q_end..query_fasta.length]
235
+ nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
236
+ return nt_seq
237
+ end
613
238
 
614
- if (!stop_c.nil?)
615
- # puts stop_c
616
- # puts stop_c_longer
617
- if (stop_c.translate == '*')
618
-
619
- if (bad_atg == true)
620
- my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
621
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
622
- else
623
-
624
- my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1]
625
- my_prot = my_seq.sub(/\w+_\-_/,'')
626
- my_prot = my_prot.sub(/___\w+/,'')
627
- my_prot = my_prot.translate
628
- my_prot = my_prot.sub(/x$/,'')
629
-
630
- simliar_fragment = final_prot.lcs(my_prot)
631
-
632
- if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
633
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
634
- else
635
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
636
- # puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
239
+ def exonerate_fix_frame_shift(query_fasta, hit)
240
+ frame_shifts = []
241
+ added_nts = 0
242
+ hit.each_with_index do |hsp, num|
243
+ if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
244
+ if !hsp.q_frameshift.empty? #There is frameshift
245
+ hsp.q_frameshift.each do |position, num_nts|
246
+ local_add = 3 - num_nts
247
+ fs_final_position = position + num_nts
248
+ $global_warnings << ['ExFrameS', fs_final_position]
249
+ frame_shifts << [fs_final_position, local_add]
250
+ added_nts += local_add
637
251
  end
638
-
639
252
  end
253
+ end
254
+ hsp.q_beg += added_nts if num > 0
255
+ hsp.q_end += added_nts
256
+ end
257
+ add = 0
258
+ frame_shifts.each do |position, num_nts|
259
+ query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
260
+ add += num_nts
261
+ end
262
+ return query_fasta
263
+ end
640
264
 
641
- else
642
- if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')
643
265
 
644
- if (bad_atg == true)
645
- stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3]
646
- stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8]
647
- else
648
- stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9]
649
- stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13]
650
- end
651
-
652
- if (!stop_c.nil?)
653
- if (stop_c.translate == '*')
654
- final_hit.q_end = final_hit.q_end + 3
655
- if (bad_atg == true)
656
- my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
657
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
658
- else
659
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
660
- end
661
- else
662
- if (bad_atg == true)
663
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
664
- # puts "find nt end: NO ATG, NO exact STOP"
665
- else
666
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
667
- # puts "find nt end: GOOD ATG, NO exact STOP"
668
- end
669
- end
670
- end
671
- end
672
-
266
+ ## VERBOSE METHODS
267
+ def show_nts
268
+ show = FALSE
269
+ show = TRUE if $verbose && $verbose > 3
270
+ return show
271
+ end
673
272
 
674
- end
675
273
 
676
- else
274
+ def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
275
+ if new_q_end > final_hit.q_end #There is an align extension
276
+ extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
277
+ final_hit.q_seq = final_hit.q_seq + extend_align
278
+ elsif new_q_end < final_hit.q_end #The align is cutted
279
+ upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
280
+ final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
281
+ end
282
+ end
677
283
 
678
- if (bad_atg == true)
679
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
680
- else
681
- seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
682
- end
683
284
 
285
+ def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
286
+ if new_q_beg < final_hit.q_beg #There is an align extension
287
+ extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
288
+ final_hit.q_seq = extend_align + final_hit.q_seq
289
+ elsif new_q_beg > final_hit.q_beg #The align is cut
290
+ seq_cut = (new_q_beg - final_hit.q_beg)/3
291
+ gaps = final_hit.q_seq[0..seq_cut].count('-')
292
+ seq_cut += gaps
293
+ final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
684
294
  end
685
-
686
295
  end
687
296
 
688
297
  end